diff options
Diffstat (limited to 'tools/perf/Documentation')
53 files changed, 5537 insertions, 681 deletions
diff --git a/tools/perf/Documentation/Build.txt b/tools/perf/Documentation/Build.txt index 3766886c4bca..83dc87c662b6 100644 --- a/tools/perf/Documentation/Build.txt +++ b/tools/perf/Documentation/Build.txt @@ -71,3 +71,31 @@ supported by GCC. UBSan detects undefined behaviors of programs at runtime. $ UBSAN_OPTIONS=print_stacktrace=1 ./perf record -a If UBSan detects any problem at runtime, it outputs a “runtime error:” message. + +4) Cross compilation +==================== +As Multiarch is commonly supported in Linux distributions, we can install +libraries for multiple architectures on the same system and then cross-compile +Linux perf. For example, Aarch64 libraries and toolchains can be installed on +an x86_64 machine, allowing us to compile perf for an Aarch64 target. + +Below is the command for building the perf with dynamic linking. + + $ cd /path/to/Linux + $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/perf + +For static linking, the option `LDFLAGS="-static"` is required. + + $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- \ + LDFLAGS="-static" -C tools/perf + +In the embedded system world, a use case is to explicitly specify the package +configuration paths for cross building: + + $ PKG_CONFIG_SYSROOT_DIR="/path/to/cross/build/sysroot" \ + PKG_CONFIG_LIBDIR="/usr/lib/:/usr/local/lib" \ + make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/perf + +In this case, the variable PKG_CONFIG_SYSROOT_DIR can be used alongside the +variable PKG_CONFIG_LIBDIR or PKG_CONFIG_PATH to prepend the sysroot path to +the library paths for cross compilation. diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index 6e54979c2124..4407b106d977 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -2,6 +2,10 @@ include ../../scripts/Makefile.include include ../../scripts/utilities.mak +ARTICLES = +# with their own formatting rules. +SP_ARTICLES = + MAN1_TXT= \ $(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \ $(wildcard perf-*.txt)) \ @@ -16,13 +20,6 @@ _MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT)) MAN_XML=$(addprefix $(OUTPUT),$(_MAN_XML)) MAN_HTML=$(addprefix $(OUTPUT),$(_MAN_HTML)) -ARTICLES = -# with their own formatting rules. -SP_ARTICLES = -API_DOCS = $(patsubst %.txt,%,$(filter-out technical/api-index-skel.txt technical/api-index.txt, $(wildcard technical/api-*.txt))) -SP_ARTICLES += $(API_DOCS) -SP_ARTICLES += technical/api-index - _DOC_HTML = $(_MAN_HTML) _DOC_HTML+=$(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES)) DOC_HTML=$(addprefix $(OUTPUT),$(_DOC_HTML)) @@ -173,7 +170,7 @@ ifneq ($(V),1) endif endif -all: html man +all: html man info html: $(DOC_HTML) @@ -186,8 +183,6 @@ man7: $(DOC_MAN7) info: $(OUTPUT)perf.info $(OUTPUT)perfman.info -pdf: $(OUTPUT)user-manual.pdf - install: install-man check-man-tools: @@ -225,11 +220,6 @@ install-info: info echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \ fi -install-pdf: pdf - $(call QUIET_INSTALL, Documentation-pdf) \ - $(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir); \ - $(INSTALL) -m 644 $(OUTPUT)user-manual.pdf $(DESTDIR)$(pdfdir) - #install-html: html # '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir) @@ -244,33 +234,13 @@ $(OUTPUT)doc.dep : $(wildcard *.txt) build-docdep.perl -include $(OUTPUT)doc.dep -_cmds_txt = cmds-ancillaryinterrogators.txt \ - cmds-ancillarymanipulators.txt \ - cmds-mainporcelain.txt \ - cmds-plumbinginterrogators.txt \ - cmds-plumbingmanipulators.txt \ - cmds-synchingrepositories.txt \ - cmds-synchelpers.txt \ - cmds-purehelpers.txt \ - cmds-foreignscminterface.txt -cmds_txt=$(addprefix $(OUTPUT),$(_cmds_txt)) - -$(cmds_txt): $(OUTPUT)cmd-list.made - -$(OUTPUT)cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT) - $(QUIET_GEN)$(RM) $@ && \ - $(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \ - date >$@ - CLEAN_FILES = \ $(MAN_XML) $(addsuffix +,$(MAN_XML)) \ $(MAN_HTML) $(addsuffix +,$(MAN_HTML)) \ $(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7) \ $(OUTPUT)*.texi $(OUTPUT)*.texi+ $(OUTPUT)*.texi++ \ - $(OUTPUT)perf.info $(OUTPUT)perfman.info \ - $(OUTPUT)howto-index.txt $(OUTPUT)howto/*.html $(OUTPUT)doc.dep \ - $(OUTPUT)technical/api-*.html $(OUTPUT)technical/api-index.txt \ - $(cmds_txt) $(OUTPUT)*.made + $(OUTPUT)perf.info $(OUTPUT)perfman.info $(OUTPUT)doc.dep \ + $(OUTPUT)technical/api-*.html $(OUTPUT)technical/api-index.txt clean: $(call QUIET_CLEAN, Documentation) $(RM) $(CLEAN_FILES) @@ -280,11 +250,20 @@ $(MAN_HTML): $(OUTPUT)%.html : %.txt $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ mv $@+ $@ +# Generate date from either KBUILD_BUILD_TIMESTAMP or git log of +# the doc input file +PERF_DATE = $(strip \ + $(if $(KBUILD_BUILD_TIMESTAMP), \ + $(shell date -u -d '$(KBUILD_BUILD_TIMESTAMP)' +%Y-%m-%d), \ + $(shell git log -1 --pretty="format:%cd" \ + --date=short --no-show-signature $<))) + ifdef USE_ASCIIDOCTOR $(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : %.txt $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ $(ASCIIDOC) -b manpage -d manpage \ - $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ + $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) \ + -adocdate=$(PERF_DATE) -o $@+ $< && \ mv $@+ $@ endif @@ -296,32 +275,12 @@ $(OUTPUT)%.xml : %.txt $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ $(ASCIIDOC) -b docbook -d manpage \ $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) \ - -aperf_date=$(shell git log -1 --pretty="format:%cd" \ - --date=short $<) \ - -o $@+ $< && \ + -aperf_date=$(PERF_DATE) -o $@+ $< && \ mv $@+ $@ XSLT = docbook.xsl XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css -$(OUTPUT)user-manual.html: $(OUTPUT)user-manual.xml - $(QUIET_XSLTPROC)xsltproc $(XSLTOPTS) -o $@ $(XSLT) $< - -$(OUTPUT)perf.info: $(OUTPUT)user-manual.texi - $(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ $(OUTPUT)user-manual.texi - -$(OUTPUT)user-manual.texi: $(OUTPUT)user-manual.xml - $(QUIET_DB2TEXI)$(RM) $@+ $@ && \ - $(DOCBOOK2X_TEXI) $(OUTPUT)user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \ - $(PERL_PATH) fix-texi.perl <$@++ >$@+ && \ - rm $@++ && \ - mv $@+ $@ - -$(OUTPUT)user-manual.pdf: $(OUTPUT)user-manual.xml - $(QUIET_DBLATEX)$(RM) $@+ $@ && \ - $(DBLATEX) -o $@+ -p /etc/asciidoc/dblatex/asciidoc-dblatex.xsl -s /etc/asciidoc/dblatex/asciidoc-dblatex.sty $< && \ - mv $@+ $@ - $(OUTPUT)perfman.texi: $(MAN_XML) cat-texi.perl $(QUIET_DB2TEXI)$(RM) $@+ $@ && \ ($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \ @@ -331,28 +290,18 @@ $(OUTPUT)perfman.texi: $(MAN_XML) cat-texi.perl mv $@+ $@ $(OUTPUT)perfman.info: $(OUTPUT)perfman.texi - $(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate $*.texi + $(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate -o $@ $*.texi $(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml $(QUIET_DB2TEXI)$(RM) $@+ $@ && \ $(DOCBOOK2X_TEXI) --to-stdout $*.xml >$@+ && \ mv $@+ $@ -howto-index.txt: howto-index.sh $(wildcard howto/*.txt) - $(QUIET_GEN)$(RM) $@+ $@ && \ - '$(SHELL_PATH_SQ)' ./howto-index.sh $(wildcard howto/*.txt) >$@+ && \ - mv $@+ $@ - $(patsubst %,%.html,$(ARTICLES)) : %.html : %.txt $(QUIET_ASCIIDOC)$(ASCIIDOC) -b $(ASCIIDOC_HTML) $*.txt WEBDOC_DEST = /pub/software/tools/perf/docs -$(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt - $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ - sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b $(ASCIIDOC_HTML) - >$@+ && \ - mv $@+ $@ - # UNIMPLEMENTED #install-webdoc : html # '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST) diff --git a/tools/perf/Documentation/arm-coresight.txt b/tools/perf/Documentation/arm-coresight.txt new file mode 100644 index 000000000000..c117fc50a2a9 --- /dev/null +++ b/tools/perf/Documentation/arm-coresight.txt @@ -0,0 +1,5 @@ +Arm CoreSight Support +===================== + +For full documentation, see Documentation/trace/coresight/coresight-perf.rst +in the kernel tree. diff --git a/tools/perf/Documentation/build-docdep.perl b/tools/perf/Documentation/build-docdep.perl new file mode 100755 index 000000000000..ba4205e0302a --- /dev/null +++ b/tools/perf/Documentation/build-docdep.perl @@ -0,0 +1,46 @@ +#!/usr/bin/perl + +my %include = (); +my %included = (); + +for my $text (<*.txt>) { + open I, '<', $text || die "cannot read: $text"; + while (<I>) { + if (/^include::/) { + chomp; + s/^include::\s*//; + s/\[\]//; + $include{$text}{$_} = 1; + $included{$_} = 1; + } + } + close I; +} + +# Do we care about chained includes??? +my $changed = 1; +while ($changed) { + $changed = 0; + while (my ($text, $included) = each %include) { + for my $i (keys %$included) { + # $text has include::$i; if $i includes $j + # $text indirectly includes $j. + if (exists $include{$i}) { + for my $j (keys %{$include{$i}}) { + if (!exists $include{$text}{$j}) { + $include{$text}{$j} = 1; + $included{$j} = 1; + $changed = 1; + } + } + } + } + } +} + +while (my ($text, $included) = each %include) { + if (! exists $included{$text} && + (my $base = $text) =~ s/\.txt$//) { + print "$base.html $base.xml : ", join(" ", keys %$included), "\n"; + } +} diff --git a/tools/perf/Documentation/callchain-overhead-calculation.txt b/tools/perf/Documentation/callchain-overhead-calculation.txt index 1a757927195e..e0202bf5bd1a 100644 --- a/tools/perf/Documentation/callchain-overhead-calculation.txt +++ b/tools/perf/Documentation/callchain-overhead-calculation.txt @@ -1,7 +1,8 @@ Overhead calculation -------------------- -The overhead can be shown in two columns as 'Children' and 'Self' when -perf collects callchains. The 'self' overhead is simply calculated by +The CPU overhead can be shown in two columns as 'Children' and 'Self' +when perf collects callchains (and corresponding 'Wall' columns for +wall-clock overhead). The 'self' overhead is simply calculated by adding all period values of the entry - usually a function (symbol). This is the value that perf shows traditionally and sum of all the 'self' overhead values should be 100%. diff --git a/tools/perf/Documentation/cat-texi.perl b/tools/perf/Documentation/cat-texi.perl new file mode 100755 index 000000000000..14d2f8341517 --- /dev/null +++ b/tools/perf/Documentation/cat-texi.perl @@ -0,0 +1,46 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +my @menu = (); +my $output = $ARGV[0]; + +open my $tmp, '>', "$output.tmp"; + +while (<STDIN>) { + next if (/^\\input texinfo/../\@node Top/); + next if (/^\@bye/ || /^\.ft/); + if (s/^\@top (.*)/\@node $1,,,Top/) { + push @menu, $1; + } + s/\(\@pxref\{\[(URLS|REMOTES)\]}\)//; + s/\@anchor\{[^{}]*\}//g; + print $tmp $_; +} +close $tmp; + +print '\input texinfo +@setfilename gitman.info +@documentencoding UTF-8 +@dircategory Development +@direntry +* Git Man Pages: (gitman). Manual pages for Git revision control system +@end direntry +@node Top,,, (dir) +@top Git Manual Pages +@documentlanguage en +@menu +'; + +for (@menu) { + print "* ${_}::\n"; +} +print "\@end menu\n"; +open $tmp, '<', "$output.tmp"; +while (<$tmp>) { + print; +} +close $tmp; +print "\@bye\n"; +unlink "$output.tmp"; diff --git a/tools/perf/Documentation/cpu-and-latency-overheads.txt b/tools/perf/Documentation/cpu-and-latency-overheads.txt new file mode 100644 index 000000000000..3b6d63705465 --- /dev/null +++ b/tools/perf/Documentation/cpu-and-latency-overheads.txt @@ -0,0 +1,85 @@ +CPU and latency overheads +------------------------- +There are two notions of time: wall-clock time and CPU time. +For a single-threaded program, or a program running on a single-core machine, +these notions are the same. However, for a multi-threaded/multi-process program +running on a multi-core machine, these notions are significantly different. +Each second of wall-clock time we have number-of-cores seconds of CPU time. +Perf can measure overhead for both of these times (shown in 'overhead' and +'latency' columns for CPU and wall-clock time correspondingly). + +Optimizing CPU overhead is useful to improve 'throughput', while optimizing +latency overhead is useful to improve 'latency'. It's important to understand +which one is useful in a concrete situation at hand. For example, the former +may be useful to improve max throughput of a CI build server that runs on 100% +CPU utilization, while the latter may be useful to improve user-perceived +latency of a single interactive program build. +These overheads may be significantly different in some cases. For example, +consider a program that executes function 'foo' for 9 seconds with 1 thread, +and then executes function 'bar' for 1 second with 128 threads (consumes +128 seconds of CPU time). The CPU overhead is: 'foo' - 6.6%, 'bar' - 93.4%. +While the latency overhead is: 'foo' - 90%, 'bar' - 10%. If we try to optimize +running time of the program looking at the (wrong in this case) CPU overhead, +we would concentrate on the function 'bar', but it can yield only 10% running +time improvement at best. + +By default, perf shows only CPU overhead. To show latency overhead, use +'perf record --latency' and 'perf report': + +----------------------------------- +Overhead Latency Command + 93.88% 25.79% cc1 + 1.90% 39.87% gzip + 0.99% 10.16% dpkg-deb + 0.57% 1.00% as + 0.40% 0.46% sh +----------------------------------- + +To sort by latency overhead, use 'perf report --latency': + +----------------------------------- +Latency Overhead Command + 39.87% 1.90% gzip + 25.79% 93.88% cc1 + 10.16% 0.99% dpkg-deb + 4.17% 0.29% git + 2.81% 0.11% objtool +----------------------------------- + +To get insight into the difference between the overheads, you may check +parallelization histogram with '--sort=latency,parallelism,comm,symbol --hierarchy' +flags. It shows fraction of (wall-clock) time the workload utilizes different +numbers of cores ('Parallelism' column). For example, in the following case +the workload utilizes only 1 core most of the time, but also has some +highly-parallel phases, which explains significant difference between +CPU and wall-clock overheads: + +----------------------------------- + Latency Overhead Parallelism / Command / Symbol ++ 56.98% 2.29% 1 ++ 16.94% 1.36% 2 ++ 4.00% 20.13% 125 ++ 3.66% 18.25% 124 ++ 3.48% 17.66% 126 ++ 3.26% 0.39% 3 ++ 2.61% 12.93% 123 +----------------------------------- + +By expanding corresponding lines, you may see what commands/functions run +at the given parallelism level: + +----------------------------------- + Latency Overhead Parallelism / Command / Symbol +- 56.98% 2.29% 1 + 32.80% 1.32% gzip + 4.46% 0.18% cc1 + 2.81% 0.11% objtool + 2.43% 0.10% dpkg-source + 2.22% 0.09% ld + 2.10% 0.08% dpkg-genchanges +----------------------------------- + +To see the normal function-level profile for particular parallelism levels +(number of threads actively running on CPUs), you may use '--parallelism' +filter. For example, to see the profile only for low parallelism phases +of a workload use '--latency --parallelism=1-2' flags. diff --git a/tools/perf/Documentation/examples.txt b/tools/perf/Documentation/examples.txt index a4e392156488..c0d22fbe9201 100644 --- a/tools/perf/Documentation/examples.txt +++ b/tools/perf/Documentation/examples.txt @@ -3,7 +3,7 @@ ****** perf by examples ****** ------------------------------ -[ From an e-mail by Ingo Molnar, http://lkml.org/lkml/2009/8/4/346 ] +[ From an e-mail by Ingo Molnar, https://lore.kernel.org/lkml/20090804195717.GA5998@elte.hu ] First, discovery/enumeration of available counters can be done via diff --git a/tools/perf/Documentation/guest-files.txt b/tools/perf/Documentation/guest-files.txt new file mode 100644 index 000000000000..8cc0b092f996 --- /dev/null +++ b/tools/perf/Documentation/guest-files.txt @@ -0,0 +1,16 @@ +include::guestmount.txt[] + +--guestkallsyms=<path>:: + Guest OS /proc/kallsyms file copy. perf reads it to get guest + kernel symbols. Users copy it out from guest OS. + +--guestmodules=<path>:: + Guest OS /proc/modules file copy. perf reads it to get guest + kernel module information. Users copy it out from guest OS. + +--guestvmlinux=<path>:: + Guest OS kernel vmlinux. + +--guest-code:: + Indicate that guest code can be found in the hypervisor process, + which is a common case for KVM test programs. diff --git a/tools/perf/Documentation/guestmount.txt b/tools/perf/Documentation/guestmount.txt new file mode 100644 index 000000000000..6edf12363add --- /dev/null +++ b/tools/perf/Documentation/guestmount.txt @@ -0,0 +1,11 @@ +--guestmount=<path>:: + Guest OS root file system mount directory. Users mount guest OS + root directories under <path> by a specific filesystem access method, + typically, sshfs. + For example, start 2 guest OS, one's pid is 8888 and the other's is 9999: +[verse] + $ mkdir \~/guestmount + $ cd \~/guestmount + $ sshfs -o allow_other,direct_io -p 5551 localhost:/ 8888/ + $ sshfs -o allow_other,direct_io -p 5552 localhost:/ 9999/ + $ perf {GMEXAMPLECMD} --guestmount=~/guestmount {GMEXAMPLESUBCMD} diff --git a/tools/perf/Documentation/intel-hybrid.txt b/tools/perf/Documentation/intel-hybrid.txt new file mode 100644 index 000000000000..0379903673a4 --- /dev/null +++ b/tools/perf/Documentation/intel-hybrid.txt @@ -0,0 +1,204 @@ +Intel hybrid support +-------------------- +Support for Intel hybrid events within perf tools. + +For some Intel platforms, such as AlderLake, which is hybrid platform and +it consists of atom cpu and core cpu. Each cpu has dedicated event list. +Part of events are available on core cpu, part of events are available +on atom cpu and even part of events are available on both. + +Kernel exports two new cpu pmus via sysfs: +/sys/bus/event_source/devices/cpu_core +/sys/bus/event_source/devices/cpu_atom + +The 'cpus' files are created under the directories. For example, + +cat /sys/bus/event_source/devices/cpu_core/cpus +0-15 + +cat /sys/bus/event_source/devices/cpu_atom/cpus +16-23 + +It indicates cpu0-cpu15 are core cpus and cpu16-cpu23 are atom cpus. + +As before, use perf-list to list the symbolic event. + +perf list + +inst_retired.any + [Fixed Counter: Counts the number of instructions retired. Unit: cpu_atom] +inst_retired.any + [Number of instructions retired. Fixed Counter - architectural event. Unit: cpu_core] + +The 'Unit: xxx' is added to brief description to indicate which pmu +the event is belong to. Same event name but with different pmu can +be supported. + +Enable hybrid event with a specific pmu + +To enable a core only event or atom only event, following syntax is supported: + + cpu_core/<event name>/ +or + cpu_atom/<event name>/ + +For example, count the 'cycles' event on core cpus. + + perf stat -e cpu_core/cycles/ + +Create two events for one hardware event automatically + +When creating one event and the event is available on both atom and core, +two events are created automatically. One is for atom, the other is for +core. Most of hardware events and cache events are available on both +cpu_core and cpu_atom. + +For hardware events, they have pre-defined configs (e.g. 0 for cycles). +But on hybrid platform, kernel needs to know where the event comes from +(from atom or from core). The original perf event type PERF_TYPE_HARDWARE +can't carry pmu information. So now this type is extended to be PMU aware +type. The PMU type ID is stored at attr.config[63:32]. + +PMU type ID is retrieved from sysfs. +/sys/bus/event_source/devices/cpu_atom/type +/sys/bus/event_source/devices/cpu_core/type + +The new attr.config layout for PERF_TYPE_HARDWARE: + +PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA + AA: hardware event ID + EEEEEEEE: PMU type ID + +Cache event is similar. The type PERF_TYPE_HW_CACHE is extended to be +PMU aware type. The PMU type ID is stored at attr.config[63:32]. + +The new attr.config layout for PERF_TYPE_HW_CACHE: + +PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB + BB: hardware cache ID + CC: hardware cache op ID + DD: hardware cache op result ID + EEEEEEEE: PMU type ID + +When enabling a hardware event without specified pmu, such as, +perf stat -e cycles -a (use system-wide in this example), two events +are created automatically. + + ------------------------------------------------------------ + perf_event_attr: + size 120 + config 0x400000000 + sample_type IDENTIFIER + read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING + disabled 1 + inherit 1 + exclude_guest 1 + ------------------------------------------------------------ + +and + + ------------------------------------------------------------ + perf_event_attr: + size 120 + config 0x800000000 + sample_type IDENTIFIER + read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING + disabled 1 + inherit 1 + exclude_guest 1 + ------------------------------------------------------------ + +type 0 is PERF_TYPE_HARDWARE. +0x4 in 0x400000000 indicates it's cpu_core pmu. +0x8 in 0x800000000 indicates it's cpu_atom pmu (atom pmu type id is random). + +The kernel creates 'cycles' (0x400000000) on cpu0-cpu15 (core cpus), +and create 'cycles' (0x800000000) on cpu16-cpu23 (atom cpus). + +For perf-stat result, it displays two events: + + Performance counter stats for 'system wide': + + 6,744,979 cpu_core/cycles/ + 1,965,552 cpu_atom/cycles/ + +The first 'cycles' is core event, the second 'cycles' is atom event. + +Thread mode example: + +perf-stat reports the scaled counts for hybrid event and with a percentage +displayed. The percentage is the event's running time/enabling time. + +One example, 'triad_loop' runs on cpu16 (atom core), while we can see the +scaled value for core cycles is 160,444,092 and the percentage is 0.47%. + +perf stat -e cycles \-- taskset -c 16 ./triad_loop + +As previous, two events are created. + +------------------------------------------------------------ +perf_event_attr: + size 120 + config 0x400000000 + sample_type IDENTIFIER + read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING + disabled 1 + inherit 1 + enable_on_exec 1 + exclude_guest 1 +------------------------------------------------------------ + +and + +------------------------------------------------------------ +perf_event_attr: + size 120 + config 0x800000000 + sample_type IDENTIFIER + read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING + disabled 1 + inherit 1 + enable_on_exec 1 + exclude_guest 1 +------------------------------------------------------------ + + Performance counter stats for 'taskset -c 16 ./triad_loop': + + 233,066,666 cpu_core/cycles/ (0.43%) + 604,097,080 cpu_atom/cycles/ (99.57%) + +perf-record: + +If there is no '-e' specified in perf record, on hybrid platform, +it creates two default 'cycles' and adds them to event list. One +is for core, the other is for atom. + +perf-stat: + +If there is no '-e' specified in perf stat, on hybrid platform, +besides of software events, following events are created and +added to event list in order. + +cpu_core/cycles/, +cpu_atom/cycles/, +cpu_core/instructions/, +cpu_atom/instructions/, +cpu_core/branches/, +cpu_atom/branches/, +cpu_core/branch-misses/, +cpu_atom/branch-misses/ + +Of course, both perf-stat and perf-record support to enable +hybrid event with a specific pmu. + +e.g. +perf stat -e cpu_core/cycles/ +perf stat -e cpu_atom/cycles/ +perf stat -e cpu_core/r1a/ +perf stat -e cpu_atom/L1-icache-loads/ +perf stat -e cpu_core/cycles/,cpu_atom/instructions/ +perf stat -e '{cpu_core/cycles/,cpu_core/instructions/}' + +But '{cpu_core/cycles/,cpu_atom/instructions/}' will return +warning and disable grouping, because the pmus in group are +not matched (cpu_core vs. cpu_atom). diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt index e817179c5027..40476b227f8d 100644 --- a/tools/perf/Documentation/itrace.txt +++ b/tools/perf/Documentation/itrace.txt @@ -1,16 +1,20 @@ i synthesize instructions events - b synthesize branches events (branch misses for Arm SPE) + y synthesize cycles events + b synthesize branches events c synthesize branches events (calls only) r synthesize branches events (returns only) x synthesize transactions events w synthesize ptwrite events - p synthesize power events + p synthesize power events (incl. PSB events for Intel PT) o synthesize other events recorded due to the use of aux-output (refer to perf record) + I synthesize interrupt or similar (asynchronous) events + (e.g. Intel PT Event Trace) e synthesize error events d create a debug log f synthesize first level cache events m synthesize last level cache events + M synthesize memory events t synthesize TLB events a synthesize remote access events g synthesize a call chain (use with i or x) @@ -18,8 +22,12 @@ l synthesize last branch entries (use with i or x) L synthesize last branch entries on existing event records s skip initial number of events + q quicker (less detailed) decoding + A approximate IPC + Z prefer to ignore timestamps (so-called "timeless" decoding) + T use the timestamp trace as kernel time - The default is all events i.e. the same as --itrace=ibxwpe, + The default is all events i.e. the same as --itrace=iybxwpe, except for perf script where it is --itrace=ce In addition, the period (default 100000, except for perf script where it is 1) @@ -47,3 +55,18 @@ --itrace=i0nss1000000 skips the first million instructions. + + The 'e' option may be followed by flags which affect what errors will or + will not be reported. Each flag must be preceded by either '+' or '-'. + The flags are: + o overflow + l trace data lost + + If supported, the 'd' option may be followed by flags which affect what + debug messages will or will not be logged. Each flag must be preceded + by either '+' or '-'. The flags are: + a all perf events + e output only on errors (size configurable - see linkperf:perf-config[1]) + o output to stdout + + If supported, the 'q' option may be repeated to increase the effect. diff --git a/tools/perf/Documentation/jitdump-specification.txt b/tools/perf/Documentation/jitdump-specification.txt index 52152d156ad9..79936355d819 100644 --- a/tools/perf/Documentation/jitdump-specification.txt +++ b/tools/perf/Documentation/jitdump-specification.txt @@ -164,7 +164,7 @@ const char unwinding_data[n]: an array of unwinding data, consisting of the EH F The EH Frame header follows the Linux Standard Base (LSB) specification as described in the document at https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/ehframehdr.html -The EH Frame follows the LSB specicfication as described in the document at https://refspecs.linuxbase.org/LSB_3.0.0/LSB-PDA/LSB-PDA/ehframechpt.html +The EH Frame follows the LSB specification as described in the document at https://refspecs.linuxbase.org/LSB_3.0.0/LSB-PDA/LSB-PDA/ehframechpt.html NOTE: The mapped_size is generally either the same as unwind_data_size (if the unwinding data was mapped in memory by the running process) or zero (if the unwinding data is not mapped by the process). If the unwinding data was not mapped, then only the EH Frame Header will be read, which can be used to specify FP based unwinding for a function which does not have unwinding information. diff --git a/tools/perf/Documentation/perf-amd-ibs.txt b/tools/perf/Documentation/perf-amd-ibs.txt new file mode 100644 index 000000000000..548549935760 --- /dev/null +++ b/tools/perf/Documentation/perf-amd-ibs.txt @@ -0,0 +1,223 @@ +perf-amd-ibs(1) +=============== + +NAME +---- +perf-amd-ibs - Support for AMD Instruction-Based Sampling (IBS) with perf tool + +SYNOPSIS +-------- +[verse] +'perf record' -e ibs_op// +'perf record' -e ibs_fetch// + +DESCRIPTION +----------- + +Instruction-Based Sampling (IBS) provides precise Instruction Pointer (IP) +profiling support on AMD platforms. IBS has two independent components: IBS +Op and IBS Fetch. IBS Op sampling provides information about instruction +execution (micro-op execution to be precise) with details like d-cache +hit/miss, d-TLB hit/miss, cache miss latency, load/store data source, branch +behavior etc. IBS Fetch sampling provides information about instruction fetch +with details like i-cache hit/miss, i-TLB hit/miss, fetch latency etc. IBS is +per-smt-thread i.e. each SMT hardware thread contains standalone IBS units. + +Both, IBS Op and IBS Fetch, are exposed as PMUs by Linux and can be exploited +using the Linux perf utility. The following files will be created at boot time +if IBS is supported by the hardware and kernel. + + /sys/bus/event_source/devices/ibs_op/ + /sys/bus/event_source/devices/ibs_fetch/ + +IBS Op PMU supports two events: cycles and micro ops. IBS Fetch PMU supports +one event: fetch ops. + +IBS PMUs do not have user/kernel filtering capability and thus it requires +CAP_SYS_ADMIN or CAP_PERFMON privilege. + +IBS VS. REGULAR CORE PMU +------------------------ + +IBS gives samples with precise IP, i.e. the IP recorded with IBS sample has +no skid. Whereas the IP recorded by regular core PMU will have some skid +(sample was generated at IP X but perf would record it at IP X+n). Hence, +regular core PMU might not help for profiling with instruction level +precision. Further, IBS provides additional information about the sample in +question. On the other hand, regular core PMU has it's own advantages like +plethora of events, counting mode (less interference), up to 6 parallel +counters, event grouping support, filtering capabilities etc. + +Three regular core PMU events are internally forwarded to IBS Op PMU when +precise_ip attribute is set: + + -e cpu-cycles:p becomes -e ibs_op// + -e r076:p becomes -e ibs_op// + -e r0C1:p becomes -e ibs_op/cnt_ctl=1/ + +EXAMPLES +-------- + +IBS Op PMU +~~~~~~~~~~ + +System-wide profile, cycles event, sampling period: 100000 + + # perf record -e ibs_op// -c 100000 -a + +Per-cpu profile (cpu10), cycles event, sampling period: 100000 + + # perf record -e ibs_op// -c 100000 -C 10 + +Per-cpu profile (cpu10), cycles event, sampling freq: 1000 + + # perf record -e ibs_op// -F 1000 -C 10 + +System-wide profile, uOps event, sampling period: 100000 + + # perf record -e ibs_op/cnt_ctl=1/ -c 100000 -a + +Same command, but also capture IBS register raw dump along with perf sample: + + # perf record -e ibs_op/cnt_ctl=1/ -c 100000 -a --raw-samples + +System-wide profile, uOps event, sampling period: 100000, L3MissOnly (Zen4 onward) + + # perf record -e ibs_op/cnt_ctl=1,l3missonly=1/ -c 100000 -a + +System-wide profile, cycles event, sampling period: 100000, LdLat filtering (Zen5 +onward) + + # perf record -e ibs_op/ldlat=128/ -c 100000 -a + + Supported load latency threshold values are 128 to 2048 (both inclusive). + Latency value which is a multiple of 128 incurs a little less profiling + overhead compared to other values. + +Per process(upstream v6.2 onward), uOps event, sampling period: 100000 + + # perf record -e ibs_op/cnt_ctl=1/ -c 100000 -p 1234 + +Per process(upstream v6.2 onward), uOps event, sampling period: 100000 + + # perf record -e ibs_op/cnt_ctl=1/ -c 100000 -- ls + +To analyse recorded profile in aggregate mode + + # perf report + /* Select a line and press 'a' to drill down at instruction level. */ + +To go over each sample + + # perf script + +Raw dump of IBS registers when profiled with --raw-samples + + # perf report -D + /* Look for PERF_RECORD_SAMPLE */ + + Example register raw dump: + + ibs_op_ctl: 000002c30006186a MaxCnt 100000 L3MissOnly 0 En 1 + Val 1 CntCtl 0=cycles CurCnt 707 + IbsOpRip: ffffffff8204aea7 + ibs_op_data: 0000010002550001 CompToRetCtr 1 TagToRetCtr 597 + BrnRet 0 RipInvalid 0 BrnFuse 0 Microcode 1 + ibs_op_data2: 0000000000000013 RmtNode 1 DataSrc 3=DRAM + ibs_op_data3: 0000000031960092 LdOp 0 StOp 1 DcL1TlbMiss 0 + DcL2TlbMiss 0 DcL1TlbHit2M 1 DcL1TlbHit1G 0 DcL2TlbHit2M 0 + DcMiss 1 DcMisAcc 0 DcWcMemAcc 0 DcUcMemAcc 0 DcLockedOp 0 + DcMissNoMabAlloc 0 DcLinAddrValid 1 DcPhyAddrValid 1 + DcL2TlbHit1G 0 L2Miss 1 SwPf 0 OpMemWidth 32 bytes + OpDcMissOpenMemReqs 12 DcMissLat 0 TlbRefillLat 0 + IbsDCLinAd: ff110008a5398920 + IbsDCPhysAd: 00000008a5398920 + +IBS applied in a real world usecase + + ~90% regression was observed in tbench with specific scheduler hint + which was counter intuitive. IBS profile of good and bad run captured + using perf helped in identifying exact cause of the problem: + + https://lore.kernel.org/r/20220921063638.2489-1-kprateek.nayak@amd.com + +IBS Fetch PMU +~~~~~~~~~~~~~ + +Similar commands can be used with Fetch PMU as well. + +System-wide profile, fetch ops event, sampling period: 100000 + + # perf record -e ibs_fetch// -c 100000 -a + +System-wide profile, fetch ops event, sampling period: 100000, Random enable + + # perf record -e ibs_fetch/rand_en=1/ -c 100000 -a + + Random enable adds small degree of variability to sample period. This + helps in cases like long running loops where PMU is tagging the same + instruction over and over because of fixed sample period. + +etc. + +PERF MEM AND PERF C2C +--------------------- + +perf mem is a memory access profiler tool and perf c2c is a shared data +cacheline analyser tool. Both of them internally uses IBS Op PMU on AMD. +Below is a simple example of the perf mem tool. + + # perf mem record -c 100000 -- make + # perf mem report + +A normal perf mem report output will provide detailed memory access profile. +New output fields will show related access info together. For example: + + # perf mem report -F overhead,cache,snoop,comm + ... + # Samples: 92K of event 'ibs_op//' + # Total weight : 531104 + # + # ---------- Cache ----------- --- Snoop ---- + # Overhead L1 L2 L1-buf Other HitM Other Command + # ........ ............................ .............. .......... + # + 76.07% 5.8% 35.7% 0.0% 34.6% 23.3% 52.8% cc1 + 5.79% 0.2% 0.0% 0.0% 5.6% 0.1% 5.7% make + 5.78% 0.1% 4.4% 0.0% 1.2% 0.5% 5.3% gcc + 5.33% 0.3% 3.9% 0.0% 1.1% 0.2% 5.2% as + 5.00% 0.1% 3.8% 0.0% 1.0% 0.3% 4.7% sh + 1.56% 0.1% 0.1% 0.0% 1.4% 0.6% 0.9% ld + 0.28% 0.1% 0.0% 0.0% 0.2% 0.1% 0.2% pkg-config + 0.09% 0.0% 0.0% 0.0% 0.1% 0.0% 0.1% git + 0.03% 0.0% 0.0% 0.0% 0.0% 0.0% 0.0% rm + ... + +Also, it can be aggregated based on various memory access info using the +sort keys. For example: + + # perf mem report -s mem,snoop + ... + # Samples: 92K of event 'ibs_op//' + # Total weight : 531104 + # Sort order : mem,snoop + # + # Overhead Samples Memory access Snoop + # ........ ............ ....................................... ............ + # + 47.99% 1509 L2 hit N/A + 25.08% 338 core, same node Any cache hit HitM + 10.24% 54374 N/A N/A + 6.77% 35938 L1 hit N/A + 6.39% 101 core, same node Any cache hit N/A + 3.50% 69 RAM hit N/A + 0.03% 158 LFB/MAB hit N/A + 0.00% 2 Uncached hit N/A + +Please refer to their man page for more detail. + +SEE ALSO +-------- + +linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1], +linkperf:perf-mem[1], linkperf:perf-c2c[1] diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index 1b5042f134a8..46090c5b42b4 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -41,7 +41,7 @@ OPTIONS -q:: --quiet:: - Do not show any message. (Suppress -v) + Do not show any warnings or messages. (Suppress -v) -n:: --show-nr-samples:: @@ -58,6 +58,13 @@ OPTIONS --ignore-vmlinux:: Ignore vmlinux files. +--itrace:: + Options for decoding instruction tracing data. The options are: + +include::itrace.txt[] + + To disable decoding entirely, use --no-itrace. + -m:: --modules:: Load module symbols. WARNING: use only with -k and LIVE kernel. @@ -109,6 +116,9 @@ OPTIONS -M:: --disassembler-style=:: Set disassembler style for objdump. +--addr2line=<path>:: + Path to addr2line binary. + --objdump=<path>:: Path to objdump binary. @@ -124,6 +134,13 @@ OPTIONS --group:: Show event group information together +--demangle:: + Demangle symbol names to human readable form. It's enabled by default, + disable with --no-demangle. + +--demangle-kernel:: + Demangle kernel symbol names to human readable form (for C++ kernels). + --percent-type:: Set annotation percent type from following choices: global-period, local-period, global-hits, local-hits @@ -133,6 +150,29 @@ OPTIONS The period/hits keywords set the base the percentage is computed on - the samples period or the number of samples (hits). +--percent-limit:: + Do not show functions which have an overhead under that percent on + stdio or stdio2 (Default: 0). Note that this is about selection of + functions to display, not about lines within the function. + +--data-type[=TYPE_NAME]:: + Display data type annotation instead of code. It infers data type of + samples (if they are memory accessing instructions) using DWARF debug + information. It can take an optional argument of data type name. In + that case it'd show annotation for the type only, otherwise it'd show + all data types it finds. + +--type-stat:: + Show stats for the data type annotation. + +--skip-empty:: + Do not display empty (or dummy) events. + +--code-with-type:: + Show data type info in code annotation (for memory instructions only). + Currently it only works with --stdio option. + + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1] diff --git a/tools/perf/Documentation/perf-arm-spe.txt b/tools/perf/Documentation/perf-arm-spe.txt new file mode 100644 index 000000000000..37afade4f1b2 --- /dev/null +++ b/tools/perf/Documentation/perf-arm-spe.txt @@ -0,0 +1,254 @@ +perf-arm-spe(1) +================ + +NAME +---- +perf-arm-spe - Support for Arm Statistical Profiling Extension within Perf tools + +SYNOPSIS +-------- +[verse] +'perf record' -e arm_spe// + +DESCRIPTION +----------- + +The SPE (Statistical Profiling Extension) feature provides accurate attribution of latencies and + events down to individual instructions. Rather than being interrupt-driven, it picks an +instruction to sample and then captures data for it during execution. Data includes execution time +in cycles. For loads and stores it also includes data address, cache miss events, and data origin. + +The sampling has 5 stages: + + 1. Choose an operation + 2. Collect data about the operation + 3. Optionally discard the record based on a filter + 4. Write the record to memory + 5. Interrupt when the buffer is full + +Choose an operation +~~~~~~~~~~~~~~~~~~~ + +This is chosen from a sample population, for SPE this is an IMPLEMENTATION DEFINED choice of all +architectural instructions or all micro-ops. Sampling happens at a programmable interval. The +architecture provides a mechanism for the SPE driver to infer the minimum interval at which it should +sample. This minimum interval is used by the driver if no interval is specified. A pseudo-random +perturbation is also added to the sampling interval by default. + +Collect data about the operation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Program counter, PMU events, timings and data addresses related to the operation are recorded. +Sampling ensures there is only one sampled operation is in flight. + +Optionally discard the record based on a filter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Based on programmable criteria, choose whether to keep the record or discard it. If the record is +discarded then the flow stops here for this sample. + +Write the record to memory +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The record is appended to a memory buffer + +Interrupt when the buffer is full +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When the buffer fills, an interrupt is sent and the driver signals Perf to collect the records. +Perf saves the raw data in the perf.data file. + +Opening the file +---------------- + +Up until this point no decoding of the SPE data was done by either the kernel or Perf. Only when the +recorded file is opened with 'perf report' or 'perf script' does the decoding happen. When decoding +the data, Perf generates "synthetic samples" as if these were generated at the time of the +recording. These samples are the same as if normal sampling was done by Perf without using SPE, +although they may have more attributes associated with them. For example a normal sample may have +just the instruction pointer, but an SPE sample can have data addresses and latency attributes. + +Why Sampling? +------------- + + - Sampling, rather than tracing, cuts down the profiling problem to something more manageable for + hardware. Only one sampled operation is in flight at a time. + + - Allows precise attribution data, including: Full PC of instruction, data virtual and physical + addresses. + + - Allows correlation between an instruction and events, such as TLB and cache miss. (Data source + indicates which particular cache was hit, but the meaning is implementation defined because + different implementations can have different cache configurations.) + +However, SPE does not provide any call-graph information, and relies on statistical methods. + +Collisions +---------- + +When an operation is sampled while a previous sampled operation has not finished, a collision +occurs. The new sample is dropped. Collisions affect the integrity of the data, so the sample rate +should be set to avoid collisions. + +The 'sample_collision' PMU event can be used to determine the number of lost samples. Although this +count is based on collisions _before_ filtering occurs. Therefore this can not be used as an exact +number for samples dropped that would have made it through the filter, but can be a rough +guide. + +The effect of microarchitectural sampling +----------------------------------------- + +If an implementation samples micro-operations instead of instructions, the results of sampling must +be weighted accordingly. + +For example, if a given instruction A is always converted into two micro-operations, A0 and A1, it +becomes twice as likely to appear in the sample population. + +The coarse effect of conversions, and, if applicable, sampling of speculative operations, can be +estimated from the 'sample_pop' and 'inst_retired' PMU events. + +Kernel Requirements +------------------- + +The ARM_SPE_PMU config must be set to build as either a module or statically. + +Depending on CPU model, the kernel may need to be booted with page table isolation disabled +(kpti=off). If KPTI needs to be disabled, this will fail with a console message "profiling buffer +inaccessible. Try passing 'kpti=off' on the kernel command line". + +For the full criteria that determine whether KPTI needs to be forced off or not, see function +unmap_kernel_at_el0() in the kernel sources. Common cases where it's not required +are on the CPUs in kpti_safe_list, or on Arm v8.5+ where FEAT_E0PD is mandatory. + +The SPE interrupt must also be described by the firmware. If the module is loaded and KPTI is +disabled (or isn't required to be disabled) but the SPE PMU still doesn't show in +/sys/bus/event_source/devices/, then it's possible that the SPE interrupt isn't described by +ACPI or DT. In this case no warning will be printed by the driver. + +Capturing SPE with perf command-line tools +------------------------------------------ + +You can record a session with SPE samples: + + perf record -e arm_spe// -- ./mybench + +The sample period is set from the -c option, and because the minimum interval is used by default +it's recommended to set this to a higher value. The value is written to PMSIRR.INTERVAL. + +Config parameters +~~~~~~~~~~~~~~~~~ + +These are placed between the // in the event and comma separated. For example '-e +arm_spe/load_filter=1,min_latency=10/' + + branch_filter=1 - collect branches only (PMSFCR.B) + event_filter=<mask> - filter on specific events (PMSEVFR) - see bitfield description below + jitter=1 - use jitter to avoid resonance when sampling (PMSIRR.RND) + load_filter=1 - collect loads only (PMSFCR.LD) + min_latency=<n> - collect only samples with this latency or higher* (PMSLATFR) + pa_enable=1 - collect physical address (as well as VA) of loads/stores (PMSCR.PA) - requires privilege + pct_enable=1 - collect physical timestamp instead of virtual timestamp (PMSCR.PCT) - requires privilege + store_filter=1 - collect stores only (PMSFCR.ST) + ts_enable=1 - enable timestamping with value of generic timer (PMSCR.TS) + discard=1 - enable SPE PMU events but don't collect sample data - see 'Discard mode' (PMBLIMITR.FM = DISCARD) + ++++*+++ Latency is the total latency from the point at which sampling started on that instruction, rather +than only the execution latency. + +Only some events can be filtered on; these include: + + bit 1 - instruction retired (i.e. omit speculative instructions) + bit 3 - L1D refill + bit 5 - TLB refill + bit 7 - mispredict + bit 11 - misaligned access + +So to sample just retired instructions: + + perf record -e arm_spe/event_filter=2/ -- ./mybench + +or just mispredicted branches: + + perf record -e arm_spe/event_filter=0x80/ -- ./mybench + +Viewing the data +~~~~~~~~~~~~~~~~~ + +By default perf report and perf script will assign samples to separate groups depending on the +attributes/events of the SPE record. Because instructions can have multiple events associated with +them, the samples in these groups are not necessarily unique. For example perf report shows these +groups: + + Available samples + 0 arm_spe// + 0 dummy:u + 21 l1d-miss + 897 l1d-access + 5 llc-miss + 7 llc-access + 2 tlb-miss + 1K tlb-access + 36 branch + 0 remote-access + 900 memory + +The arm_spe// and dummy:u events are implementation details and are expected to be empty. + +To get a full list of unique samples that are not sorted into groups, set the itrace option to +generate 'instruction' samples. The period option is also taken into account, so set it to 1 +instruction unless you want to further downsample the already sampled SPE data: + + perf report --itrace=i1i + +Memory access details are also stored on the samples and this can be viewed with: + + perf report --mem-mode + +Common errors +~~~~~~~~~~~~~ + + - "Cannot find PMU `arm_spe'. Missing kernel support?" + + Module not built or loaded, KPTI not disabled, interrupt not described by firmware, + or running on a VM. See 'Kernel Requirements' above. + + - "Arm SPE CONTEXT packets not found in the traces." + + Root privilege is required to collect context packets. But these only increase the accuracy of + assigning PIDs to kernel samples. For userspace sampling this can be ignored. + + - Excessively large perf.data file size + + Increase sampling interval (see above) + +PMU events +~~~~~~~~~~ + +SPE has events that can be counted on core PMUs. These are prefixed with +SAMPLE_, for example SAMPLE_POP, SAMPLE_FEED, SAMPLE_COLLISION and +SAMPLE_FEED_BR. + +These events will only count when an SPE event is running on the same core that +the PMU event is opened on, otherwise they read as 0. There are various ways to +ensure that the PMU event and SPE event are scheduled together depending on the +way the event is opened. For example opening both events as per-process events +on the same process, although it's not guaranteed that the PMU event is enabled +first when context switching. For that reason it may be better to open the PMU +event as a systemwide event and then open SPE on the process of interest. + +Discard mode +~~~~~~~~~~~~ + +SPE related (SAMPLE_* etc) core PMU events can be used without the overhead of +collecting sample data if discard mode is supported (optional from Armv8.6). +First run a system wide SPE session (or on the core of interest) using options +to minimize output. Then run perf stat: + + perf record -e arm_spe/discard/ -a -N -B --no-bpf-event -o - > /dev/null & + perf stat -e SAMPLE_FEED_LD + +SEE ALSO +-------- + +linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1], +linkperf:perf-inject[1] diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index bad16512c48d..8331bd28b10e 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -18,7 +18,7 @@ COMMON OPTIONS -------------- -r:: --repeat=:: -Specify amount of times to repeat the run (default 10). +Specify number of times to repeat the run (default 10). -f:: --format=:: @@ -49,6 +49,9 @@ SUBSYSTEM 'sched':: Scheduler and IPC mechanisms. +'syscall':: + System call performance (throughput). + 'mem':: Memory access performance. @@ -64,6 +67,9 @@ SUBSYSTEM 'internals':: Benchmark internal perf functionality. +'uprobe':: + Benchmark overhead of uprobe + BPF. + 'all':: All benchmark subsystems. @@ -118,6 +124,14 @@ Options of *pipe* --loop=:: Specify number of loops. +-G:: +--cgroups=:: +Names of cgroups for sender and receiver, separated by a comma. +This is useful to check cgroup context switching overhead. +Note that perf doesn't create nor delete the cgroups, so users should +make sure that the cgroups exist and are accessible before use. + + Example of *pipe* ^^^^^^^^^^^^^^^^^ @@ -135,8 +149,27 @@ Example of *pipe* Total time:0.016 sec 16.948000 usecs/op 59004 ops/sec + +% perf bench sched pipe -G AAA,BBB +(executing 1000000 pipe operations between cgroups) +# Running 'sched/pipe' benchmark: +# Executed 1000000 pipe operations between two processes + + Total time: 6.886 [sec] + + 6.886208 usecs/op + 145217 ops/sec + --------------------- +SUITES FOR 'syscall' +~~~~~~~~~~~~~~~~~~ +*basic*:: +Suite for evaluating performance of core system call throughput (both usecs/op and ops/sec metrics). +This uses a single thread simply doing getppid(2), which is a simple syscall where the result is not +cached by glibc. + + SUITES FOR 'mem' ~~~~~~~~~~~~~~~~ *memcpy*:: diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt index f6de0952ff3c..7e44b419d301 100644 --- a/tools/perf/Documentation/perf-buildid-cache.txt +++ b/tools/perf/Documentation/perf-buildid-cache.txt @@ -57,7 +57,7 @@ OPTIONS -u:: --update=:: Update specified file of the cache. Note that this doesn't remove - older entires since those may be still needed for annotating old + older entries since those may be still needed for annotating old (or remote) perf.data. Only if there is already a cache which has exactly same build-id, that is replaced by new one. It can be used to update kallsyms and kernel dso to vmlinux in order to support @@ -74,6 +74,15 @@ OPTIONS used when creating a uprobe for a process that resides in a different mount namespace from the perf(1) utility. +--debuginfod[=URLs]:: + Specify debuginfod URL to be used when retrieving perf.data binaries, + it follows the same syntax as the DEBUGINFOD_URLS variable, like: + + buildid-cache.debuginfod=http://192.168.122.174:8002 + + If the URLs is not specified, the value of DEBUGINFOD_URLS + system environment variable is used. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-buildid-list[1] diff --git a/tools/perf/Documentation/perf-buildid-list.txt b/tools/perf/Documentation/perf-buildid-list.txt index 25c52efcc7f0..e1e8fdbe06b9 100644 --- a/tools/perf/Documentation/perf-buildid-list.txt +++ b/tools/perf/Documentation/perf-buildid-list.txt @@ -33,6 +33,10 @@ OPTIONS -k:: --kernel:: Show running kernel build id. +-m:: +--kernel-maps:: + Show buildid, start/end text address, and path of running kernel and + its modules. -v:: --verbose:: Be more verbose. diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt index 98efdab5fbd4..f4af2dd6ab31 100644 --- a/tools/perf/Documentation/perf-c2c.txt +++ b/tools/perf/Documentation/perf-c2c.txt @@ -9,7 +9,7 @@ SYNOPSIS -------- [verse] 'perf c2c record' [<options>] <command> -'perf c2c record' [<options>] -- [<record command options>] <command> +'perf c2c record' [<options>] \-- [<record command options>] <command> 'perf c2c report' [<options>] DESCRIPTION @@ -19,9 +19,14 @@ C2C stands for Cache To Cache. The perf c2c tool provides means for Shared Data C2C/HITM analysis. It allows you to track down the cacheline contentions. -On x86, the tool is based on load latency and precise store facility events +On Intel, the tool is based on load latency and precise store facility events provided by Intel CPUs. On PowerPC, the tool uses random instruction sampling -with thresholding feature. +with thresholding feature. On AMD, the tool uses IBS op pmu (due to hardware +limitations, perf c2c is not supported on Zen3 cpus). On Arm64 it uses SPE to +sample load and store operations, therefore hardware and kernel support is +required. See linkperf:perf-arm-spe[1] for a setup guide. Due to the +statistical nature of Arm SPE sampling, not every memory operation will be +sampled. These events provide: - memory address of the access @@ -49,7 +54,15 @@ RECORD OPTIONS -l:: --ldlat:: - Configure mem-loads latency. (x86 only) + Configure mem-loads latency. Supported on Intel, Arm64 and some AMD + processors. Ignored on other archs. + + On supported AMD processors: + - /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'. + - Supported latency values are 128 to 2048 (both inclusive). + - Latency value which is a multiple of 128 incurs a little less profiling + overhead compared to other values. + - Load latency filtering is disabled by default. -k:: --all-kernel:: @@ -109,7 +122,9 @@ REPORT OPTIONS -d:: --display:: - Switch to HITM type (rmt, lcl) to display and sort on. Total HITMs as default. + Switch to HITM type (rmt, lcl) or peer snooping type (peer) to display + and sort on. Total HITMs (tot) as default, except Arm64 uses peer mode + as default. --stitch-lbr:: Show callgraph with stitched LBRs, which may have more complete @@ -117,11 +132,17 @@ REPORT OPTIONS perf c2c record --call-graph lbr. Disabled by default. In common cases with call stack overflows, it can recreate better call stacks than the default lbr call stack - output. But this approach is not full proof. There can be cases + output. But this approach is not foolproof. There can be cases where it creates incorrect call stacks from incorrect matches. The known limitations include exception handing such as setjmp/longjmp will have calls/returns not match. +--double-cl:: + Group the detection of shared cacheline events into double cacheline + granularity. Some architectures have an Adjacent Cacheline Prefetch + feature, which causes cacheline sharing to behave like the cacheline + size is doubled. + C2C RECORD ---------- The perf c2c record command setup options related to HITM cacheline analysis @@ -133,11 +154,15 @@ Following perf record options are configured by default: -W,-d,--phys-data,--sample-cpu Unless specified otherwise with '-e' option, following events are monitored by -default on x86: +default on Intel: cpu/mem-loads,ldlat=30/P cpu/mem-stores/P +following on AMD: + + ibs_op// + and following on PowerPC: cpu/mem-loads/ @@ -174,42 +199,57 @@ For each cacheline in the 1) list we display following data: Cacheline - cacheline address (hex number) - Total records - - sum of all cachelines accesses - - Rmt/Lcl Hitm + Rmt/Lcl Hitm (Display with HITM types) - cacheline percentage of all Remote/Local HITM accesses - LLC Load Hitm - Total, Lcl, Rmt - - count of Total/Local/Remote load HITMs + Peer Snoop (Display with peer type) + - cacheline percentage of all peer accesses - Store Reference - Total, L1Hit, L1Miss - Total - all store accesses - L1Hit - store accesses that hit L1 - L1Hit - store accesses that missed L1 + LLC Load Hitm - Total, LclHitm, RmtHitm (For display with HITM types) + - count of Total/Local/Remote load HITMs - Load Dram - - count of local and remote DRAM accesses + Load Peer - Total, Local, Remote (For display with peer type) + - count of Total/Local/Remote load from peer cache or DRAM - LLC Ld Miss - - count of all accesses that missed LLC + Total records + - sum of all cachelines accesses - Total Loads + Total loads - sum of all load accesses + Total stores + - sum of all store accesses + + Store Reference - L1Hit, L1Miss, N/A + L1Hit - store accesses that hit L1 + L1Miss - store accesses that missed L1 + N/A - store accesses with memory level is not available + Core Load Hit - FB, L1, L2 - count of load hits in FB (Fill Buffer), L1 and L2 cache - LLC Load Hit - Llc, Rmt - - count of LLC and Remote load hits + LLC Load Hit - LlcHit, LclHitm + - count of LLC load accesses, includes LLC hits and LLC HITMs + + RMT Load Hit - RmtHit, RmtHitm + - count of remote load accesses, includes remote hits and remote HITMs; + on Arm neoverse cores, RmtHit is used to account remote accesses, + includes remote DRAM or any upward cache level in remote node + + Load Dram - Lcl, Rmt + - count of local and remote DRAM accesses For each offset in the 2) list we display following data: - HITM - Rmt, Lcl + HITM - Rmt, Lcl (Display with HITM types) - % of Remote/Local HITM accesses for given offset within cacheline - Store Refs - L1 Hit, L1 Miss - - % of store accesses that hit/missed L1 for given offset within cacheline + Peer Snoop - Rmt, Lcl (Display with peer type) + - % of Remote/Local peer accesses for given offset within cacheline + + Store Refs - L1 Hit, L1 Miss, N/A + - % of store accesses that hit L1, missed L1 and N/A (no available) memory + level for given offset within cacheline Data address - Offset - offset address @@ -223,9 +263,12 @@ For each offset in the 2) list we display following data: Code address - code address responsible for the accesses - cycles - rmt hitm, lcl hitm, load + cycles - rmt hitm, lcl hitm, load (Display with HITM types) - sum of cycles for given accesses - Remote/Local HITM and generic load + cycles - rmt peer, lcl peer, load (Display with peer type) + - sum of cycles for given accesses - Remote/Local peer load and generic load + cpu cnt - number of cpus that participated on the access @@ -247,7 +290,8 @@ The 'Node' field displays nodes that accesses given cacheline offset. Its output comes in 3 flavors: - node IDs separated by ',' - node IDs with stats for each ID, in following format: - Node{cpus %hitms %stores} + Node{cpus %hitms %stores} (Display with HITM types) + Node{cpus %peers %stores} (Display with peer type) - node IDs with list of affected CPUs in following format: Node{cpu list} @@ -259,7 +303,7 @@ COALESCE User can specify how to sort offsets for cacheline. Following fields are available and governs the final -output fields set for caheline offsets output: +output fields set for cacheline offsets output: tid - coalesced by process TIDs pid - coalesced by process PIDs @@ -306,4 +350,4 @@ Check Joe's blog on c2c tool for detailed use case explanation: SEE ALSO -------- -linkperf:perf-record[1], linkperf:perf-mem[1] +linkperf:perf-record[1], linkperf:perf-mem[1], linkperf:perf-arm-spe[1] diff --git a/tools/perf/Documentation/perf-check.txt b/tools/perf/Documentation/perf-check.txt new file mode 100644 index 000000000000..a764a4629220 --- /dev/null +++ b/tools/perf/Documentation/perf-check.txt @@ -0,0 +1,80 @@ +perf-check(1) +=============== + +NAME +---- +perf-check - check if features are present in perf + +SYNOPSIS +-------- +[verse] +'perf check' [<options>] +'perf check' {feature <feature_list>} [<options>] + +DESCRIPTION +----------- +With no subcommands given, 'perf check' command just prints the command +usage on the standard output. + +If the subcommand 'feature' is used, then status of feature is printed +on the standard output (unless '-q' is also passed), ie. whether it is +compiled-in/built-in or not. +Also, 'perf check feature' returns with exit status 0 if the feature +is built-in, otherwise returns with exit status 1. + +SUBCOMMANDS +----------- + +feature:: + + Print whether feature(s) is compiled-in or not, and also returns with an + exit status of 0, if passed feature(s) are compiled-in, else 1. + + It expects a feature list as an argument. There can be a single feature + name/macro, or multiple features can also be passed as a comma-separated + list, in which case the exit status will be 0 only if all of the passed + features are compiled-in. + + The feature names/macros are case-insensitive. + + Example Usage: + perf check feature libtraceevent + perf check feature HAVE_LIBTRACEEVENT + perf check feature libtraceevent,bpf + + Supported feature names/macro: + aio / HAVE_AIO_SUPPORT + bpf / HAVE_LIBBPF_SUPPORT + bpf_skeletons / HAVE_BPF_SKEL + debuginfod / HAVE_DEBUGINFOD_SUPPORT + dwarf / HAVE_LIBDW_SUPPORT + dwarf_getlocations / HAVE_LIBDW_SUPPORT + dwarf-unwind / HAVE_DWARF_UNWIND_SUPPORT + auxtrace / HAVE_AUXTRACE_SUPPORT + libbfd / HAVE_LIBBFD_SUPPORT + libcapstone / HAVE_LIBCAPSTONE_SUPPORT + libcrypto / HAVE_LIBCRYPTO_SUPPORT + libdw-dwarf-unwind / HAVE_LIBDW_SUPPORT + libelf / HAVE_LIBELF_SUPPORT + libnuma / HAVE_LIBNUMA_SUPPORT + libopencsd / HAVE_CSTRACE_SUPPORT + libperl / HAVE_LIBPERL_SUPPORT + libpfm4 / HAVE_LIBPFM + libpython / HAVE_LIBPYTHON_SUPPORT + libslang / HAVE_SLANG_SUPPORT + libtraceevent / HAVE_LIBTRACEEVENT + libunwind / HAVE_LIBUNWIND_SUPPORT + lzma / HAVE_LZMA_SUPPORT + numa_num_possible_cpus / HAVE_LIBNUMA_SUPPORT + zlib / HAVE_ZLIB_SUPPORT + zstd / HAVE_ZSTD_SUPPORT + +OPTIONS +------- +-q:: +--quiet:: + Do not print any messages or warnings + + This can be used along with subcommands such as 'perf check feature' + to hide unnecessary output in test scripts, eg. + 'perf check feature --quiet libtraceevent' diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt index c7d3df5798e2..c6f335659667 100644 --- a/tools/perf/Documentation/perf-config.txt +++ b/tools/perf/Documentation/perf-config.txt @@ -40,7 +40,7 @@ The '$HOME/.perfconfig' file is used to store a per-user configuration. The file '$(sysconfdir)/perfconfig' can be used to store a system-wide default configuration. -One an disable reading config files by setting the PERF_CONFIG environment +One can disable reading config files by setting the PERF_CONFIG environment variable to /dev/null, or provide an alternate config file by setting that variable. @@ -123,10 +123,8 @@ Given a $HOME/.perfconfig like this: queue-size = 0 children = true group = true + skip-empty = true - [llvm] - dump-obj = true - clang-opt = -g You can hide source code of annotate feature setting the config to false with @@ -138,7 +136,7 @@ If you want to add or modify several config items, you can do like To modify the sort order of report functionality in user config file(i.e. `~/.perfconfig`), do - % perf config --user report sort-order=srcline + % perf config --user report.sort-order=srcline To change colors of selected line to other foreground and background colors in system config file (i.e. `$(sysconf)/perfconfig`), do @@ -238,10 +236,42 @@ buildid.*:: cache location, or to disable it altogether. If you want to disable it, set buildid.dir to /dev/null. The default is $HOME/.debug +buildid-cache.*:: + buildid-cache.debuginfod=URLs + Specify debuginfod URLs to be used when retrieving perf.data binaries, + it follows the same syntax as the DEBUGINFOD_URLS variable, like: + + buildid-cache.debuginfod=http://192.168.122.174:8002 + annotate.*:: These are in control of addresses, jump function, source code in lines of assembly code from a specific program. + annotate.disassemblers:: + Choose the disassembler to use: "objdump", "llvm", "capstone", + if not specified it will first try, if available, the "llvm" one, + then, if it fails, "capstone", and finally the original "objdump" + based one. + + Choosing a different one is useful when handling some feature that + is known to be best support at some point by one of the options, + to compare the output when in doubt about some bug, etc. + + This can be a list, in order of preference, the first one that works + finishes the process. + + annotate.addr2line:: + addr2line binary to use for file names and line numbers. + + annotate.objdump:: + objdump binary to use for disassembly and annotations, + including in the 'perf test' command. + + annotate.disassembler_style:: + Use this to change the default disassembler style to some other value + supported by binutils, such as "intel", see the '-M' option help in the + 'objdump' man page. + annotate.hide_src_code:: If a program which is analyzed has source code, this option lets 'annotate' print a list of assembly code with the source code. @@ -381,6 +411,12 @@ annotate.*:: This option works with tui, stdio2 browsers. + annotate.demangle:: + Demangle symbol names to human readable form. Default is 'true'. + + annotate.demangle_kernel:: + Demangle kernel symbol names to human readable form. Default is 'true'. + hist.*:: hist.percentage:: This option control the way to calculate overhead of filtered entries - @@ -513,6 +549,10 @@ report.*:: 0.07% 0.00% noploop ld-2.15.so [.] strcmp 0.03% 0.00% noploop [kernel.kallsyms] [k] timerqueue_del + report.skip-empty:: + This option can change default stat behavior with empty results. + If it's set true, 'perf report --stat' will not show 0 stats. + top.*:: top.children:: Same as 'report.children'. So if it is enabled, the output of 'top' @@ -547,11 +587,12 @@ kmem.*:: record.*:: record.build-id:: - This option can be 'cache', 'no-cache' or 'skip'. + This option can be 'cache', 'no-cache', 'skip' or 'mmap'. 'cache' is to post-process data and save/update the binaries into the build-id cache (in ~/.debug). This is the default. But if this option is 'no-cache', it will not update the build-id cache. 'skip' skips post-processing and does not update the cache. + 'mmap' skips post-processing and reads build-ids from MMAP events. record.call-graph:: This is identical to 'call-graph.record-mode', except it is @@ -563,6 +604,15 @@ record.*:: Use 'n' control blocks in asynchronous (Posix AIO) trace writing mode ('n' default: 1, max: 4). + record.debuginfod:: + Specify debuginfod URL to be used when cacheing perf.data binaries, + it follows the same syntax as the DEBUGINFOD_URLS variable, like: + + http://192.168.122.174:8002 + + If the URLs is 'system', the value of DEBUGINFOD_URLS system environment + variable is used. + diff.*:: diff.order:: This option sets the number of columns to sort the result. @@ -614,38 +664,9 @@ trace.*:: ftrace.*:: ftrace.tracer:: - Can be used to select the default tracer. Possible values are - 'function' and 'function_graph'. - -llvm.*:: - llvm.clang-path:: - Path to clang. If omit, search it from $PATH. - - llvm.clang-bpf-cmd-template:: - Cmdline template. Below lines show its default value. Environment - variable is used to pass options. - "$CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS "\ - "-DLINUX_VERSION_CODE=$LINUX_VERSION_CODE " \ - "$CLANG_OPTIONS $PERF_BPF_INC_OPTIONS $KERNEL_INC_OPTIONS " \ - "-Wno-unused-value -Wno-pointer-sign " \ - "-working-directory $WORKING_DIR " \ - "-c \"$CLANG_SOURCE\" -target bpf $CLANG_EMIT_LLVM -O2 -o - $LLVM_OPTIONS_PIPE" - - llvm.clang-opt:: - Options passed to clang. - - llvm.kbuild-dir:: - kbuild directory. If not set, use /lib/modules/`uname -r`/build. - If set to "" deliberately, skip kernel header auto-detector. - - llvm.kbuild-opts:: - Options passed to 'make' when detecting kernel header options. - - llvm.dump-obj:: - Enable perf dump BPF object files compiled by LLVM. - - llvm.opts:: - Options passed to llc. + Can be used to select the default tracer when neither -G nor + -F option is not specified. Possible values are 'function' and + 'function_graph'. samples.*:: @@ -681,6 +702,16 @@ intel-pt.*:: If set, Intel PT decoder will set the mispred flag on all branches. + intel-pt.max-loops:: + If set and non-zero, the maximum number of unconditional + branches decoded without consuming any trace packets. If + the maximum is exceeded there will be a "Never-ending loop" + error. The default is 100000. + + intel-pt.all-switch-events:: + If the user has permission to do so, always record all context + switch events on all CPUs. + auxtrace.*:: auxtrace.dumpdir:: @@ -689,6 +720,26 @@ auxtrace.*:: If the directory does not exist or has the wrong file type, the current directory is used. +itrace.*:: + + debug-log-buffer-size:: + Log size in bytes to output when using the option --itrace=d+e + Refer 'itrace' option of linkperf:perf-script[1] or + linkperf:perf-report[1]. The default is 16384. + +daemon.*:: + + daemon.base:: + Base path for daemon data. All sessions data are stored under + this path. + +session-<NAME>.*:: + + session-<NAME>.run:: + + Defines new record session for daemon. The value is record's + command line without the 'record' keyword. + SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/Documentation/perf-daemon.txt b/tools/perf/Documentation/perf-daemon.txt new file mode 100644 index 000000000000..f558f8e4bc9b --- /dev/null +++ b/tools/perf/Documentation/perf-daemon.txt @@ -0,0 +1,208 @@ +perf-daemon(1) +============== + + +NAME +---- +perf-daemon - Run record sessions on background + + +SYNOPSIS +-------- +[verse] +'perf daemon' +'perf daemon' [<options>] +'perf daemon start' [<options>] +'perf daemon stop' [<options>] +'perf daemon signal' [<options>] +'perf daemon ping' [<options>] + + +DESCRIPTION +----------- +This command allows to run simple daemon process that starts and +monitors configured record sessions. + +You can imagine 'perf daemon' of background process with several +'perf record' child tasks, like: + + # ps axjf + ... + 1 916507 ... perf daemon start + 916507 916508 ... \_ perf record --control=fifo:control,ack -m 10M -e cycles --overwrite --switch-output -a + 916507 916509 ... \_ perf record --control=fifo:control,ack -m 20M -e sched:* --overwrite --switch-output -a + +Not every 'perf record' session is suitable for running under daemon. +User need perf session that either produces data on query, like the +flight recorder sessions in above example or session that is configured +to produce data periodically, like with --switch-output configuration +for time and size. + +Each session is started with control setup (with perf record --control +options). + +Sessions are configured through config file, see CONFIG FILE section +with EXAMPLES. + + +OPTIONS +------- +-v:: +--verbose:: + Be more verbose. + +--config=<PATH>:: + Config file path. If not provided, perf will check system and default + locations (/etc/perfconfig, $HOME/.perfconfig). + +--base=<PATH>:: + Base directory path. Each daemon instance is running on top + of base directory. Only one instance of server can run on + top of one directory at the time. + +All generic options are available also under commands. + + +START COMMAND +------------- +The start command creates the daemon process. + +-f:: +--foreground:: + Do not put the process in background. + + +STOP COMMAND +------------ +The stop command stops all the session and the daemon process. + + +SIGNAL COMMAND +-------------- +The signal command sends signal to configured sessions. + +--session:: + Send signal to specific session. + + +PING COMMAND +------------ +The ping command sends control ping to configured sessions. + +--session:: + Send ping to specific session. + + +CONFIG FILE +----------- +The daemon is configured within standard perf config file by +following new variables: + +daemon.base: + Base path for daemon data. All sessions data are + stored under this path. + +session-<NAME>.run: + Defines new record session. The value is record's command + line without the 'record' keyword. + +Each perf record session is run in daemon.base/<NAME> directory. + + +EXAMPLES +-------- +Example with 2 record sessions: + + # cat ~/.perfconfig + [daemon] + base=/opt/perfdata + + [session-cycles] + run = -m 10M -e cycles --overwrite --switch-output -a + + [session-sched] + run = -m 20M -e sched:* --overwrite --switch-output -a + + +Starting the daemon: + + # perf daemon start + + +Check sessions: + + # perf daemon + [603349:daemon] base: /opt/perfdata + [603350:cycles] perf record -m 10M -e cycles --overwrite --switch-output -a + [603351:sched] perf record -m 20M -e sched:* --overwrite --switch-output -a + +First line is daemon process info with configured daemon base. + + +Check sessions with more info: + + # perf daemon -v + [603349:daemon] base: /opt/perfdata + output: /opt/perfdata/output + lock: /opt/perfdata/lock + up: 1 minutes + [603350:cycles] perf record -m 10M -e cycles --overwrite --switch-output -a + base: /opt/perfdata/session-cycles + output: /opt/perfdata/session-cycles/output + control: /opt/perfdata/session-cycles/control + ack: /opt/perfdata/session-cycles/ack + up: 1 minutes + [603351:sched] perf record -m 20M -e sched:* --overwrite --switch-output -a + base: /opt/perfdata/session-sched + output: /opt/perfdata/session-sched/output + control: /opt/perfdata/session-sched/control + ack: /opt/perfdata/session-sched/ack + up: 1 minutes + +The 'base' path is daemon/session base. +The 'lock' file is daemon's lock file guarding that no other +daemon is running on top of the base. +The 'output' file is perf record output for specific session. +The 'control' and 'ack' files are perf control files. +The 'up' number shows minutes daemon/session is running. + + +Make sure control session is online: + + # perf daemon ping + OK cycles + OK sched + + +Send USR2 signal to session 'cycles' to generate perf.data file: + + # perf daemon signal --session cycles + signal 12 sent to session 'cycles [603452]' + + # tail -2 /opt/perfdata/session-cycles/output + [ perf record: dump data: Woken up 1 times ] + [ perf record: Dump perf.data.2020123017013149 ] + + +Send USR2 signal to all sessions: + + # perf daemon signal + signal 12 sent to session 'cycles [603452]' + signal 12 sent to session 'sched [603453]' + + # tail -2 /opt/perfdata/session-cycles/output + [ perf record: dump data: Woken up 1 times ] + [ perf record: Dump perf.data.2020123017024689 ] + # tail -2 /opt/perfdata/session-sched/output + [ perf record: dump data: Woken up 1 times ] + [ perf record: Dump perf.data.2020123017024713 ] + + +Stop daemon: + + # perf daemon stop + + +SEE ALSO +-------- +linkperf:perf-record[1], linkperf:perf-config[1] diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt index c87180764829..417bf17e265c 100644 --- a/tools/perf/Documentation/perf-data.txt +++ b/tools/perf/Documentation/perf-data.txt @@ -17,7 +17,7 @@ Data file related processing. COMMANDS -------- convert:: - Converts perf data file into another format (only CTF [1] format is support by now). + Converts perf data file into another format. It's possible to set data-convert debug variable to get debug messages from conversion, like: perf --debug data-convert data convert ... @@ -27,6 +27,12 @@ OPTIONS for 'convert' --to-ctf:: Triggers the CTF conversion, specify the path of CTF data directory. +--to-json:: + Triggers JSON conversion. Specify the JSON filename to output. + +--tod:: + Convert time to wall clock time. + -i:: Specify input perf data file path. diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt index f50ca0fef0a4..f3067a4af294 100644 --- a/tools/perf/Documentation/perf-diff.txt +++ b/tools/perf/Documentation/perf-diff.txt @@ -75,7 +75,7 @@ OPTIONS -q:: --quiet:: - Do not show any message. (Suppress -v) + Do not show any warnings or messages. (Suppress -v) -f:: --force:: @@ -182,6 +182,10 @@ OPTIONS --tid=:: Only diff samples for given thread ID (comma separated list). +--stream:: + Enable hot streams comparison. Stream can be a callchain which is + aggregated by the branch records from samples. + COMPARISON ---------- The comparison is governed by the baseline file. The baseline perf.data diff --git a/tools/perf/Documentation/perf-dlfilter.txt b/tools/perf/Documentation/perf-dlfilter.txt new file mode 100644 index 000000000000..8887cc20a809 --- /dev/null +++ b/tools/perf/Documentation/perf-dlfilter.txt @@ -0,0 +1,299 @@ +perf-dlfilter(1) +================ + +NAME +---- +perf-dlfilter - Filter sample events using a dynamically loaded shared +object file + +SYNOPSIS +-------- +[verse] +'perf script' [--dlfilter file.so ] [ --dlarg arg ]... + +DESCRIPTION +----------- + +This option is used to process data through a custom filter provided by a +dynamically loaded shared object file. Arguments can be passed using --dlarg +and retrieved using perf_dlfilter_fns.args(). + +If 'file.so' does not contain "/", then it will be found either in the current +directory, or perf tools exec path which is ~/libexec/perf-core/dlfilters for +a local build and install (refer perf --exec-path), or the dynamic linker +paths. + +API +--- + +The API for filtering consists of the following: + +[source,c] +---- +#include <perf/perf_dlfilter.h> + +struct perf_dlfilter_fns perf_dlfilter_fns; + +int start(void **data, void *ctx); +int stop(void *data, void *ctx); +int filter_event(void *data, const struct perf_dlfilter_sample *sample, void *ctx); +int filter_event_early(void *data, const struct perf_dlfilter_sample *sample, void *ctx); +const char *filter_description(const char **long_description); +---- + +If implemented, 'start' will be called at the beginning, before any +calls to 'filter_event' or 'filter_event_early'. Return 0 to indicate success, +or return a negative error code. '*data' can be assigned for use by other +functions. 'ctx' is needed for calls to perf_dlfilter_fns, but most +perf_dlfilter_fns are not valid when called from 'start'. + +If implemented, 'stop' will be called at the end, after any calls to +'filter_event' or 'filter_event_early'. Return 0 to indicate success, or +return a negative error code. 'data' is set by 'start'. 'ctx' is needed +for calls to perf_dlfilter_fns, but most perf_dlfilter_fns are not valid +when called from 'stop'. + +If implemented, 'filter_event' will be called for each sample event. +Return 0 to keep the sample event, 1 to filter it out, or return a negative +error code. 'data' is set by 'start'. 'ctx' is needed for calls to +'perf_dlfilter_fns'. + +'filter_event_early' is the same as 'filter_event' except it is called before +internal filtering. + +If implemented, 'filter_description' should return a one-line description +of the filter, and optionally a longer description. + +Do not assume the 'sample' argument is valid (dereferenceable) +after 'filter_event' and 'filter_event_early' return. + +Do not assume data referenced by pointers in struct perf_dlfilter_sample +is valid (dereferenceable) after 'filter_event' and 'filter_event_early' return. + +The perf_dlfilter_sample structure +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +'filter_event' and 'filter_event_early' are passed a perf_dlfilter_sample +structure, which contains the following fields: +[source,c] +---- +/* + * perf sample event information (as per perf script and <linux/perf_event.h>) + */ +struct perf_dlfilter_sample { + __u32 size; /* Size of this structure (for compatibility checking) */ + __u16 ins_lat; /* Refer PERF_SAMPLE_WEIGHT_TYPE in <linux/perf_event.h> */ + __u16 p_stage_cyc; /* Refer PERF_SAMPLE_WEIGHT_TYPE in <linux/perf_event.h> */ + __u64 ip; + __s32 pid; + __s32 tid; + __u64 time; + __u64 addr; + __u64 id; + __u64 stream_id; + __u64 period; + __u64 weight; /* Refer PERF_SAMPLE_WEIGHT_TYPE in <linux/perf_event.h> */ + __u64 transaction; /* Refer PERF_SAMPLE_TRANSACTION in <linux/perf_event.h> */ + __u64 insn_cnt; /* For instructions-per-cycle (IPC) */ + __u64 cyc_cnt; /* For instructions-per-cycle (IPC) */ + __s32 cpu; + __u32 flags; /* Refer PERF_DLFILTER_FLAG_* above */ + __u64 data_src; /* Refer PERF_SAMPLE_DATA_SRC in <linux/perf_event.h> */ + __u64 phys_addr; /* Refer PERF_SAMPLE_PHYS_ADDR in <linux/perf_event.h> */ + __u64 data_page_size; /* Refer PERF_SAMPLE_DATA_PAGE_SIZE in <linux/perf_event.h> */ + __u64 code_page_size; /* Refer PERF_SAMPLE_CODE_PAGE_SIZE in <linux/perf_event.h> */ + __u64 cgroup; /* Refer PERF_SAMPLE_CGROUP in <linux/perf_event.h> */ + __u8 cpumode; /* Refer CPUMODE_MASK etc in <linux/perf_event.h> */ + __u8 addr_correlates_sym; /* True => resolve_addr() can be called */ + __u16 misc; /* Refer perf_event_header in <linux/perf_event.h> */ + __u32 raw_size; /* Refer PERF_SAMPLE_RAW in <linux/perf_event.h> */ + const void *raw_data; /* Refer PERF_SAMPLE_RAW in <linux/perf_event.h> */ + __u64 brstack_nr; /* Number of brstack entries */ + const struct perf_branch_entry *brstack; /* Refer <linux/perf_event.h> */ + __u64 raw_callchain_nr; /* Number of raw_callchain entries */ + const __u64 *raw_callchain; /* Refer <linux/perf_event.h> */ + const char *event; + __s32 machine_pid; + __s32 vcpu; +}; +---- + +Note: 'machine_pid' and 'vcpu' are not original members, but were added together later. +'size' can be used to determine their presence at run time. +PERF_DLFILTER_HAS_MACHINE_PID will be defined if they are present at compile time. +For example: +[source,c] +---- +#include <perf/perf_dlfilter.h> +#include <stddef.h> +#include <stdbool.h> + +static inline bool have_machine_pid(const struct perf_dlfilter_sample *sample) +{ +#ifdef PERF_DLFILTER_HAS_MACHINE_PID + return sample->size >= offsetof(struct perf_dlfilter_sample, vcpu) + sizeof(sample->vcpu); +#else + return false; +#endif +} +---- + +The perf_dlfilter_fns structure +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The 'perf_dlfilter_fns' structure is populated with function pointers when the +file is loaded. The functions can be called by 'filter_event' or +'filter_event_early'. + +[source,c] +---- +struct perf_dlfilter_fns { + const struct perf_dlfilter_al *(*resolve_ip)(void *ctx); + const struct perf_dlfilter_al *(*resolve_addr)(void *ctx); + char **(*args)(void *ctx, int *dlargc); + __s32 (*resolve_address)(void *ctx, __u64 address, struct perf_dlfilter_al *al); + const __u8 *(*insn)(void *ctx, __u32 *length); + const char *(*srcline)(void *ctx, __u32 *line_number); + struct perf_event_attr *(*attr)(void *ctx); + __s32 (*object_code)(void *ctx, __u64 ip, void *buf, __u32 len); + void (*al_cleanup)(void *ctx, struct perf_dlfilter_al *al); + void *(*reserved[119])(void *); +}; +---- + +'resolve_ip' returns information about ip. + +'resolve_addr' returns information about addr (if addr_correlates_sym). + +'args' returns arguments from --dlarg options. + +'resolve_address' provides information about 'address'. al->size must be set +before calling. Returns 0 on success, -1 otherwise. Call al_cleanup() (if present, +see below) when 'al' data is no longer needed. + +'insn' returns instruction bytes and length. + +'srcline' return source file name and line number. + +'attr' returns perf_event_attr, refer <linux/perf_event.h>. + +'object_code' reads object code and returns the number of bytes read. + +'al_cleanup' must be called (if present, so check perf_dlfilter_fns.al_cleanup != NULL) +after resolve_address() to free any associated resources. + +Do not assume pointers obtained via perf_dlfilter_fns are valid (dereferenceable) +after 'filter_event' and 'filter_event_early' return. + +The perf_dlfilter_al structure +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The 'perf_dlfilter_al' structure contains information about an address. + +[source,c] +---- +/* + * Address location (as per perf script) + */ +struct perf_dlfilter_al { + __u32 size; /* Size of this structure (for compatibility checking) */ + __u32 symoff; + const char *sym; + __u64 addr; /* Mapped address (from dso) */ + __u64 sym_start; + __u64 sym_end; + const char *dso; + __u8 sym_binding; /* STB_LOCAL, STB_GLOBAL or STB_WEAK, refer <elf.h> */ + __u8 is_64_bit; /* Only valid if dso is not NULL */ + __u8 is_kernel_ip; /* True if in kernel space */ + __u32 buildid_size; + __u8 *buildid; + /* Below members are only populated by resolve_ip() */ + __u8 filtered; /* true if this sample event will be filtered out */ + const char *comm; + void *priv; /* Private data. Do not change */ +}; +---- + +Do not assume data referenced by pointers in struct perf_dlfilter_al +is valid (dereferenceable) after 'filter_event' and 'filter_event_early' return. + +perf_dlfilter_sample flags +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The 'flags' member of 'perf_dlfilter_sample' corresponds with the flags field +of perf script. The bits of the flags are as follows: + +[source,c] +---- +/* Definitions for perf_dlfilter_sample flags */ +enum { + PERF_DLFILTER_FLAG_BRANCH = 1ULL << 0, + PERF_DLFILTER_FLAG_CALL = 1ULL << 1, + PERF_DLFILTER_FLAG_RETURN = 1ULL << 2, + PERF_DLFILTER_FLAG_CONDITIONAL = 1ULL << 3, + PERF_DLFILTER_FLAG_SYSCALLRET = 1ULL << 4, + PERF_DLFILTER_FLAG_ASYNC = 1ULL << 5, + PERF_DLFILTER_FLAG_INTERRUPT = 1ULL << 6, + PERF_DLFILTER_FLAG_TX_ABORT = 1ULL << 7, + PERF_DLFILTER_FLAG_TRACE_BEGIN = 1ULL << 8, + PERF_DLFILTER_FLAG_TRACE_END = 1ULL << 9, + PERF_DLFILTER_FLAG_IN_TX = 1ULL << 10, + PERF_DLFILTER_FLAG_VMENTRY = 1ULL << 11, + PERF_DLFILTER_FLAG_VMEXIT = 1ULL << 12, +}; +---- + +EXAMPLE +------- + +Filter out everything except branches from "foo" to "bar": + +[source,c] +---- +#include <perf/perf_dlfilter.h> +#include <string.h> + +struct perf_dlfilter_fns perf_dlfilter_fns; + +int filter_event(void *data, const struct perf_dlfilter_sample *sample, void *ctx) +{ + const struct perf_dlfilter_al *al; + const struct perf_dlfilter_al *addr_al; + + if (!sample->ip || !sample->addr_correlates_sym) + return 1; + + al = perf_dlfilter_fns.resolve_ip(ctx); + if (!al || !al->sym || strcmp(al->sym, "foo")) + return 1; + + addr_al = perf_dlfilter_fns.resolve_addr(ctx); + if (!addr_al || !addr_al->sym || strcmp(addr_al->sym, "bar")) + return 1; + + return 0; +} +---- + +To build the shared object, assuming perf has been installed for the local user +i.e. perf_dlfilter.h is in ~/include/perf : + + gcc -c -I ~/include -fpic dlfilter-example.c + gcc -shared -o dlfilter-example.so dlfilter-example.o + +To use the filter with perf script: + + perf script --dlfilter dlfilter-example.so + +NOTES +----- + +The dlfilter .so file will be dependent on shared libraries. If those change, +it may be necessary to rebuild the .so. Also there may be unexpected results +if the .so uses different versions of the shared libraries that perf uses. +Versions can be checked using the ldd command. + +SEE ALSO +-------- +linkperf:perf-script[1] diff --git a/tools/perf/Documentation/perf-evlist.txt b/tools/perf/Documentation/perf-evlist.txt index c0a66400a960..9af8b8dfb7b6 100644 --- a/tools/perf/Documentation/perf-evlist.txt +++ b/tools/perf/Documentation/perf-evlist.txt @@ -29,7 +29,7 @@ OPTIONS Show just the sample frequency used for each event. -v:: ---verbose=:: +--verbose:: Show all fields. -g:: diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt index b80c84307dc9..b77f58c4d2fd 100644 --- a/tools/perf/Documentation/perf-ftrace.txt +++ b/tools/perf/Documentation/perf-ftrace.txt @@ -9,31 +9,35 @@ perf-ftrace - simple wrapper for kernel's ftrace functionality SYNOPSIS -------- [verse] -'perf ftrace' <command> +'perf ftrace' {trace|latency|profile} <command> DESCRIPTION ----------- -The 'perf ftrace' command is a simple wrapper of kernel's ftrace -functionality. It only supports single thread tracing currently and -just reads trace_pipe in text and then write it to stdout. +The 'perf ftrace' command provides a collection of subcommands which use +kernel's ftrace infrastructure. -The following options apply to perf ftrace. + 'perf ftrace trace' is a simple wrapper of the ftrace. It only supports + single thread tracing currently and just reads trace_pipe in text and then + write it to stdout. -OPTIONS -------- + 'perf ftrace latency' calculates execution latency of a given function + (optionally with BPF) and display it as a histogram. --t:: ---tracer=:: - Tracer to use: function_graph or function. + 'perf ftrace profile' show a execution profile for each function including + total, average, max time and the number of calls. --v:: ---verbose=:: - Verbosity level. +The following options apply to perf ftrace. + +COMMON OPTIONS +-------------- -p:: --pid=:: Trace on existing process id (comma separated list). +--tid=:: + Trace on existing thread id (comma separated list). + -a:: --all-cpus:: Force system-wide collection. Scripts run without a <command> @@ -48,39 +52,163 @@ OPTIONS Ranges of CPUs are specified with -: 0-2. Default is to trace on all online CPUs. +-v:: +--verbose:: + Increase the verbosity level. + + +OPTIONS for 'perf ftrace trace' +------------------------------- + +-t:: +--tracer=:: + Tracer to use when neither -G nor -F option is not + specified: function_graph or function. + +-F:: +--funcs:: + List available functions to trace. It accepts a pattern to + only list interested functions. + +-D:: +--delay:: + Time (ms) to wait before starting tracing after program start. + +-m:: +--buffer-size:: + Set the size of per-cpu tracing buffer, <size> is expected to + be a number with appended unit character - B/K/M/G. + +--inherit:: + Trace children processes spawned by our target. + -T:: --trace-funcs=:: - Only trace functions given by the argument. Multiple functions - can be given by using this option more than once. The function - argument also can be a glob pattern. It will be passed to - 'set_ftrace_filter' in tracefs. + Select function tracer and set function filter on the given + function (or a glob pattern). Multiple functions can be given + by using this option more than once. The function argument also + can be a glob pattern. It will be passed to 'set_ftrace_filter' + in tracefs. -N:: --notrace-funcs=:: - Do not trace functions given by the argument. Like -T option, - this can be used more than once to specify multiple functions - (or glob patterns). It will be passed to 'set_ftrace_notrace' + Select function tracer and do not trace functions given by the + argument. Like -T option, this can be used more than once to + specify multiple functions (or glob patterns). It will be + passed to 'set_ftrace_notrace' in tracefs. + +--func-opts:: + List of options allowed to set: + + - call-graph - Display kernel stack trace for function tracer. + - irq-info - Display irq context info for function tracer. + +-G:: +--graph-funcs=:: + Select function_graph tracer and set graph filter on the given + function (or a glob pattern). This is useful to trace for + functions executed from the given function. This can be used more + than once to specify multiple functions. It will be passed to + 'set_graph_function' in tracefs. + +-g:: +--nograph-funcs=:: + Select function_graph tracer and set graph notrace filter on the + given function (or a glob pattern). Like -G option, this is useful + for the function_graph tracer only and disables tracing for function + executed from the given function. This can be used more than once to + specify multiple functions. It will be passed to 'set_graph_notrace' in tracefs. +--graph-opts:: + List of options allowed to set: + + - nosleep-time - Measure on-CPU time only for function_graph tracer. + - noirqs - Ignore functions that happen inside interrupt. + - verbose - Show process names, PIDs, timestamps, etc. + - thresh=<n> - Setup trace duration threshold in microseconds. + - depth=<n> - Set max depth for function graph tracer to follow. + - tail - Print function name at the end. + + +OPTIONS for 'perf ftrace latency' +--------------------------------- + +-T:: +--trace-funcs=:: + Set the function name to get the histogram. Unlike perf ftrace trace, + it only allows single function to calculate the histogram. + +-b:: +--use-bpf:: + Use BPF to measure function latency instead of using the ftrace (it + uses function_graph tracer internally). + +-n:: +--use-nsec:: + Use nano-second instead of micro-second as a base unit of the histogram. + +--bucket-range=:: + Bucket range in ms or ns (according to -n/--use-nsec), default is log2() mode. + +--min-latency=:: + Minimum latency for the start of the first bucket, in ms or ns (according to + -n/--use-nsec). + +--max-latency=:: + Maximum latency for the start of the last bucket, in ms or ns (according to + -n/--use-nsec). The setting is ignored if the value results in more than + 22 buckets. + +OPTIONS for 'perf ftrace profile' +--------------------------------- + +-T:: +--trace-funcs=:: + Set function filter on the given function (or a glob pattern). + Multiple functions can be given by using this option more than once. + The function argument also can be a glob pattern. It will be passed + to 'set_ftrace_filter' in tracefs. + +-N:: +--notrace-funcs=:: + Do not trace functions given by the argument. Like -T option, this + can be used more than once to specify multiple functions (or glob + patterns). It will be passed to 'set_ftrace_notrace' in tracefs. + -G:: --graph-funcs=:: - Set graph filter on the given function (or a glob pattern). - This is useful for the function_graph tracer only and enables - tracing for functions executed from the given function. - This can be used more than once to specify multiple functions. - It will be passed to 'set_graph_function' in tracefs. + Set graph filter on the given function (or a glob pattern). This is + useful to trace for functions executed from the given function. This + can be used more than once to specify multiple functions. It will be + passed to 'set_graph_function' in tracefs. -g:: --nograph-funcs=:: Set graph notrace filter on the given function (or a glob pattern). - Like -G option, this is useful for the function_graph tracer only - and disables tracing for function executed from the given function. - This can be used more than once to specify multiple functions. - It will be passed to 'set_graph_notrace' in tracefs. + Like -G option, this is useful for the function_graph tracer only and + disables tracing for function executed from the given function. This + can be used more than once to specify multiple functions. It will be + passed to 'set_graph_notrace' in tracefs. + +-m:: +--buffer-size:: + Set the size of per-cpu tracing buffer, <size> is expected to + be a number with appended unit character - B/K/M/G. + +-s:: +--sort=:: + Sort the result by the given field. Available values are: + total, avg, max, count, name. Default is 'total'. + +--graph-opts:: + List of options allowed to set: + + - nosleep-time - Measure on-CPU time only for function_graph tracer. + - noirqs - Ignore functions that happen inside interrupt. + - thresh=<n> - Setup trace duration threshold in microseconds. + - depth=<n> - Set max depth for function graph tracer to follow. --D:: ---graph-depth=:: - Set max depth for function graph tracer to follow SEE ALSO -------- diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt index 70969ea73e01..c972032f4ca0 100644 --- a/tools/perf/Documentation/perf-inject.txt +++ b/tools/perf/Documentation/perf-inject.txt @@ -24,8 +24,19 @@ information could make use of this facility. OPTIONS ------- -b:: ---build-ids=:: - Inject build-ids into the output stream +--build-ids:: + Inject build-ids of DSOs hit by samples into the output stream. + This means it needs to process all SAMPLE records to find the DSOs. + +--buildid-all:: + Inject build-ids of all DSOs into the output stream regardless of hits + and skip SAMPLE processing. + +--known-build-ids=:: + Override build-ids to inject using these comma-separated pairs of + build-id and path. Understands file://filename to read these pairs + from a file, which can be generated with perf buildid-list. + -v:: --verbose:: Be more verbose. @@ -41,6 +52,13 @@ OPTIONS tasks slept. sched_switch contains a callchain where a task slept and sched_stat contains a timeslice how long a task slept. +-k:: +--vmlinux=<file>:: + vmlinux pathname + +--ignore-vmlinux:: + Ignore vmlinux files. + --kallsyms=<file>:: kallsyms pathname @@ -64,6 +82,37 @@ include::itrace.txt[] --force:: Don't complain, do it. +--vm-time-correlation[=OPTIONS]:: + Some architectures may capture AUX area data which contains timestamps + affected by virtualization. This option will update those timestamps + in place, to correlate with host timestamps. The in-place update means + that an output file is not specified, and instead the input file is + modified. The options are architecture specific, except that they may + start with "dry-run" which will cause the file to be processed but + without updating it. Currently this option is supported only by + Intel PT, refer linkperf:perf-intel-pt[1] + +--guest-data=<path>,<pid>[,<time offset>[,<time scale>]]:: + Insert events from a perf.data file recorded in a virtual machine at + the same time as the input perf.data file was recorded on the host. + The Process ID (PID) of the QEMU hypervisor process must be provided, + and the time offset and time scale (multiplier) will likely be needed + to convert guest time stamps into host time stamps. For example, for + x86 the TSC Offset and Multiplier could be provided for a virtual machine + using Linux command line option no-kvmclock. + Currently only mmap, mmap2, comm, task, context_switch, ksymbol, + and text_poke events are inserted, as well as build ID information. + The QEMU option -name debug-threads=on is needed so that thread names + can be used to determine which thread is running which VCPU. Note + libvirt seems to use this by default. + When using perf record in the guest, option --sample-identifier + should be used, and also --buildid-all and --switch-events may be + useful. + +:GMEXAMPLECMD: inject +:GMEXAMPLESUBCMD: +include::guestmount.txt[] + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1], diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt index f4cd49a7fcdb..cc0f37f0fa5a 100644 --- a/tools/perf/Documentation/perf-intel-pt.txt +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -101,16 +101,47 @@ data is available you can use the 'perf script' tool with all itrace sampling options, which will list all the samples. perf record -e intel_pt//u ls - perf script --itrace=ibxwpe + perf script --itrace=iybxwpe An interesting field that is not printed by default is 'flags' which can be displayed as follows: - perf script --itrace=ibxwpe -F+flags + perf script --itrace=iybxwpe -F+flags -The flags are "bcrosyiABEx" which stand for branch, call, return, conditional, -system, asynchronous, interrupt, transaction abort, trace begin, trace end, and -in transaction, respectively. +The flags are "bcrosyiABExghDt" which stand for branch, call, return, conditional, +system, asynchronous, interrupt, transaction abort, trace begin, trace end, +in transaction, VM-entry, VM-exit, interrupt disabled, and interrupt disable +toggle respectively. + +perf script also supports higher level ways to dump instruction traces: + + perf script --insn-trace=disasm + +or to use the xed disassembler, which requires installing the xed tool +(see XED below): + + perf script --insn-trace --xed + +Dumping all instructions in a long trace can be fairly slow. It is usually better +to start with higher level decoding, like + + perf script --call-trace + +or + + perf script --call-ret-trace + +and then select a time range of interest. The time range can then be examined +in detail with + + perf script --time starttime,stoptime --insn-trace=disasm + +While examining the trace it's also useful to filter on specific CPUs using +the -C option + + perf script --time starttime,stoptime --insn-trace=disasm -C 1 + +Dump all instructions in time range on CPU 1. Another interesting field that is not printed by default is 'ipc' which can be displayed as follows: @@ -120,16 +151,28 @@ displayed as follows: There are two ways that instructions-per-cycle (IPC) can be calculated depending on the recording. -If the 'cyc' config term (see config terms section below) was used, then IPC is -calculated using the cycle count from CYC packets, otherwise MTC packets are -used - refer to the 'mtc' config term. When MTC is used, however, the values -are less accurate because the timing is less accurate. +If the 'cyc' config term (see <<_config_terms,config terms>> section below) was used, then IPC +and cycle events are calculated using the cycle count from CYC packets, otherwise +MTC packets are used - refer to the 'mtc' config term. When MTC is used, however, +the values are less accurate because the timing is less accurate. Because Intel PT does not update the cycle count on every branch or instruction, the values will often be zero. When there are values, they will be the number of instructions and number of cycles since the last update, and thus represent -the average IPC since the last IPC for that event type. Note IPC for "branches" -events is calculated separately from IPC for "instructions" events. +the average IPC cycle count since the last IPC for that event type. +Note IPC for "branches" events is calculated separately from IPC for "instructions" +events. + +Even with the 'cyc' config term, it is possible to produce IPC information for +every change of timestamp, but at the expense of accuracy. That is selected by +specifying the itrace 'A' option. Due to the granularity of timestamps, the +actual number of cycles increases even though the cycles reported does not. +The number of instructions is known, but if IPC is reported, cycles can be too +low and so IPC is too high. Note that inaccuracy decreases as the period of +sampling increases i.e. if the number of cycles is too low by a small amount, +that becomes less significant if the number of cycles is large. It may also be +useful to use the 'A' option in conjunction with dlfilter-show-cycles.so to +provide higher granularity cycle information. Also note that the IPC instruction count may or may not include the current instruction. If the cycle count is associated with an asynchronous branch @@ -148,7 +191,19 @@ Refer to script export-to-sqlite.py or export-to-postgresql.py for more details, and to script exported-sql-viewer.py for an example of using the database. There is also script intel-pt-events.py which provides an example of how to -unpack the raw data for power events and PTWRITE. +unpack the raw data for power events and PTWRITE. The script also displays +branches, and supports 2 additional modes selected by option: + + - --insn-trace - instruction trace + - --src-trace - source trace + +The intel-pt-events.py script also has options: + + - --all-switch-events - display all switch events, not only the last consecutive. + - --interleave [<n>] - interleave sample output for the same timestamp so that + no more than n samples for a CPU are displayed in a row. 'n' defaults to 4. + Note this only affects the order of output, and only when the timestamp is the + same. As mentioned above, it is easy to capture too much data. One way to limit the data captured is to use 'snapshot' mode which is explained further below. @@ -184,7 +239,7 @@ which is the same as -e intel_pt/tsc=1,noretcomp=0/ -Note there are now new config terms - see section 'config terms' further below. +Note there are other config terms - see section <<_config_terms,config terms>> further below. The config terms are listed in /sys/devices/intel_pt/format. They are bit fields within the config member of the struct perf_event_attr which is @@ -225,7 +280,7 @@ Note that, as with all events, the event is suffixed with event modifiers: H host p precise ip -'h', 'G' and 'H' are for virtualization which is not supported by Intel PT. +'h', 'G' and 'H' are for virtualization which are not used by Intel PT. 'p' is also not relevant to Intel PT. So only options 'u' and 'k' are meaningful for Intel PT. @@ -256,192 +311,271 @@ perf_event_attr is displayed if the -vv option is used e.g. config terms ~~~~~~~~~~~~ -The June 2015 version of Intel 64 and IA-32 Architectures Software Developer -Manuals, Chapter 36 Intel Processor Trace, defined new Intel PT features. -Some of the features are reflect in new config terms. All the config terms are -described below. - -tsc Always supported. Produces TSC timestamp packets to provide - timing information. In some cases it is possible to decode - without timing information, for example a per-thread context - that does not overlap executable memory maps. - - The default config selects tsc (i.e. tsc=1). - -noretcomp Always supported. Disables "return compression" so a TIP packet - is produced when a function returns. Causes more packets to be - produced but might make decoding more reliable. - - The default config does not select noretcomp (i.e. noretcomp=0). - -psb_period Allows the frequency of PSB packets to be specified. - - The PSB packet is a synchronization packet that provides a - starting point for decoding or recovery from errors. - - Support for psb_period is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_cyc - - which contains "1" if the feature is supported and "0" - otherwise. - - Valid values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_periods - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The psb_period value is converted to the approximate number of - trace bytes between PSB packets as: - - 2 ^ (value + 11) - - e.g. value 3 means 16KiB bytes between PSBs - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/psb_period=15/u uname - Invalid psb_period for intel_pt. Valid values are: 0-5 - - If MTC packets are selected, the default config selects a value - of 3 (i.e. psb_period=3) or the nearest lower value that is - supported (0 is always supported). Otherwise the default is 0. - - If decoding is expected to be reliable and the buffer is large - then a large PSB period can be used. - - Because a TSC packet is produced with PSB, the PSB period can - also affect the granularity to timing information in the absence - of MTC or CYC. - -mtc Produces MTC timing packets. - - MTC packets provide finer grain timestamp information than TSC - packets. MTC packets record time using the hardware crystal - clock (CTC) which is related to TSC packets using a TMA packet. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/mtc - - which contains "1" if the feature is supported and - "0" otherwise. - - The frequency of MTC packets can also be specified - see - mtc_period below. - -mtc_period Specifies how frequently MTC packets are produced - see mtc - above for how to determine if MTC packets are supported. - - Valid values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/mtc_periods - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The mtc_period value is converted to the MTC frequency as: - - CTC-frequency / (2 ^ value) - - e.g. value 3 means one eighth of CTC-frequency - - Where CTC is the hardware crystal clock, the frequency of which - can be related to TSC via values provided in cpuid leaf 0x15. - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/mtc_period=15/u uname - Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9 - - The default value is 3 or the nearest lower value - that is supported (0 is always supported). - -cyc Produces CYC timing packets. - - CYC packets provide even finer grain timestamp information than - MTC and TSC packets. A CYC packet contains the number of CPU - cycles since the last CYC packet. Unlike MTC and TSC packets, - CYC packets are only sent when another packet is also sent. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_cyc - - which contains "1" if the feature is supported and - "0" otherwise. - - The number of CYC packets produced can be reduced by specifying - a threshold - see cyc_thresh below. - -cyc_thresh Specifies how frequently CYC packets are produced - see cyc - above for how to determine if CYC packets are supported. - - Valid cyc_thresh values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The cyc_thresh value represents the minimum number of CPU cycles - that must have passed before a CYC packet can be sent. The - number of CPU cycles is: - - 2 ^ (value - 1) - - e.g. value 4 means 8 CPU cycles must pass before a CYC packet - can be sent. Note a CYC packet is still only sent when another - packet is sent, not at, e.g. every 8 CPU cycles. - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/cyc,cyc_thresh=15/u uname - Invalid cyc_thresh for intel_pt. Valid values are: 0-12 - - CYC packets are not requested by default. - -pt Specifies pass-through which enables the 'branch' config term. - - The default config selects 'pt' if it is available, so a user will - never need to specify this term. - -branch Enable branch tracing. Branch tracing is enabled by default so to - disable branch tracing use 'branch=0'. - - The default config selects 'branch' if it is available. - -ptw Enable PTWRITE packets which are produced when a ptwrite instruction - is executed. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/ptwrite - - which contains "1" if the feature is supported and - "0" otherwise. - -fup_on_ptw Enable a FUP packet to follow the PTWRITE packet. The FUP packet - provides the address of the ptwrite instruction. In the absence of - fup_on_ptw, the decoder will use the address of the previous branch - if branch tracing is enabled, otherwise the address will be zero. - Note that fup_on_ptw will work even when branch tracing is disabled. - -pwr_evt Enable power events. The power events provide information about - changes to the CPU C-state. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/power_event_trace - - which contains "1" if the feature is supported and - "0" otherwise. - +Config terms are parameters specified with the -e intel_pt// event option, +for example: + + -e intel_pt/cyc/ + +which selects cycle accurate mode. Each config term can have a value which +defaults to 1, so the above is the same as: + + -e intel_pt/cyc=1/ + +Some terms are set by default, so must be set to 0 to turn them off. For +example, to turn off branch tracing: + + -e intel_pt/branch=0/ + +Multiple config terms are separated by commas, for example: + + -e intel_pt/cyc,mtc_period=9/ + +There are also common config terms, see linkperf:perf-record[1] documentation. + +Intel PT config terms are described below. + +*tsc*:: +Always supported. Produces TSC timestamp packets to provide +timing information. In some cases it is possible to decode +without timing information, for example a per-thread context +that does not overlap executable memory maps. ++ +The default config selects tsc (i.e. tsc=1). + +*noretcomp*:: +Always supported. Disables "return compression" so a TIP packet +is produced when a function returns. Causes more packets to be +produced but might make decoding more reliable. ++ +The default config does not select noretcomp (i.e. noretcomp=0). + +*psb_period*:: +Allows the frequency of PSB packets to be specified. ++ +The PSB packet is a synchronization packet that provides a +starting point for decoding or recovery from errors. ++ +Support for psb_period is indicated by: ++ + /sys/bus/event_source/devices/intel_pt/caps/psb_cyc ++ +which contains "1" if the feature is supported and "0" +otherwise. ++ +Valid values are given by: ++ + /sys/bus/event_source/devices/intel_pt/caps/psb_periods ++ +which contains a hexadecimal value, the bits of which represent +valid values e.g. bit 2 set means value 2 is valid. ++ +The psb_period value is converted to the approximate number of +trace bytes between PSB packets as: ++ + 2 ^ (value + 11) ++ +e.g. value 3 means 16KiB bytes between PSBs ++ +If an invalid value is entered, the error message +will give a list of valid values e.g. ++ + $ perf record -e intel_pt/psb_period=15/u uname + Invalid psb_period for intel_pt. Valid values are: 0-5 ++ +If MTC packets are selected, the default config selects a value +of 3 (i.e. psb_period=3) or the nearest lower value that is +supported (0 is always supported). Otherwise the default is 0. ++ +If decoding is expected to be reliable and the buffer is large +then a large PSB period can be used. ++ +Because a TSC packet is produced with PSB, the PSB period can +also affect the granularity to timing information in the absence +of MTC or CYC. + +*mtc*:: +Produces MTC timing packets. ++ +MTC packets provide finer grain timestamp information than TSC +packets. MTC packets record time using the hardware crystal +clock (CTC) which is related to TSC packets using a TMA packet. ++ +Support for this feature is indicated by: ++ + /sys/bus/event_source/devices/intel_pt/caps/mtc ++ +which contains "1" if the feature is supported and +"0" otherwise. ++ +The frequency of MTC packets can also be specified - see +mtc_period below. + +*mtc_period*:: +Specifies how frequently MTC packets are produced - see mtc +above for how to determine if MTC packets are supported. ++ +Valid values are given by: ++ + /sys/bus/event_source/devices/intel_pt/caps/mtc_periods ++ +which contains a hexadecimal value, the bits of which represent +valid values e.g. bit 2 set means value 2 is valid. ++ +The mtc_period value is converted to the MTC frequency as: + + CTC-frequency / (2 ^ value) ++ +e.g. value 3 means one eighth of CTC-frequency ++ +Where CTC is the hardware crystal clock, the frequency of which +can be related to TSC via values provided in cpuid leaf 0x15. ++ +If an invalid value is entered, the error message +will give a list of valid values e.g. ++ + $ perf record -e intel_pt/mtc_period=15/u uname + Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9 ++ +The default value is 3 or the nearest lower value +that is supported (0 is always supported). + +*cyc*:: +Produces CYC timing packets. ++ +CYC packets provide even finer grain timestamp information than +MTC and TSC packets. A CYC packet contains the number of CPU +cycles since the last CYC packet. Unlike MTC and TSC packets, +CYC packets are only sent when another packet is also sent. ++ +Support for this feature is indicated by: ++ + /sys/bus/event_source/devices/intel_pt/caps/psb_cyc ++ +which contains "1" if the feature is supported and +"0" otherwise. ++ +The number of CYC packets produced can be reduced by specifying +a threshold - see cyc_thresh below. + +*cyc_thresh*:: +Specifies how frequently CYC packets are produced - see cyc +above for how to determine if CYC packets are supported. ++ +Valid cyc_thresh values are given by: ++ + /sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds ++ +which contains a hexadecimal value, the bits of which represent +valid values e.g. bit 2 set means value 2 is valid. ++ +The cyc_thresh value represents the minimum number of CPU cycles +that must have passed before a CYC packet can be sent. The +number of CPU cycles is: ++ + 2 ^ (value - 1) ++ +e.g. value 4 means 8 CPU cycles must pass before a CYC packet +can be sent. Note a CYC packet is still only sent when another +packet is sent, not at, e.g. every 8 CPU cycles. ++ +If an invalid value is entered, the error message +will give a list of valid values e.g. ++ + $ perf record -e intel_pt/cyc,cyc_thresh=15/u uname + Invalid cyc_thresh for intel_pt. Valid values are: 0-12 ++ +CYC packets are not requested by default. + +*pt*:: +Specifies pass-through which enables the 'branch' config term. ++ +The default config selects 'pt' if it is available, so a user will +never need to specify this term. + +*branch*:: +Enable branch tracing. Branch tracing is enabled by default so to +disable branch tracing use 'branch=0'. ++ +The default config selects 'branch' if it is available. + +*ptw*:: +Enable PTWRITE packets which are produced when a ptwrite instruction +is executed. ++ +Support for this feature is indicated by: ++ + /sys/bus/event_source/devices/intel_pt/caps/ptwrite ++ +which contains "1" if the feature is supported and +"0" otherwise. ++ +As an alternative, refer to "Emulated PTWRITE" further below. + +*fup_on_ptw*:: +Enable a FUP packet to follow the PTWRITE packet. The FUP packet +provides the address of the ptwrite instruction. In the absence of +fup_on_ptw, the decoder will use the address of the previous branch +if branch tracing is enabled, otherwise the address will be zero. +Note that fup_on_ptw will work even when branch tracing is disabled. + +*pwr_evt*:: +Enable power events. The power events provide information about +changes to the CPU C-state. ++ +Support for this feature is indicated by: ++ + /sys/bus/event_source/devices/intel_pt/caps/power_event_trace ++ +which contains "1" if the feature is supported and +"0" otherwise. + +*event*:: +Enable Event Trace. The events provide information about asynchronous +events. ++ +Support for this feature is indicated by: ++ + /sys/bus/event_source/devices/intel_pt/caps/event_trace ++ +which contains "1" if the feature is supported and +"0" otherwise. + +*notnt*:: +Disable TNT packets. Without TNT packets, it is not possible to walk +executable code to reconstruct control flow, however FUP, TIP, TIP.PGE +and TIP.PGD packets still indicate asynchronous control flow, and (if +return compression is disabled - see noretcomp) return statements. +The advantage of eliminating TNT packets is reducing the size of the +trace and corresponding tracing overhead. ++ +Support for this feature is indicated by: ++ + /sys/bus/event_source/devices/intel_pt/caps/tnt_disable ++ +which contains "1" if the feature is supported and +"0" otherwise. + +*aux-action=start-paused*:: +Start tracing paused, refer to the section <<_pause_or_resume_tracing,Pause or Resume Tracing>> + + +config terms on other events +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some Intel PT features work with other events, features such as AUX area sampling +and PEBS-via-PT. In those cases, the other events can have config terms below: + +*aux-sample-size*:: + Used to set the AUX area sample size, refer to the section + <<_aux_area_sampling_option,AUX area sampling option>> + +*aux-output*:: + Used to select PEBS-via-PT, refer to the + section <<_pebs_via_intel_pt,PEBS via Intel PT>> + +*aux-action*:: + Used to pause or resume tracing, refer to the section + <<_pause_or_resume_tracing,Pause or Resume Tracing>> AUX area sampling option ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -515,7 +649,8 @@ The default snapshot size is the auxtrace mmap size. If neither auxtrace mmap s nor snapshot size is specified, then the default is 4MiB for privileged users (or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. If an unprivileged user does not specify mmap pages, the mmap pages will be -reduced as described in the 'new auxtrace mmap size option' section below. +reduced as described in the <<_new_auxtrace_mmap_size_option,new auxtrace mmap size option>> +section below. The snapshot size is displayed if the option -vv is used e.g. @@ -558,7 +693,7 @@ The mmap size and auxtrace mmap size are displayed if the -vv option is used e.g Intel PT modes of operation ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Intel PT can be used in 2 modes: +Intel PT can be used in 3 modes: full-trace mode sample mode snapshot mode @@ -571,7 +706,8 @@ Sample mode attaches a Intel PT sample to other events e.g. perf record --aux-sample -e intel_pt//u -e branch-misses:u -Snapshot mode captures the available data when a signal is sent e.g. +Snapshot mode captures the available data when a signal is sent or "snapshot" +control command is issued. e.g. using a signal perf record -v -e intel_pt//u -S ./loopy 1000000000 & [1] 11435 @@ -582,14 +718,30 @@ Note that the signal sent is SIGUSR2. Note that "Recording AUX area tracing snapshot" is displayed because the -v option is used. -The 2 modes cannot be used together. +The advantage of using "snapshot" control command is that the access is +controlled by access to a FIFO e.g. + + $ mkfifo perf.control + $ mkfifo perf.ack + $ cat perf.ack & + [1] 15235 + $ sudo ~/bin/perf record --control fifo:perf.control,perf.ack -S -e intel_pt//u -- sleep 60 & + [2] 15243 + $ ps -e | grep perf + 15244 pts/1 00:00:00 perf + $ kill -USR2 15244 + bash: kill: (15244) - Operation not permitted + $ echo snapshot > perf.control + ack + +The 3 Intel PT modes of operation cannot be used together. Buffer handling ~~~~~~~~~~~~~~~ There may be buffer limitations (i.e. single ToPa entry) which means that actual -buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER). In order to +buffer sizes are limited to powers of 2 up to 4MiB (MAX_PAGE_ORDER). In order to provide other sizes, and in particular an arbitrarily large size, multiple buffers are logically concatenated. However an interrupt must be used to switch between buffers. That has two potential problems: @@ -807,17 +959,20 @@ Having no option is the same as which, in turn, is the same as - --itrace=cepwx + --itrace=cepwxy The letters are: i synthesize "instructions" events + y synthesize "cycles" events b synthesize "branches" events x synthesize "transactions" events w synthesize "ptwrite" events - p synthesize "power" events + p synthesize "power" events (incl. PSB events) c synthesize branches events (calls only) r synthesize branches events (returns only) + o synthesize PEBS-via-PT events + I synthesize Event Trace events e synthesize tracing error events d create a debug log g synthesize a call chain (use with i or x) @@ -825,24 +980,37 @@ The letters are: l synthesize last branch entries (use with i or x) L synthesize last branch entries on existing event records s skip initial number of events + q quicker (less detailed) decoding + A approximate IPC + Z prefer to ignore timestamps (so-called "timeless" decoding) "Instructions" events look like they were recorded by "perf record -e instructions". +"Cycles" events look like they were recorded by "perf record -e cycles" +(ie., the default). Note that even with CYC packets enabled and no sampling, +these are not fully accurate, since CYC packets are not emitted for each +instruction, only when some other event (like an indirect branch, or a +TNT packet representing multiple branches) happens causes a packet to +be emitted. Thus, it is more effective for attributing cycles to functions +(and possibly basic blocks) than to individual instructions, although it +is not even perfect for functions (although it becomes better if the noretcomp +option is active). + "Branches" events look like they were recorded by "perf record -e branches". "c" and "r" can be combined to get calls and returns. "Transactions" events correspond to the start or end of transactions. The 'flags' field can be used in perf script to determine whether the event is a -tranasaction start, commit or abort. +transaction start, commit or abort. -Note that "instructions", "branches" and "transactions" events depend on code -flow packets which can be disabled by using the config term "branch=0". Refer -to the config terms section above. +Note that "instructions", "cycles", "branches" and "transactions" events +depend on code flow packets which can be disabled by using the config term +"branch=0". Refer to the <<_config_terms,config terms>> section above. "ptwrite" events record the payload of the ptwrite instruction and whether "fup_on_ptw" was used. "ptwrite" events depend on PTWRITE packets which are -recorded only if the "ptw" config term was used. Refer to the config terms +recorded only if the "ptw" config term was used. Refer to the <<_config_terms,config terms>> section above. perf script "synth" field displays "ptwrite" information like this: "ip: 0 payload: 0x123456789abcdef0" where "ip" is 1 if "fup_on_ptw" was used. @@ -850,15 +1018,18 @@ used. "Power" events correspond to power event packets and CBR (core-to-bus ratio) packets. While CBR packets are always recorded when tracing is enabled, power event packets are recorded only if the "pwr_evt" config term was used. Refer to -the config terms section above. The power events record information about +the <<_config_terms,config terms>> section above. The power events record information about C-state changes, whereas CBR is indicative of CPU frequency. perf script "event,synth" fields display information like this: + cbr: cbr: 22 freq: 2189 MHz (200%) mwait: hints: 0x60 extensions: 0x1 pwre: hw: 0 cstate: 2 sub-cstate: 0 exstop: ip: 1 pwrx: deepest cstate: 2 last cstate: 2 wake reason: 0x4 + Where: + "cbr" includes the frequency and the percentage of maximum non-turbo "mwait" shows mwait hints and extensions "pwre" shows C-state transitions (to a C-state deeper than C0) and @@ -866,16 +1037,43 @@ Where: "exstop" indicates execution stopped and whether the IP was recorded exactly, "pwrx" indicates return to C0 + For more details refer to the Intel 64 and IA-32 Architectures Software Developer Manuals. +PSB events show when a PSB+ occurred and also the byte-offset in the trace. +Emitting a PSB+ can cause a CPU a slight delay. When doing timing analysis +of code with Intel PT, it is useful to know if a timing bubble was caused +by Intel PT or not. + Error events show where the decoder lost the trace. Error events are quite important. Users must know if what they are seeing is a complete -picture or not. +picture or not. The "e" option may be followed by flags which affect what errors +will or will not be reported. Each flag must be preceded by either '+' or '-'. +The flags supported by Intel PT are: + + -o Suppress overflow errors + -l Suppress trace data lost errors + +For example, for errors but not overflow or data lost errors: + + --itrace=e-o-l The "d" option will cause the creation of a file "intel_pt.log" containing all decoded packets and instructions. Note that this option slows down the decoder -and that the resulting file may be very large. +and that the resulting file may be very large. The "d" option may be followed +by flags which affect what debug messages will or will not be logged. Each flag +must be preceded by either '+' or '-'. The flags support by Intel PT are: + + -a Suppress logging of perf events + +a Log all perf events + +e Output only on decoding errors (size configurable) + +o Output to stdout instead of "intel_pt.log" + +By default, logged perf events are filtered by any specified time ranges, but +flag +a overrides that. The +e flag can be useful for analyzing errors. By +default, the log size in that case is 16384 bytes, but can be altered by +linkperf:perf-config[1] e.g. perf config itrace.debug-log-buffer-size=30000 In addition, the period of the "instructions" event can be specified. e.g. @@ -956,6 +1154,70 @@ at the beginning. This is useful to ignore initialization code. skips the first million instructions. +The q option changes the way the trace is decoded. The decoding is much faster +but much less detailed. Specifically, with the q option, the decoder does not +decode TNT packets, and does not walk object code, but gets the ip from FUP and +TIP packets. The q option can be used with the b and i options but the period +is not used. The q option decodes more quickly, but is useful only if the +control flow of interest is represented or indicated by FUP, TIP, TIP.PGE, or +TIP.PGD packets (refer below). However the q option could be used to find time +ranges that could then be decoded fully using the --time option. + +What will *not* be decoded with the (single) q option: + + - direct calls and jmps + - conditional branches + - non-branch instructions + +What *will* be decoded with the (single) q option: + + - asynchronous branches such as interrupts + - indirect branches + - function return target address *if* the noretcomp config term (refer + <<_config_terms,config terms>> section) was used + - start of (control-flow) tracing + - end of (control-flow) tracing, if it is not out of context + - power events, ptwrite, transaction start and abort + - instruction pointer associated with PSB packets + +Note the q option does not specify what events will be synthesized e.g. the p +option must be used also to show power events. + +Repeating the q option (double-q i.e. qq) results in even faster decoding and even +less detail. The decoder decodes only extended PSB (PSB+) packets, getting the +instruction pointer if there is a FUP packet within PSB+ (i.e. between PSB and +PSBEND). Note PSB packets occur regularly in the trace based on the psb_period +config term (refer <<_config_terms,config terms>> section). There will be a FUP packet if the +PSB+ occurs while control flow is being traced. + +What will *not* be decoded with the qq option: + + - everything except instruction pointer associated with PSB packets + +What *will* be decoded with the qq option: + + - instruction pointer associated with PSB packets + +The Z option is equivalent to having recorded a trace without TSC +(i.e. config term tsc=0). It can be useful to avoid timestamp issues when +decoding a trace of a virtual machine. + + +dlfilter-show-cycles.so +~~~~~~~~~~~~~~~~~~~~~~~ + +Cycles can be displayed using dlfilter-show-cycles.so in which case the itrace A +option can be useful to provide higher granularity cycle information: + + perf script --itrace=A --call-trace --dlfilter dlfilter-show-cycles.so + +To see a list of dlfilters: + + perf script -v --list-dlfilters + +See also linkperf:perf-dlfilters[1] + + dump option ~~~~~~~~~~~ @@ -1028,12 +1290,746 @@ Recording is selected by using the aux-output config term e.g. perf record -c 10000 -e '{intel_pt/branch=0/,cycles/aux-output/ppp}' uname -Note that currently, software only supports redirecting at most one PEBS event. +Originally, software only supported redirecting at most one PEBS event because it +was not able to differentiate one event from another. To overcome that, more recent +kernels and perf tools add support for the PERF_RECORD_AUX_OUTPUT_HW_ID side-band event. +To check for the presence of that event in a PEBS-via-PT trace: + + perf script -D --no-itrace | grep PERF_RECORD_AUX_OUTPUT_HW_ID To display PEBS events from the Intel PT trace, use the itrace 'o' option e.g. perf script --itrace=oe +XED +--- + +include::build-xed.txt[] + + +Tracing Virtual Machines (kernel only) +-------------------------------------- + +Currently, kernel tracing is supported with either "timeless" decoding +(i.e. no TSC timestamps) or VM Time Correlation. VM Time Correlation is an extra step +using 'perf inject' and requires unchanging VMX TSC Offset and no VMX TSC Scaling. + +Other limitations and caveats + + VMX controls may suppress packets needed for decoding resulting in decoding errors + VMX controls may block the perf NMI to the host potentially resulting in lost trace data + Guest kernel self-modifying code (e.g. jump labels or JIT-compiled eBPF) will result in decoding errors + Guest thread information is unknown + Guest VCPU is unknown but may be able to be inferred from the host thread + Callchains are not supported + +Example using "timeless" decoding + +Start VM + + $ sudo virsh start kubuntu20.04 + Domain kubuntu20.04 started + +Mount the guest file system. Note sshfs needs -o direct_io to enable reading of proc files. root access is needed to read /proc/kcore. + + $ mkdir vm0 + $ sshfs -o direct_io root@vm0:/ vm0 + +Copy the guest /proc/kallsyms, /proc/modules and /proc/kcore + + $ perf buildid-cache -v --kcore vm0/proc/kcore + kcore added to build-id cache directory /home/user/.debug/[kernel.kcore]/9600f316a53a0f54278885e8d9710538ec5f6a08/2021021807494306 + $ KALLSYMS=/home/user/.debug/[kernel.kcore]/9600f316a53a0f54278885e8d9710538ec5f6a08/2021021807494306/kallsyms + +Find the VM process + + $ ps -eLl | grep 'KVM\|PID' + F S UID PID PPID LWP C PRI NI ADDR SZ WCHAN TTY TIME CMD + 3 S 64055 1430 1 1440 1 80 0 - 1921718 - ? 00:02:47 CPU 0/KVM + 3 S 64055 1430 1 1441 1 80 0 - 1921718 - ? 00:02:41 CPU 1/KVM + 3 S 64055 1430 1 1442 1 80 0 - 1921718 - ? 00:02:38 CPU 2/KVM + 3 S 64055 1430 1 1443 2 80 0 - 1921718 - ? 00:03:18 CPU 3/KVM + +Start an open-ended perf record, tracing the VM process, do something on the VM, and then ctrl-C to stop. +TSC is not supported and tsc=0 must be specified. That means mtc is useless, so add mtc=0. +However, IPC can still be determined, hence cyc=1 can be added. +Only kernel decoding is supported, so 'k' must be specified. +Intel PT traces both the host and the guest so --guest and --host need to be specified. +Without timestamps, --per-thread must be specified to distinguish threads. + + $ sudo perf kvm --guest --host --guestkallsyms $KALLSYMS record --kcore -e intel_pt/tsc=0,mtc=0,cyc=1/k -p 1430 --per-thread + ^C + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 5.829 MB ] + +perf script can be used to provide an instruction trace + + $ perf script --guestkallsyms $KALLSYMS --insn-trace=disasm -F+ipc | grep -C10 vmresume | head -21 + CPU 0/KVM 1440 ffffffff82133cdd __vmx_vcpu_run+0x3d ([kernel.kallsyms]) movq 0x48(%rax), %r9 + CPU 0/KVM 1440 ffffffff82133ce1 __vmx_vcpu_run+0x41 ([kernel.kallsyms]) movq 0x50(%rax), %r10 + CPU 0/KVM 1440 ffffffff82133ce5 __vmx_vcpu_run+0x45 ([kernel.kallsyms]) movq 0x58(%rax), %r11 + CPU 0/KVM 1440 ffffffff82133ce9 __vmx_vcpu_run+0x49 ([kernel.kallsyms]) movq 0x60(%rax), %r12 + CPU 0/KVM 1440 ffffffff82133ced __vmx_vcpu_run+0x4d ([kernel.kallsyms]) movq 0x68(%rax), %r13 + CPU 0/KVM 1440 ffffffff82133cf1 __vmx_vcpu_run+0x51 ([kernel.kallsyms]) movq 0x70(%rax), %r14 + CPU 0/KVM 1440 ffffffff82133cf5 __vmx_vcpu_run+0x55 ([kernel.kallsyms]) movq 0x78(%rax), %r15 + CPU 0/KVM 1440 ffffffff82133cf9 __vmx_vcpu_run+0x59 ([kernel.kallsyms]) movq (%rax), %rax + CPU 0/KVM 1440 ffffffff82133cfc __vmx_vcpu_run+0x5c ([kernel.kallsyms]) callq 0xffffffff82133c40 + CPU 0/KVM 1440 ffffffff82133c40 vmx_vmenter+0x0 ([kernel.kallsyms]) jz 0xffffffff82133c46 + CPU 0/KVM 1440 ffffffff82133c42 vmx_vmenter+0x2 ([kernel.kallsyms]) vmresume IPC: 0.11 (50/445) + :1440 1440 ffffffffbb678b06 native_write_msr+0x6 ([guest.kernel.kallsyms]) nopl %eax, (%rax,%rax,1) + :1440 1440 ffffffffbb678b0b native_write_msr+0xb ([guest.kernel.kallsyms]) retq IPC: 0.04 (2/41) + :1440 1440 ffffffffbb666646 lapic_next_deadline+0x26 ([guest.kernel.kallsyms]) data16 nop + :1440 1440 ffffffffbb666648 lapic_next_deadline+0x28 ([guest.kernel.kallsyms]) xor %eax, %eax + :1440 1440 ffffffffbb66664a lapic_next_deadline+0x2a ([guest.kernel.kallsyms]) popq %rbp + :1440 1440 ffffffffbb66664b lapic_next_deadline+0x2b ([guest.kernel.kallsyms]) retq IPC: 0.16 (4/25) + :1440 1440 ffffffffbb74607f clockevents_program_event+0x8f ([guest.kernel.kallsyms]) test %eax, %eax + :1440 1440 ffffffffbb746081 clockevents_program_event+0x91 ([guest.kernel.kallsyms]) jz 0xffffffffbb74603c IPC: 0.06 (2/30) + :1440 1440 ffffffffbb74603c clockevents_program_event+0x4c ([guest.kernel.kallsyms]) popq %rbx + :1440 1440 ffffffffbb74603d clockevents_program_event+0x4d ([guest.kernel.kallsyms]) popq %r12 + +Example using VM Time Correlation + +Start VM + + $ sudo virsh start kubuntu20.04 + Domain kubuntu20.04 started + +Mount the guest file system. Note sshfs needs -o direct_io to enable reading of proc files. root access is needed to read /proc/kcore. + + $ mkdir -p vm0 + $ sshfs -o direct_io root@vm0:/ vm0 + +Copy the guest /proc/kallsyms, /proc/modules and /proc/kcore + + $ perf buildid-cache -v --kcore vm0/proc/kcore + same kcore found in /home/user/.debug/[kernel.kcore]/cc9c55a98c5e4ec0aeda69302554aabed5cd6491/2021021312450777 + $ KALLSYMS=/home/user/.debug/\[kernel.kcore\]/cc9c55a98c5e4ec0aeda69302554aabed5cd6491/2021021312450777/kallsyms + +Find the VM process + + $ ps -eLl | grep 'KVM\|PID' + F S UID PID PPID LWP C PRI NI ADDR SZ WCHAN TTY TIME CMD + 3 S 64055 16998 1 17005 13 80 0 - 1818189 - ? 00:00:16 CPU 0/KVM + 3 S 64055 16998 1 17006 4 80 0 - 1818189 - ? 00:00:05 CPU 1/KVM + 3 S 64055 16998 1 17007 3 80 0 - 1818189 - ? 00:00:04 CPU 2/KVM + 3 S 64055 16998 1 17008 4 80 0 - 1818189 - ? 00:00:05 CPU 3/KVM + +Start an open-ended perf record, tracing the VM process, do something on the VM, and then ctrl-C to stop. +IPC can be determined, hence cyc=1 can be added. +Only kernel decoding is supported, so 'k' must be specified. +Intel PT traces both the host and the guest so --guest and --host need to be specified. + + $ sudo perf kvm --guest --host --guestkallsyms $KALLSYMS record --kcore -e intel_pt/cyc=1/k -p 16998 + ^C[ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 9.041 MB perf.data.kvm ] + +Now 'perf inject' can be used to determine the VMX TCS Offset. Note, Intel PT TSC packets are +only 7-bytes, so the TSC Offset might differ from the actual value in the 8th byte. That will +have no effect i.e. the resulting timestamps will be correct anyway. + + $ perf inject -i perf.data.kvm --vm-time-correlation=dry-run + ERROR: Unknown TSC Offset for VMCS 0x1bff6a + VMCS: 0x1bff6a TSC Offset 0xffffe42722c64c41 + ERROR: Unknown TSC Offset for VMCS 0x1cbc08 + VMCS: 0x1cbc08 TSC Offset 0xffffe42722c64c41 + ERROR: Unknown TSC Offset for VMCS 0x1c3ce8 + VMCS: 0x1c3ce8 TSC Offset 0xffffe42722c64c41 + ERROR: Unknown TSC Offset for VMCS 0x1cbce9 + VMCS: 0x1cbce9 TSC Offset 0xffffe42722c64c41 + +Each virtual CPU has a different Virtual Machine Control Structure (VMCS) +shown above with the calculated TSC Offset. For an unchanging TSC Offset +they should all be the same for the same virtual machine. + +Now that the TSC Offset is known, it can be provided to 'perf inject' + + $ perf inject -i perf.data.kvm --vm-time-correlation="dry-run 0xffffe42722c64c41" + +Note the options for 'perf inject' --vm-time-correlation are: + + [ dry-run ] [ <TSC Offset> [ : <VMCS> [ , <VMCS> ]... ] ]... + +So it is possible to specify different TSC Offsets for different VMCS. +The option "dry-run" will cause the file to be processed but without updating it. +Note it is also possible to get a intel_pt.log file by adding option --itrace=d + +There were no errors so, do it for real + + $ perf inject -i perf.data.kvm --vm-time-correlation=0xffffe42722c64c41 --force + +'perf script' can be used to see if there are any decoder errors + + $ perf script -i perf.data.kvm --guestkallsyms $KALLSYMS --itrace=e-o + +There were none. + +'perf script' can be used to provide an instruction trace showing timestamps + + $ perf script -i perf.data.kvm --guestkallsyms $KALLSYMS --insn-trace=disasm -F+ipc | grep -C10 vmresume | head -21 + CPU 1/KVM 17006 [001] 11500.262865593: ffffffff82133cdd __vmx_vcpu_run+0x3d ([kernel.kallsyms]) movq 0x48(%rax), %r9 + CPU 1/KVM 17006 [001] 11500.262865593: ffffffff82133ce1 __vmx_vcpu_run+0x41 ([kernel.kallsyms]) movq 0x50(%rax), %r10 + CPU 1/KVM 17006 [001] 11500.262865593: ffffffff82133ce5 __vmx_vcpu_run+0x45 ([kernel.kallsyms]) movq 0x58(%rax), %r11 + CPU 1/KVM 17006 [001] 11500.262865593: ffffffff82133ce9 __vmx_vcpu_run+0x49 ([kernel.kallsyms]) movq 0x60(%rax), %r12 + CPU 1/KVM 17006 [001] 11500.262865593: ffffffff82133ced __vmx_vcpu_run+0x4d ([kernel.kallsyms]) movq 0x68(%rax), %r13 + CPU 1/KVM 17006 [001] 11500.262865593: ffffffff82133cf1 __vmx_vcpu_run+0x51 ([kernel.kallsyms]) movq 0x70(%rax), %r14 + CPU 1/KVM 17006 [001] 11500.262865593: ffffffff82133cf5 __vmx_vcpu_run+0x55 ([kernel.kallsyms]) movq 0x78(%rax), %r15 + CPU 1/KVM 17006 [001] 11500.262865593: ffffffff82133cf9 __vmx_vcpu_run+0x59 ([kernel.kallsyms]) movq (%rax), %rax + CPU 1/KVM 17006 [001] 11500.262865593: ffffffff82133cfc __vmx_vcpu_run+0x5c ([kernel.kallsyms]) callq 0xffffffff82133c40 + CPU 1/KVM 17006 [001] 11500.262865593: ffffffff82133c40 vmx_vmenter+0x0 ([kernel.kallsyms]) jz 0xffffffff82133c46 + CPU 1/KVM 17006 [001] 11500.262866075: ffffffff82133c42 vmx_vmenter+0x2 ([kernel.kallsyms]) vmresume IPC: 0.05 (40/769) + :17006 17006 [001] 11500.262869216: ffffffff82200cb0 asm_sysvec_apic_timer_interrupt+0x0 ([guest.kernel.kallsyms]) clac + :17006 17006 [001] 11500.262869216: ffffffff82200cb3 asm_sysvec_apic_timer_interrupt+0x3 ([guest.kernel.kallsyms]) pushq $0xffffffffffffffff + :17006 17006 [001] 11500.262869216: ffffffff82200cb5 asm_sysvec_apic_timer_interrupt+0x5 ([guest.kernel.kallsyms]) callq 0xffffffff82201160 + :17006 17006 [001] 11500.262869216: ffffffff82201160 error_entry+0x0 ([guest.kernel.kallsyms]) cld + :17006 17006 [001] 11500.262869216: ffffffff82201161 error_entry+0x1 ([guest.kernel.kallsyms]) pushq %rsi + :17006 17006 [001] 11500.262869216: ffffffff82201162 error_entry+0x2 ([guest.kernel.kallsyms]) movq 0x8(%rsp), %rsi + :17006 17006 [001] 11500.262869216: ffffffff82201167 error_entry+0x7 ([guest.kernel.kallsyms]) movq %rdi, 0x8(%rsp) + :17006 17006 [001] 11500.262869216: ffffffff8220116c error_entry+0xc ([guest.kernel.kallsyms]) pushq %rdx + :17006 17006 [001] 11500.262869216: ffffffff8220116d error_entry+0xd ([guest.kernel.kallsyms]) pushq %rcx + :17006 17006 [001] 11500.262869216: ffffffff8220116e error_entry+0xe ([guest.kernel.kallsyms]) pushq %rax + + +Tracing Virtual Machines (including user space) +----------------------------------------------- + +It is possible to use perf record to record sideband events within a virtual machine, so that an Intel PT trace on the host can be decoded. +Sideband events from the guest perf.data file can be injected into the host perf.data file using perf inject. + +Here is an example of the steps needed: + +On the guest machine: + +Check that no-kvmclock kernel command line option was used to boot: + +Note, this is essential to enable time correlation between host and guest machines. + + $ cat /proc/cmdline + BOOT_IMAGE=/boot/vmlinuz-5.10.0-16-amd64 root=UUID=cb49c910-e573-47e0-bce7-79e293df8e1d ro no-kvmclock + +There is no BPF support at present so, if possible, disable JIT compiling: + + $ echo 0 | sudo tee /proc/sys/net/core/bpf_jit_enable + 0 + +Start perf record to collect sideband events: + + $ sudo perf record -o guest-sideband-testing-guest-perf.data --sample-identifier --buildid-all --switch-events --kcore -a -e dummy + +On the host machine: + +Start perf record to collect Intel PT trace: + +Note, the host trace will get very big, very fast, so the steps from starting to stopping the host trace really need to be done so that they happen in the shortest time possible. + + $ sudo perf record -o guest-sideband-testing-host-perf.data -m,64M --kcore -a -e intel_pt/cyc/ + +On the guest machine: + +Run a small test case, just 'uname' in this example: + + $ uname + Linux + +On the host machine: + +Stop the Intel PT trace: + + ^C + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 76.122 MB guest-sideband-testing-host-perf.data ] + +On the guest machine: + +Stop the Intel PT trace: + + ^C + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 1.247 MB guest-sideband-testing-guest-perf.data ] + +And then copy guest-sideband-testing-guest-perf.data to the host (not shown here). + +On the host machine: + +With the 2 perf.data recordings, and with their ownership changed to the user. + +Identify the TSC Offset: + + $ perf inject -i guest-sideband-testing-host-perf.data --vm-time-correlation=dry-run + VMCS: 0x103fc6 TSC Offset 0xfffffa6ae070cb20 + VMCS: 0x103ff2 TSC Offset 0xfffffa6ae070cb20 + VMCS: 0x10fdaa TSC Offset 0xfffffa6ae070cb20 + VMCS: 0x24d57c TSC Offset 0xfffffa6ae070cb20 + +Correct Intel PT TSC timestamps for the guest machine: + + $ perf inject -i guest-sideband-testing-host-perf.data --vm-time-correlation=0xfffffa6ae070cb20 --force + +Identify the guest machine PID: + + $ perf script -i guest-sideband-testing-host-perf.data --no-itrace --show-task-events | grep KVM + CPU 0/KVM 0 [000] 0.000000: PERF_RECORD_COMM: CPU 0/KVM:13376/13381 + CPU 1/KVM 0 [000] 0.000000: PERF_RECORD_COMM: CPU 1/KVM:13376/13382 + CPU 2/KVM 0 [000] 0.000000: PERF_RECORD_COMM: CPU 2/KVM:13376/13383 + CPU 3/KVM 0 [000] 0.000000: PERF_RECORD_COMM: CPU 3/KVM:13376/13384 + +Note, the QEMU option -name debug-threads=on is needed so that thread names +can be used to determine which thread is running which VCPU as above. libvirt seems to use this by default. + +Create a guestmount, assuming the guest machine is 'vm_to_test': + + $ mkdir -p ~/guestmount/13376 + $ sshfs -o direct_io vm_to_test:/ ~/guestmount/13376 + +Inject the guest perf.data file into the host perf.data file: + +Note, due to the guestmount option, guest object files and debug files will be copied into the build ID cache from the guest machine, with the notable exception of VDSO. +If needed, VDSO can be copied manually in a fashion similar to that used by the perf-archive script. + + $ perf inject -i guest-sideband-testing-host-perf.data -o inj --guestmount ~/guestmount --guest-data=guest-sideband-testing-guest-perf.data,13376,0xfffffa6ae070cb20 + +Show an excerpt from the result. In this case the CPU and time range have been to chosen to show interaction between guest and host when 'uname' is starting to run on the guest machine: + +Notes: + + - the CPU displayed, [002] in this case, is always the host CPU + - events happening in the virtual machine start with VM:13376 VCPU:003, which shows the hypervisor PID 13376 and the VCPU number + - only calls and errors are displayed i.e. --itrace=ce + - branches entering and exiting the virtual machine are split, and show as 2 branches to/from "0 [unknown] ([unknown])" + + $ perf script -i inj --itrace=ce -F+machine_pid,+vcpu,+addr,+pid,+tid,-period --ns --time 7919.408803365,7919.408804631 -C 2 + CPU 3/KVM 13376/13384 [002] 7919.408803365: branches: ffffffffc0f8ebe0 vmx_vcpu_enter_exit+0xc0 ([kernel.kallsyms]) => ffffffffc0f8edc0 __vmx_vcpu_run+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803365: branches: ffffffffc0f8edd5 __vmx_vcpu_run+0x15 ([kernel.kallsyms]) => ffffffffc0f8eca0 vmx_update_host_rsp+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803365: branches: ffffffffc0f8ee1b __vmx_vcpu_run+0x5b ([kernel.kallsyms]) => ffffffffc0f8ed60 vmx_vmenter+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803461: branches: ffffffffc0f8ed62 vmx_vmenter+0x2 ([kernel.kallsyms]) => 0 [unknown] ([unknown]) + VM:13376 VCPU:003 uname 3404/3404 [002] 7919.408803461: branches: 0 [unknown] ([unknown]) => 7f851c9b5a5c init_cacheinfo+0x3ac (/usr/lib/x86_64-linux-gnu/libc-2.31.so) + VM:13376 VCPU:003 uname 3404/3404 [002] 7919.408803567: branches: 7f851c9b5a5a init_cacheinfo+0x3aa (/usr/lib/x86_64-linux-gnu/libc-2.31.so) => 0 [unknown] ([unknown]) + CPU 3/KVM 13376/13384 [002] 7919.408803567: branches: 0 [unknown] ([unknown]) => ffffffffc0f8ed80 vmx_vmexit+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803596: branches: ffffffffc0f6619a vmx_vcpu_run+0x26a ([kernel.kallsyms]) => ffffffffb2255c60 x86_virt_spec_ctrl+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803801: branches: ffffffffc0f66445 vmx_vcpu_run+0x515 ([kernel.kallsyms]) => ffffffffb2290b30 native_write_msr+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803850: branches: ffffffffc0f661f8 vmx_vcpu_run+0x2c8 ([kernel.kallsyms]) => ffffffffc1092300 kvm_load_host_xsave_state+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803850: branches: ffffffffc1092327 kvm_load_host_xsave_state+0x27 ([kernel.kallsyms]) => ffffffffc1092220 kvm_load_host_xsave_state.part.0+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803862: branches: ffffffffc0f662cf vmx_vcpu_run+0x39f ([kernel.kallsyms]) => ffffffffc0f63f90 vmx_recover_nmi_blocking+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803862: branches: ffffffffc0f662e9 vmx_vcpu_run+0x3b9 ([kernel.kallsyms]) => ffffffffc0f619a0 __vmx_complete_interrupts+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803872: branches: ffffffffc109cfb2 vcpu_enter_guest+0x752 ([kernel.kallsyms]) => ffffffffc0f5f570 vmx_handle_exit_irqoff+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803881: branches: ffffffffc109d028 vcpu_enter_guest+0x7c8 ([kernel.kallsyms]) => ffffffffb234f900 __srcu_read_lock+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803897: branches: ffffffffc109d06f vcpu_enter_guest+0x80f ([kernel.kallsyms]) => ffffffffc0f72e30 vmx_handle_exit+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803897: branches: ffffffffc0f72e3d vmx_handle_exit+0xd ([kernel.kallsyms]) => ffffffffc0f727c0 __vmx_handle_exit+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803897: branches: ffffffffc0f72b15 __vmx_handle_exit+0x355 ([kernel.kallsyms]) => ffffffffc0f60ae0 vmx_flush_pml_buffer+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803903: branches: ffffffffc0f72994 __vmx_handle_exit+0x1d4 ([kernel.kallsyms]) => ffffffffc10b7090 kvm_emulate_cpuid+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803903: branches: ffffffffc10b70f1 kvm_emulate_cpuid+0x61 ([kernel.kallsyms]) => ffffffffc10b6e10 kvm_cpuid+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803941: branches: ffffffffc10b7125 kvm_emulate_cpuid+0x95 ([kernel.kallsyms]) => ffffffffc1093110 kvm_skip_emulated_instruction+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803941: branches: ffffffffc109311f kvm_skip_emulated_instruction+0xf ([kernel.kallsyms]) => ffffffffc0f5e180 vmx_get_rflags+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803951: branches: ffffffffc109312a kvm_skip_emulated_instruction+0x1a ([kernel.kallsyms]) => ffffffffc0f5fd30 vmx_skip_emulated_instruction+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803951: branches: ffffffffc0f5fd79 vmx_skip_emulated_instruction+0x49 ([kernel.kallsyms]) => ffffffffc0f5fb50 skip_emulated_instruction+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803956: branches: ffffffffc0f5fc68 skip_emulated_instruction+0x118 ([kernel.kallsyms]) => ffffffffc0f6a940 vmx_cache_reg+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803964: branches: ffffffffc0f5fc11 skip_emulated_instruction+0xc1 ([kernel.kallsyms]) => ffffffffc0f5f9e0 vmx_set_interrupt_shadow+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803980: branches: ffffffffc109f8b1 vcpu_run+0x71 ([kernel.kallsyms]) => ffffffffc10ad2f0 kvm_cpu_has_pending_timer+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803980: branches: ffffffffc10ad2fb kvm_cpu_has_pending_timer+0xb ([kernel.kallsyms]) => ffffffffc10b0490 apic_has_pending_timer+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803991: branches: ffffffffc109f899 vcpu_run+0x59 ([kernel.kallsyms]) => ffffffffc109c860 vcpu_enter_guest+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803993: branches: ffffffffc109cd4c vcpu_enter_guest+0x4ec ([kernel.kallsyms]) => ffffffffc0f69140 vmx_prepare_switch_to_guest+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803996: branches: ffffffffc109cd7d vcpu_enter_guest+0x51d ([kernel.kallsyms]) => ffffffffb234f930 __srcu_read_unlock+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803996: branches: ffffffffc109cd9c vcpu_enter_guest+0x53c ([kernel.kallsyms]) => ffffffffc0f609b0 vmx_sync_pir_to_irr+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408803996: branches: ffffffffc0f60a6d vmx_sync_pir_to_irr+0xbd ([kernel.kallsyms]) => ffffffffc10adc20 kvm_lapic_find_highest_irr+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804010: branches: ffffffffc0f60abd vmx_sync_pir_to_irr+0x10d ([kernel.kallsyms]) => ffffffffc0f60820 vmx_set_rvi+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804019: branches: ffffffffc109ceca vcpu_enter_guest+0x66a ([kernel.kallsyms]) => ffffffffb2249840 fpregs_assert_state_consistent+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804021: branches: ffffffffc109cf10 vcpu_enter_guest+0x6b0 ([kernel.kallsyms]) => ffffffffc0f65f30 vmx_vcpu_run+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804024: branches: ffffffffc0f6603b vmx_vcpu_run+0x10b ([kernel.kallsyms]) => ffffffffb229bed0 __get_current_cr3_fast+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804024: branches: ffffffffc0f66055 vmx_vcpu_run+0x125 ([kernel.kallsyms]) => ffffffffb2253050 cr4_read_shadow+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804030: branches: ffffffffc0f6608d vmx_vcpu_run+0x15d ([kernel.kallsyms]) => ffffffffc10921e0 kvm_load_guest_xsave_state+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804030: branches: ffffffffc1092207 kvm_load_guest_xsave_state+0x27 ([kernel.kallsyms]) => ffffffffc1092110 kvm_load_guest_xsave_state.part.0+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804032: branches: ffffffffc0f660c6 vmx_vcpu_run+0x196 ([kernel.kallsyms]) => ffffffffb22061a0 perf_guest_get_msrs+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804032: branches: ffffffffb22061a9 perf_guest_get_msrs+0x9 ([kernel.kallsyms]) => ffffffffb220cda0 intel_guest_get_msrs+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804039: branches: ffffffffc0f66109 vmx_vcpu_run+0x1d9 ([kernel.kallsyms]) => ffffffffc0f652c0 clear_atomic_switch_msr+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804040: branches: ffffffffc0f66119 vmx_vcpu_run+0x1e9 ([kernel.kallsyms]) => ffffffffc0f73f60 intel_pmu_lbr_is_enabled+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804042: branches: ffffffffc0f73f81 intel_pmu_lbr_is_enabled+0x21 ([kernel.kallsyms]) => ffffffffc10b68e0 kvm_find_cpuid_entry+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804045: branches: ffffffffc0f66454 vmx_vcpu_run+0x524 ([kernel.kallsyms]) => ffffffffc0f61ff0 vmx_update_hv_timer+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804057: branches: ffffffffc0f66142 vmx_vcpu_run+0x212 ([kernel.kallsyms]) => ffffffffc10af100 kvm_wait_lapic_expire+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804057: branches: ffffffffc0f66156 vmx_vcpu_run+0x226 ([kernel.kallsyms]) => ffffffffb2255c60 x86_virt_spec_ctrl+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804057: branches: ffffffffc0f66161 vmx_vcpu_run+0x231 ([kernel.kallsyms]) => ffffffffc0f8eb20 vmx_vcpu_enter_exit+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804057: branches: ffffffffc0f8eb44 vmx_vcpu_enter_exit+0x24 ([kernel.kallsyms]) => ffffffffb2353e10 rcu_note_context_switch+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804057: branches: ffffffffb2353e1c rcu_note_context_switch+0xc ([kernel.kallsyms]) => ffffffffb2353db0 rcu_qs+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804066: branches: ffffffffc0f8ebe0 vmx_vcpu_enter_exit+0xc0 ([kernel.kallsyms]) => ffffffffc0f8edc0 __vmx_vcpu_run+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804066: branches: ffffffffc0f8edd5 __vmx_vcpu_run+0x15 ([kernel.kallsyms]) => ffffffffc0f8eca0 vmx_update_host_rsp+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804066: branches: ffffffffc0f8ee1b __vmx_vcpu_run+0x5b ([kernel.kallsyms]) => ffffffffc0f8ed60 vmx_vmenter+0x0 ([kernel.kallsyms]) + CPU 3/KVM 13376/13384 [002] 7919.408804162: branches: ffffffffc0f8ed62 vmx_vmenter+0x2 ([kernel.kallsyms]) => 0 [unknown] ([unknown]) + VM:13376 VCPU:003 uname 3404/3404 [002] 7919.408804162: branches: 0 [unknown] ([unknown]) => 7f851c9b5a5c init_cacheinfo+0x3ac (/usr/lib/x86_64-linux-gnu/libc-2.31.so) + VM:13376 VCPU:003 uname 3404/3404 [002] 7919.408804273: branches: 7f851cb7c0e4 _dl_init+0x74 (/usr/lib/x86_64-linux-gnu/ld-2.31.so) => 7f851cb7bf50 call_init.part.0+0x0 (/usr/lib/x86_64-linux-gnu/ld-2.31.so) + VM:13376 VCPU:003 uname 3404/3404 [002] 7919.408804526: branches: 55e0c00136f0 _start+0x0 (/usr/bin/uname) => ffffffff83200ac0 asm_exc_page_fault+0x0 ([kernel.kallsyms]) + VM:13376 VCPU:003 uname 3404/3404 [002] 7919.408804526: branches: ffffffff83200ac3 asm_exc_page_fault+0x3 ([kernel.kallsyms]) => ffffffff83201290 error_entry+0x0 ([kernel.kallsyms]) + VM:13376 VCPU:003 uname 3404/3404 [002] 7919.408804534: branches: ffffffff832012fa error_entry+0x6a ([kernel.kallsyms]) => ffffffff830b59a0 sync_regs+0x0 ([kernel.kallsyms]) + VM:13376 VCPU:003 uname 3404/3404 [002] 7919.408804631: branches: ffffffff83200ad9 asm_exc_page_fault+0x19 ([kernel.kallsyms]) => ffffffff830b8210 exc_page_fault+0x0 ([kernel.kallsyms]) + VM:13376 VCPU:003 uname 3404/3404 [002] 7919.408804631: branches: ffffffff830b82a4 exc_page_fault+0x94 ([kernel.kallsyms]) => ffffffff830b80e0 __kvm_handle_async_pf+0x0 ([kernel.kallsyms]) + VM:13376 VCPU:003 uname 3404/3404 [002] 7919.408804631: branches: ffffffff830b80ed __kvm_handle_async_pf+0xd ([kernel.kallsyms]) => ffffffff830b80c0 kvm_read_and_reset_apf_flags+0x0 ([kernel.kallsyms]) + + +Tracing Virtual Machines - Guest Code +------------------------------------- + +A common case for KVM test programs is that the test program acts as the +hypervisor, creating, running and destroying the virtual machine, and +providing the guest object code from its own object code. In this case, +the VM is not running an OS, but only the functions loaded into it by the +hypervisor test program, and conveniently, loaded at the same virtual +addresses. To support that, option "--guest-code" has been added to perf script +and perf kvm report. + +Here is an example tracing a test program from the kernel's KVM selftests: + + # perf record --kcore -e intel_pt/cyc/ -- tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 0.280 MB perf.data ] + # perf script --guest-code --itrace=bep --ns -F-period,+addr,+flags + [SNIP] + tsc_msrs_test 18436 [007] 10897.962087733: branches: call ffffffffc13b2ff5 __vmx_vcpu_run+0x15 (vmlinux) => ffffffffc13b2f50 vmx_update_host_rsp+0x0 (vmlinux) + tsc_msrs_test 18436 [007] 10897.962087733: branches: return ffffffffc13b2f5d vmx_update_host_rsp+0xd (vmlinux) => ffffffffc13b2ffa __vmx_vcpu_run+0x1a (vmlinux) + tsc_msrs_test 18436 [007] 10897.962087733: branches: call ffffffffc13b303b __vmx_vcpu_run+0x5b (vmlinux) => ffffffffc13b2f80 vmx_vmenter+0x0 (vmlinux) + tsc_msrs_test 18436 [007] 10897.962087836: branches: vmentry ffffffffc13b2f82 vmx_vmenter+0x2 (vmlinux) => 0 [unknown] ([unknown]) + [guest/18436] 18436 [007] 10897.962087836: branches: vmentry 0 [unknown] ([unknown]) => 402c81 guest_code+0x131 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) + [guest/18436] 18436 [007] 10897.962087836: branches: call 402c81 guest_code+0x131 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) => 40dba0 ucall+0x0 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) + [guest/18436] 18436 [007] 10897.962088248: branches: vmexit 40dba0 ucall+0x0 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) => 0 [unknown] ([unknown]) + tsc_msrs_test 18436 [007] 10897.962088248: branches: vmexit 0 [unknown] ([unknown]) => ffffffffc13b2fa0 vmx_vmexit+0x0 (vmlinux) + tsc_msrs_test 18436 [007] 10897.962088248: branches: jmp ffffffffc13b2fa0 vmx_vmexit+0x0 (vmlinux) => ffffffffc13b2fd2 vmx_vmexit+0x32 (vmlinux) + tsc_msrs_test 18436 [007] 10897.962088256: branches: return ffffffffc13b2fd2 vmx_vmexit+0x32 (vmlinux) => ffffffffc13b3040 __vmx_vcpu_run+0x60 (vmlinux) + tsc_msrs_test 18436 [007] 10897.962088270: branches: return ffffffffc13b30b6 __vmx_vcpu_run+0xd6 (vmlinux) => ffffffffc13b2f2e vmx_vcpu_enter_exit+0x4e (vmlinux) + [SNIP] + tsc_msrs_test 18436 [007] 10897.962089321: branches: call ffffffffc13b2ff5 __vmx_vcpu_run+0x15 (vmlinux) => ffffffffc13b2f50 vmx_update_host_rsp+0x0 (vmlinux) + tsc_msrs_test 18436 [007] 10897.962089321: branches: return ffffffffc13b2f5d vmx_update_host_rsp+0xd (vmlinux) => ffffffffc13b2ffa __vmx_vcpu_run+0x1a (vmlinux) + tsc_msrs_test 18436 [007] 10897.962089321: branches: call ffffffffc13b303b __vmx_vcpu_run+0x5b (vmlinux) => ffffffffc13b2f80 vmx_vmenter+0x0 (vmlinux) + tsc_msrs_test 18436 [007] 10897.962089424: branches: vmentry ffffffffc13b2f82 vmx_vmenter+0x2 (vmlinux) => 0 [unknown] ([unknown]) + [guest/18436] 18436 [007] 10897.962089424: branches: vmentry 0 [unknown] ([unknown]) => 40dba0 ucall+0x0 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) + [guest/18436] 18436 [007] 10897.962089701: branches: jmp 40dc1b ucall+0x7b (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) => 40dc39 ucall+0x99 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) + [guest/18436] 18436 [007] 10897.962089701: branches: jcc 40dc3c ucall+0x9c (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) => 40dc20 ucall+0x80 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) + [guest/18436] 18436 [007] 10897.962089701: branches: jcc 40dc3c ucall+0x9c (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) => 40dc20 ucall+0x80 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) + [guest/18436] 18436 [007] 10897.962089701: branches: jcc 40dc37 ucall+0x97 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) => 40dc50 ucall+0xb0 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) + [guest/18436] 18436 [007] 10897.962089878: branches: vmexit 40dc55 ucall+0xb5 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) => 0 [unknown] ([unknown]) + tsc_msrs_test 18436 [007] 10897.962089878: branches: vmexit 0 [unknown] ([unknown]) => ffffffffc13b2fa0 vmx_vmexit+0x0 (vmlinux) + tsc_msrs_test 18436 [007] 10897.962089878: branches: jmp ffffffffc13b2fa0 vmx_vmexit+0x0 (vmlinux) => ffffffffc13b2fd2 vmx_vmexit+0x32 (vmlinux) + tsc_msrs_test 18436 [007] 10897.962089887: branches: return ffffffffc13b2fd2 vmx_vmexit+0x32 (vmlinux) => ffffffffc13b3040 __vmx_vcpu_run+0x60 (vmlinux) + tsc_msrs_test 18436 [007] 10897.962089901: branches: return ffffffffc13b30b6 __vmx_vcpu_run+0xd6 (vmlinux) => ffffffffc13b2f2e vmx_vcpu_enter_exit+0x4e (vmlinux) + [SNIP] + + # perf kvm --guest-code --guest --host report -i perf.data --stdio | head -20 + + # To display the perf.data header info, please use --header/--header-only options. + # + # + # Total Lost Samples: 0 + # + # Samples: 12 of event 'instructions' + # Event count (approx.): 2274583 + # + # Children Self Command Shared Object Symbol + # ........ ........ ............. .................... ........................................... + # + 54.70% 0.00% tsc_msrs_test [kernel.vmlinux] [k] entry_SYSCALL_64_after_hwframe + | + ---entry_SYSCALL_64_after_hwframe + do_syscall_64 + | + |--29.44%--syscall_exit_to_user_mode + | exit_to_user_mode_prepare + | task_work_run + | __fput + + +Event Trace +----------- + +Event Trace records information about asynchronous events, for example interrupts, +faults, VM exits and entries. The information is recorded in CFE and EVD packets, +and also the Interrupt Flag is recorded on the MODE.Exec packet. The CFE packet +contains a type field to identify one of the following: + + 1 INTR interrupt, fault, exception, NMI + 2 IRET interrupt return + 3 SMI system management interrupt + 4 RSM resume from system management mode + 5 SIPI startup interprocessor interrupt + 6 INIT INIT signal + 7 VMENTRY VM-Entry + 8 VMEXIT VM-Entry + 9 VMEXIT_INTR VM-Exit due to interrupt + 10 SHUTDOWN Shutdown + +For more details, refer to the Intel 64 and IA-32 Architectures Software +Developer Manuals (version 076 or later). + +The capability to do Event Trace is indicated by the +/sys/bus/event_source/devices/intel_pt/caps/event_trace file. + +Event trace is selected for recording using the "event" config term. e.g. + + perf record -e intel_pt/event/u uname + +Event trace events are output using the --itrace I option. e.g. + + perf script --itrace=Ie + +perf script displays events containing CFE type, vector and event data, +in the form: + + evt: hw int (t) cfe: INTR IP: 1 vector: 3 PFA: 0x8877665544332211 + +The IP flag indicates if the event binds to an IP, which includes any case where +flow control packet generation is enabled, as well as when CFE packet IP bit is +set. + +perf script displays events containing changes to the Interrupt Flag in the form: + + iflag: t IFLAG: 1->0 via branch + +where "via branch" indicates a branch (interrupt or return from interrupt) and +"non branch" indicates an instruction such as CFI, STI or POPF). + +In addition, the current state of the interrupt flag is indicated by the presence +or absence of the "D" (interrupt disabled) perf script flag. If the interrupt +flag is changed, then the "t" flag is also included i.e. + + no flag, interrupts enabled IF=1 + t interrupts become disabled IF=1 -> IF=0 + D interrupts are disabled IF=0 + Dt interrupts become enabled IF=0 -> IF=1 + +The intel-pt-events.py script illustrates how to access Event Trace information +using a Python script. + + +TNT Disable +----------- + +TNT packets are disabled using the "notnt" config term. e.g. + + perf record -e intel_pt/notnt/u uname + +In that case the --itrace q option is forced because walking executable code +to reconstruct the control flow is not possible. + + +Emulated PTWRITE +---------------- + +Later perf tools support a method to emulate the ptwrite instruction, which +can be useful if hardware does not support the ptwrite instruction. + +Instead of using the ptwrite instruction, a function is used which produces +a trace that encodes the payload data into TNT packets. Here is an example +of the function: + + #include <stdint.h> + + void perf_emulate_ptwrite(uint64_t x) + __attribute__((externally_visible, noipa, no_instrument_function, naked)); + + #define PERF_EMULATE_PTWRITE_8_BITS \ + "1: shl %rax\n" \ + " jc 1f\n" \ + "1: shl %rax\n" \ + " jc 1f\n" \ + "1: shl %rax\n" \ + " jc 1f\n" \ + "1: shl %rax\n" \ + " jc 1f\n" \ + "1: shl %rax\n" \ + " jc 1f\n" \ + "1: shl %rax\n" \ + " jc 1f\n" \ + "1: shl %rax\n" \ + " jc 1f\n" \ + "1: shl %rax\n" \ + " jc 1f\n" + + /* Undefined instruction */ + #define PERF_EMULATE_PTWRITE_UD2 ".byte 0x0f, 0x0b\n" + + #define PERF_EMULATE_PTWRITE_MAGIC PERF_EMULATE_PTWRITE_UD2 ".ascii \"perf,ptwrite \"\n" + + void perf_emulate_ptwrite(uint64_t x __attribute__ ((__unused__))) + { + /* Assumes SysV ABI : x passed in rdi */ + __asm__ volatile ( + "jmp 1f\n" + PERF_EMULATE_PTWRITE_MAGIC + "1: mov %rdi, %rax\n" + PERF_EMULATE_PTWRITE_8_BITS + PERF_EMULATE_PTWRITE_8_BITS + PERF_EMULATE_PTWRITE_8_BITS + PERF_EMULATE_PTWRITE_8_BITS + PERF_EMULATE_PTWRITE_8_BITS + PERF_EMULATE_PTWRITE_8_BITS + PERF_EMULATE_PTWRITE_8_BITS + PERF_EMULATE_PTWRITE_8_BITS + "1: ret\n" + ); + } + +For example, a test program with the function above: + + #include <stdio.h> + #include <stdint.h> + #include <stdlib.h> + + #include "perf_emulate_ptwrite.h" + + int main(int argc, char *argv[]) + { + uint64_t x = 0; + + if (argc > 1) + x = strtoull(argv[1], NULL, 0); + perf_emulate_ptwrite(x); + return 0; + } + +Can be compiled and traced: + + $ gcc -Wall -Wextra -O3 -g -o eg_ptw eg_ptw.c + $ perf record -e intel_pt//u ./eg_ptw 0x1234567890abcdef + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 0.017 MB perf.data ] + $ perf script --itrace=ew + eg_ptw 19875 [007] 8061.235912: ptwrite: IP: 0 payload: 0x1234567890abcdef 55701249a196 perf_emulate_ptwrite+0x16 (/home/user/eg_ptw) + $ + + +Pipe mode +--------- +Pipe mode is a problem for Intel PT and possibly other auxtrace users. +It's not recommended to use a pipe as data output with Intel PT because +of the following reason. + +Essentially the auxtrace buffers do not behave like the regular perf +event buffers. That is because the head and tail are updated by +software, but in the auxtrace case the data is written by hardware. +So the head and tail do not get updated as data is written. + +In the Intel PT case, the head and tail are updated only when the trace +is disabled by software, for example: + - full-trace, system wide : when buffer passes watermark + - full-trace, not system-wide : when buffer passes watermark or + context switches + - snapshot mode : as above but also when a snapshot is made + - sample mode : as above but also when a sample is made + +That means finished-round ordering doesn't work. An auxtrace buffer +can turn up that has data that extends back in time, possibly to the +very beginning of tracing. + +For a perf.data file, that problem is solved by going through the trace +and queuing up the auxtrace buffers in advance. + +For pipe mode, the order of events and timestamps can presumably +be messed up. + + +Pause or Resume Tracing +----------------------- + +With newer Kernels, it is possible to use other selected events to pause +or resume Intel PT tracing. This is configured by using the "aux-action" +config term: + +"aux-action=pause" is used with events that are to pause Intel PT tracing. + +"aux-action=resume" is used with events that are to resume Intel PT tracing. + +"aux-action=start-paused" is used with the Intel PT event to start in a +paused state. + +For example, to trace only the uname system call (sys_newuname) when running the +command line utility uname: + + $ perf record --kcore -e intel_pt/aux-action=start-paused/k,syscalls:sys_enter_newuname/aux-action=resume/,syscalls:sys_exit_newuname/aux-action=pause/ uname + Linux + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 0.043 MB perf.data ] + $ perf script --call-trace + uname 30805 [000] 24001.058782799: name: 0x7ffc9c1865b0 + uname 30805 [000] 24001.058784424: psb offs: 0 + uname 30805 [000] 24001.058784424: cbr: 39 freq: 3904 MHz (139%) + uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) debug_smp_processor_id + uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __x64_sys_newuname + uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) down_read + uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __cond_resched + uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add + uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) in_lock_functions + uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_sub + uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) up_read + uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add + uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) in_lock_functions + uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) preempt_count_sub + uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) _copy_to_user + uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_to_user_mode + uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_work + uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) perf_syscall_exit + uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) debug_smp_processor_id + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_alloc + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_get_recursion_context + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_tp_event + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_update + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) tracing_gen_ctx_irq_test + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_event + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __perf_event_account_interrupt + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __this_cpu_preempt_check + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_output_forward + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_aux_pause + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) ring_buffer_get + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_lock + uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_unlock + uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) pt_event_stop + uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id + uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id + uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) native_write_msr + uname 30805 [000] 24001.058785463: ([kernel.kallsyms]) native_write_msr + uname 30805 [000] 24001.058785639: 0x0 + +The example above uses tracepoints, but any kind of sampled event can be used. + +For example: + + Tracing between arch_cpu_idle_enter() and arch_cpu_idle_exit() using breakpoint events: + + $ sudo cat /proc/kallsyms | sort | grep ' arch_cpu_idle_enter\| arch_cpu_idle_exit' + ffffffffb605bf60 T arch_cpu_idle_enter + ffffffffb614d8a0 W arch_cpu_idle_exit + $ sudo perf record --kcore -a -e intel_pt/aux-action=start-paused/k -e mem:0xffffffffb605bf60:x/aux-action=resume/ -e mem:0xffffffffb614d8a0:x/aux-action=pause/ -- sleep 1 + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 1.387 MB perf.data ] + + Tracing __alloc_pages() using kprobes: + + $ sudo perf probe --add '__alloc_pages order' + Added new event: probe:__alloc_pages (on __alloc_pages with order) + $ sudo perf probe --add __alloc_pages%return + Added new event: probe:__alloc_pages__return (on __alloc_pages%return) + $ sudo perf record --kcore -aR -e intel_pt/aux-action=start-paused/k -e probe:__alloc_pages/aux-action=resume/ -e probe:__alloc_pages__return/aux-action=pause/ -- sleep 1 + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 1.490 MB perf.data ] + + Tracing starting at main() using a uprobe event: + + $ sudo perf probe -x /usr/bin/uname main + Added new event: probe_uname:main (on main in /usr/bin/uname) + $ sudo perf record -e intel_pt/-aux-action=start-paused/u -e probe_uname:main/aux-action=resume/ -- uname + Linux + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 0.031 MB perf.data ] + + Tracing occasionally using cycles events with different periods: + + $ perf record --kcore -a -m,64M -e intel_pt/aux-action=start-paused/k -e cycles/aux-action=pause,period=1000000/Pk -e cycles/aux-action=resume,period=10500000/Pk -- firefox + [ perf record: Woken up 19 times to write data ] + [ perf record: Captured and wrote 16.561 MB perf.data ] + + +EXAMPLE +------- + +Examples can be found on perf wiki page "Perf tools support for Intel® Processor Trace": + +https://perf.wiki.kernel.org/index.php/Perf_tools_support_for_Intel%C2%AE_Processor_Trace + SEE ALSO -------- diff --git a/tools/perf/Documentation/perf-iostat.txt b/tools/perf/Documentation/perf-iostat.txt new file mode 100644 index 000000000000..04d510364384 --- /dev/null +++ b/tools/perf/Documentation/perf-iostat.txt @@ -0,0 +1,88 @@ +perf-iostat(1) +=============== + +NAME +---- +perf-iostat - Show I/O performance metrics + +SYNOPSIS +-------- +[verse] +'perf iostat' list +'perf iostat' <ports> \-- <command> [<options>] + +DESCRIPTION +----------- +Mode is intended to provide four I/O performance metrics per each PCIe root port: + +- Inbound Read - I/O devices below root port read from the host memory, in MB + +- Inbound Write - I/O devices below root port write to the host memory, in MB + +- Outbound Read - CPU reads from I/O devices below root port, in MB + +- Outbound Write - CPU writes to I/O devices below root port, in MB + +OPTIONS +------- +<command>...:: + Any command you can specify in a shell. + +list:: + List all PCIe root ports. + +<ports>:: + Select the root ports for monitoring. Comma-separated list is supported. + +EXAMPLES +-------- + +1. List all PCIe root ports (example for 2-S platform): + + $ perf iostat list + S0-uncore_iio_0<0000:00> + S1-uncore_iio_0<0000:80> + S0-uncore_iio_1<0000:17> + S1-uncore_iio_1<0000:85> + S0-uncore_iio_2<0000:3a> + S1-uncore_iio_2<0000:ae> + S0-uncore_iio_3<0000:5d> + S1-uncore_iio_3<0000:d7> + +2. Collect metrics for all PCIe root ports: + + $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s + + Performance counter stats for 'system wide': + + port Inbound Read(MB) Inbound Write(MB) Outbound Read(MB) Outbound Write(MB) + 0000:00 1 0 2 3 + 0000:80 0 0 0 0 + 0000:17 352552 43 0 21 + 0000:85 0 0 0 0 + 0000:3a 3 0 0 0 + 0000:ae 0 0 0 0 + 0000:5d 0 0 0 0 + 0000:d7 0 0 0 0 + +3. Collect metrics for comma-separated list of PCIe root ports: + + $ perf iostat 0000:17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s + + Performance counter stats for 'system wide': + + port Inbound Read(MB) Inbound Write(MB) Outbound Read(MB) Outbound Write(MB) + 0000:17 358559 44 0 22 + 0000:3a 3 2 0 0 + + 197.081983474 seconds time elapsed + +SEE ALSO +-------- +linkperf:perf-stat[1] diff --git a/tools/perf/Documentation/perf-kallsyms.txt b/tools/perf/Documentation/perf-kallsyms.txt index f3c620951f6e..c97527df8ecd 100644 --- a/tools/perf/Documentation/perf-kallsyms.txt +++ b/tools/perf/Documentation/perf-kallsyms.txt @@ -20,5 +20,5 @@ modules). OPTIONS ------- -v:: ---verbose=:: +--verbose:: Increase verbosity level, showing details about symbol table loading, etc. diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt index 85b8ac695c87..f378ac59353d 100644 --- a/tools/perf/Documentation/perf-kmem.txt +++ b/tools/perf/Documentation/perf-kmem.txt @@ -8,22 +8,25 @@ perf-kmem - Tool to trace/measure kernel memory properties SYNOPSIS -------- [verse] -'perf kmem' {record|stat} [<options>] +'perf kmem' [<options>] {record|stat} DESCRIPTION ----------- There are two variants of perf kmem: - 'perf kmem record <command>' to record the kmem events - of an arbitrary workload. + 'perf kmem [<options>] record [<perf-record-options>] <command>' to + record the kmem events of an arbitrary workload. Additional 'perf + record' options may be specified after record, such as '-o' to + change the output file name. - 'perf kmem stat' to report kernel memory statistics. + 'perf kmem [<options>] stat' to report kernel memory statistics. OPTIONS ------- -i <file>:: --input=<file>:: - Select the input file (default: perf.data unless stdin is a fifo) + For stat, select the input file (default: perf.data unless stdin is a + fifo) -f:: --force:: diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt index cf95baef7b61..c26524d38f47 100644 --- a/tools/perf/Documentation/perf-kvm.txt +++ b/tools/perf/Documentation/perf-kvm.txt @@ -58,7 +58,7 @@ There are a couple of variants of perf kvm: events. 'perf kvm stat report' reports statistical data which includes events - handled time, samples, and so on. + handled sample, percent_sample, time, percent_time, max_t, min_t, mean_t. 'perf kvm stat live' reports statistical data in a live mode (similar to record + report but with statistical data updated live at a given display @@ -77,23 +77,13 @@ OPTIONS Collect host side performance profile. --guest:: Collect guest side performance profile. ---guestmount=<path>:: - Guest os root file system mount directory. Users mounts guest os - root directories under <path> by a specific filesystem access method, - typically, sshfs. For example, start 2 guest os. The one's pid is 8888 - and the other's is 9999. - #mkdir ~/guestmount; cd ~/guestmount - #sshfs -o allow_other,direct_io -p 5551 localhost:/ 8888/ - #sshfs -o allow_other,direct_io -p 5552 localhost:/ 9999/ - #perf kvm --host --guest --guestmount=~/guestmount top ---guestkallsyms=<path>:: - Guest os /proc/kallsyms file copy. 'perf' kvm' reads it to get guest - kernel symbols. Users copy it out from guest os. ---guestmodules=<path>:: - Guest os /proc/modules file copy. 'perf' kvm' reads it to get guest - kernel module information. Users copy it out from guest os. ---guestvmlinux=<path>:: - Guest os kernel vmlinux. + +:GMEXAMPLECMD: kvm --host --guest +:GMEXAMPLESUBCMD: top +include::guest-files.txt[] + +--stdio:: Use the stdio interface. + -v:: --verbose:: Be more verbose (show counter open errors, etc). @@ -109,7 +99,10 @@ STAT REPORT OPTIONS -k:: --key=<value>:: Sorting key. Possible values: sample (default, sort by samples - number), time (sort by average time). + number), percent_sample (sort by sample percentage), time + (sort by average time), precent_time (sort by time percentage), + max_t (sort by maximum time), min_t (sort by minimum time), mean_t + (sort by mean time). -p:: --pid=:: Analyze events only for given process ID(s) (comma separated list). @@ -122,9 +115,9 @@ STAT LIVE OPTIONS -m:: --mmap-pages=:: - Number of mmap data pages (must be a power of two) or size - specification with appended unit character - B/K/M/G. The - size is rounded up to have nearest pages power of two value. + Number of mmap data pages (must be a power of two) or size + specification in bytes with appended unit character - B/K/M/G. + The size is rounded up to the nearest power-of-two page value. -a:: --all-cpus:: diff --git a/tools/perf/Documentation/perf-kwork.txt b/tools/perf/Documentation/perf-kwork.txt new file mode 100644 index 000000000000..21e607669d78 --- /dev/null +++ b/tools/perf/Documentation/perf-kwork.txt @@ -0,0 +1,214 @@ +perf-kwork(1) +============= + +NAME +---- +perf-kwork - Tool to trace/measure kernel work properties (latencies) + +SYNOPSIS +-------- +[verse] +'perf kwork' {record|report|latency|timehist|top} + +DESCRIPTION +----------- +There are several variants of 'perf kwork': + + 'perf kwork record <command>' to record the kernel work + of an arbitrary workload. + + 'perf kwork report' to report the per kwork runtime. + + 'perf kwork latency' to report the per kwork latencies. + + 'perf kwork timehist' provides an analysis of kernel work events. + + 'perf kwork top' to report the task cpu usage. + + Example usage: + perf kwork record -- sleep 1 + perf kwork report + perf kwork report -b + perf kwork latency + perf kwork latency -b + perf kwork timehist + perf kwork top + perf kwork top -b + + By default it shows the individual work events such as irq, workqueue, + including the run time and delay (time between raise and actually entry): + + Runtime start Runtime end Cpu Kwork name Runtime Delaytime + (TYPE)NAME:NUM (msec) (msec) + ----------------- ----------------- ------ ------------------------- ---------- ---------- + 1811186.976062 1811186.976327 [0000] (s)RCU:9 0.266 0.114 + 1811186.978452 1811186.978547 [0000] (s)SCHED:7 0.095 0.171 + 1811186.980327 1811186.980490 [0000] (s)SCHED:7 0.162 0.083 + 1811186.981221 1811186.981271 [0000] (s)SCHED:7 0.050 0.077 + 1811186.984267 1811186.984318 [0000] (s)SCHED:7 0.051 0.075 + 1811186.987252 1811186.987315 [0000] (s)SCHED:7 0.063 0.081 + 1811186.987785 1811186.987843 [0006] (s)RCU:9 0.058 0.645 + 1811186.988319 1811186.988383 [0000] (s)SCHED:7 0.064 0.143 + 1811186.989404 1811186.989607 [0002] (s)TIMER:1 0.203 0.111 + 1811186.989660 1811186.989732 [0002] (s)SCHED:7 0.072 0.310 + 1811186.991295 1811186.991407 [0002] eth0:10 0.112 + 1811186.991639 1811186.991734 [0002] (s)NET_RX:3 0.095 0.277 + 1811186.989860 1811186.991826 [0002] (w)vmstat_shepherd 1.966 0.345 + ... + + Times are in msec.usec. + +OPTIONS +------- +-D:: +--dump-raw-trace=:: + Display verbose dump of the sched data. + +-f:: +--force:: + Don't complain, do it. + +-k:: +--kwork:: + List of kwork to profile (irq, softirq, workqueue, sched, etc) + +-v:: +--verbose:: + Be more verbose. (show symbol address, etc) + +OPTIONS for 'perf kwork report' +---------------------------- + +-b:: +--use-bpf:: + Use BPF to measure kwork runtime + +-C:: +--cpu:: + Only show events for the given CPU(s) (comma separated list). + +-i:: +--input:: + Input file name. (default: perf.data unless stdin is a fifo) + +-n:: +--name:: + Only show events for the given name. + +-s:: +--sort:: + Sort by key(s): runtime, max, count + +-S:: +--with-summary:: + Show summary with statistics + +--time:: + Only analyze samples within given time window: <start>,<stop>. Times + have the format seconds.microseconds. If start is not given (i.e., time + string is ',x.y') then analysis starts at the beginning of the file. If + stop time is not given (i.e, time string is 'x.y,') then analysis goes + to end of file. + +OPTIONS for 'perf kwork latency' +---------------------------- + +-b:: +--use-bpf:: + Use BPF to measure kwork latency + +-C:: +--cpu:: + Only show events for the given CPU(s) (comma separated list). + +-i:: +--input:: + Input file name. (default: perf.data unless stdin is a fifo) + +-n:: +--name:: + Only show events for the given name. + +-s:: +--sort:: + Sort by key(s): avg, max, count + +--time:: + Only analyze samples within given time window: <start>,<stop>. Times + have the format seconds.microseconds. If start is not given (i.e., time + string is ',x.y') then analysis starts at the beginning of the file. If + stop time is not given (i.e, time string is 'x.y,') then analysis goes + to end of file. + +OPTIONS for 'perf kwork timehist' +--------------------------------- + +-C:: +--cpu:: + Only show events for the given CPU(s) (comma separated list). + +-g:: +--call-graph:: + Display call chains if present (default off). + +-i:: +--input:: + Input file name. (default: perf.data unless stdin is a fifo) + +-k:: +--vmlinux=<file>:: + Vmlinux pathname + +-n:: +--name:: + Only show events for the given name. + +--kallsyms=<file>:: + Kallsyms pathname + +--max-stack:: + Maximum number of functions to display in backtrace, default 5. + +--symfs=<directory>:: + Look for files with symbols relative to this directory. + +--time:: + Only analyze samples within given time window: <start>,<stop>. Times + have the format seconds.microseconds. If start is not given (i.e., time + string is ',x.y') then analysis starts at the beginning of the file. If + stop time is not given (i.e, time string is 'x.y,') then analysis goes + to end of file. + +OPTIONS for 'perf kwork top' +--------------------------------- + +-b:: +--use-bpf:: + Use BPF to measure task cpu usage. + +-C:: +--cpu:: + Only show events for the given CPU(s) (comma separated list). + +-i:: +--input:: + Input file name. (default: perf.data unless stdin is a fifo) + +-n:: +--name:: + Only show events for the given name. + +-s:: +--sort:: + Sort by key(s): rate, runtime, tid + +--time:: + Only analyze samples within given time window: <start>,<stop>. Times + have the format seconds.microseconds. If start is not given (i.e., time + string is ',x.y') then analysis starts at the beginning of the file. If + stop time is not given (i.e, time string is 'x.y,') then analysis goes + to end of file. + +SEE ALSO +-------- +linkperf:perf-record[1] diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 376a50b3452d..ce0735021473 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -8,7 +8,7 @@ perf-list - List all symbolic event types SYNOPSIS -------- [verse] -'perf list' [--no-desc] [--long-desc] +'perf list' [<options>] [hw|sw|cache|tracepoint|pmu|sdt|metric|metricgroup|event_glob] DESCRIPTION @@ -27,7 +27,7 @@ Don't print descriptions. -v:: --long-desc:: -Print longer event descriptions. +Print longer event descriptions and all similar PMUs with alphanumeric suffixes. --debug:: Enable debugging output. @@ -39,6 +39,18 @@ any extra expressions computed by perf stat. --deprecated:: Print deprecated events. By default the deprecated events are hidden. +--unit:: +Print PMU events and metrics limited to the specific PMU name. +(e.g. --unit cpu, --unit msr, --unit cpu_core, --unit cpu_atom) + +-j:: +--json:: +Output in JSON format. + +-o:: +--output=:: + Output file name. By default output is written to stdout. + [[EVENT_MODIFIERS]] EVENT MODIFIERS --------------- @@ -58,6 +70,9 @@ counted. The following modifiers exist: S - read sample value (PERF_SAMPLE_READ) D - pin the event to the PMU W - group is weak and will fallback to non-group if not schedulable, + e - group or event are exclusive and do not share the PMU + b - use BPF aggregration (see perf stat --bpf-counters) + R - retire latency value of the event The 'p' modifier can be used for specifying how precise the instruction address should be. The 'p' modifier can be specified multiple times: @@ -72,11 +87,17 @@ For Intel systems precise event sampling is implemented with PEBS which supports up to precise-level 2, and precise level 3 for some special cases -On AMD systems it is implemented using IBS (up to precise-level 2). -The precise modifier works with event types 0x76 (cpu-cycles, CPU -clocks not halted) and 0xC1 (micro-ops retired). Both events map to -IBS execution sampling (IBS op) with the IBS Op Counter Control bit -(IbsOpCntCtl) set respectively (see AMD64 Architecture Programmer’s +On AMD systems it is implemented using IBS OP (up to precise-level 2). +Unlike Intel PEBS which provides levels of precision, AMD core pmu is +inherently non-precise and IBS is inherently precise. (i.e. ibs_op//, +ibs_op//p, ibs_op//pp and ibs_op//ppp are all same). The precise modifier +works with event types 0x76 (cpu-cycles, CPU clocks not halted) and 0xC1 +(micro-ops retired). Both events map to IBS execution sampling (IBS op) +with the IBS Op Counter Control bit (IbsOpCntCtl) set respectively (see the +Core Complex (CCX) -> Processor x86 Core -> Instruction Based Sampling (IBS) +section of the [AMD Processor Programming Reference (PPR)] relevant to the +family, model and stepping of the processor being used). + Manual Volume 2: System Programming, 13.3 Instruction-Based Sampling). Examples to use IBS: @@ -89,10 +110,12 @@ RAW HARDWARE EVENT DESCRIPTOR Even when an event is not available in a symbolic form within perf right now, it can be encoded in a per processor specific way. -For instance For x86 CPUs NNN represents the raw register encoding with the +For instance on x86 CPUs, N is a hexadecimal value that represents the raw register encoding with the layout of IA32_PERFEVTSELx MSRs (see [Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide] Figure 30-1 Layout -of IA32_PERFEVTSELx MSRs) or AMD's PerfEvtSeln (see [AMD64 Architecture Programmer’s Manual Volume 2: System Programming], Page 344, -Figure 13-7 Performance Event-Select Register (PerfEvtSeln)). +of IA32_PERFEVTSELx MSRs) or AMD's PERF_CTL MSRs (see the +Core Complex (CCX) -> Processor x86 Core -> MSR Registers section of the +[AMD Processor Programming Reference (PPR)] relevant to the family, model +and stepping of the processor being used). Note: Only the following bit fields can be set in x86 counter registers: event, umask, edge, inv, cmask. Esp. guest/host only and @@ -119,6 +142,39 @@ It's also possible to use pmu syntax: perf record -e r1a8 -a sleep 1 perf record -e cpu/r1a8/ ... + perf record -e cpu/r0x1a8/ ... + +Some processors, like those from AMD, support event codes and unit masks +larger than a byte. In such cases, the bits corresponding to the event +configuration parameters can be seen with: + + cat /sys/bus/event_source/devices/<pmu>/format/<config> + +Example: + +If the AMD docs for an EPYC 7713 processor describe an event as: + + Event Umask Event Mask + Num. Value Mnemonic Description + + 28FH 03H op_cache_hit_miss.op_cache_hit Counts Op Cache micro-tag + hit events. + +raw encoding of 0x0328F cannot be used since the upper nibble of the +EventSelect bits have to be specified via bits 32-35 as can be seen with: + + cat /sys/bus/event_source/devices/cpu/format/event + +raw encoding of 0x20000038F should be used instead: + + perf stat -e r20000038f -a sleep 1 + perf record -e r20000038f ... + +It's also possible to use pmu syntax: + + perf record -e r20000038f -a sleep 1 + perf record -e cpu/r20000038f/ ... + perf record -e cpu/r0x20000038f/ ... You should refer to the processor specific documentation for getting these details. Some of them are referenced in the SEE ALSO section below. @@ -132,7 +188,7 @@ in the CPU vendor specific documentation. The available PMUs and their raw parameters can be listed with - ls /sys/devices/*/format + ls /sys/bus/event_source/devices/*/format For example the raw event "LSD.UOPS" core pmu event above could be specified as @@ -184,9 +240,24 @@ This can be overridden by setting the kernel.perf_event_paranoid sysctl to -1, which allows non root to use these events. For accessing trace point events perf needs to have read access to -/sys/kernel/debug/tracing, even when perf_event_paranoid is in a relaxed +/sys/kernel/tracing, even when perf_event_paranoid is in a relaxed setting. +TOOL/HWMON EVENTS +----------------- + +Some events don't have an associated PMU instead reading values +available to software without perf_event_open. As these events don't +support sampling they can only really be read by tools like perf stat. + +Tool events provide times and certain system parameters. Examples +include duration_time, user_time, system_time and num_cpus_online. + +Hwmon events provide easy access to hwmon sysfs data typically in +/sys/class/hwmon. This information includes temperatures, fan speeds +and energy usage. + + TRACING ------- @@ -218,6 +289,15 @@ Sums up the event counts for all hardware threads in a core, e.g.: perf stat -e cpu/event=0,umask=0x3,percore=1/ +cpu: + +Specifies the CPU to open the event upon. The value may be repeated to +specify opening the event on multiple CPUs: + + + perf stat -e instructions/cpu=0,cpu=2/,cycles/cpu=1,cpu=2/ -a sleep 1 + perf stat -e data_read/cpu=0/,data_write/cpu=1/ -a sleep 1 + EVENT GROUPS ------------ @@ -310,4 +390,4 @@ SEE ALSO linkperf:perf-stat[1], linkperf:perf-top[1], linkperf:perf-record[1], http://www.intel.com/sdm/[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide], -http://support.amd.com/us/Processor_TechDocs/24593_APM_v2.pdf[AMD64 Architecture Programmer’s Manual Volume 2: System Programming] +https://bugzilla.kernel.org/show_bug.cgi?id=206537[AMD Processor Programming Reference (PPR)] diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index 74d774592196..c17b3e318169 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -8,7 +8,7 @@ perf-lock - Analyze lock events SYNOPSIS -------- [verse] -'perf lock' {record|report|script|info} +'perf lock' {record|report|script|info|contention} DESCRIPTION ----------- @@ -27,6 +27,8 @@ and statistics with this 'perf lock' command. 'perf lock info' shows metadata like threads or addresses of lock instances. + 'perf lock contention' shows contention statistics. + COMMON OPTIONS -------------- @@ -34,17 +36,31 @@ COMMON OPTIONS --input=<file>:: Input file name. (default: perf.data unless stdin is a fifo) +--output=<file>:: + Output file name for perf lock contention and report. + -v:: --verbose:: Be more verbose (show symbol address, etc). +-q:: +--quiet:: + Do not show any warnings or messages. (Suppress -v) + -D:: --dump-raw-trace:: Dump raw trace in ASCII. -f:: --force:: - Don't complan, do it. + Don't complain, do it. + +--vmlinux=<file>:: + vmlinux pathname + +--kallsyms=<file>:: + kallsyms pathname + REPORT OPTIONS -------------- @@ -54,16 +70,167 @@ REPORT OPTIONS Sorting key. Possible values: acquired (default), contended, avg_wait, wait_total, wait_max, wait_min. +-F:: +--field=<value>:: + Output fields. By default it shows all the fields but users can + customize that using this. Possible values: acquired, contended, + avg_wait, wait_total, wait_max, wait_min. + +-c:: +--combine-locks:: + Merge lock instances in the same class (based on name). + +-t:: +--threads:: + The -t option is to show per-thread lock stat like below: + + $ perf lock report -t -F acquired,contended,avg_wait + + Name acquired contended avg wait (ns) + + perf 240569 9 5784 + swapper 106610 19 543 + :15789 17370 2 14538 + ContainerMgr 8981 6 874 + sleep 5275 1 11281 + ContainerThread 4416 4 944 + RootPressureThr 3215 5 1215 + rcu_preempt 2954 0 0 + ContainerMgr 2560 0 0 + unnamed 1873 0 0 + EventManager_De 1845 1 636 + futex-default-S 1609 0 0 + +-E:: +--entries=<value>:: + Display this many entries. + + INFO OPTIONS ------------ -t:: --threads:: - dump thread list in perf.data + dump only the thread list in perf.data -m:: --map:: - dump map of lock instances (address:name table) + dump only the map of lock instances (address:name table) + + +CONTENTION OPTIONS +------------------ + +-k:: +--key=<value>:: + Sorting key. Possible values: contended, wait_total (default), + wait_max, wait_min, avg_wait. + +-F:: +--field=<value>:: + Output fields. By default it shows all but the wait_min fields + and users can customize that using this. Possible values: + contended, wait_total, wait_max, wait_min, avg_wait. + +-t:: +--threads:: + Show per-thread lock contention stat + +-b:: +--use-bpf:: + Use BPF program to collect lock contention stats instead of + using the input data. + +-a:: +--all-cpus:: + System-wide collection from all CPUs. + +-C:: +--cpu=<value>:: + Collect samples only on the list of CPUs provided. Multiple CPUs can be + provided as a comma-separated list with no space: 0,1. Ranges of CPUs + are specified with -: 0-2. Default is to monitor all CPUs. + +-p:: +--pid=<value>:: + Record events on existing process ID (comma separated list). + +--tid=<value>:: + Record events on existing thread ID (comma separated list). + +-M:: +--map-nr-entries=<value>:: + Maximum number of BPF map entries (default: 16384). + This will be aligned to a power of 2. + +--max-stack=<value>:: + Maximum stack depth when collecting lock contention (default: 8). + +--stack-skip=<value>:: + Number of stack depth to skip when finding a lock caller (default: 3). + +-E:: +--entries=<value>:: + Display this many entries. + +-l:: +--lock-addr:: + Show lock contention stat by address + +-o:: +--lock-owner:: + Show lock contention stat by owners. This option can be combined with -t, + which shows owner's per thread lock stats, or -v, which shows owner's + stacktrace. Requires --use-bpf. + +-Y:: +--type-filter=<value>:: + Show lock contention only for given lock types (comma separated list). + Available values are: + semaphore, spinlock, rwlock, rwlock:R, rwlock:W, rwsem, rwsem:R, rwsem:W, + rtmutex, rwlock-rt, rwlock-rt:R, rwlock-rt:W, percpu-rwmem, pcpu-sem, + pcpu-sem:R, pcpu-sem:W, mutex + + Note that RW-variant of locks have :R and :W suffix. Names without the + suffix are shortcuts for the both variants. Ex) rwsem = rwsem:R + rwsem:W. + +-L:: +--lock-filter=<value>:: + Show lock contention only for given lock addresses or names (comma separated list). + +-S:: +--callstack-filter=<value>:: + Show lock contention only if the callstack contains the given string. + Note that it matches the substring so 'rq' would match both 'raw_spin_rq_lock' + and 'irq_enter_rcu'. + +-x:: +--field-separator=<SEP>:: + Show results using a CSV-style output to make it easy to import directly + into spreadsheets. Columns are separated by the string specified in SEP. + +--lock-cgroup:: + Show lock contention stat by cgroup. Requires --use-bpf. + +-G:: +--cgroup-filter=<value>:: + Show lock contention only in the given cgroups (comma separated list). + +-J:: +--inject-delay=<time@function>:: + Add delays to the given lock. It's added to the contention-end part so + that the (new) owner of the lock will be delayed. But by slowing down + the owner, the waiters will also be delayed as well. This is working + only with -b/--use-bpf. + + The 'time' is specified in nsec but it can have a unit suffix. Available + units are "ms", "us" and "ns". Currently it accepts up to 10ms of delays + for safety reasons. + + Note that it will busy-wait after it gets the lock. Delaying locks can + have significant consequences including potential kernel crashes. Please + use it at your own risk. + SEE ALSO -------- diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt index 199ea0f0a6c0..4d164836d094 100644 --- a/tools/perf/Documentation/perf-mem.txt +++ b/tools/perf/Documentation/perf-mem.txt @@ -21,17 +21,17 @@ and stores are sampled. Use the -t option to limit to loads or stores. Note that on Intel systems the memory latency reported is the use-latency, not the pure load (or store latency). Use latency includes any pipeline -queueing delays in addition to the memory subsystem latency. +queuing delays in addition to the memory subsystem latency. -OPTIONS -------- -<command>...:: - Any command you can specify in a shell. +On Arm64 this uses SPE to sample load and store operations, therefore hardware +and kernel support is required. See linkperf:perf-arm-spe[1] for a setup guide. +Due to the statistical nature of SPE sampling, not every memory operation will +be sampled. --i:: ---input=<file>:: - Input file name. +On AMD this use IBS Op PMU to sample load-store operations. +COMMON OPTIONS +-------------- -f:: --force:: Don't do ownership validation @@ -40,31 +40,22 @@ OPTIONS --type=<type>:: Select the memory operation type: load or store (default: load,store) --D:: ---dump-raw-samples:: - Dump the raw decoded samples on the screen in a format that is easy to parse with - one sample per line. - --x:: ---field-separator=<separator>:: - Specify the field separator used when dump raw samples (-D option). By default, - The separator is the space character. - --C:: ---cpu=<cpu>:: - Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a - comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. Default - is to monitor all CPUS. --U:: ---hide-unresolved:: - Only display entries resolved to a symbol. +-v:: +--verbose:: + Be more verbose (show counter open errors, etc) -p:: --phys-data:: Record/Report sample physical addresses +--data-page-size:: + Record/Report sample data address page size + RECORD OPTIONS -------------- +<command>...:: + Any command you can specify in a shell. + -e:: --event <event>:: Event selector. Use 'perf mem record -e list' to list available events. @@ -77,16 +68,144 @@ RECORD OPTIONS --all-user:: Configure all used events to run in user space. --v:: ---verbose:: - Be more verbose (show counter open errors, etc) - --ldlat <n>:: - Specify desired latency for loads event. (x86 only) + Specify desired latency for loads event. Supported on Intel, Arm64 and + some AMD processors. Ignored on other archs. + + On supported AMD processors: + - /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'. + - Supported latency values are 128 to 2048 (both inclusive). + - Latency value which is a multiple of 128 incurs a little less profiling + overhead compared to other values. + - Load latency filtering is disabled by default. + +REPORT OPTIONS +-------------- +-i:: +--input=<file>:: + Input file name. + +-C:: +--cpu=<cpu>:: + Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a + comma-separated list with no space: 0,1. Ranges of CPUs are specified with - + like 0-2. Default is to monitor all CPUS. + +-D:: +--dump-raw-samples:: + Dump the raw decoded samples on the screen in a format that is easy to parse with + one sample per line. + +-s:: +--sort=<key>:: + Group result by given key(s) - multiple keys can be specified + in CSV format. The keys are specific to memory samples are: + symbol_daddr, symbol_iaddr, dso_daddr, locked, tlb, mem, snoop, + dcacheline, phys_daddr, data_page_size, blocked. + + - symbol_daddr: name of data symbol being executed on at the time of sample + - symbol_iaddr: name of code symbol being executed on at the time of sample + - dso_daddr: name of library or module containing the data being executed + on at the time of the sample + - locked: whether the bus was locked at the time of the sample + - tlb: type of tlb access for the data at the time of the sample + - mem: type of memory access for the data at the time of the sample + - snoop: type of snoop (if any) for the data at the time of the sample + - dcacheline: the cacheline the data address is on at the time of the sample + - phys_daddr: physical address of data being executed on at the time of sample + - data_page_size: the data page size of data being executed on at the time of sample + - blocked: reason of blocked load access for the data at the time of the sample + + And the default sort keys are changed to local_weight, mem, sym, dso, + symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, local_ins_lat. + +-F:: +--fields=:: + Specify output field - multiple keys can be specified in CSV format. + Please see linkperf:perf-report[1] for details. + + In addition to the default fields, 'perf mem report' will provide the + following fields to break down sample periods. + + - op: operation in the sample instruction (load, store, prefetch, ...) + - cache: location in CPU cache (L1, L2, ...) where the sample hit + - mem: location in memory or other places the sample hit + - dtlb: location in Data TLB (L1, L2) where the sample hit + - snoop: snoop result for the sampled data access + + Please take a look at the OUTPUT FIELD SELECTION section for caveats. + +-T:: +--type-profile:: + Show data-type profile result instead of code symbols. This requires + the debug information and it will change the default sort keys to: + mem, snoop, tlb, type. + +-U:: +--hide-unresolved:: + Only display entries resolved to a symbol. + +-x:: +--field-separator=<separator>:: + Specify the field separator used when dump raw samples (-D option). By default, + The separator is the space character. In addition, for report all perf report options are valid, and for record all perf record options. +OVERHEAD CALCULATION +-------------------- +Unlike linkperf:perf-report[1], which calculates overhead from the actual +sample period, perf-mem overhead is calculated using sample weight. E.g. +there are two samples in perf.data file, both with the same sample period, +but one sample with weight 180 and the other with weight 20: + + $ perf script -F period,data_src,weight,ip,sym + 100000 629080842 |OP LOAD|LVL L3 hit|... 20 7e69b93ca524 strcmp + 100000 1a29081042 |OP LOAD|LVL RAM hit|... 180 ffffffff82429168 memcpy + + $ perf report -F overhead,symbol + 50% [.] strcmp + 50% [k] memcpy + + $ perf mem report -F overhead,symbol + 90% [k] memcpy + 10% [.] strcmp + +OUTPUT FIELD SELECTION +---------------------- +"perf mem report" adds a number of new output fields specific to data source +information in the sample. Some of them have the same name with the existing +sort keys ("mem" and "snoop"). So unlike other fields and sort keys, they'll +behave differently when it's used by -F/--fields or -s/--sort. + +Using those two as output fields will aggregate samples altogether and show +breakdown. + + $ perf mem report -F mem,snoop + ... + # ------ Memory ------- --- Snoop ---- + # RAM Uncach Other HitM Other + # ..................... .............. + # + 3.5% 0.0% 96.5% 25.1% 74.9% + +But using the same name for sort keys will aggregate samples for each type +separately. + + $ perf mem report -s mem,snoop + # Overhead Samples Memory access Snoop + # ........ ............ ....................................... ............ + # + 47.99% 1509 L2 hit N/A + 25.08% 338 core, same node Any cache hit HitM + 10.24% 54374 N/A N/A + 6.77% 35938 L1 hit N/A + 6.39% 101 core, same node Any cache hit N/A + 3.50% 69 RAM hit N/A + 0.03% 158 LFB/MAB hit N/A + 0.00% 2 Uncached hit N/A + SEE ALSO -------- -linkperf:perf-record[1], linkperf:perf-report[1] +linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-arm-spe[1] diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index ed3ecfa422e1..5c43a6edc0e5 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -57,7 +57,7 @@ OPTIONS -q:: --quiet:: - Be quiet (do not show any messages including errors). + Do not show any warnings or messages. Can not use with -v. -a:: @@ -222,11 +222,11 @@ probe syntax, 'SRC' means the source file path, 'ALN' is start line number, and 'ALN2' is end line number in the file. It is also possible to specify how many lines to show by using 'NUM'. Moreover, 'FUNC@SRC' combination is good for searching a specific function when several functions share same name. -So, "source.c:100-120" shows lines between 100th to l20th in source.c file. And "func:10+20" shows 20 lines from 10th line of func function. +So, "source.c:100-120" shows lines between 100th to 120th in source.c file. And "func:10+20" shows 20 lines from 10th line of func function. LAZY MATCHING ------------- - The lazy line matching is similar to glob matching but ignoring spaces in both of pattern and target. So this accepts wildcards('*', '?') and character classes(e.g. [a-z], [!A-Z]). +The lazy line matching is similar to glob matching but ignoring spaces in both of pattern and target. So this accepts wildcards('*', '?') and character classes(e.g. [a-z], [!A-Z]). e.g. 'a=*' can matches 'a=b', 'a = b', 'a == b' and so on. @@ -235,8 +235,8 @@ This provides some sort of flexibility and robustness to probe point definitions FILTER PATTERN -------------- - The filter pattern is a glob matching pattern(s) to filter variables. - In addition, you can use "!" for specifying filter-out rule. You also can give several rules combined with "&" or "|", and fold those rules as one rule by using "(" ")". +The filter pattern is a glob matching pattern(s) to filter variables. +In addition, you can use "!" for specifying filter-out rule. You also can give several rules combined with "&" or "|", and fold those rules as one rule by using "(" ")". e.g. With --filter "foo* | bar*", perf probe -V shows variables which start with "foo" or "bar". @@ -295,6 +295,19 @@ Add a probe in a source file using special characters by backslash escape ./perf probe -x /opt/test/a.out 'foo\+bar.c:4' +PERMISSIONS AND SYSCTL +---------------------- +Since perf probe depends on ftrace (tracefs) and kallsyms (/proc/kallsyms), you have to care about the permission and some sysctl knobs. + + - Since tracefs and kallsyms requires root or privileged user to access it, the following perf probe commands also require it; --add, --del, --list (except for --cache option) + + - The system admin can remount the tracefs with 755 (`sudo mount -o remount,mode=755 /sys/kernel/tracing/`) to allow unprivileged user to run the perf probe --list command. + + - /proc/sys/kernel/kptr_restrict = 2 (restrict all users) also prevents perf probe to retrieve the important information from kallsyms. You also need to set to 1 (restrict non CAP_SYSLOG users) for the above commands. Since the user-space probe doesn't need to access kallsyms, this is only for probing the kernel function (kprobes). + + - Since the perf probe commands read the vmlinux (for kernel) and/or the debuginfo file (including user-space application), you need to ensure that you can read those files. + + SEE ALSO -------- linkperf:perf-trace[1], linkperf:perf-record[1], linkperf:perf-buildid-cache[1] diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index fa8a5fcd27ab..612612fa2d80 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -9,7 +9,7 @@ SYNOPSIS -------- [verse] 'perf record' [-e <EVENT> | --event=EVENT] [-a] <command> -'perf record' [-e <EVENT> | --event=EVENT] [-a] -- <command> [<options>] +'perf record' [-e <EVENT> | --event=EVENT] [-a] \-- <command> [<options>] DESCRIPTION ----------- @@ -30,8 +30,14 @@ OPTIONS - a symbolic event name (use 'perf list' to list all events) - - a raw PMU event (eventsel+umask) in the form of rNNN where NNN is a - hexadecimal event descriptor. + - a raw PMU event in the form of rN where N is a hexadecimal value + that represents the raw register encoding with the layout of the + event control registers as described by entries in + /sys/bus/event_source/devices/cpu/format/*. + + - a symbolic or raw PMU event followed by an optional colon + and a list of event modifiers, e.g., cpu-cycles:p. See the + linkperf:perf-list[1] man page for details on event modifiers. - a symbolically formed PMU event like 'pmu/param1=0x3,param2/' where 'param1', 'param2', etc are defined as formats for the PMU in @@ -62,6 +68,10 @@ OPTIONS like this: name=\'CPU_CLK_UNHALTED.THREAD:cmask=0x1\'. - 'aux-output': Generate AUX records instead of events. This requires that an AUX area event is also provided. + - 'aux-action': "pause" or "resume" to pause or resume an AUX + area event (the group leader) when this event occurs. + "start-paused" on an AUX area event itself, will + start in a paused state. - 'aux-sample-size': Set sample size for AUX area sampling. If the '--aux-sample' option has been used, set aux-sample-size=0 to disable AUX area sampling for the event. @@ -93,29 +103,18 @@ OPTIONS If you want to profile write accesses in [0x1000~1008), just set 'mem:0x1000/8:w'. - - a BPF source file (ending in .c) or a precompiled object file (ending - in .o) selects one or more BPF events. - The BPF program can attach to various perf events based on the ELF section - names. - - When processing a '.c' file, perf searches an installed LLVM to compile it - into an object file first. Optional clang options can be passed via the - '--clang-opt' command line option, e.g.: - - perf record --clang-opt "-DLINUX_VERSION_CODE=0x50000" \ - -e tests/bpf-script-example.c - - Note: '--clang-opt' must be placed before '--event/-e'. - - a group of events surrounded by a pair of brace ("{event1,event2,...}"). Each event is separated by commas and the group should be quoted to prevent the shell interpretation. You also need to use --group on "perf report" to view group events together. --filter=<filter>:: - Event filter. This option should follow an event selector (-e) which - selects either tracepoint event(s) or a hardware trace PMU - (e.g. Intel PT or CoreSight). + Event filter. This option should follow an event selector (-e). + If the event is a tracepoint, the filter string will be parsed by + the kernel. If the event is a hardware trace PMU (e.g. Intel PT + or CoreSight), it'll be processed as an address filter. Otherwise + it means a general filter using BPF which can be applied for any + kind of event. - tracepoint filters @@ -170,6 +169,57 @@ OPTIONS Multiple filters can be separated with space or comma. + - bpf filters + + A BPF filter can access the sample data and make a decision based on the + data. Users need to set an appropriate sample type to use the BPF + filter. BPF filters need root privilege. + + The sample data field can be specified in lower case letter. Multiple + filters can be separated with comma. For example, + + --filter 'period > 1000, cpu == 1' + or + --filter 'mem_op == load || mem_op == store, mem_lvl > l1' + + The former filter only accept samples with period greater than 1000 AND + CPU number is 1. The latter one accepts either load and store memory + operations but it should have memory level above the L1. Since the + mem_op and mem_lvl fields come from the (memory) data_source, it'd only + work with some events which set the data_source field. + + Also user should request to collect that information (with -d option in + the above case). Otherwise, the following message will be shown. + + $ sudo perf record -e cycles --filter 'mem_op == load' + Error: cycles event does not have PERF_SAMPLE_DATA_SRC + Hint: please add -d option to perf record. + failed to set filter "BPF" on event cycles with 22 (Invalid argument) + + Essentially the BPF filter expression is: + + <term> <operator> <value> (("," | "||") <term> <operator> <value>)* + + The <term> can be one of: + ip, id, tid, pid, cpu, time, addr, period, txn, weight, phys_addr, + code_pgsz, data_pgsz, weight1, weight2, weight3, ins_lat, retire_lat, + p_stage_cyc, mem_op, mem_lvl, mem_snoop, mem_remote, mem_lock, + mem_dtlb, mem_blk, mem_hops, uid, gid + + The <operator> can be one of: + ==, !=, >, >=, <, <=, & + + The <value> can be one of: + <number> (for any term) + na, load, store, pfetch, exec (for mem_op) + l1, l2, l3, l4, cxl, io, any_cache, lfb, ram, pmem (for mem_lvl) + na, none, hit, miss, hitm, fwd, peer (for mem_snoop) + remote (for mem_remote) + na, locked (for mem_locked) + na, l1_hit, l1_miss, l2_hit, l2_miss, any_hit, any_miss, walk, fault (for mem_dtlb) + na, by_data, by_addr (for mem_blk) + hops0, hops1, hops2, hops3 (for mem_hops) + --exclude-perf:: Don't record events issued by perf itself. This option should follow an event selector (-e) which selects tracepoint event(s). It adds a @@ -177,6 +227,10 @@ OPTIONS '--filter' exists, the new filter expression will be combined with them by '&&'. +--latency:: + Enable data collection for latency profiling. + Use perf report --latency for latency-centric profile. + -a:: --all-cpus:: System-wide collection from all CPUs (default if no target is specified). @@ -227,14 +281,11 @@ OPTIONS -m:: --mmap-pages=:: Number of mmap data pages (must be a power of two) or size - specification with appended unit character - B/K/M/G. The - size is rounded up to have nearest pages power of two value. - Also, by adding a comma, the number of mmap pages for AUX - area tracing can be specified. - ---group:: - Put all events in a single event group. This precedes the --event - option and remains only for backward compatibility. See --event. + specification in bytes with appended unit character - B/K/M/G. + The size is rounded up to the nearest power-of-two page value. + By adding a comma, an additional parameter with the same + semantics used for the normal mmap areas can be specified for + AUX tracing area. -g:: Enables call-graph (stack chain/backtrace) recording for both @@ -269,9 +320,14 @@ OPTIONS User can change the size by passing the size after comma like "--call-graph dwarf,4096". + When "fp" recording is used, perf tries to save stack entries + up to the number specified in sysctl.kernel.perf_event_max_stack + by default. User can change the number by passing it after comma + like "--call-graph fp,32". + -q:: --quiet:: - Don't print any message, useful for scripting. + Don't print any warnings or messages, useful for scripting. -v:: --verbose:: @@ -284,11 +340,17 @@ OPTIONS -d:: --data:: - Record the sample virtual addresses. + Record the sample virtual addresses. Implies --sample-mem-info. --phys-data:: Record the sample physical addresses. +--data-page-size:: + Record the sampled data address data page size. + +--code-page-size:: + Record the sampled code address (ip) page size + -T:: --timestamp:: Record the sample timestamps. Use it with 'perf report -D' to see the @@ -301,6 +363,16 @@ OPTIONS --sample-cpu:: Record the sample cpu. +--sample-identifier:: + Record the sample identifier i.e. PERF_SAMPLE_IDENTIFIER bit set in + the sample_type member of the struct perf_event_attr argument to the + perf_event_open system call. + +--sample-mem-info:: + Record the sample data source information for memory operations. + It requires hardware supports and may work on specific events only. + Please consider using 'perf mem record' instead if you're not sure. + -n:: --no-samples:: Don't sample. @@ -316,6 +388,9 @@ comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0- In per-thread mode with inheritance mode on (default), samples are captured only when the thread executes on the designated CPUs. Default is to monitor all CPUs. +User space tasks can migrate between CPUs, so when tracing selected CPUs, +a dummy event is created to track sideband for all CPUs. + -B:: --no-buildid:: Do not save the build ids of binaries in the perf.data files. This skips @@ -366,6 +441,7 @@ following filters are defined: - any_call: any function call or system call - any_ret: any function return or system call return - ind_call: any indirect branch + - ind_jmp: any indirect jump - call: direct calls, including far (to/from kernel) calls - u: only when the branch target is at the user level - k: only when the branch target is in the kernel @@ -374,7 +450,19 @@ following filters are defined: - no_tx: only when the target is not in a hardware transaction - abort_tx: only when the target is a hardware transaction abort - cond: conditional branches + - call_stack: save call stack + - no_flags: don't save branch flags e.g prediction, misprediction etc + - no_cycles: don't save branch cycles + - hw_index: save branch hardware index - save_type: save branch type during sampling in case binary is not available later + For the platforms with Intel Arch LBR support (12th-Gen+ client or + 4th-Gen Xeon+ server), the save branch type is unconditionally enabled + when the taken branch stack sampling is enabled. + - priv: save privilege state during sampling in case binary is not available later + - counter: save occurrences of the event since the last branch entry. Currently, the + feature is only supported by a newer CPU, e.g., Intel Sierra Forest and + later platforms. An error out is expected if it's used on the unsupported + kernel or CPUs. + The option requires at least one branch type among any, any_call, any_ret, ind_call, cond. @@ -385,6 +473,7 @@ is enabled for all the sampling events. The sampled branch type is the same for The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k Note that this feature may not be available on all processors. +-W:: --weight:: Enable weightened sampling. An additional weight is recorded per sample and can be displayed with the weight and local_weight sort keys. This currently works for TSX @@ -407,8 +496,11 @@ if combined with -a or -C options. -D:: --delay=:: -After starting the program, wait msecs before measuring. This is useful to -filter out the startup phase of the program, which is often very different. +After starting the program, wait msecs before measuring (-1: start with events +disabled), or enable events only for specified ranges of msecs (e.g. +-D 10-20,30-40 means wait 10 msecs, enable for 10 msecs, wait 10 msecs, enable +for 10 msecs, then stop). Note, delaying enabling of events is useful to filter +out the startup phase of the program, which is often very different. -I:: --intr-regs:: @@ -438,9 +530,10 @@ CLOCK_BOOTTIME, CLOCK_REALTIME and CLOCK_TAI. Select AUX area tracing Snapshot Mode. This option is valid only with an AUX area tracing event. Optionally, certain snapshot capturing parameters can be specified in a string that follows this option: - 'e': take one last snapshot on exit; guarantees that there is at least one + + - 'e': take one last snapshot on exit; guarantees that there is at least one snapshot in the output file; - <size>: if the PMU supports this, specify the desired snapshot size. + - <size>: if the PMU supports this, specify the desired snapshot size. In Snapshot Mode trace data is captured only when signal SIGUSR2 is received and on exit if the above 'e' option is given. @@ -458,18 +551,10 @@ This option sets the time out limit. The default value is 500 ms. --switch-events:: Record context switch events i.e. events of type PERF_RECORD_SWITCH or -PERF_RECORD_SWITCH_CPU_WIDE. In some cases (e.g. Intel PT or CoreSight) +PERF_RECORD_SWITCH_CPU_WIDE. In some cases (e.g. Intel PT, CoreSight or Arm SPE) switch events will be enabled automatically, which can be suppressed by by the option --no-switch-events. ---clang-path=PATH:: -Path to clang binary to use for compiling BPF scriptlets. -(enabled when BPF support is on) - ---clang-opt=OPTIONS:: -Options passed to clang when compiling BPF scriptlets. -(enabled when BPF support is on) - --vmlinux=PATH:: Specify vmlinux path which has debuginfo. (enabled when BPF prologue is on) @@ -477,6 +562,9 @@ Specify vmlinux path which has debuginfo. --buildid-all:: Record build-id of all DSOs regardless whether it's actually hit or not. +--buildid-mmap:: +Record build ids in mmap2 events, disables build id cache (implies --no-buildid). + --aio[=n]:: Use <n> control blocks in asynchronous (Posix AIO) trace writing mode (default: 1, max: 4). Asynchronous mode is supported only when linking Perf tool with libc library @@ -484,8 +572,9 @@ providing implementation for Posix AIO API. --affinity=mode:: Set affinity mask of trace reading thread according to the policy defined by 'mode' value: - node - thread affinity mask is set to NUMA node cpu mask of the processed mmap buffer - cpu - thread affinity mask is set to cpu of the processed mmap buffer + + - node - thread affinity mask is set to NUMA node cpu mask of the processed mmap buffer + - cpu - thread affinity mask is set to cpu of the processed mmap buffer --mmap-flush=number:: @@ -537,16 +626,17 @@ Record timestamp boundary (time of first/last samples). --switch-output[=mode]:: Generate multiple perf.data files, timestamp prefixed, switching to a new one based on 'mode' value: - "signal" - when receiving a SIGUSR2 (default value) or - <size> - when reaching the size threshold, size is expected to - be a number with appended unit character - B/K/M/G - <time> - when reaching the time threshold, size is expected to - be a number with appended unit character - s/m/h/d - Note: the precision of the size threshold hugely depends - on your configuration - the number and size of your ring - buffers (-m). It is generally more precise for higher sizes - (like >5M), for lower values expect different sizes. + - "signal" - when receiving a SIGUSR2 (default value) or + - <size> - when reaching the size threshold, size is expected to + be a number with appended unit character - B/K/M/G + - <time> - when reaching the time threshold, size is expected to + be a number with appended unit character - s/m/h/d + + Note: the precision of the size threshold hugely depends + on your configuration - the number and size of your ring + buffers (-m). It is generally more precise for higher sizes + (like >5M), for lower values expect different sizes. A possible use case is to, given an external event, slice the perf.data file that gets then processed, possibly via a perf script, to decide if that @@ -582,6 +672,23 @@ options. 'perf record --dry-run -e' can act as a BPF script compiler if llvm.dump-obj in config file is set to true. +--synth=TYPE:: +Collect and synthesize given type of events (comma separated). Note that +this option controls the synthesis from the /proc filesystem which represent +task status for pre-existing threads. + +Kernel (and some other) events are recorded regardless of the +choice in this option. For example, --synth=no would have MMAP events for +kernel and modules. + +Available types are: + + - 'task' - synthesize FORK and COMM events for each task + - 'mmap' - synthesize MMAP events for each process (implies 'task') + - 'cgroup' - synthesize CGROUP events for each cgroup + - 'all' - synthesize all events (default) + - 'no' - do not synthesize any of the above events + --tail-synthesize:: Instead of collecting non-sample events (for example, fork, comm, mmap) at the beginning of record, collect them during finalizing an output file. @@ -626,6 +733,131 @@ option. The -e option and this one can be mixed and matched. Events can be grouped using the {} notation. endif::HAVE_LIBPFM[] +--control=fifo:ctl-fifo[,ack-fifo]:: +--control=fd:ctl-fd[,ack-fd]:: +ctl-fifo / ack-fifo are opened and used as ctl-fd / ack-fd as follows. +Listen on ctl-fd descriptor for command to control measurement. + +Available commands: + + - 'enable' : enable events + - 'disable' : disable events + - 'enable name' : enable event 'name' + - 'disable name' : disable event 'name' + - 'snapshot' : AUX area tracing snapshot). + - 'stop' : stop perf record + - 'ping' : ping + - 'evlist [-v|-g|-F] : display all events + + -F Show just the sample frequency used for each event. + -v Show all fields. + -g Show event group information. + +Measurements can be started with events disabled using --delay=-1 option. Optionally +send control command completion ('ack\n') to ack-fd descriptor to synchronize with the +controlling process. Example of bash shell script to enable and disable events during +measurements: + + #!/bin/bash + + ctl_dir=/tmp/ + + ctl_fifo=${ctl_dir}perf_ctl.fifo + test -p ${ctl_fifo} && unlink ${ctl_fifo} + mkfifo ${ctl_fifo} + exec {ctl_fd}<>${ctl_fifo} + + ctl_ack_fifo=${ctl_dir}perf_ctl_ack.fifo + test -p ${ctl_ack_fifo} && unlink ${ctl_ack_fifo} + mkfifo ${ctl_ack_fifo} + exec {ctl_fd_ack}<>${ctl_ack_fifo} + + perf record -D -1 -e cpu-cycles -a \ + --control fd:${ctl_fd},${ctl_fd_ack} \ + -- sleep 30 & + perf_pid=$! + + sleep 5 && echo 'enable' >&${ctl_fd} && read -u ${ctl_fd_ack} e1 && echo "enabled(${e1})" + sleep 10 && echo 'disable' >&${ctl_fd} && read -u ${ctl_fd_ack} d1 && echo "disabled(${d1})" + + exec {ctl_fd_ack}>&- + unlink ${ctl_ack_fifo} + + exec {ctl_fd}>&- + unlink ${ctl_fifo} + + wait -n ${perf_pid} + exit $? + +--threads=<spec>:: +Write collected trace data into several data files using parallel threads. +<spec> value can be user defined list of masks. Masks separated by colon +define CPUs to be monitored by a thread and affinity mask of that thread +is separated by slash: + + <cpus mask 1>/<affinity mask 1>:<cpus mask 2>/<affinity mask 2>:... + +CPUs or affinity masks must not overlap with other corresponding masks. +Invalid CPUs are ignored, but masks containing only invalid CPUs are not +allowed. + +For example user specification like the following: + + 0,2-4/2-4:1,5-7/5-7 + +specifies parallel threads layout that consists of two threads, +the first thread monitors CPUs 0 and 2-4 with the affinity mask 2-4, +the second monitors CPUs 1 and 5-7 with the affinity mask 5-7. + +<spec> value can also be a string meaning predefined parallel threads +layout: + + - cpu - create new data streaming thread for every monitored cpu + - core - create new thread to monitor CPUs grouped by a core + - package - create new thread to monitor CPUs grouped by a package + - numa - create new threed to monitor CPUs grouped by a NUMA domain + +Predefined layouts can be used on systems with large number of CPUs in +order not to spawn multiple per-cpu streaming threads but still avoid LOST +events in data directory files. Option specified with no or empty value +defaults to CPU layout. Masks defined or provided by the option value are +filtered through the mask provided by -C option. + +--debuginfod[=URLs]:: + Specify debuginfod URL to be used when cacheing perf.data binaries, + it follows the same syntax as the DEBUGINFOD_URLS variable, like: + + http://192.168.122.174:8002 + + If the URLs is not specified, the value of DEBUGINFOD_URLS + system environment variable is used. + +--off-cpu:: + Enable off-cpu profiling with BPF. The BPF program will collect + task scheduling information with (user) stacktrace and save them + as sample data of a software event named "offcpu-time". The + sample period will have the time the task slept in nanoseconds. + + Note that BPF can collect stack traces using frame pointer ("fp") + only, as of now. So the applications built without the frame + pointer might see bogus addresses. + + off-cpu profiling consists two types of samples: direct samples, which + share the same behavior as regular samples, and the accumulated + samples, stored in BPF stack trace map, presented after all the regular + samples. + +--off-cpu-thresh:: + Once a task's off-cpu time reaches this threshold (in milliseconds), it + generates a direct off-cpu sample. The default is 500ms. + +--setup-filter=<action>:: + Prepare BPF filter to be used by regular users. The action should be + either "pin" or "unpin". The filter can be used after it's pinned. + + +include::intel-hybrid.txt[] + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index d068103690cc..acef3ff4178e 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -27,7 +27,7 @@ OPTIONS -q:: --quiet:: - Do not show any message. (Suppress -v) + Do not show any warnings or messages. (Suppress -v) -n:: --show-nr-samples:: @@ -44,7 +44,7 @@ OPTIONS --comms=:: Only consider symbols in these comms. CSV that understands file://filename entries. This option will affect the percentage of - the overhead column. See --percentage for more info. + the overhead and latency columns. See --percentage for more info. --pid=:: Only show events for given process ID (comma separated list). @@ -54,12 +54,12 @@ OPTIONS --dsos=:: Only consider symbols in these dsos. CSV that understands file://filename entries. This option will affect the percentage of - the overhead column. See --percentage for more info. + the overhead and latency columns. See --percentage for more info. -S:: --symbols=:: Only consider these symbols. CSV that understands file://filename entries. This option will affect the percentage of - the overhead column. See --percentage for more info. + the overhead and latency columns. See --percentage for more info. --symbol-filter=:: Only show symbols that match (partially) with this filter. @@ -68,17 +68,33 @@ OPTIONS --hide-unresolved:: Only display entries resolved to a symbol. +--parallelism:: + Only consider these parallelism levels. Parallelism level is the number + of threads that actively run on CPUs at the time of sample. The flag + accepts single number, comma-separated list, and ranges (for example: + "1", "7,8", "1,64-128"). This is useful in understanding what a program + is doing during sequential/low-parallelism phases as compared to + high-parallelism phases. This option will affect the percentage of + the overhead and latency columns. See --percentage for more info. + Also see the `CPU and latency overheads' section for more details. + +--latency:: + Show latency-centric profile rather than the default + CPU-consumption-centric profile + (requires perf record --latency flag). + -s:: --sort=:: Sort histogram entries by given key(s) - multiple keys can be specified in CSV format. Following sort keys are available: pid, comm, dso, symbol, parent, cpu, socket, srcline, weight, - local_weight, cgroup_id. + local_weight, cgroup_id, addr. Each key has following meaning: - comm: command (name) of the task which can be read via /proc/<pid>/comm - pid: command and tid of the task + - tgid: command and tgid of the task - dso: name of library or module executed at the time of sample - dso_size: size of library or module executed at the time of sample - symbol: name of function executed at the time of sample @@ -87,6 +103,7 @@ OPTIONS entries are displayed as "[other]". - cpu: cpu number the task ran at the time of sample - socket: processor socket number the task ran at the time of sample + - parallelism: number of running threads at the time of sample - srcline: filename and line number executed at the time of sample. The DWARF debugging info must be provided. - srcfile: file name of the source file of the samples. Requires dwarf @@ -97,20 +114,38 @@ OPTIONS - cgroup_id: ID derived from cgroup namespace device and inode numbers. - cgroup: cgroup pathname in the cgroupfs. - transaction: Transaction abort flags. - - overhead: Overhead percentage of sample - - overhead_sys: Overhead percentage of sample running in system mode - - overhead_us: Overhead percentage of sample running in user mode - - overhead_guest_sys: Overhead percentage of sample running in system mode + - overhead: CPU overhead percentage of sample. + - latency: latency (wall-clock) overhead percentage of sample. + See the `CPU and latency overheads' section for more details. + - overhead_sys: CPU overhead percentage of sample running in system mode + - overhead_us: CPU overhead percentage of sample running in user mode + - overhead_guest_sys: CPU overhead percentage of sample running in system mode on guest machine - - overhead_guest_us: Overhead percentage of sample running in user mode on + - overhead_guest_us: CPU overhead percentage of sample running in user mode on guest machine - sample: Number of sample - period: Raw number of event count of sample - time: Separate the samples by time stamp with the resolution specified by --time-quantum (default 100ms). Specify with overhead and before it. - - By default, comm, dso and symbol keys are used. - (i.e. --sort comm,dso,symbol) + - code_page_size: the code page size of sampled code address (ip) + - ins_lat: Instruction latency in core cycles. This is the global instruction + latency + - local_ins_lat: Local instruction latency version + - p_stage_cyc: On powerpc, this presents the number of cycles spent in a + pipeline stage. And currently supported only on powerpc. + - addr: (Full) virtual address of the sampled instruction + - retire_lat: On X86, this reports pipeline stall of this instruction compared + to the previous instruction in cycles. And currently supported only on X86 + - simd: Flags describing a SIMD operation. "e" for empty Arm SVE predicate. "p" for partial Arm SVE predicate + - type: Data type of sample memory access. + - typeoff: Offset in the data type of sample memory access. + - symoff: Offset in the symbol. + - weight1: Average value of event specific weight (1st field of weight_struct). + - weight2: Average value of event specific weight (2nd field of weight_struct). + - weight3: Average value of event specific weight (3rd field of weight_struct). + + By default, overhead, comm, dso and symbol keys are used. + (i.e. --sort overhead,comm,dso,symbol). If --branch-stack option is used, following sort keys are also available: @@ -139,7 +174,7 @@ OPTIONS If the --mem-mode option is used, the following sort keys are also available (incompatible with --branch-stack): - symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline. + symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline, blocked. - symbol_daddr: name of data symbol being executed on at the time of sample - dso_daddr: name of library or module containing the data being executed @@ -150,9 +185,12 @@ OPTIONS - snoop: type of snoop (if any) for the data at the time of the sample - dcacheline: the cacheline the data address is on at the time of the sample - phys_daddr: physical address of data being executed on at the time of sample + - data_page_size: the data page size of data being executed on at the time of sample + - blocked: reason of blocked load access for the data at the time of the sample And the default sort keys are changed to local_weight, mem, sym, dso, - symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'. + symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, local_ins_lat, + see '--mem-mode'. If the data file has tracepoint event(s), following (dynamic) sort keys are also available: @@ -182,7 +220,11 @@ OPTIONS --fields=:: Specify output field - multiple keys can be specified in CSV format. Following fields are available: - overhead, overhead_sys, overhead_us, overhead_children, sample and period. + overhead, latency, overhead_sys, overhead_us, overhead_children, sample, + period, weight1, weight2, weight3, ins_lat, p_stage_cyc and retire_lat. + The last 3 names are alias for the corresponding weights. When the weight + fields are used, they will show the average value of the weight. + Also it can contain any sort key(s). By default, every sort keys not specified in -F will be appended @@ -217,6 +259,9 @@ OPTIONS --dump-raw-trace:: Dump raw trace in ASCII. +--disable-order:: + Disable raw trace ordering. + -g:: --call-graph=<print_type,threshold[,print_limit],order,sort_key[,branch],value>:: Display call chains using type, min percent threshold, print limit, @@ -263,7 +308,7 @@ OPTIONS Accumulate callchain of children to parent entry so that then can show up in the output. The output will have a new "Children" column and will be sorted on the data. It requires callchains are recorded. - See the `overhead calculation' section for more details. Enabled by + See the `Overhead calculation' section for more details. Enabled by default, disable with --no-children. --max-stack:: @@ -365,6 +410,17 @@ OPTIONS This allows to examine the path the program took to each sample. The data collection must have used -b (or -j) and -g. + Also show with some branch flags that can be: + - Predicted: display the average percentage of predicated branches. + (predicated number / total number) + - Abort: display the number of tsx aborted branches. + - Cycles: cycles in basic block. + + - iterations: display the average number of iterations in callchain list. + +--addr2line=<path>:: + Path to addr2line binary. + --objdump=<path>:: Path to objdump binary. @@ -405,9 +461,9 @@ OPTIONS --call-graph option for details. --percentage:: - Determine how to display the overhead percentage of filtered entries. - Filters can be applied by --comms, --dsos and/or --symbols options and - Zoom operations on the TUI (thread, dso, etc). + Determine how to display the CPU and latency overhead percentage + of filtered entries. Filters can be applied by --comms, --dsos, --symbols + and/or --parallelism options and Zoom operations on the TUI (thread, dso, etc). "relative" means it's relative to filtered entries only so that the sum of shown entries will be always 100%. "absolute" means it retains @@ -465,7 +521,7 @@ OPTIONS but probably we'll make the default not to show the switch-on/off events on the --group mode and if there is only one event besides the off/on ones, go straight to the histogram browser, just like 'perf report' with no events - explicitely specified does. + explicitly specified does. --itrace:: Options for decoding instruction tracing data. The options are: @@ -494,7 +550,7 @@ include::itrace.txt[] perf record --call-graph lbr. Disabled by default. In common cases with call stack overflows, it can recreate better call stacks than the default lbr call stack - output. But this approach is not full proof. There can be cases + output. But this approach is not foolproof. There can be cases where it creates incorrect call stacks from incorrect matches. The known limitations include exception handing such as setjmp/longjmp will have calls/returns not match. @@ -509,8 +565,35 @@ include::itrace.txt[] --raw-trace:: When displaying traceevent output, do not use print fmt or plugins. +-H:: --hierarchy:: - Enable hierarchical output. + Enable hierarchical output. In the hierarchy mode, each sort key groups + samples based on the criteria and then sub-divide it using the lower + level sort key. + + For example: + In normal output: + + perf report -s dso,sym + # Overhead Shared Object Symbol + 50.00% [kernel.kallsyms] [k] kfunc1 + 20.00% perf [.] foo + 15.00% [kernel.kallsyms] [k] kfunc2 + 10.00% perf [.] bar + 5.00% libc.so [.] libcall + + In hierarchy output: + + perf report -s dso,sym --hierarchy + # Overhead Shared Object / Symbol + 65.00% [kernel.kallsyms] + 50.00% [k] kfunc1 + 15.00% [k] kfunc2 + 30.00% perf + 20.00% [.] foo + 10.00% [.] bar + 5.00% libc.so + 5.00% [.] libcall --inline:: If a callgraph address belongs to an inlined function, the inline stack @@ -558,6 +641,12 @@ include::itrace.txt[] 'Avg Cycles%' - block average sampled cycles / sum of total block average sampled cycles 'Avg Cycles' - block average sampled cycles + 'Branch Counter' - block branch counter histogram (with -v showing the number) + +--skip-empty:: + Do not print 0 results in the --stat output. + +include::cpu-and-latency-overheads.txt[] include::callchain-overhead-calculation.txt[] diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 5fbe42bd599b..6dbbddb6464d 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -20,6 +20,26 @@ There are several variants of 'perf sched': 'perf sched latency' to report the per task scheduling latencies and other scheduling properties of the workload. + Example usage: + perf sched record -- sleep 1 + perf sched latency + + ------------------------------------------------------------------------------------------------------------------------------------------- + Task | Runtime ms | Count | Avg delay ms | Max delay ms | Max delay start | Max delay end | + ------------------------------------------------------------------------------------------------------------------------------------------- + perf:(2) | 2.804 ms | 66 | avg: 0.524 ms | max: 1.069 ms | max start: 254752.314960 s | max end: 254752.316029 s + NetworkManager:1343 | 0.372 ms | 13 | avg: 0.008 ms | max: 0.013 ms | max start: 254751.551153 s | max end: 254751.551166 s + kworker/1:2-xfs:4649 | 0.012 ms | 1 | avg: 0.008 ms | max: 0.008 ms | max start: 254751.519807 s | max end: 254751.519815 s + kworker/3:1-xfs:388 | 0.011 ms | 1 | avg: 0.006 ms | max: 0.006 ms | max start: 254751.519809 s | max end: 254751.519815 s + sleep:147736 | 0.938 ms | 3 | avg: 0.006 ms | max: 0.007 ms | max start: 254751.313817 s | max end: 254751.313824 s + + It shows Runtime(time that a task spent actually running on the CPU), + Count(number of times a delay was calculated) and delay(time that a + task was ready to run but was kept waiting). + + Tasks with the same command name are merged and the merge count is + given within (), However if -p option is used, pid is mentioned. + 'perf sched script' to see a detailed trace of the workload that was recorded (aliased to 'perf script' for now). @@ -44,8 +64,8 @@ There are several variants of 'perf sched': By default it shows the individual schedule events, including the wait time (time between sched-out and next sched-in events for the task), the - task scheduling delay (time between wakeup and actually running) and run - time for the task: + task scheduling delay (time between runnable and actually running) and + run time for the task: time cpu task name wait time sch delay run time [tid/pid] (msec) (msec) (msec) @@ -78,6 +98,22 @@ OPTIONS --force:: Don't complain, do it. +OPTIONS for 'perf sched latency' +------------------------------- + +-C:: +--CPU <n>:: + CPU to profile on. + +-p:: +--pids:: + latency stats per pid instead of per command name. + +-s:: +--sort <key[,key2...]>:: + sort by key(s): runtime, switch, avg, max + by default it's sorted by "avg ,max ,switch ,runtime". + OPTIONS for 'perf sched map' ---------------------------- @@ -94,6 +130,16 @@ OPTIONS for 'perf sched map' --color-pids:: Highlight the given pids. +--task-name <task>:: + Map output only for the given task name(s). Separate the + task names with a comma (without whitespace). The sched-out + time is printed and is represented by '*-' for the given + task name(s). + ('-' indicates other tasks while '.' is idle). + +--fuzzy-name:: + Given task name(s) can be partially matched (fuzzy matching). + OPTIONS for 'perf sched timehist' --------------------------------- -k:: @@ -166,6 +212,30 @@ OPTIONS for 'perf sched timehist' --state:: Show task state when it switched out. +--show-prio:: + Show task priority. + +--prio:: + Only show events for given task priority(ies). Multiple priorities can be + provided as a comma-separated list with no spaces: 0,120. Ranges of + priorities are specified with -: 120-129. A combination of both can also be + provided: 0,120-129. + +-P:: +--pre-migrations:: + Show pre-migration wait time. pre-migration wait time is the time spent + by a task waiting on a runqueue but not getting the chance to run there + and is migrated to a different runqueue where it is finally run. This + time between sched_wakeup and migrate_task is the pre-migration wait + time. + +OPTIONS for 'perf sched replay' +------------------------------ + +-r:: +--repeat <n>:: + repeat the workload n times (0: infinite). Default is 10. + SEE ALSO -------- linkperf:perf-record[1] diff --git a/tools/perf/Documentation/perf-script-perl.txt b/tools/perf/Documentation/perf-script-perl.txt index 5a1f68122f50..5b479f5e62ff 100644 --- a/tools/perf/Documentation/perf-script-perl.txt +++ b/tools/perf/Documentation/perf-script-perl.txt @@ -54,8 +54,8 @@ all sched_wakeup events in the system: Traces meant to be processed using a script should be recorded with the above option: -a to enable system-wide collection. -The format file for the sched_wakep event defines the following fields -(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format): +The format file for the sched_wakeup event defines the following fields +(see /sys/kernel/tracing/events/sched/sched_wakeup/format): ---- format: diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt index 0fb9eda3cbca..27a1cac6fe76 100644 --- a/tools/perf/Documentation/perf-script-python.txt +++ b/tools/perf/Documentation/perf-script-python.txt @@ -167,7 +167,7 @@ below). Following those are the 'event handler' functions generated one for every event in the 'perf record' output. The handler functions take -the form subsystem__event_name, and contain named parameters, one for +the form subsystem\__event_name, and contain named parameters, one for each field in the event; in this case, there's only one event, raw_syscalls__sys_enter(). (see the EVENT HANDLERS section below for more info on event handlers). @@ -319,7 +319,7 @@ So those are the essential steps in writing and running a script. The process can be generalized to any tracepoint or set of tracepoints you're interested in - basically find the tracepoint(s) you're interested in by looking at the list of available events shown by -'perf list' and/or look in /sys/kernel/debug/tracing/events/ for +'perf list' and/or look in /sys/kernel/tracing/events/ for detailed event and field info, record the corresponding trace data using 'perf record', passing it the list of interesting events, generate a skeleton script using 'perf script -g python' and modify the @@ -448,8 +448,8 @@ all sched_wakeup events in the system: Traces meant to be processed using a script should be recorded with the above option: -a to enable system-wide collection. -The format file for the sched_wakep event defines the following fields -(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format): +The format file for the sched_wakeup event defines the following fields +(see /sys/kernel/tracing/events/sched/sched_wakeup/format): ---- format: @@ -550,6 +550,27 @@ def trace_unhandled(event_name, context, event_fields_dict): pass ---- +*process_event*, if defined, is called for any non-tracepoint event + +---- +def process_event(param_dict): + pass +---- + +*context_switch*, if defined, is called for any context switch + +---- +def context_switch(ts, cpu, pid, tid, np_pid, np_tid, machine_pid, out, out_preempt, *x): + pass +---- + +*auxtrace_error*, if defined, is called for any AUX area tracing error + +---- +def auxtrace_error(typ, code, cpu, pid, tid, ip, ts, msg, cpumode, *x): + pass +---- + The remaining sections provide descriptions of each of the available built-in perf script Python modules and their associated functions. @@ -592,12 +613,18 @@ common, but need to be made accessible to user scripts nonetheless. perf_trace_context defines a set of functions that can be used to access this data in the context of the current event. Each of these functions expects a context variable, which is the same as the -context variable passed into every event handler as the second -argument. +context variable passed into every tracepoint event handler as the second +argument. For non-tracepoint events, the context variable is also present +as perf_trace_context.perf_script_context . common_pc(context) - returns common_preempt count for the current event common_flags(context) - returns common_flags for the current event common_lock_depth(context) - returns common_lock_depth for the current event + perf_sample_insn(context) - returns the machine code instruction + perf_set_itrace_options(context, itrace_options) - set --itrace options if they have not been set already + perf_sample_srcline(context) - returns source_file_name, line_number + perf_sample_srccode(context) - returns source_file_name, line_number, source_line + perf_config_get(config_name) - returns the value of the named config item, or None if unset Util.py Module ~~~~~~~~~~~~~~ @@ -615,10 +642,21 @@ SUPPORTED FIELDS Currently supported fields: -ev_name, comm, pid, tid, cpu, ip, time, period, phys_addr, addr, -symbol, dso, time_enabled, time_running, values, callchain, +ev_name, comm, id, stream_id, pid, tid, cpu, ip, time, period, phys_addr, +addr, symbol, symoff, dso, time_enabled, time_running, values, callchain, brstack, brstacksym, datasrc, datasrc_decode, iregs, uregs, -weight, transaction, raw_buf, attr. +weight, transaction, raw_buf, attr, cpumode. + +Fields that may also be present: + + flags - sample flags + flags_disp - sample flags display + insn_cnt - instruction count for determining instructions-per-cycle (IPC) + cyc_cnt - cycle count for determining IPC + addr_correlates_sym - addr can correlate to a symbol + addr_dso - addr dso + addr_symbol - addr symbol + addr_symoff - addr symbol offset Some fields have sub items: diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 372dfd110e6d..28bec7e78bc8 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -79,6 +79,9 @@ OPTIONS --dump-raw-trace=:: Display verbose dump of the trace data. +--dump-unsorted-raw-trace=:: + Same as --dump-raw-trace but not sorted in time order. + -L:: --Latency=:: Show latency attributes (irqs/preemption disabled, etc). @@ -98,6 +101,18 @@ OPTIONS Generate perf-script.[ext] starter script for given language, using current perf.data. +--dlfilter=<file>:: + Filter sample events using the given shared object file. + Refer linkperf:perf-dlfilter[1] + +--dlarg=<arg>:: + Pass 'arg' as an argument to the dlfilter. --dlarg may be repeated + to add more arguments. + +--list-dlfilters:: + Display a list of available dlfilters. Use with option -v (must come + before option --list-dlfilters) to show long descriptions. + -a:: Force system-wide collection. Scripts run without a <command> normally use -a by default, while scripts run with a <command> @@ -115,9 +130,12 @@ OPTIONS -F:: --fields:: Comma separated list of fields to print. Options are: - comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, - srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, brstackinsn, - brstackoff, callindent, insn, insnlen, synth, phys_addr, metric, misc, srccode, ipc. + comm, tid, pid, time, cpu, event, trace, ip, sym, dso, dsoff, addr, symoff, + srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, + brstackinsn, brstackinsnlen, brstackdisasm, brstackoff, callindent, insn, disasm, + insnlen, synth, phys_addr, metric, misc, srccode, ipc, data_page_size, + code_page_size, ins_lat, machine_pid, vcpu, cgroup, retire_lat, brcntr, + Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace @@ -182,23 +200,27 @@ OPTIONS At this point usage is displayed, and perf-script exits. The flags field is synthesized and may have a value when Instruction - Trace decoding. The flags are "bcrosyiABEx" which stand for branch, + Trace decoding. The flags are "bcrosyiABExghDt" which stand for branch, call, return, conditional, system, asynchronous, interrupt, - transaction abort, trace begin, trace end, and in transaction, - respectively. Known combinations of flags are printed more nicely e.g. + transaction abort, trace begin, trace end, in transaction, VM-Entry, + VM-Exit, interrupt disabled and interrupt disable toggle respectively. + Known combinations of flags are printed more nicely e.g. "call" for "bc", "return" for "br", "jcc" for "bo", "jmp" for "b", "int" for "bci", "iret" for "bri", "syscall" for "bcs", "sysret" for "brs", "async" for "by", "hw int" for "bcyi", "tx abrt" for "bA", "tr strt" for "bB", - "tr end" for "bE". However the "x" flag will be display separately in those - cases e.g. "jcc (x)" for a condition branch within a transaction. + "tr end" for "bE", "vmentry" for "bcg", "vmexit" for "bch". + However the "x", "D" and "t" flags will be displayed separately in those + cases e.g. "jcc (xD)" for a condition branch within a transaction + with interrupts disabled. Note, interrupts becoming disabled is "t", + whereas interrupts becoming enabled is "Dt". The callindent field is synthesized and may have a value when Instruction Trace decoding. For calls and returns, it will display the name of the symbol indented with spaces to reflect the stack depth. - When doing instruction trace decoding insn and insnlen give the - instruction bytes and the instruction length of the current - instruction. + When doing instruction trace decoding, insn, disasm and insnlen give the + instruction bytes, disassembled instructions (requires libcapstone support) + and the instruction length of the current instruction respectively. The synth field is used by synthesized events which may be created when Instruction Trace decoding. @@ -206,17 +228,33 @@ OPTIONS The ipc (instructions per cycle) field is synthesized and may have a value when Instruction Trace decoding. + The machine_pid and vcpu fields are derived from data resulting from using + perf inject to insert a perf.data file recorded inside a virtual machine into + a perf.data file recorded on the host at the same time. + + The cgroup fields requires sample having the cgroup id which is saved + when "--all-cgroups" option is passed to 'perf record'. + Finally, a user may not set fields to none for all event types. i.e., -F "" is not allowed. The brstack output includes branch related information with raw addresses using the - /v/v/v/v/cycles syntax in the following order: - FROM: branch source instruction - TO : branch target instruction - M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported - X/- : X=branch inside a transactional region, -=not in transaction region or not supported - A/- : A=TSX abort entry, -=not aborted region or not supported - cycles + FROM/TO/EVENT/INTX/ABORT/CYCLES/TYPE/SPEC syntax in the following order: + FROM : branch source instruction + TO : branch target instruction + EVENT : M=branch target or direction was mispredicted + P=branch target or direction was predicted + N=branch not-taken + -=no event or not supported + INTX : X=branch inside a transactional region + -=branch not in transaction region or not supported + ABORT : A=TSX abort entry + -=not aborted region or not supported + CYCLES: the number of cycles that have elapsed since the last branch was recorded + TYPE : branch type: COND/UNCOND/IND/CALL/IND_CALL/RET etc. + -=not supported + SPEC : branch speculation info: SPEC_WRONG_PATH/NON_SPEC_CORRECT_PATH/SPEC_CORRECT_PATH + -=not supported The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible. @@ -224,6 +262,13 @@ OPTIONS is printed. This is the full execution path leading to the sample. This is only supported when the sample was recorded with perf record -b or -j any. + Use brstackinsnlen to print the brstackinsn lenght. For example, you + can’t know the next sequential instruction after an unconditional branch unless + you calculate that based on its length. + + brstackdisasm acts like brstackinsn, but will print disassembled instructions if + perf is built with the capstone library. + The brstackoff field will print an offset into a specific dso/binary. With the metric option perf script can compute metrics for @@ -322,6 +367,10 @@ OPTIONS --show-cgroup-events Display cgroup events i.e. events of type PERF_RECORD_CGROUP. +--show-text-poke-events + Display text poke events i.e. events of type PERF_RECORD_TEXT_POKE and + PERF_RECORD_KSYMBOL. + --demangle:: Demangle symbol names to human readable form. It's enabled by default, disable with --no-demangle. @@ -329,6 +378,9 @@ OPTIONS --demangle-kernel:: Demangle kernel symbol names to human readable form (for C++ kernels). +--addr2line=<path>:: + Path to addr2line binary. + --header Show perf.data header. @@ -405,9 +457,10 @@ include::itrace.txt[] will be printed. Each entry has function name and file/line. Enabled by default, disable with --no-inline. ---insn-trace:: - Show instruction stream for intel_pt traces. Combine with --xed to - show disassembly. +--insn-trace[=<raw|disasm>]:: + Show instruction stream in bytes (raw) or disassembled (disasm) + for intel_pt traces. The default is 'raw'. To use xed, combine + 'raw' with --xed to show disassembly done by xed. --xed:: Run xed disassembler on output. Requires installing the xed disassembler. @@ -417,9 +470,32 @@ include::itrace.txt[] Only consider the listed symbols. Symbols are typically a name but they may also be hexadecimal address. + The hexadecimal address may be the start address of a symbol or + any other address to filter the trace records + For example, to select the symbol noploop or the address 0x4007a0: perf script --symbols=noploop,0x4007a0 + Support filtering trace records by symbol name, start address of + symbol, any hexadecimal address and address range. + + The comparison order is: + + 1. symbol name comparison + 2. symbol start address comparison. + 3. any hexadecimal address comparison. + 4. address range comparison (see --addr-range). + +--addr-range:: + Use with -S or --symbols to list traced records within address range. + + For example, to list the traced records within the address range + [0x4007a0, 0x0x4007a9]: + perf script -S 0x4007a0 --addr-range 10 + +--dsos=:: + Only consider symbols in these DSOs. + --call-trace:: Show call stream for intel_pt traces. The CPUs are interleaved, but can be filtered with -C. @@ -446,12 +522,17 @@ include::itrace.txt[] perf record --call-graph lbr. Disabled by default. In common cases with call stack overflows, it can recreate better call stacks than the default lbr call stack - output. But this approach is not full proof. There can be cases + output. But this approach is not foolproof. There can be cases where it creates incorrect call stacks from incorrect matches. The known limitations include exception handing such as setjmp/longjmp will have calls/returns not match. +:GMEXAMPLECMD: script +:GMEXAMPLESUBCMD: +include::guest-files.txt[] + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], -linkperf:perf-script-python[1], linkperf:perf-intel-pt[1] +linkperf:perf-script-python[1], linkperf:perf-intel-pt[1], +linkperf:perf-dlfilter[1] diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index b029ee728a0b..61d091670dee 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -9,8 +9,8 @@ SYNOPSIS -------- [verse] 'perf stat' [-e <EVENT> | --event=EVENT] [-a] <command> -'perf stat' [-e <EVENT> | --event=EVENT] [-a] -- <command> [<options>] -'perf stat' [-e <EVENT> | --event=EVENT] [-a] record [-o file] -- <command> [<options>] +'perf stat' [-e <EVENT> | --event=EVENT] [-a] \-- <command> [<options>] +'perf stat' [-e <EVENT> | --event=EVENT] [-a] record [-o file] \-- <command> [<options>] 'perf stat' report [-i file] DESCRIPTION @@ -36,8 +36,14 @@ report:: - a symbolic event name (use 'perf list' to list all events) - - a raw PMU event (eventsel+umask) in the form of rNNN where NNN is a - hexadecimal event descriptor. + - a raw PMU event in the form of rN where N is a hexadecimal value + that represents the raw register encoding with the layout of the + event control registers as described by entries in + /sys/bus/event_source/devices/cpu/format/*. + + - a symbolic or raw PMU event followed by an optional colon + and a list of event modifiers, e.g., cpu-cycles:p. See the + linkperf:perf-list[1] man page for details on event modifiers. - a symbolically formed event like 'pmu/param1=0x3,param2/' where param1 and param2 are defined as formats for the PMU in @@ -71,6 +77,37 @@ report:: --tid=<tid>:: stat events on existing thread id (comma separated list) +-b:: +--bpf-prog:: + stat events on existing bpf program id (comma separated list), + requiring root rights. bpftool-prog could be used to find program + id all bpf programs in the system. For example: + + # bpftool prog | head -n 1 + 17247: tracepoint name sys_enter tag 192d548b9d754067 gpl + + # perf stat -e cycles,instructions --bpf-prog 17247 --timeout 1000 + + Performance counter stats for 'BPF program(s) 17247': + + 85,967 cycles + 28,982 instructions # 0.34 insn per cycle + + 1.102235068 seconds time elapsed + +--bpf-counters:: + Use BPF programs to aggregate readings from perf_events. This + allows multiple perf-stat sessions that are counting the same metric (cycles, + instructions, etc.) to share hardware counters. + To use BPF programs on common events by default, use + "perf config stat.bpf-counter-events=<list_of_events>". + +--bpf-attr-map:: + With option "--bpf-counters", different perf-stat sessions share + information about shared BPF programs and maps via a pinned hashmap. + Use "--bpf-attr-map" to specify the path of this pinned hashmap. + The default path is /sys/fs/bpf/perf_attr_map. + ifdef::HAVE_LIBPFM[] --pfm-events events:: Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net) @@ -120,7 +157,10 @@ Do not aggregate counts across all monitored CPUs. -n:: --null:: - null run - don't start any counters +null run - Don't start any counters. + +This can be useful to measure just elapsed wall-clock time - or to assess the +raw overhead of perf stat itself, without running any counters. -v:: --verbose:: @@ -162,6 +202,12 @@ use '-e e1 -e e2 -G foo,foo' or just use '-e e1 -e e2 -G foo'. If wanting to monitor, say, 'cycles' for a cgroup and also for system wide, this command line can be used: 'perf stat -e cycles -G cgroup_name -a -e cycles'. +--for-each-cgroup name:: +Expand event list for each cgroup in "name" (allow multiple cgroups separated +by comma). It also support regex patterns to match multiple groups. This has same +effect that repeating -e option and -G option for each event x name. This option +cannot be used with -G/--cgroup option. + -o file:: --output file:: Print the output into the designated file. @@ -173,14 +219,55 @@ Append to the output file designated with the -o option. Ignored if -o is not sp Log output to fd, instead of stderr. Complementary to --output, and mutually exclusive with it. --append may be used here. Examples: - 3>results perf stat --log-fd 3 -- $cmd - 3>>results perf stat --log-fd 3 --append -- $cmd + 3>results perf stat --log-fd 3 \-- $cmd + 3>>results perf stat --log-fd 3 --append \-- $cmd + +--control=fifo:ctl-fifo[,ack-fifo]:: +--control=fd:ctl-fd[,ack-fd]:: +ctl-fifo / ack-fifo are opened and used as ctl-fd / ack-fd as follows. +Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, +'disable': disable events). Measurements can be started with events disabled using +--delay=-1 option. Optionally send control command completion ('ack\n') to ack-fd descriptor +to synchronize with the controlling process. Example of bash shell script to enable and +disable events during measurements: + + #!/bin/bash + + ctl_dir=/tmp/ + + ctl_fifo=${ctl_dir}perf_ctl.fifo + test -p ${ctl_fifo} && unlink ${ctl_fifo} + mkfifo ${ctl_fifo} + exec {ctl_fd}<>${ctl_fifo} + + ctl_ack_fifo=${ctl_dir}perf_ctl_ack.fifo + test -p ${ctl_ack_fifo} && unlink ${ctl_ack_fifo} + mkfifo ${ctl_ack_fifo} + exec {ctl_fd_ack}<>${ctl_ack_fifo} + + perf stat -D -1 -e cpu-cycles -a -I 1000 \ + --control fd:${ctl_fd},${ctl_fd_ack} \ + \-- sleep 30 & + perf_pid=$! + + sleep 5 && echo 'enable' >&${ctl_fd} && read -u ${ctl_fd_ack} e1 && echo "enabled(${e1})" + sleep 10 && echo 'disable' >&${ctl_fd} && read -u ${ctl_fd_ack} d1 && echo "disabled(${d1})" + + exec {ctl_fd_ack}>&- + unlink ${ctl_ack_fifo} + + exec {ctl_fd}>&- + unlink ${ctl_fifo} + + wait -n ${perf_pid} + exit $? + --pre:: --post:: Pre and post measurement hooks, e.g.: -perf stat --repeat 10 --null --sync --pre 'make -s O=defconfig-build/clean' -- make -s -j64 O=defconfig-build/ bzImage +perf stat --repeat 10 --null --sync --pre 'make -s O=defconfig-build/clean' \-- make -s -j64 O=defconfig-build/ bzImage -I msecs:: --interval-print msecs:: @@ -221,6 +308,22 @@ use --per-die in addition to -a. (system-wide). The output includes the die number and the number of online processors on that die. This is useful to gauge the amount of aggregation. +--per-cluster:: +Aggregate counts per processor cluster for system-wide mode measurement. This +is a useful mode to detect imbalance between clusters. To enable this mode, +use --per-cluster in addition to -a. (system-wide). The output includes the +cluster number and the number of online processors on that cluster. This is +useful to gauge the amount of aggregation. The information of cluster ID and +related CPUs can be gotten from /sys/devices/system/cpu/cpuX/topology/cluster_{id, cpus}. + +--per-cache:: +Aggregate counts per cache instance for system-wide mode measurements. By +default, the aggregation happens for the cache level at the highest index +in the system. To specify a particular level, mention the cache level +alongside the option in the format [Ll][1-9][0-9]*. For example: +Using option "--per-cache=l3" or "--per-cache=L3" will aggregate the +information at the boundary of the level 3 cache in the system. + --per-core:: Aggregate counts per physical processor for system-wide mode measurements. This is a useful mode to detect imbalance between physical cores. To enable this mode, @@ -238,8 +341,9 @@ mode, use --per-node in addition to -a. (system-wide). -D msecs:: --delay msecs:: -After starting the program, wait msecs before measuring. This is useful to -filter out the startup phase of the program, which is often very different. +After starting the program, wait msecs before measuring (-1: start with events +disabled). This is useful to filter out the startup phase of the program, +which is often very different. -T:: --transaction:: @@ -265,6 +369,19 @@ small group that need not have multiplexing is lowered. This option forbids the event merging logic from sharing events between groups and may be used to increase accuracy in this case. +--metric-no-threshold:: +Metric thresholds may increase the number of events necessary to +compute whether a metric has exceeded its threshold expression. This +may not be desirable, for example, as the events can introduce +multiplexing. This option disables the adding of threshold expression +events for a metric. However, if there are sufficient events to +compute the threshold then the threshold is still computed and used to +color the metric's computed value. + +--quiet:: +Don't print output, warnings or messages. This is useful with perf stat +record below to only write data to the perf.data file. + STAT RECORD ----------- Stores stat data into perf data file. @@ -287,6 +404,17 @@ Aggregate counts per processor socket for system-wide mode measurements. --per-die:: Aggregate counts per processor die for system-wide mode measurements. +--per-cluster:: +Aggregate counts perf processor cluster for system-wide mode measurements. + +--per-cache:: +Aggregate counts per cache instance for system-wide mode measurements. By +default, the aggregation happens for the cache level at the highest index +in the system. To specify a particular level, mention the cache level +alongside the option in the format [Ll][1-9][0-9]*. For example: Using +option "--per-cache=l3" or "--per-cache=L3" will aggregate the +information at the boundary of the level 3 cache in the system. + --per-core:: Aggregate counts per physical processor for system-wide mode measurements. @@ -295,17 +423,50 @@ Aggregate counts per physical processor for system-wide mode measurements. Print metrics or metricgroups specified in a comma separated list. For a group all metrics from the group are added. The events from the metrics are automatically measured. -See perf list output for the possble metrics and metricgroups. +See perf list output for the possible metrics and metricgroups. + + When threshold information is available for a metric, the + color red is used to signify a metric has exceeded a threshold + while green shows it hasn't. The default color means that + no threshold information was available or the threshold + couldn't be computed. -A:: --no-aggr:: -Do not aggregate counts across all monitored CPUs. +--no-merge:: +Do not aggregate/merge counts across monitored CPUs or PMUs. + +When multiple events are created from a single event specification, +stat will, by default, aggregate the event counts and show the result +in a single row. This option disables that behavior and shows the +individual events and counts. + +Multiple events are created from a single event specification when: + +1. PID monitoring isn't requested and the system has more than one + CPU. For example, a system with 8 SMT threads will have one event + opened on each thread and aggregation is performed across them. + +2. Prefix or glob wildcard matching is used for the PMU name. For + example, multiple memory controller PMUs may exist typically with a + suffix of _0, _1, etc. By default the event counts will all be + combined if the PMU is specified without the suffix such as + uncore_imc rather than uncore_imc_0. + +3. Aliases, which are listed immediately after the Kernel PMU events + by perf list, are used. + +--hybrid-merge:: +Merge core event counts from all core PMUs. In hybrid or big.LITTLE +systems by default each core PMU will report its count +separately. This option forces core PMU counts to be combined to give +a behavior closer to having a single CPU type in the system. --topdown:: -Print top down level 1 metrics if supported by the CPU. This allows to -determine bottle necks in the CPU pipeline for CPU bound workloads, -by breaking the cycles consumed down into frontend bound, backend bound, -bad speculation and retiring. +Print top-down metrics supported by the CPU. This allows to determine +bottle necks in the CPU pipeline for CPU bound workloads, by breaking +the cycles consumed down into frontend bound, backend bound, bad +speculation and retiring. Frontend bound means that the CPU cannot fetch and decode instructions fast enough. Backend bound means that computation or memory access is the bottle @@ -317,6 +478,11 @@ if the workload is actually bound by the CPU and not by something else. For best results it is usually a good idea to use it with interval mode like -I 1000, as the bottleneck of workloads can change often. +This enables --metric-only, unless overridden with --no-metric-only. + +The following restrictions only apply to older Intel CPUs and Atom, +on newer CPUs (IceLake and later) TopDown can be collected for any thread: + The top down metrics are collected per core instead of per CPU thread. Per core mode is automatically enabled and -a (global monitoring) is needed, requiring root rights or @@ -328,24 +494,39 @@ echo 0 > /proc/sys/kernel/nmi_watchdog for best results. Otherwise the bottlenecks may be inconsistent on workload with changing phases. -This enables --metric-only, unless overridden with --no-metric-only. - To interpret the results it is usually needed to know on which CPUs the workload runs on. If needed the CPUs can be forced using taskset. ---no-merge:: -Do not merge results from same PMUs. - -When multiple events are created from a single event specification, -stat will, by default, aggregate the event counts and show the result -in a single row. This option disables that behavior and shows -the individual events and counts. - -Multiple events are created from a single event specification when: -1. Prefix or glob matching is used for the PMU name. -2. Aliases, which are listed immediately after the Kernel PMU events - by perf list, are used. +--record-tpebs:: +Enable automatic sampling on Intel TPEBS retire_latency events (event with :R +modifier). Without this option, perf would not capture dynamic retire_latency +at runtime. Currently, a zero value is assigned to the retire_latency event when +this option is not set. The TPEBS hardware feature starts from Intel Granite +Rapids microarchitecture. This option only exists in X86_64 and is meaningful on +Intel platforms with TPEBS feature. + +--tpebs-mode=[mean|min|max|last]:: +Set how retirement latency events have their sample times +combined. The default "mean" gives the average of retirement +latency. "min" or "max" give the smallest or largest retirment latency +times respectively. "last" uses the last retirment latency sample's +time. + +--td-level:: +Print the top-down statistics that equal the input level. It allows +users to print the interested top-down metrics level instead of the +level 1 top-down metrics. + +As the higher levels gather more metrics and use more counters they +will be less accurate. By convention a metric can be examined by +appending '_group' to it and this will increase accuracy compared to +gathering all metrics for a level. For example, level 1 analysis may +highlight 'tma_frontend_bound'. This metric may be drilled into with +'tma_frontend_bound_group' with +'perf stat -M tma_frontend_bound_group...'. + +Error out if the input is higher than the supported max level. --smi-cost:: Measure SMI cost if msr/aperf/ and msr/smi/ events are supported. @@ -376,10 +557,26 @@ counts for all hardware threads in a core but show the sum counts per hardware thread. This is essentially a replacement for the any bit and convenient for post processing. +--summary:: +Print summary for interval mode (-I). + +--no-csv-summary:: +Don't print 'summary' at the first column for CVS summary output. +This option must be used with -x and --summary. + +This option can be enabled in perf config by setting the variable +'stat.no-csv-summary'. + +$ perf config stat.no-csv-summary=true + +--cputype:: +Only enable events on applying cpu with this type for hybrid platform +(e.g. core or atom)" + EXAMPLES -------- -$ perf stat -- make +$ perf stat \-- make Performance counter stats for 'make': @@ -435,6 +632,29 @@ The fields are in this order: Additional metrics may be printed with all earlier fields being empty. +include::intel-hybrid.txt[] + +JSON FORMAT +----------- + +With -j, perf stat is able to print out a JSON format output +that can be used for parsing. + +- timestamp : optional usec time stamp in fractions of second (with -I) +- optional aggregate options: + - core : core identifier (with --per-core) + - die : die identifier (with --per-die) + - socket : socket identifier (with --per-socket) + - node : node identifier (with --per-node) + - thread : thread identifier (with --per-thread) +- counter-value : counter value +- unit : unit of the counter value or empty +- event : event name +- variance : optional variance if multiple values are collected (with -r) +- runtime : run time of counter +- metric-value : optional metric value +- metric-unit : optional unit of metric + SEE ALSO -------- linkperf:perf-top[1], linkperf:perf-list[1] diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt index b329c65d7f40..32da0d1fa86a 100644 --- a/tools/perf/Documentation/perf-test.txt +++ b/tools/perf/Documentation/perf-test.txt @@ -28,9 +28,44 @@ OPTIONS Tests to skip (comma separated numeric list). -v:: +-vv:: +-vvv:: --verbose:: - Be more verbose. + With a single '-v', verbose level 1, only failing test output + is displayed. With '-vv' and higher all test output is shown. + +-S:: +--sequential:: + Run all tests one after the other. By default "exclusive" + tests are run sequentially, but other tests are run in + parallel to speed execution. + +-r:: +--runs-per-test:: + Run each test the given number of times, by default once. This + option can be useful to determine if a test is flaky. -F:: --dont-fork:: - Do not fork child for each test, run all tests within single process. + Do not fork child for each test, run all tests within single process, this + sets sequential mode. + +--dso:: + Specify a DSO for the "Symbols" test. + +-w:: +--workload=:: + Run a built-in workload, to list them use '--list-workloads', current ones include: + noploop, thloop, leafloop, sqrtloop, brstack, datasym and landlock. + + Used with the shell script regression tests. + + Some accept an extra parameter: + + seconds: leafloop, noploop, sqrtloop, thloop + nrloops: brstack + + The datasym and landlock workloads don't accept any. + +--list-workloads:: + List the available workloads to use with -w/--workload. diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index ee2024691d46..af3e4230c72f 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -38,9 +38,14 @@ Default is to monitor all CPUS. -e <event>:: --event=<event>:: Select the PMU event. Selection can be a symbolic event name - (use 'perf list' to list all events) or a raw PMU - event (eventsel+umask) in the form of rNNN where NNN is a - hexadecimal event descriptor. + (use 'perf list' to list all events) or a raw PMU event in the form + of rN where N is a hexadecimal value that represents the raw register + encoding with the layout of the event control registers as described + by entries in /sys/bus/event_source/devices/cpu/format/*. + +--filter=<filter>:: + Event filter. This option should follow an event selector (-e). For + syntax see linkperf:perf-record[1]. -E <entries>:: --entries=<entries>:: @@ -50,9 +55,6 @@ Default is to monitor all CPUS. --count-filter=<count>:: Only display functions with more events than this. ---group:: - Put the counters into a counter group. - --group-sort-idx:: Sort the output by the event at the index n in group. If n is invalid, sort by the first event. It can support multiple groups with different @@ -81,8 +83,8 @@ Default is to monitor all CPUS. -m <pages>:: --mmap-pages=<pages>:: Number of mmap data pages (must be a power of two) or size - specification with appended unit character - B/K/M/G. The - size is rounded up to have nearest pages power of two value. + specification in bytes with appended unit character - B/K/M/G. + The size is rounded up to the nearest power-of-two page value. -p <pid>:: --pid=<pid>:: @@ -163,6 +165,12 @@ Default is to monitor all CPUS. -M:: --disassembler-style=:: Set disassembler style for objdump. +--addr2line=<path>:: + Path to addr2line binary. + +--objdump=<path>:: + Path to objdump binary. + --prefix=PREFIX:: --prefix-strip=N:: Remove first N entries from source file path names in executables @@ -250,11 +258,45 @@ Default is to monitor all CPUS. The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k Note that this feature may not be available on all processors. +--branch-history:: + Add the addresses of sampled taken branches to the callstack. + This allows to examine the path the program took to each sample. + --raw-trace:: When displaying traceevent output, do not use print fmt or plugins. +-H:: --hierarchy:: - Enable hierarchy output. + Enable hierarchical output. In the hierarchy mode, each sort key groups + samples based on the criteria and then sub-divide it using the lower + level sort key. + + For example, in normal output: + + perf report -s dso,sym + # + # Overhead Shared Object Symbol + # ........ ................. ........... + 50.00% [kernel.kallsyms] [k] kfunc1 + 20.00% perf [.] foo + 15.00% [kernel.kallsyms] [k] kfunc2 + 10.00% perf [.] bar + 5.00% libc.so [.] libcall + + In hierarchy output: + + perf report -s dso,sym --hierarchy + # + # Overhead Shared Object / Symbol + # .......... ...................... + 65.00% [kernel.kallsyms] + 50.00% [k] kfunc1 + 15.00% [k] kfunc2 + 30.00% perf + 20.00% [.] foo + 10.00% [.] bar + 5.00% libc.so + 5.00% [.] libcall --overwrite:: Enable this to use just the most recent records, which helps in high core count @@ -277,6 +319,18 @@ Default is to monitor all CPUS. Record events of type PERF_RECORD_NAMESPACES and display it with the 'cgroup_id' sort key. +-G name:: +--cgroup name:: +monitor only in the container (cgroup) called "name". This option is available only +in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to +container "name" are monitored when they run on the monitored CPUs. Multiple cgroups +can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup +to first event, second cgroup to second event and so on. It is possible to provide +an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have +corresponding events, i.e., they always refer to events defined earlier on the command +line. If the user wants to track multiple events for a specific cgroup, the user can +use '-e e1 -e e2 -G foo,foo' or just use '-e e1 -e e2 -G foo'. + --all-cgroups:: Record events of type PERF_RECORD_CGROUP and display it with the 'cgroup' sort key. @@ -300,10 +354,10 @@ Default is to monitor all CPUS. perf top -e cycles,probe:icmp_rcv --switch-on=probe:icmp_rcv - Alternatively one can ask for --group and then two overhead columns + Alternatively one can ask for a group and then two overhead columns will appear, the first for cycles and the second for the switch-on event. - perf top --group -e cycles,probe:icmp_rcv --switch-on=probe:icmp_rcv + perf top -e '{cycles,probe:icmp_rcv}' --switch-on=probe:icmp_rcv This may be interesting to measure a workload only after some initialization phase is over, i.e. insert a perf probe at that point and use the above @@ -317,14 +371,14 @@ Default is to monitor all CPUS. but probably we'll make the default not to show the switch-on/off events on the --group mode and if there is only one event besides the off/on ones, go straight to the histogram browser, just like 'perf top' with no events - explicitely specified does. + explicitly specified does. --stitch-lbr:: Show callgraph with stitched LBRs, which may have more complete callgraph. The option must be used with --call-graph lbr recording. Disabled by default. In common cases with call stack overflows, it can recreate better call stacks than the default lbr call stack - output. But this approach is not full proof. There can be cases + output. But this approach is not foolproof. There can be cases where it creates incorrect call stacks from incorrect matches. The known limitations include exception handing such as setjmp/longjmp will have calls/returns not match. diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index abc9b5d83312..c1fb6056a0d3 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -97,8 +97,8 @@ filter out the startup phase of the program, which is often very different. Filter out events for these pids and for 'trace' itself (comma separated list). -v:: ---verbose=:: - Verbosity level. +--verbose:: + Increase the verbosity level. --no-inherit:: Child tasks do not inherit counters. @@ -106,8 +106,8 @@ filter out the startup phase of the program, which is often very different. -m:: --mmap-pages=:: Number of mmap data pages (must be a power of two) or size - specification with appended unit character - B/K/M/G. The - size is rounded up to have nearest pages power of two value. + specification in bytes with appended unit character - B/K/M/G. + The size is rounded up to the nearest power-of-two page value. -C:: --cpu:: @@ -150,6 +150,11 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. To be used with -s or -S, to show stats for the errnos experienced by syscalls, using only this option will trigger --summary. +--summary-mode=mode:: + To be used with -s or -S, to select how to show summary. By default it'll + show the syscall summary by thread. Possible values are: thread, total, + cgroup. + --tool_stats:: Show tool stats such as number of times fd->pathname was discovered thru hooking the open syscall return + vfs_getname or via reading /proc/pid/fd, etc. @@ -241,6 +246,17 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. printing using the existing 'perf trace' syscall arg beautifiers to map integer arguments to strings (pid to comm, syscall id to syscall name, etc). +--force-btf:: + Use btf_dump to pretty print syscall argument data, instead of using hand-crafted pretty + printers. This option is intended for testing BTF integration in perf trace. btf_dump-based + pretty-printing serves as a fallback to hand-crafted pretty printers, as the latter can + better pretty-print integer flags and struct pointers. + +--bpf-summary:: + Collect system call statistics in BPF. This is only for live mode and + works well with -s/--summary option where no argument information is + required. + PAGEFAULTS ---------- diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index b6472e463284..cd95ba09f727 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -43,7 +43,7 @@ struct perf_file_section { Flags section: -For each of the optional features a perf_file_section it placed after the data +For each of the optional features a perf_file_section is placed after the data section if the feature bit is set in the perf_header flags bitset. The respective perf_file_section points to the data of the additional header and defines its size. @@ -346,7 +346,7 @@ to special needs. HEADER_BPF_PROG_INFO = 25, -struct bpf_prog_info_linear, which contains detailed information about +struct perf_bpil, which contains detailed information about a BPF program, including type, id, tag, jited/xlated instructions, etc. HEADER_BPF_BTF = 26, @@ -370,7 +370,7 @@ struct { u32 mmap_len; }; -Indicates that trace contains records of PERF_RECORD_COMPRESSED type +Indicates that trace contains records of PERF_RECORD_COMPRESSED2 type that have perf_events records in compressed form. HEADER_CPU_PMU_CAPS = 28, @@ -389,6 +389,54 @@ struct { Example: cpu pmu capabilities: branches=32, max_precise=3, pmu_name=icelake + HEADER_CLOCK_DATA = 29, + + Contains clock id and its reference time together with wall clock + time taken at the 'same time', both values are in nanoseconds. + The format of data is as below. + +struct { + u32 version; /* version = 1 */ + u32 clockid; + u64 wall_clock_ns; + u64 clockid_time_ns; +}; + + HEADER_HYBRID_TOPOLOGY = 30, + +Indicate the hybrid CPUs. The format of data is as below. + +struct { + u32 nr; + struct { + char pmu_name[]; + char cpus[]; + } [nr]; /* Variable length records */ +}; + +Example: + hybrid cpu system: + cpu_core cpu list : 0-15 + cpu_atom cpu list : 16-23 + + HEADER_PMU_CAPS = 31, + + List of pmu capabilities (except cpu pmu which is already + covered by HEADER_CPU_PMU_CAPS). Note that hybrid cpu pmu + capabilities are also stored here. + +struct { + u32 nr_pmu; + struct { + u32 nr_caps; + { + char name[]; + char value[]; + } [nr_caps]; + char pmu_name[]; + } [nr_pmu]; +}; + other bits are reserved and should ignored for now HEADER_FEAT_BITS = 256, @@ -554,16 +602,40 @@ struct auxtrace_error_event { Describes a header feature. These are records used in pipe-mode that contain information that otherwise would be in perf.data file's header. - PERF_RECORD_COMPRESSED = 81, + PERF_RECORD_COMPRESSED = 81, /* deprecated */ + +The header is followed by compressed data frame that can be decompressed +into array of perf trace records. The size of the entire compressed event +record including the header is limited by the max value of header.size. + +It is deprecated and new files should use PERF_RECORD_COMPRESSED2 to gurantee +8-byte alignment. struct compressed_event { struct perf_event_header header; char data[]; }; -The header is followed by compressed data frame that can be decompressed -into array of perf trace records. The size of the entire compressed event -record including the header is limited by the max value of header.size. + PERF_RECORD_FINISHED_INIT = 82, + +Marks the end of records for the system, pre-existing threads in system wide +sessions, etc. Those are the ones prefixed PERF_RECORD_USER_*. + +This is used, for instance, to 'perf inject' events after init and before +regular events, those emitted by the kernel, to support combining guest and +host records. + + PERF_RECORD_COMPRESSED2 = 83, + +8-byte aligned version of `PERF_RECORD_COMPRESSED`. `header.size` indicates the +total record size, including padding for 8-byte alignment, and `data_size` +specifies the actual size of the compressed data. + +struct perf_record_compressed2 { + struct perf_event_header header; + __u64 data_size; + char data[]; +}; Event types diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt index 3f37ded13f8c..cbcc2e4d557e 100644 --- a/tools/perf/Documentation/perf.txt +++ b/tools/perf/Documentation/perf.txt @@ -12,32 +12,62 @@ SYNOPSIS OPTIONS ------- ---debug:: - Setup debug variable (see list below) in value - range (0, 10). Use like: - --debug verbose # sets verbose = 1 - --debug verbose=2 # sets verbose = 2 - - List of debug variables allowed to set: - verbose - general debug messages - ordered-events - ordered events object debug messages - data-convert - data convert command debug messages - stderr - write debug output (option -v) to stderr - in browser mode - perf-event-open - Print perf_event_open() arguments and - return value - ---buildid-dir:: - Setup buildid cache directory. It has higher priority than - buildid.dir config file option. +-h:: +--help:: + Run perf help command. -v:: --version:: - Display perf version. + Display perf version. --h:: ---help:: - Run perf help command. +-vv:: + Print the compiled-in status of libraries. + +--exec-path:: + Display or set exec path. + +--html-path:: + Display html documentation path. + +-p:: +--paginate:: + Set up pager. + +--no-pager:: + Do not set pager. + +--buildid-dir:: + Setup buildid cache directory. It has higher priority + than buildid.dir config file option. + +--list-cmds:: + List the most commonly used perf commands. + +--list-opts:: + List available perf options. + +--debugfs-dir:: + Set debugfs directory or set environment variable PERF_DEBUGFS_DIR. + +--debug:: + Setup debug variable (see list below) in value + range (0, 10). Use like: + --debug verbose # sets verbose = 1 + --debug verbose=2 # sets verbose = 2 + + List of debug variables allowed to set: + verbose - general debug messages + ordered-events - ordered events object debug messages + data-convert - data convert command debug messages + stderr - write debug output (option -v) to stderr + in browser mode + perf-event-open - Print perf_event_open() arguments and + return value + kmaps - Print kernel and module maps (perf script + and perf report without browser) + +--debug-file:: + Write debug output to a specified file. DESCRIPTION ----------- @@ -51,3 +81,16 @@ SEE ALSO linkperf:perf-stat[1], linkperf:perf-top[1], linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-list[1] + +linkperf:perf-amd-ibs[1], linkperf:perf-annotate[1], +linkperf:perf-archive[1], linkperf:perf-arm-spe[1], +linkperf:perf-bench[1], linkperf:perf-buildid-cache[1], +linkperf:perf-buildid-list[1], linkperf:perf-c2c[1], +linkperf:perf-config[1], linkperf:perf-data[1], linkperf:perf-diff[1], +linkperf:perf-evlist[1], linkperf:perf-ftrace[1], +linkperf:perf-help[1], linkperf:perf-inject[1], +linkperf:perf-intel-pt[1], linkperf:perf-iostat[1], linkperf:perf-kallsyms[1], +linkperf:perf-kmem[1], linkperf:perf-kvm[1], linkperf:perf-lock[1], +linkperf:perf-mem[1], linkperf:perf-probe[1], linkperf:perf-sched[1], +linkperf:perf-script[1], linkperf:perf-test[1], +linkperf:perf-trace[1], linkperf:perf-version[1] diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt index 825745a645c1..3fee9b2a88ea 100644 --- a/tools/perf/Documentation/tips.txt +++ b/tools/perf/Documentation/tips.txt @@ -2,6 +2,7 @@ For a higher level overview, try: perf report --sort comm,dso Sample related events with: perf record -e '{cycles,instructions}:S' Compare performance results with: perf diff [<old file> <new file>] Boolean options have negative forms, e.g.: perf report --no-children +To not accumulate CPU time of children symbols add --no-children Customize output of perf script with: perf script -F event,ip,sym Generate a script for your data: perf script -g <lang> Save output of perf stat using: perf stat record <target workload> @@ -12,32 +13,56 @@ List events using substring match: perf list <keyword> To see list of saved events and attributes: perf evlist -v Use --symfs <dir> if your symbol files are in non-standard locations To see callchains in a more compact form: perf report -g folded +To see call chains by final symbol taking CPU time (bottom up) use perf report -G Show individual samples with: perf script Limit to show entries above 5% only: perf report --percent-limit 5 Profiling branch (mis)predictions with: perf record -b / perf report -To show assembler sample contexts use perf record -b / perf script -F +brstackinsn --xed -Treat branches as callchains: perf report --branch-history -To count events in every 1000 msec: perf stat -I 1000 -Print event counts in CSV format with: perf stat -x, +To show assembler sample context control flow use perf record -b / perf report --samples 10 and then browse context +To adjust path to source files to local file system use perf report --prefix=... --prefix-strip=... +Treat branches as callchains: perf record -b ... ; perf report --branch-history +Show estimate cycles per function and IPC in annotate use perf record -b ... ; perf report --total-cycles +To count events every 1000 msec: perf stat -I 1000 +Print event counts in machine readable CSV format with: perf stat -x\; If you have debuginfo enabled, try: perf report -s sym,srcline For memory address profiling, try: perf mem record / perf mem report For tracepoint events, try: perf report -s trace_fields To record callchains for each sample: perf record -g +If call chains don't work try perf record --call-graph dwarf or --call-graph lbr To record every process run by a user: perf record -u <user> +To show inline functions in call traces add --inline to perf report +To not record events from perf itself add --exclude-perf Skip collecting build-id when recording: perf record -B To change sampling frequency to 100 Hz: perf record -F 100 +To show information about system the samples were collected on use perf report --header +To only collect call graph on one event use perf record -e cpu/cpu-cycles,callgraph=1/,branches ; perf report --show-ref-call-graph +To set sampling period of individual events use perf record -e cpu/cpu-cycles,period=100001/,cpu/branches,period=10001/ ... +To group events which need to be collected together for accuracy use {}: perf record -e {cycles,branches}' ... +To compute metrics for samples use perf record -e '{cycles,instructions}' ... ; perf script -F +metric See assembly instructions with percentage: perf annotate <symbol> If you prefer Intel style assembly, try: perf annotate -M intel +When collecting LBR backtraces use --stitch-lbr to handle more than 32 deep entries: perf record --call-graph lbr ; perf report --stitch-lbr For hierarchical output, try: perf report --hierarchy Order by the overhead of source file name and line number: perf report -s srcline System-wide collection from all CPUs: perf record -a Show current config key-value pairs: perf config --list +To collect Processor Trace with samples use perf record -e '{intel_pt//,cycles}' ; perf script --call-trace or --insn-trace --xed -F +ipc (remove --xed if no xed) +To trace calls using Processor Trace use perf record -e intel_pt// ... ; perf script --call-trace. Then use perf script --time A-B --insn-trace to look at region of interest. +To measure approximate function latency with Processor Trace use perf record -e intel_pt// ... ; perf script --call-ret-trace +To trace only single function with Processor Trace use perf record --filter 'filter func @ program' -e intel_pt//u ./program ; perf script --insn-trace Show user configuration overrides: perf config --user --list To add Node.js USDT(User-Level Statically Defined Tracing): perf buildid-cache --add `which node` -To report cacheline events from previous recording: perf c2c report +To analyze cache line scalability issues use perf c2c record ... ; perf c2c report To browse sample contexts use perf report --sample 10 and select in context menu To separate samples by time use perf report --sort time,overhead,sym +To filter subset of samples with report or script add --time X-Y or --cpu A,B,C or --socket-filter ... To set sample time separation other than 100ms with --sort time use --time-quantum Add -I to perf record to sample register values, which will be visible in perf report sample context. To show IPC for sampling periods use perf record -e '{cycles,instructions}:S' and then browse context To show context switches in perf report sample context add --switch-events to perf record. +To show time in nanoseconds in record/report add --ns +To compare hot regions in two workloads use perf record -b -o file ... ; perf diff --stream file1 file2 +To compare scalability of two workload samples use perf diff -c ratio file1 file2 +For latency profiling, try: perf record/report --latency +For parallelism histogram, try: perf report --hierarchy --sort latency,parallelism,comm,symbol +To analyze particular parallelism levels, try: perf report --latency --parallelism=32-64 +To see how parallelism changes over time, try: perf report -F time,latency,parallelism --time-quantum=1s diff --git a/tools/perf/Documentation/topdown.txt b/tools/perf/Documentation/topdown.txt new file mode 100644 index 000000000000..5c17fff694ee --- /dev/null +++ b/tools/perf/Documentation/topdown.txt @@ -0,0 +1,362 @@ +Using TopDown metrics +--------------------- + +TopDown metrics break apart performance bottlenecks. Starting at level +1 it is typical to get metrics on retiring, bad speculation, frontend +bound, and backend bound. Higher levels provide more detail in to the +level 1 bottlenecks, such as at level 2: core bound, memory bound, +heavy operations, light operations, branch mispredicts, machine +clears, fetch latency and fetch bandwidth. For more details see [1][2][3]. + +perf stat --topdown implements this using available metrics that vary +per architecture. + +% perf stat -a --topdown -I1000 +# time % tma_retiring % tma_backend_bound % tma_frontend_bound % tma_bad_speculation + 1.001141351 11.5 34.9 46.9 6.7 + 2.006141972 13.4 28.1 50.4 8.1 + 3.010162040 12.9 28.1 51.1 8.0 + 4.014009311 12.5 28.6 51.8 7.2 + 5.017838554 11.8 33.0 48.0 7.2 + 5.704818971 14.0 27.5 51.3 7.3 +... + +New Topdown features in Intel Ice Lake +====================================== + +With Ice Lake CPUs the TopDown metrics are directly available as +fixed counters and do not require generic counters. This allows +to collect TopDown always in addition to other events. + +Using TopDown through RDPMC in applications on Intel Ice Lake +============================================================= + +For more fine grained measurements it can be useful to +access the new directly from user space. This is more complicated, +but drastically lowers overhead. + +On Ice Lake, there is a new fixed counter 3: SLOTS, which reports +"pipeline SLOTS" (cycles multiplied by core issue width) and a +metric register that reports slots ratios for the different bottleneck +categories. + +The metrics counter is CPU model specific and is not available on older +CPUs. + +Example code +============ + +Library functions to do the functionality described below +is also available in libjevents [4] + +The application opens a group with fixed counter 3 (SLOTS) and any +metric event, and allow user programs to read the performance counters. + +Fixed counter 3 is mapped to a pseudo event event=0x00, umask=04, +so the perf_event_attr structure should be initialized with +{ .config = 0x0400, .type = PERF_TYPE_RAW } +The metric events are mapped to the pseudo event event=0x00, umask=0x8X. +For example, the perf_event_attr structure can be initialized with +{ .config = 0x8000, .type = PERF_TYPE_RAW } for Retiring metric event +The Fixed counter 3 must be the leader of the group. + +#include <linux/perf_event.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <unistd.h> + +/* Provide own perf_event_open stub because glibc doesn't */ +__attribute__((weak)) +int perf_event_open(struct perf_event_attr *attr, pid_t pid, + int cpu, int group_fd, unsigned long flags) +{ + return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); +} + +/* Open slots counter file descriptor for current task. */ +struct perf_event_attr slots = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .config = 0x400, + .exclude_kernel = 1, +}; + +int slots_fd = perf_event_open(&slots, 0, -1, -1, 0); +if (slots_fd < 0) + ... error ... + +/* Memory mapping the fd permits _rdpmc calls from userspace */ +void *slots_p = mmap(0, getpagesize(), PROT_READ, MAP_SHARED, slots_fd, 0); +if (!slot_p) + .... error ... + +/* + * Open metrics event file descriptor for current task. + * Set slots event as the leader of the group. + */ +struct perf_event_attr metrics = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .config = 0x8000, + .exclude_kernel = 1, +}; + +int metrics_fd = perf_event_open(&metrics, 0, -1, slots_fd, 0); +if (metrics_fd < 0) + ... error ... + +/* Memory mapping the fd permits _rdpmc calls from userspace */ +void *metrics_p = mmap(0, getpagesize(), PROT_READ, MAP_SHARED, metrics_fd, 0); +if (!metrics_p) + ... error ... + +Note: the file descriptors returned by the perf_event_open calls must be memory +mapped to permit calls to the _rdpmd instruction. Permission may also be granted +by writing the /sys/devices/cpu/rdpmc sysfs node. + +The RDPMC instruction (or _rdpmc compiler intrinsic) can now be used +to read slots and the topdown metrics at different points of the program: + +#include <stdint.h> +#include <x86intrin.h> + +#define RDPMC_FIXED (1 << 30) /* return fixed counters */ +#define RDPMC_METRIC (1 << 29) /* return metric counters */ + +#define FIXED_COUNTER_SLOTS 3 +#define METRIC_COUNTER_TOPDOWN_L1_L2 0 + +static inline uint64_t read_slots(void) +{ + return _rdpmc(RDPMC_FIXED | FIXED_COUNTER_SLOTS); +} + +static inline uint64_t read_metrics(void) +{ + return _rdpmc(RDPMC_METRIC | METRIC_COUNTER_TOPDOWN_L1_L2); +} + +Then the program can be instrumented to read these metrics at different +points. + +It's not a good idea to do this with too short code regions, +as the parallelism and overlap in the CPU program execution will +cause too much measurement inaccuracy. For example instrumenting +individual basic blocks is definitely too fine grained. + +_rdpmc calls should not be mixed with reading the metrics and slots counters +through system calls, as the kernel will reset these counters after each system +call. + +Decoding metrics values +======================= + +The value reported by read_metrics() contains four 8 bit fields +that represent a scaled ratio that represent the Level 1 bottleneck. +All four fields add up to 0xff (= 100%) + +The binary ratios in the metric value can be converted to float ratios: + +#define GET_METRIC(m, i) (((m) >> (i*8)) & 0xff) + +/* L1 Topdown metric events */ +#define TOPDOWN_RETIRING(val) ((float)GET_METRIC(val, 0) / 0xff) +#define TOPDOWN_BAD_SPEC(val) ((float)GET_METRIC(val, 1) / 0xff) +#define TOPDOWN_FE_BOUND(val) ((float)GET_METRIC(val, 2) / 0xff) +#define TOPDOWN_BE_BOUND(val) ((float)GET_METRIC(val, 3) / 0xff) + +/* + * L2 Topdown metric events. + * Available on Sapphire Rapids and later platforms. + */ +#define TOPDOWN_HEAVY_OPS(val) ((float)GET_METRIC(val, 4) / 0xff) +#define TOPDOWN_BR_MISPREDICT(val) ((float)GET_METRIC(val, 5) / 0xff) +#define TOPDOWN_FETCH_LAT(val) ((float)GET_METRIC(val, 6) / 0xff) +#define TOPDOWN_MEM_BOUND(val) ((float)GET_METRIC(val, 7) / 0xff) + +and then converted to percent for printing. + +The ratios in the metric accumulate for the time when the counter +is enabled. For measuring programs it is often useful to measure +specific sections. For this it is needed to deltas on metrics. + +This can be done by scaling the metrics with the slots counter +read at the same time. + +Then it's possible to take deltas of these slots counts +measured at different points, and determine the metrics +for that time period. + + slots_a = read_slots(); + metric_a = read_metrics(); + + ... larger code region ... + + slots_b = read_slots() + metric_b = read_metrics() + + # compute scaled metrics for measurement a + retiring_slots_a = GET_METRIC(metric_a, 0) * slots_a + bad_spec_slots_a = GET_METRIC(metric_a, 1) * slots_a + fe_bound_slots_a = GET_METRIC(metric_a, 2) * slots_a + be_bound_slots_a = GET_METRIC(metric_a, 3) * slots_a + + # compute delta scaled metrics between b and a + retiring_slots = GET_METRIC(metric_b, 0) * slots_b - retiring_slots_a + bad_spec_slots = GET_METRIC(metric_b, 1) * slots_b - bad_spec_slots_a + fe_bound_slots = GET_METRIC(metric_b, 2) * slots_b - fe_bound_slots_a + be_bound_slots = GET_METRIC(metric_b, 3) * slots_b - be_bound_slots_a + +Later the individual ratios of L1 metric events for the measurement period can +be recreated from these counts. + + slots_delta = slots_b - slots_a + retiring_ratio = (float)retiring_slots / slots_delta + bad_spec_ratio = (float)bad_spec_slots / slots_delta + fe_bound_ratio = (float)fe_bound_slots / slots_delta + be_bound_ratio = (float)be_bound_slots / slota_delta + + printf("Retiring %.2f%% Bad Speculation %.2f%% FE Bound %.2f%% BE Bound %.2f%%\n", + retiring_ratio * 100., + bad_spec_ratio * 100., + fe_bound_ratio * 100., + be_bound_ratio * 100.); + +The individual ratios of L2 metric events for the measurement period can be +recreated from L1 and L2 metric counters. (Available on Sapphire Rapids and +later platforms) + + # compute scaled metrics for measurement a + heavy_ops_slots_a = GET_METRIC(metric_a, 4) * slots_a + br_mispredict_slots_a = GET_METRIC(metric_a, 5) * slots_a + fetch_lat_slots_a = GET_METRIC(metric_a, 6) * slots_a + mem_bound_slots_a = GET_METRIC(metric_a, 7) * slots_a + + # compute delta scaled metrics between b and a + heavy_ops_slots = GET_METRIC(metric_b, 4) * slots_b - heavy_ops_slots_a + br_mispredict_slots = GET_METRIC(metric_b, 5) * slots_b - br_mispredict_slots_a + fetch_lat_slots = GET_METRIC(metric_b, 6) * slots_b - fetch_lat_slots_a + mem_bound_slots = GET_METRIC(metric_b, 7) * slots_b - mem_bound_slots_a + + slots_delta = slots_b - slots_a + heavy_ops_ratio = (float)heavy_ops_slots / slots_delta + light_ops_ratio = retiring_ratio - heavy_ops_ratio; + + br_mispredict_ratio = (float)br_mispredict_slots / slots_delta + machine_clears_ratio = bad_spec_ratio - br_mispredict_ratio; + + fetch_lat_ratio = (float)fetch_lat_slots / slots_delta + fetch_bw_ratio = fe_bound_ratio - fetch_lat_ratio; + + mem_bound_ratio = (float)mem_bound_slots / slota_delta + core_bound_ratio = be_bound_ratio - mem_bound_ratio; + + printf("Heavy Operations %.2f%% Light Operations %.2f%% " + "Branch Mispredict %.2f%% Machine Clears %.2f%% " + "Fetch Latency %.2f%% Fetch Bandwidth %.2f%% " + "Mem Bound %.2f%% Core Bound %.2f%%\n", + heavy_ops_ratio * 100., + light_ops_ratio * 100., + br_mispredict_ratio * 100., + machine_clears_ratio * 100., + fetch_lat_ratio * 100., + fetch_bw_ratio * 100., + mem_bound_ratio * 100., + core_bound_ratio * 100.); + +Resetting metrics counters +========================== + +Since the individual metrics are only 8bit they lose precision for +short regions over time because the number of cycles covered by each +fraction bit shrinks. So the counters need to be reset regularly. + +When using the kernel perf API the kernel resets on every read. +So as long as the reading is at reasonable intervals (every few +seconds) the precision is good. + +When using perf stat it is recommended to always use the -I option, +with no longer interval than a few seconds + + perf stat -I 1000 --topdown ... + +For user programs using RDPMC directly the counter can +be reset explicitly using ioctl: + + ioctl(perf_fd, PERF_EVENT_IOC_RESET, 0); + +This "opens" a new measurement period. + +A program using RDPMC for TopDown should schedule such a reset +regularly, as in every few seconds. + +Limits on Intel Ice Lake +======================== + +Four pseudo TopDown metric events are exposed for the end-users, +topdown-retiring, topdown-bad-spec, topdown-fe-bound and topdown-be-bound. +They can be used to collect the TopDown value under the following +rules: +- All the TopDown metric events must be in a group with the SLOTS event. +- The SLOTS event must be the leader of the group. +- The PERF_FORMAT_GROUP flag must be applied for each TopDown metric + events + +The SLOTS event and the TopDown metric events can be counting members of +a sampling read group. Since the SLOTS event must be the leader of a TopDown +group, the second event of the group is the sampling event. +For example, perf record -e '{slots, $sampling_event, topdown-retiring}:S' + +Extension on Intel Sapphire Rapids Server +========================================= +The metrics counter is extended to support TMA method level 2 metrics. +The lower half of the register is the TMA level 1 metrics (legacy). +The upper half is also divided into four 8-bit fields for the new level 2 +metrics. Four more TopDown metric events are exposed for the end-users, +topdown-heavy-ops, topdown-br-mispredict, topdown-fetch-lat and +topdown-mem-bound. + +Each of the new level 2 metrics in the upper half is a subset of the +corresponding level 1 metric in the lower half. Software can deduce the +other four level 2 metrics by subtracting corresponding metrics as below. + + Light_Operations = Retiring - Heavy_Operations + Machine_Clears = Bad_Speculation - Branch_Mispredicts + Fetch_Bandwidth = Frontend_Bound - Fetch_Latency + Core_Bound = Backend_Bound - Memory_Bound + +TPEBS in TopDown +================ + +TPEBS (Timed PEBS) is one of the new Intel PMU features provided since Granite +Rapids microarchitecture. The TPEBS feature adds a 16 bit retire_latency field +in the Basic Info group of the PEBS record. It records the Core cycles since the +retirement of the previous instruction to the retirement of current instruction. +Please refer to Section 8.4.1 of "Intel® Architecture Instruction Set Extensions +Programming Reference" for more details about this feature. Because this feature +extends PEBS record, sampling with weight option is required to get the +retire_latency value. + + perf record -e event_name -W ... + +In the most recent release of TMA, the metrics begin to use event retire_latency +values in some of the metrics’ formulas on processors that support TPEBS feature. +For previous generations that do not support TPEBS, the values are static and +predefined per processor family by the hardware architects. Due to the diversity +of workloads in execution environments, retire_latency values measured at real +time are more accurate. Therefore, new TMA metrics that use TPEBS will provide +more accurate performance analysis results. + +To support TPEBS in TMA metrics, a new modifier :R on event is added. Perf would +capture retire_latency value of required events(event with :R in metric formula) +with perf record. The retire_latency value would be used in metric calculation. +Currently, this feature is supported through perf stat + + perf stat -M metric_name --record-tpebs ... + + + +[1] https://software.intel.com/en-us/top-down-microarchitecture-analysis-method-win +[2] https://sites.google.com/site/analysismethods/yasin-pubs +[3] https://perf.wiki.kernel.org/index.php/Top-Down_Analysis +[4] https://github.com/andikleen/pmu-tools/tree/master/jevents |