diff options
Diffstat (limited to 'tools/perf/scripts/python')
-rw-r--r-- | tools/perf/scripts/python/Perf-Trace-Util/Build | 3 | ||||
-rw-r--r-- | tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py | 7 | ||||
-rwxr-xr-x | tools/perf/scripts/python/arm-cs-trace-disasm.py | 30 | ||||
-rw-r--r-- | tools/perf/scripts/python/bin/gecko-record | 2 | ||||
-rwxr-xr-x | tools/perf/scripts/python/bin/gecko-report | 7 | ||||
-rw-r--r-- | tools/perf/scripts/python/compaction-times.py | 2 | ||||
-rwxr-xr-x | tools/perf/scripts/python/exported-sql-viewer.py | 4 | ||||
-rw-r--r-- | tools/perf/scripts/python/gecko.py | 395 | ||||
-rwxr-xr-x | tools/perf/scripts/python/parallel-perf.py | 988 |
9 files changed, 1417 insertions, 21 deletions
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Build b/tools/perf/scripts/python/Perf-Trace-Util/Build index 7d0e33ce6aba..5b0b5ff7e14a 100644 --- a/tools/perf/scripts/python/Perf-Trace-Util/Build +++ b/tools/perf/scripts/python/Perf-Trace-Util/Build @@ -1,3 +1,4 @@ perf-y += Context.o -CFLAGS_Context.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs +# -Wno-declaration-after-statement: The python headers have mixed code with declarations (decls after asserts, for instance) +CFLAGS_Context.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs -Wno-declaration-after-statement diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py index 7384dcb628c4..b75d31858e54 100644 --- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py +++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py @@ -54,6 +54,7 @@ try: import audit machine_to_id = { 'x86_64': audit.MACH_86_64, + 'aarch64': audit.MACH_AARCH64, 'alpha' : audit.MACH_ALPHA, 'ia64' : audit.MACH_IA64, 'ppc' : audit.MACH_PPC, @@ -73,9 +74,9 @@ try: except: if not audit_package_warned: audit_package_warned = True - print("Install the audit-libs-python package to get syscall names.\n" - "For example:\n # apt-get install python-audit (Ubuntu)" - "\n # yum install audit-libs-python (Fedora)" + print("Install the python-audit package to get syscall names.\n" + "For example:\n # apt-get install python3-audit (Ubuntu)" + "\n # yum install python3-audit (Fedora)" "\n etc.\n") def syscall_name(id): diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py index d59ff53f1d94..d973c2baed1c 100755 --- a/tools/perf/scripts/python/arm-cs-trace-disasm.py +++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py @@ -45,8 +45,8 @@ parser = OptionParser(option_list=option_list) # Initialize global dicts and regular expression disasm_cache = dict() cpu_data = dict() -disasm_re = re.compile("^\s*([0-9a-fA-F]+):") -disasm_func_re = re.compile("^\s*([0-9a-fA-F]+)\s.*:") +disasm_re = re.compile(r"^\s*([0-9a-fA-F]+):") +disasm_func_re = re.compile(r"^\s*([0-9a-fA-F]+)\s.*:") cache_size = 64*1024 glb_source_file_name = None @@ -188,6 +188,17 @@ def process_event(param_dict): dso_end = get_optional(param_dict, "dso_map_end") symbol = get_optional(param_dict, "symbol") + cpu = sample["cpu"] + ip = sample["ip"] + addr = sample["addr"] + + # Initialize CPU data if it's empty, and directly return back + # if this is the first tracing event for this CPU. + if (cpu_data.get(str(cpu) + 'addr') == None): + cpu_data[str(cpu) + 'addr'] = addr + return + + if (options.verbose == True): print("Event type: %s" % name) print_sample(sample) @@ -209,16 +220,6 @@ def process_event(param_dict): if (name[0:8] != "branches"): return - cpu = sample["cpu"] - ip = sample["ip"] - addr = sample["addr"] - - # Initialize CPU data if it's empty, and directly return back - # if this is the first tracing event for this CPU. - if (cpu_data.get(str(cpu) + 'addr') == None): - cpu_data[str(cpu) + 'addr'] = addr - return - # The format for packet is: # # +------------+------------+------------+ @@ -258,8 +259,9 @@ def process_event(param_dict): if (options.objdump_name != None): # It doesn't need to decrease virtual memory offset for disassembly - # for kernel dso, so in this case we set vm_start to zero. - if (dso == "[kernel.kallsyms]"): + # for kernel dso and executable file dso, so in this case we set + # vm_start to zero. + if (dso == "[kernel.kallsyms]" or dso_start == 0x400000): dso_vm_start = 0 else: dso_vm_start = int(dso_start) diff --git a/tools/perf/scripts/python/bin/gecko-record b/tools/perf/scripts/python/bin/gecko-record new file mode 100644 index 000000000000..f0d1aa55f171 --- /dev/null +++ b/tools/perf/scripts/python/bin/gecko-record @@ -0,0 +1,2 @@ +#!/bin/bash +perf record -F 99 -g "$@" diff --git a/tools/perf/scripts/python/bin/gecko-report b/tools/perf/scripts/python/bin/gecko-report new file mode 100755 index 000000000000..1867ec8d9757 --- /dev/null +++ b/tools/perf/scripts/python/bin/gecko-report @@ -0,0 +1,7 @@ +#!/bin/bash +# description: create firefox gecko profile json format from perf.data +if [ "$*" = "-i -" ]; then +perf script -s "$PERF_EXEC_PATH"/scripts/python/gecko.py +else +perf script -s "$PERF_EXEC_PATH"/scripts/python/gecko.py -- "$@" +fi diff --git a/tools/perf/scripts/python/compaction-times.py b/tools/perf/scripts/python/compaction-times.py index 2560a042dc6f..9401f7c14747 100644 --- a/tools/perf/scripts/python/compaction-times.py +++ b/tools/perf/scripts/python/compaction-times.py @@ -260,7 +260,7 @@ def pr_help(): comm_re = None pid_re = None -pid_regex = "^(\d*)-(\d*)$|^(\d*)$" +pid_regex = r"^(\d*)-(\d*)$|^(\d*)$" opt_proc = popt.DISP_DFL opt_disp = topt.DISP_ALL diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py index 13f2d8a81610..121cf61ba1b3 100755 --- a/tools/perf/scripts/python/exported-sql-viewer.py +++ b/tools/perf/scripts/python/exported-sql-viewer.py @@ -677,8 +677,8 @@ class CallGraphModelBase(TreeModel): # sqlite supports GLOB (text only) which uses * and ? and is case sensitive if not self.glb.dbref.is_sqlite3: # Escape % and _ - s = value.replace("%", "\%") - s = s.replace("_", "\_") + s = value.replace("%", "\\%") + s = s.replace("_", "\\_") # Translate * and ? into SQL LIKE pattern characters % and _ trans = string.maketrans("*?", "%_") match = " LIKE '" + str(s).translate(trans) + "'" diff --git a/tools/perf/scripts/python/gecko.py b/tools/perf/scripts/python/gecko.py new file mode 100644 index 000000000000..bc5a72f94bfa --- /dev/null +++ b/tools/perf/scripts/python/gecko.py @@ -0,0 +1,395 @@ +# gecko.py - Convert perf record output to Firefox's gecko profile format +# SPDX-License-Identifier: GPL-2.0 +# +# The script converts perf.data to Gecko Profile Format, +# which can be read by https://profiler.firefox.com/. +# +# Usage: +# +# perf record -a -g -F 99 sleep 60 +# perf script report gecko +# +# Combined: +# +# perf script gecko -F 99 -a sleep 60 + +import os +import sys +import time +import json +import string +import random +import argparse +import threading +import webbrowser +import urllib.parse +from os import system +from functools import reduce +from dataclasses import dataclass, field +from http.server import HTTPServer, SimpleHTTPRequestHandler, test +from typing import List, Dict, Optional, NamedTuple, Set, Tuple, Any + +# Add the Perf-Trace-Util library to the Python path +sys.path.append(os.environ['PERF_EXEC_PATH'] + \ + '/scripts/python/Perf-Trace-Util/lib/Perf/Trace') + +from perf_trace_context import * +from Core import * + +StringID = int +StackID = int +FrameID = int +CategoryID = int +Milliseconds = float + +# start_time is intialiazed only once for the all event traces. +start_time = None + +# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/profile.js#L425 +# Follow Brendan Gregg's Flamegraph convention: orange for kernel and yellow for user space by default. +CATEGORIES = None + +# The product name is used by the profiler UI to show the Operating system and Processor. +PRODUCT = os.popen('uname -op').read().strip() + +# store the output file +output_file = None + +# Here key = tid, value = Thread +tid_to_thread = dict() + +# The HTTP server is used to serve the profile to the profiler UI. +http_server_thread = None + +# The category index is used by the profiler UI to show the color of the flame graph. +USER_CATEGORY_INDEX = 0 +KERNEL_CATEGORY_INDEX = 1 + +# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L156 +class Frame(NamedTuple): + string_id: StringID + relevantForJS: bool + innerWindowID: int + implementation: None + optimizations: None + line: None + column: None + category: CategoryID + subcategory: int + +# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L216 +class Stack(NamedTuple): + prefix_id: Optional[StackID] + frame_id: FrameID + +# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L90 +class Sample(NamedTuple): + stack_id: Optional[StackID] + time_ms: Milliseconds + responsiveness: int + +@dataclass +class Thread: + """A builder for a profile of the thread. + + Attributes: + comm: Thread command-line (name). + pid: process ID of containing process. + tid: thread ID. + samples: Timeline of profile samples. + frameTable: interned stack frame ID -> stack frame. + stringTable: interned string ID -> string. + stringMap: interned string -> string ID. + stackTable: interned stack ID -> stack. + stackMap: (stack prefix ID, leaf stack frame ID) -> interned Stack ID. + frameMap: Stack Frame string -> interned Frame ID. + comm: str + pid: int + tid: int + samples: List[Sample] = field(default_factory=list) + frameTable: List[Frame] = field(default_factory=list) + stringTable: List[str] = field(default_factory=list) + stringMap: Dict[str, int] = field(default_factory=dict) + stackTable: List[Stack] = field(default_factory=list) + stackMap: Dict[Tuple[Optional[int], int], int] = field(default_factory=dict) + frameMap: Dict[str, int] = field(default_factory=dict) + """ + comm: str + pid: int + tid: int + samples: List[Sample] = field(default_factory=list) + frameTable: List[Frame] = field(default_factory=list) + stringTable: List[str] = field(default_factory=list) + stringMap: Dict[str, int] = field(default_factory=dict) + stackTable: List[Stack] = field(default_factory=list) + stackMap: Dict[Tuple[Optional[int], int], int] = field(default_factory=dict) + frameMap: Dict[str, int] = field(default_factory=dict) + + def _intern_stack(self, frame_id: int, prefix_id: Optional[int]) -> int: + """Gets a matching stack, or saves the new stack. Returns a Stack ID.""" + key = f"{frame_id}" if prefix_id is None else f"{frame_id},{prefix_id}" + # key = (prefix_id, frame_id) + stack_id = self.stackMap.get(key) + if stack_id is None: + # return stack_id + stack_id = len(self.stackTable) + self.stackTable.append(Stack(prefix_id=prefix_id, frame_id=frame_id)) + self.stackMap[key] = stack_id + return stack_id + + def _intern_string(self, string: str) -> int: + """Gets a matching string, or saves the new string. Returns a String ID.""" + string_id = self.stringMap.get(string) + if string_id is not None: + return string_id + string_id = len(self.stringTable) + self.stringTable.append(string) + self.stringMap[string] = string_id + return string_id + + def _intern_frame(self, frame_str: str) -> int: + """Gets a matching stack frame, or saves the new frame. Returns a Frame ID.""" + frame_id = self.frameMap.get(frame_str) + if frame_id is not None: + return frame_id + frame_id = len(self.frameTable) + self.frameMap[frame_str] = frame_id + string_id = self._intern_string(frame_str) + + symbol_name_to_category = KERNEL_CATEGORY_INDEX if frame_str.find('kallsyms') != -1 \ + or frame_str.find('/vmlinux') != -1 \ + or frame_str.endswith('.ko)') \ + else USER_CATEGORY_INDEX + + self.frameTable.append(Frame( + string_id=string_id, + relevantForJS=False, + innerWindowID=0, + implementation=None, + optimizations=None, + line=None, + column=None, + category=symbol_name_to_category, + subcategory=None, + )) + return frame_id + + def _add_sample(self, comm: str, stack: List[str], time_ms: Milliseconds) -> None: + """Add a timestamped stack trace sample to the thread builder. + Args: + comm: command-line (name) of the thread at this sample + stack: sampled stack frames. Root first, leaf last. + time_ms: timestamp of sample in milliseconds. + """ + # Ihreads may not set their names right after they are created. + # Instead, they might do it later. In such situations, to use the latest name they have set. + if self.comm != comm: + self.comm = comm + + prefix_stack_id = reduce(lambda prefix_id, frame: self._intern_stack + (self._intern_frame(frame), prefix_id), stack, None) + if prefix_stack_id is not None: + self.samples.append(Sample(stack_id=prefix_stack_id, + time_ms=time_ms, + responsiveness=0)) + + def _to_json_dict(self) -> Dict: + """Converts current Thread to GeckoThread JSON format.""" + # Gecko profile format is row-oriented data as List[List], + # And a schema for interpreting each index. + # Schema: + # https://github.com/firefox-devtools/profiler/blob/main/docs-developer/gecko-profile-format.md + # https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L230 + return { + "tid": self.tid, + "pid": self.pid, + "name": self.comm, + # https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L51 + "markers": { + "schema": { + "name": 0, + "startTime": 1, + "endTime": 2, + "phase": 3, + "category": 4, + "data": 5, + }, + "data": [], + }, + + # https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L90 + "samples": { + "schema": { + "stack": 0, + "time": 1, + "responsiveness": 2, + }, + "data": self.samples + }, + + # https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L156 + "frameTable": { + "schema": { + "location": 0, + "relevantForJS": 1, + "innerWindowID": 2, + "implementation": 3, + "optimizations": 4, + "line": 5, + "column": 6, + "category": 7, + "subcategory": 8, + }, + "data": self.frameTable, + }, + + # https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L216 + "stackTable": { + "schema": { + "prefix": 0, + "frame": 1, + }, + "data": self.stackTable, + }, + "stringTable": self.stringTable, + "registerTime": 0, + "unregisterTime": None, + "processType": "default", + } + +# Uses perf script python interface to parse each +# event and store the data in the thread builder. +def process_event(param_dict: Dict) -> None: + global start_time + global tid_to_thread + time_stamp = (param_dict['sample']['time'] // 1000) / 1000 + pid = param_dict['sample']['pid'] + tid = param_dict['sample']['tid'] + comm = param_dict['comm'] + + # Start time is the time of the first sample + if not start_time: + start_time = time_stamp + + # Parse and append the callchain of the current sample into a stack. + stack = [] + if param_dict['callchain']: + for call in param_dict['callchain']: + if 'sym' not in call: + continue + stack.append(f'{call["sym"]["name"]} (in {call["dso"]})') + if len(stack) != 0: + # Reverse the stack, as root come first and the leaf at the end. + stack = stack[::-1] + + # During perf record if -g is not used, the callchain is not available. + # In that case, the symbol and dso are available in the event parameters. + else: + func = param_dict['symbol'] if 'symbol' in param_dict else '[unknown]' + dso = param_dict['dso'] if 'dso' in param_dict else '[unknown]' + stack.append(f'{func} (in {dso})') + + # Add sample to the specific thread. + thread = tid_to_thread.get(tid) + if thread is None: + thread = Thread(comm=comm, pid=pid, tid=tid) + tid_to_thread[tid] = thread + thread._add_sample(comm=comm, stack=stack, time_ms=time_stamp) + +def trace_begin() -> None: + global output_file + if (output_file is None): + print("Staring Firefox Profiler on your default browser...") + global http_server_thread + http_server_thread = threading.Thread(target=test, args=(CORSRequestHandler, HTTPServer,)) + http_server_thread.daemon = True + http_server_thread.start() + +# Trace_end runs at the end and will be used to aggregate +# the data into the final json object and print it out to stdout. +def trace_end() -> None: + global output_file + threads = [thread._to_json_dict() for thread in tid_to_thread.values()] + + # Schema: https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L305 + gecko_profile_with_meta = { + "meta": { + "interval": 1, + "processType": 0, + "product": PRODUCT, + "stackwalk": 1, + "debug": 0, + "gcpoison": 0, + "asyncstack": 1, + "startTime": start_time, + "shutdownTime": None, + "version": 24, + "presymbolicated": True, + "categories": CATEGORIES, + "markerSchema": [], + }, + "libs": [], + "threads": threads, + "processes": [], + "pausedRanges": [], + } + # launch the profiler on local host if not specified --save-only args, otherwise print to file + if (output_file is None): + output_file = 'gecko_profile.json' + with open(output_file, 'w') as f: + json.dump(gecko_profile_with_meta, f, indent=2) + launchFirefox(output_file) + time.sleep(1) + print(f'[ perf gecko: Captured and wrote into {output_file} ]') + else: + print(f'[ perf gecko: Captured and wrote into {output_file} ]') + with open(output_file, 'w') as f: + json.dump(gecko_profile_with_meta, f, indent=2) + +# Used to enable Cross-Origin Resource Sharing (CORS) for requests coming from 'https://profiler.firefox.com', allowing it to access resources from this server. +class CORSRequestHandler(SimpleHTTPRequestHandler): + def end_headers (self): + self.send_header('Access-Control-Allow-Origin', 'https://profiler.firefox.com') + SimpleHTTPRequestHandler.end_headers(self) + +# start a local server to serve the gecko_profile.json file to the profiler.firefox.com +def launchFirefox(file): + safe_string = urllib.parse.quote_plus(f'http://localhost:8000/{file}') + url = 'https://profiler.firefox.com/from-url/' + safe_string + webbrowser.open(f'{url}') + +def main() -> None: + global output_file + global CATEGORIES + parser = argparse.ArgumentParser(description="Convert perf.data to Firefox\'s Gecko Profile format which can be uploaded to profiler.firefox.com for visualization") + + # Add the command-line options + # Colors must be defined according to this: + # https://github.com/firefox-devtools/profiler/blob/50124adbfa488adba6e2674a8f2618cf34b59cd2/res/css/categories.css + parser.add_argument('--user-color', default='yellow', help='Color for the User category', choices=['yellow', 'blue', 'purple', 'green', 'orange', 'red', 'grey', 'magenta']) + parser.add_argument('--kernel-color', default='orange', help='Color for the Kernel category', choices=['yellow', 'blue', 'purple', 'green', 'orange', 'red', 'grey', 'magenta']) + # If --save-only is specified, the output will be saved to a file instead of opening Firefox's profiler directly. + parser.add_argument('--save-only', help='Save the output to a file instead of opening Firefox\'s profiler') + + # Parse the command-line arguments + args = parser.parse_args() + # Access the values provided by the user + user_color = args.user_color + kernel_color = args.kernel_color + output_file = args.save_only + + CATEGORIES = [ + { + "name": 'User', + "color": user_color, + "subcategories": ['Other'] + }, + { + "name": 'Kernel', + "color": kernel_color, + "subcategories": ['Other'] + }, + ] + +if __name__ == '__main__': + main() diff --git a/tools/perf/scripts/python/parallel-perf.py b/tools/perf/scripts/python/parallel-perf.py new file mode 100755 index 000000000000..21f32ec5ed46 --- /dev/null +++ b/tools/perf/scripts/python/parallel-perf.py @@ -0,0 +1,988 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Run a perf script command multiple times in parallel, using perf script +# options --cpu and --time so that each job processes a different chunk +# of the data. +# +# Copyright (c) 2024, Intel Corporation. + +import subprocess +import argparse +import pathlib +import shlex +import time +import copy +import sys +import os +import re + +glb_prog_name = "parallel-perf.py" +glb_min_interval = 10.0 +glb_min_samples = 64 + +class Verbosity(): + + def __init__(self, quiet=False, verbose=False, debug=False): + self.normal = True + self.verbose = verbose + self.debug = debug + self.self_test = True + if self.debug: + self.verbose = True + if self.verbose: + quiet = False + if quiet: + self.normal = False + +# Manage work (Start/Wait/Kill), as represented by a subprocess.Popen command +class Work(): + + def __init__(self, cmd, pipe_to, output_dir="."): + self.popen = None + self.consumer = None + self.cmd = cmd + self.pipe_to = pipe_to + self.output_dir = output_dir + self.cmdout_name = f"{output_dir}/cmd.txt" + self.stdout_name = f"{output_dir}/out.txt" + self.stderr_name = f"{output_dir}/err.txt" + + def Command(self): + sh_cmd = [ shlex.quote(x) for x in self.cmd ] + return " ".join(self.cmd) + + def Stdout(self): + return open(self.stdout_name, "w") + + def Stderr(self): + return open(self.stderr_name, "w") + + def CreateOutputDir(self): + pathlib.Path(self.output_dir).mkdir(parents=True, exist_ok=True) + + def Start(self): + if self.popen: + return + self.CreateOutputDir() + with open(self.cmdout_name, "w") as f: + f.write(self.Command()) + f.write("\n") + stdout = self.Stdout() + stderr = self.Stderr() + if self.pipe_to: + self.popen = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=stderr) + args = shlex.split(self.pipe_to) + self.consumer = subprocess.Popen(args, stdin=self.popen.stdout, stdout=stdout, stderr=stderr) + else: + self.popen = subprocess.Popen(self.cmd, stdout=stdout, stderr=stderr) + + def RemoveEmptyErrFile(self): + if os.path.exists(self.stderr_name): + if os.path.getsize(self.stderr_name) == 0: + os.unlink(self.stderr_name) + + def Errors(self): + if os.path.exists(self.stderr_name): + if os.path.getsize(self.stderr_name) != 0: + return [ f"Non-empty error file {self.stderr_name}" ] + return [] + + def TidyUp(self): + self.RemoveEmptyErrFile() + + def RawPollWait(self, p, wait): + if wait: + return p.wait() + return p.poll() + + def Poll(self, wait=False): + if not self.popen: + return None + result = self.RawPollWait(self.popen, wait) + if self.consumer: + res = result + result = self.RawPollWait(self.consumer, wait) + if result != None and res == None: + self.popen.kill() + result = None + elif result == 0 and res != None and res != 0: + result = res + if result != None: + self.TidyUp() + return result + + def Wait(self): + return self.Poll(wait=True) + + def Kill(self): + if not self.popen: + return + self.popen.kill() + if self.consumer: + self.consumer.kill() + +def KillWork(worklist, verbosity): + for w in worklist: + w.Kill() + for w in worklist: + w.Wait() + +def NumberOfCPUs(): + return os.sysconf("SC_NPROCESSORS_ONLN") + +def NanoSecsToSecsStr(x): + if x == None: + return "" + x = str(x) + if len(x) < 10: + x = "0" * (10 - len(x)) + x + return x[:len(x) - 9] + "." + x[-9:] + +def InsertOptionAfter(cmd, option, after): + try: + pos = cmd.index(after) + cmd.insert(pos + 1, option) + except: + cmd.append(option) + +def CreateWorkList(cmd, pipe_to, output_dir, cpus, time_ranges_by_cpu): + max_len = len(str(cpus[-1])) + cpu_dir_fmt = f"cpu-%.{max_len}u" + worklist = [] + pos = 0 + for cpu in cpus: + if cpu >= 0: + cpu_dir = os.path.join(output_dir, cpu_dir_fmt % cpu) + cpu_option = f"--cpu={cpu}" + else: + cpu_dir = output_dir + cpu_option = None + + tr_dir_fmt = "time-range" + + if len(time_ranges_by_cpu) > 1: + time_ranges = time_ranges_by_cpu[pos] + tr_dir_fmt += f"-{pos}" + pos += 1 + else: + time_ranges = time_ranges_by_cpu[0] + + max_len = len(str(len(time_ranges))) + tr_dir_fmt += f"-%.{max_len}u" + + i = 0 + for r in time_ranges: + if r == [None, None]: + time_option = None + work_output_dir = cpu_dir + else: + time_option = "--time=" + NanoSecsToSecsStr(r[0]) + "," + NanoSecsToSecsStr(r[1]) + work_output_dir = os.path.join(cpu_dir, tr_dir_fmt % i) + i += 1 + work_cmd = list(cmd) + if time_option != None: + InsertOptionAfter(work_cmd, time_option, "script") + if cpu_option != None: + InsertOptionAfter(work_cmd, cpu_option, "script") + w = Work(work_cmd, pipe_to, work_output_dir) + worklist.append(w) + return worklist + +def DoRunWork(worklist, nr_jobs, verbosity): + nr_to_do = len(worklist) + not_started = list(worklist) + running = [] + done = [] + chg = False + while True: + nr_done = len(done) + if chg and verbosity.normal: + nr_run = len(running) + print(f"\rThere are {nr_to_do} jobs: {nr_done} completed, {nr_run} running", flush=True, end=" ") + if verbosity.verbose: + print() + chg = False + if nr_done == nr_to_do: + break + while len(running) < nr_jobs and len(not_started): + w = not_started.pop(0) + running.append(w) + if verbosity.verbose: + print("Starting:", w.Command()) + w.Start() + chg = True + if len(running): + time.sleep(0.1) + finished = [] + not_finished = [] + while len(running): + w = running.pop(0) + r = w.Poll() + if r == None: + not_finished.append(w) + continue + if r == 0: + if verbosity.verbose: + print("Finished:", w.Command()) + finished.append(w) + chg = True + continue + if verbosity.normal and not verbosity.verbose: + print() + print("Job failed!\n return code:", r, "\n command: ", w.Command()) + if w.pipe_to: + print(" piped to: ", w.pipe_to) + print("Killing outstanding jobs") + KillWork(not_finished, verbosity) + KillWork(running, verbosity) + return False + running = not_finished + done += finished + errorlist = [] + for w in worklist: + errorlist += w.Errors() + if len(errorlist): + print("Errors:") + for e in errorlist: + print(e) + elif verbosity.normal: + print("\r"," "*50, "\rAll jobs finished successfully", flush=True) + return True + +def RunWork(worklist, nr_jobs=NumberOfCPUs(), verbosity=Verbosity()): + try: + return DoRunWork(worklist, nr_jobs, verbosity) + except: + for w in worklist: + w.Kill() + raise + return True + +def ReadHeader(perf, file_name): + return subprocess.Popen([perf, "script", "--header-only", "--input", file_name], stdout=subprocess.PIPE).stdout.read().decode("utf-8") + +def ParseHeader(hdr): + result = {} + lines = hdr.split("\n") + for line in lines: + if ":" in line and line[0] == "#": + pos = line.index(":") + name = line[1:pos-1].strip() + value = line[pos+1:].strip() + if name in result: + orig_name = name + nr = 2 + while True: + name = f"{orig_name} {nr}" + if name not in result: + break + nr += 1 + result[name] = value + return result + +def HeaderField(hdr_dict, hdr_fld): + if hdr_fld not in hdr_dict: + raise Exception(f"'{hdr_fld}' missing from header information") + return hdr_dict[hdr_fld] + +# Represent the position of an option within a command string +# and provide the option value and/or remove the option +class OptPos(): + + def Init(self, opt_element=-1, value_element=-1, opt_pos=-1, value_pos=-1, error=None): + self.opt_element = opt_element # list element that contains option + self.value_element = value_element # list element that contains option value + self.opt_pos = opt_pos # string position of option + self.value_pos = value_pos # string position of value + self.error = error # error message string + + def __init__(self, args, short_name, long_name, default=None): + self.args = list(args) + self.default = default + n = 2 + len(long_name) + m = len(short_name) + pos = -1 + for opt in args: + pos += 1 + if m and opt[:2] == f"-{short_name}": + if len(opt) == 2: + if pos + 1 < len(args): + self.Init(pos, pos + 1, 0, 0) + else: + self.Init(error = f"-{short_name} option missing value") + else: + self.Init(pos, pos, 0, 2) + return + if opt[:n] == f"--{long_name}": + if len(opt) == n: + if pos + 1 < len(args): + self.Init(pos, pos + 1, 0, 0) + else: + self.Init(error = f"--{long_name} option missing value") + elif opt[n] == "=": + self.Init(pos, pos, 0, n + 1) + else: + self.Init(error = f"--{long_name} option expected '='") + return + if m and opt[:1] == "-" and opt[:2] != "--" and short_name in opt: + ipos = opt.index(short_name) + if "-" in opt[1:]: + hpos = opt[1:].index("-") + if hpos < ipos: + continue + if ipos + 1 == len(opt): + if pos + 1 < len(args): + self.Init(pos, pos + 1, ipos, 0) + else: + self.Init(error = f"-{short_name} option missing value") + else: + self.Init(pos, pos, ipos, ipos + 1) + return + self.Init() + + def Value(self): + if self.opt_element >= 0: + if self.opt_element != self.value_element: + return self.args[self.value_element] + else: + return self.args[self.value_element][self.value_pos:] + return self.default + + def Remove(self, args): + if self.opt_element == -1: + return + if self.opt_element != self.value_element: + del args[self.value_element] + if self.opt_pos: + args[self.opt_element] = args[self.opt_element][:self.opt_pos] + else: + del args[self.opt_element] + +def DetermineInputFileName(cmd): + p = OptPos(cmd, "i", "input", "perf.data") + if p.error: + raise Exception(f"perf command {p.error}") + file_name = p.Value() + if not os.path.exists(file_name): + raise Exception(f"perf command input file '{file_name}' not found") + return file_name + +def ReadOption(args, short_name, long_name, err_prefix, remove=False): + p = OptPos(args, short_name, long_name) + if p.error: + raise Exception(f"{err_prefix}{p.error}") + value = p.Value() + if remove: + p.Remove(args) + return value + +def ExtractOption(args, short_name, long_name, err_prefix): + return ReadOption(args, short_name, long_name, err_prefix, True) + +def ReadPerfOption(args, short_name, long_name): + return ReadOption(args, short_name, long_name, "perf command ") + +def ExtractPerfOption(args, short_name, long_name): + return ExtractOption(args, short_name, long_name, "perf command ") + +def PerfDoubleQuickCommands(cmd, file_name): + cpu_str = ReadPerfOption(cmd, "C", "cpu") + time_str = ReadPerfOption(cmd, "", "time") + # Use double-quick sampling to determine trace data density + times_cmd = ["perf", "script", "--ns", "--input", file_name, "--itrace=qqi"] + if cpu_str != None and cpu_str != "": + times_cmd.append(f"--cpu={cpu_str}") + if time_str != None and time_str != "": + times_cmd.append(f"--time={time_str}") + cnts_cmd = list(times_cmd) + cnts_cmd.append("-Fcpu") + times_cmd.append("-Fcpu,time") + return cnts_cmd, times_cmd + +class CPUTimeRange(): + def __init__(self, cpu): + self.cpu = cpu + self.sample_cnt = 0 + self.time_ranges = None + self.interval = 0 + self.interval_remaining = 0 + self.remaining = 0 + self.tr_pos = 0 + +def CalcTimeRangesByCPU(line, cpu, cpu_time_ranges, max_time): + cpu_time_range = cpu_time_ranges[cpu] + cpu_time_range.remaining -= 1 + cpu_time_range.interval_remaining -= 1 + if cpu_time_range.remaining == 0: + cpu_time_range.time_ranges[cpu_time_range.tr_pos][1] = max_time + return + if cpu_time_range.interval_remaining == 0: + time = TimeVal(line[1][:-1], 0) + time_ranges = cpu_time_range.time_ranges + time_ranges[cpu_time_range.tr_pos][1] = time - 1 + time_ranges.append([time, max_time]) + cpu_time_range.tr_pos += 1 + cpu_time_range.interval_remaining = cpu_time_range.interval + +def CountSamplesByCPU(line, cpu, cpu_time_ranges): + try: + cpu_time_ranges[cpu].sample_cnt += 1 + except: + print("exception") + print("cpu", cpu) + print("len(cpu_time_ranges)", len(cpu_time_ranges)) + raise + +def ProcessCommandOutputLines(cmd, per_cpu, fn, *x): + # Assume CPU number is at beginning of line and enclosed by [] + pat = re.compile(r"\s*\[[0-9]+\]") + p = subprocess.Popen(cmd, stdout=subprocess.PIPE) + while True: + if line := p.stdout.readline(): + line = line.decode("utf-8") + if pat.match(line): + line = line.split() + if per_cpu: + # Assumes CPU number is enclosed by [] + cpu = int(line[0][1:-1]) + else: + cpu = 0 + fn(line, cpu, *x) + else: + break + p.wait() + +def IntersectTimeRanges(new_time_ranges, time_ranges): + pos = 0 + new_pos = 0 + # Can assume len(time_ranges) != 0 and len(new_time_ranges) != 0 + # Note also, there *must* be at least one intersection. + while pos < len(time_ranges) and new_pos < len(new_time_ranges): + # new end < old start => no intersection, remove new + if new_time_ranges[new_pos][1] < time_ranges[pos][0]: + del new_time_ranges[new_pos] + continue + # new start > old end => no intersection, check next + if new_time_ranges[new_pos][0] > time_ranges[pos][1]: + pos += 1 + if pos < len(time_ranges): + continue + # no next, so remove remaining + while new_pos < len(new_time_ranges): + del new_time_ranges[new_pos] + return + # Found an intersection + # new start < old start => adjust new start = old start + if new_time_ranges[new_pos][0] < time_ranges[pos][0]: + new_time_ranges[new_pos][0] = time_ranges[pos][0] + # new end > old end => keep the overlap, insert the remainder + if new_time_ranges[new_pos][1] > time_ranges[pos][1]: + r = [ time_ranges[pos][1] + 1, new_time_ranges[new_pos][1] ] + new_time_ranges[new_pos][1] = time_ranges[pos][1] + new_pos += 1 + new_time_ranges.insert(new_pos, r) + continue + # new [start, end] is within old [start, end] + new_pos += 1 + +def SplitTimeRangesByTraceDataDensity(time_ranges, cpus, nr, cmd, file_name, per_cpu, min_size, min_interval, verbosity): + if verbosity.normal: + print("\rAnalyzing...", flush=True, end=" ") + if verbosity.verbose: + print() + cnts_cmd, times_cmd = PerfDoubleQuickCommands(cmd, file_name) + + nr_cpus = cpus[-1] + 1 if per_cpu else 1 + if per_cpu: + nr_cpus = cpus[-1] + 1 + cpu_time_ranges = [ CPUTimeRange(cpu) for cpu in range(nr_cpus) ] + else: + nr_cpus = 1 + cpu_time_ranges = [ CPUTimeRange(-1) ] + + if verbosity.debug: + print("nr_cpus", nr_cpus) + print("cnts_cmd", cnts_cmd) + print("times_cmd", times_cmd) + + # Count the number of "double quick" samples per CPU + ProcessCommandOutputLines(cnts_cmd, per_cpu, CountSamplesByCPU, cpu_time_ranges) + + tot = 0 + mx = 0 + for cpu_time_range in cpu_time_ranges: + cnt = cpu_time_range.sample_cnt + tot += cnt + if cnt > mx: + mx = cnt + if verbosity.debug: + print("cpu:", cpu_time_range.cpu, "sample_cnt", cnt) + + if min_size < 1: + min_size = 1 + + if mx < min_size: + # Too little data to be worth splitting + if verbosity.debug: + print("Too little data to split by time") + if nr == 0: + nr = 1 + return [ SplitTimeRangesIntoN(time_ranges, nr, min_interval) ] + + if nr: + divisor = nr + min_size = 1 + else: + divisor = NumberOfCPUs() + + interval = int(round(tot / divisor, 0)) + if interval < min_size: + interval = min_size + + if verbosity.debug: + print("divisor", divisor) + print("min_size", min_size) + print("interval", interval) + + min_time = time_ranges[0][0] + max_time = time_ranges[-1][1] + + for cpu_time_range in cpu_time_ranges: + cnt = cpu_time_range.sample_cnt + if cnt == 0: + cpu_time_range.time_ranges = copy.deepcopy(time_ranges) + continue + # Adjust target interval for CPU to give approximately equal interval sizes + # Determine number of intervals, rounding to nearest integer + n = int(round(cnt / interval, 0)) + if n < 1: + n = 1 + # Determine interval size, rounding up + d, m = divmod(cnt, n) + if m: + d += 1 + cpu_time_range.interval = d + cpu_time_range.interval_remaining = d + cpu_time_range.remaining = cnt + # Init. time ranges for each CPU with the start time + cpu_time_range.time_ranges = [ [min_time, max_time] ] + + # Set time ranges so that the same number of "double quick" samples + # will fall into each time range. + ProcessCommandOutputLines(times_cmd, per_cpu, CalcTimeRangesByCPU, cpu_time_ranges, max_time) + + for cpu_time_range in cpu_time_ranges: + if cpu_time_range.sample_cnt: + IntersectTimeRanges(cpu_time_range.time_ranges, time_ranges) + + return [cpu_time_ranges[cpu].time_ranges for cpu in cpus] + +def SplitSingleTimeRangeIntoN(time_range, n): + if n <= 1: + return [time_range] + start = time_range[0] + end = time_range[1] + duration = int((end - start + 1) / n) + if duration < 1: + return [time_range] + time_ranges = [] + for i in range(n): + time_ranges.append([start, start + duration - 1]) + start += duration + time_ranges[-1][1] = end + return time_ranges + +def TimeRangeDuration(r): + return r[1] - r[0] + 1 + +def TotalDuration(time_ranges): + duration = 0 + for r in time_ranges: + duration += TimeRangeDuration(r) + return duration + +def SplitTimeRangesByInterval(time_ranges, interval): + new_ranges = [] + for r in time_ranges: + duration = TimeRangeDuration(r) + n = duration / interval + n = int(round(n, 0)) + new_ranges += SplitSingleTimeRangeIntoN(r, n) + return new_ranges + +def SplitTimeRangesIntoN(time_ranges, n, min_interval): + if n <= len(time_ranges): + return time_ranges + duration = TotalDuration(time_ranges) + interval = duration / n + if interval < min_interval: + interval = min_interval + return SplitTimeRangesByInterval(time_ranges, interval) + +def RecombineTimeRanges(tr): + new_tr = copy.deepcopy(tr) + n = len(new_tr) + i = 1 + while i < len(new_tr): + # if prev end + 1 == cur start, combine them + if new_tr[i - 1][1] + 1 == new_tr[i][0]: + new_tr[i][0] = new_tr[i - 1][0] + del new_tr[i - 1] + else: + i += 1 + return new_tr + +def OpenTimeRangeEnds(time_ranges, min_time, max_time): + if time_ranges[0][0] <= min_time: + time_ranges[0][0] = None + if time_ranges[-1][1] >= max_time: + time_ranges[-1][1] = None + +def BadTimeStr(time_str): + raise Exception(f"perf command bad time option: '{time_str}'\nCheck also 'time of first sample' and 'time of last sample' in perf script --header-only") + +def ValidateTimeRanges(time_ranges, time_str): + n = len(time_ranges) + for i in range(n): + start = time_ranges[i][0] + end = time_ranges[i][1] + if i != 0 and start <= time_ranges[i - 1][1]: + BadTimeStr(time_str) + if start > end: + BadTimeStr(time_str) + +def TimeVal(s, dflt): + s = s.strip() + if s == "": + return dflt + a = s.split(".") + if len(a) > 2: + raise Exception(f"Bad time value'{s}'") + x = int(a[0]) + if x < 0: + raise Exception("Negative time not allowed") + x *= 1000000000 + if len(a) > 1: + x += int((a[1] + "000000000")[:9]) + return x + +def BadCPUStr(cpu_str): + raise Exception(f"perf command bad cpu option: '{cpu_str}'\nCheck also 'nrcpus avail' in perf script --header-only") + +def ParseTimeStr(time_str, min_time, max_time): + if time_str == None or time_str == "": + return [[min_time, max_time]] + time_ranges = [] + for r in time_str.split(): + a = r.split(",") + if len(a) != 2: + BadTimeStr(time_str) + try: + start = TimeVal(a[0], min_time) + end = TimeVal(a[1], max_time) + except: + BadTimeStr(time_str) + time_ranges.append([start, end]) + ValidateTimeRanges(time_ranges, time_str) + return time_ranges + +def ParseCPUStr(cpu_str, nr_cpus): + if cpu_str == None or cpu_str == "": + return [-1] + cpus = [] + for r in cpu_str.split(","): + a = r.split("-") + if len(a) < 1 or len(a) > 2: + BadCPUStr(cpu_str) + try: + start = int(a[0].strip()) + if len(a) > 1: + end = int(a[1].strip()) + else: + end = start + except: + BadCPUStr(cpu_str) + if start < 0 or end < 0 or end < start or end >= nr_cpus: + BadCPUStr(cpu_str) + cpus.extend(range(start, end + 1)) + cpus = list(set(cpus)) # Remove duplicates + cpus.sort() + return cpus + +class ParallelPerf(): + + def __init__(self, a): + for arg_name in vars(a): + setattr(self, arg_name, getattr(a, arg_name)) + self.orig_nr = self.nr + self.orig_cmd = list(self.cmd) + self.perf = self.cmd[0] + if os.path.exists(self.output_dir): + raise Exception(f"Output '{self.output_dir}' already exists") + if self.jobs < 0 or self.nr < 0 or self.interval < 0: + raise Exception("Bad options (negative values): try -h option for help") + if self.nr != 0 and self.interval != 0: + raise Exception("Cannot specify number of time subdivisions and time interval") + if self.jobs == 0: + self.jobs = NumberOfCPUs() + if self.nr == 0 and self.interval == 0: + if self.per_cpu: + self.nr = 1 + else: + self.nr = self.jobs + + def Init(self): + if self.verbosity.debug: + print("cmd", self.cmd) + self.file_name = DetermineInputFileName(self.cmd) + self.hdr = ReadHeader(self.perf, self.file_name) + self.hdr_dict = ParseHeader(self.hdr) + self.cmd_line = HeaderField(self.hdr_dict, "cmdline") + + def ExtractTimeInfo(self): + self.min_time = TimeVal(HeaderField(self.hdr_dict, "time of first sample"), 0) + self.max_time = TimeVal(HeaderField(self.hdr_dict, "time of last sample"), 0) + self.time_str = ExtractPerfOption(self.cmd, "", "time") + self.time_ranges = ParseTimeStr(self.time_str, self.min_time, self.max_time) + if self.verbosity.debug: + print("time_ranges", self.time_ranges) + + def ExtractCPUInfo(self): + if self.per_cpu: + nr_cpus = int(HeaderField(self.hdr_dict, "nrcpus avail")) + self.cpu_str = ExtractPerfOption(self.cmd, "C", "cpu") + if self.cpu_str == None or self.cpu_str == "": + self.cpus = [ x for x in range(nr_cpus) ] + else: + self.cpus = ParseCPUStr(self.cpu_str, nr_cpus) + else: + self.cpu_str = None + self.cpus = [-1] + if self.verbosity.debug: + print("cpus", self.cpus) + + def IsIntelPT(self): + return self.cmd_line.find("intel_pt") >= 0 + + def SplitTimeRanges(self): + if self.IsIntelPT() and self.interval == 0: + self.split_time_ranges_for_each_cpu = \ + SplitTimeRangesByTraceDataDensity(self.time_ranges, self.cpus, self.orig_nr, + self.orig_cmd, self.file_name, self.per_cpu, + self.min_size, self.min_interval, self.verbosity) + elif self.nr: + self.split_time_ranges_for_each_cpu = [ SplitTimeRangesIntoN(self.time_ranges, self.nr, self.min_interval) ] + else: + self.split_time_ranges_for_each_cpu = [ SplitTimeRangesByInterval(self.time_ranges, self.interval) ] + + def CheckTimeRanges(self): + for tr in self.split_time_ranges_for_each_cpu: + # Re-combined time ranges should be the same + new_tr = RecombineTimeRanges(tr) + if new_tr != self.time_ranges: + if self.verbosity.debug: + print("tr", tr) + print("new_tr", new_tr) + raise Exception("Self test failed!") + + def OpenTimeRangeEnds(self): + for time_ranges in self.split_time_ranges_for_each_cpu: + OpenTimeRangeEnds(time_ranges, self.min_time, self.max_time) + + def CreateWorkList(self): + self.worklist = CreateWorkList(self.cmd, self.pipe_to, self.output_dir, self.cpus, self.split_time_ranges_for_each_cpu) + + def PerfDataRecordedPerCPU(self): + if "--per-thread" in self.cmd_line.split(): + return False + return True + + def DefaultToPerCPU(self): + # --no-per-cpu option takes precedence + if self.no_per_cpu: + return False + if not self.PerfDataRecordedPerCPU(): + return False + # Default to per-cpu for Intel PT data that was recorded per-cpu, + # because decoding can be done for each CPU separately. + if self.IsIntelPT(): + return True + return False + + def Config(self): + self.Init() + self.ExtractTimeInfo() + if not self.per_cpu: + self.per_cpu = self.DefaultToPerCPU() + if self.verbosity.debug: + print("per_cpu", self.per_cpu) + self.ExtractCPUInfo() + self.SplitTimeRanges() + if self.verbosity.self_test: + self.CheckTimeRanges() + # Prefer open-ended time range to starting / ending with min_time / max_time resp. + self.OpenTimeRangeEnds() + self.CreateWorkList() + + def Run(self): + if self.dry_run: + print(len(self.worklist),"jobs:") + for w in self.worklist: + print(w.Command()) + return True + result = RunWork(self.worklist, self.jobs, verbosity=self.verbosity) + if self.verbosity.verbose: + print(glb_prog_name, "done") + return result + +def RunParallelPerf(a): + pp = ParallelPerf(a) + pp.Config() + return pp.Run() + +def Main(args): + ap = argparse.ArgumentParser( + prog=glb_prog_name, formatter_class = argparse.RawDescriptionHelpFormatter, + description = +""" +Run a perf script command multiple times in parallel, using perf script options +--cpu and --time so that each job processes a different chunk of the data. +""", + epilog = +""" +Follow the options by '--' and then the perf script command e.g. + + $ perf record -a -- sleep 10 + $ parallel-perf.py --nr=4 -- perf script --ns + All jobs finished successfully + $ tree parallel-perf-output/ + parallel-perf-output/ + ├── time-range-0 + │ ├── cmd.txt + │ └── out.txt + ├── time-range-1 + │ ├── cmd.txt + │ └── out.txt + ├── time-range-2 + │ ├── cmd.txt + │ └── out.txt + └── time-range-3 + ├── cmd.txt + └── out.txt + $ find parallel-perf-output -name cmd.txt | sort | xargs grep -H . + parallel-perf-output/time-range-0/cmd.txt:perf script --time=,9466.504461499 --ns + parallel-perf-output/time-range-1/cmd.txt:perf script --time=9466.504461500,9469.005396999 --ns + parallel-perf-output/time-range-2/cmd.txt:perf script --time=9469.005397000,9471.506332499 --ns + parallel-perf-output/time-range-3/cmd.txt:perf script --time=9471.506332500, --ns + +Any perf script command can be used, including the use of perf script options +--dlfilter and --script, so that the benefit of running parallel jobs +naturally extends to them also. + +If option --pipe-to is used, standard output is first piped through that +command. Beware, if the command fails (e.g. grep with no matches), it will be +considered a fatal error. + +Final standard output is redirected to files named out.txt in separate +subdirectories under the output directory. Similarly, standard error is +written to files named err.txt. In addition, files named cmd.txt contain the +corresponding perf script command. After processing, err.txt files are removed +if they are empty. + +If any job exits with a non-zero exit code, then all jobs are killed and no +more are started. A message is printed if any job results in a non-empty +err.txt file. + +There is a separate output subdirectory for each time range. If the --per-cpu +option is used, these are further grouped under cpu-n subdirectories, e.g. + + $ parallel-perf.py --per-cpu --nr=2 -- perf script --ns --cpu=0,1 + All jobs finished successfully + $ tree parallel-perf-output + parallel-perf-output/ + ├── cpu-0 + │ ├── time-range-0 + │ │ ├── cmd.txt + │ │ └── out.txt + │ └── time-range-1 + │ ├── cmd.txt + │ └── out.txt + └── cpu-1 + ├── time-range-0 + │ ├── cmd.txt + │ └── out.txt + └── time-range-1 + ├── cmd.txt + └── out.txt + $ find parallel-perf-output -name cmd.txt | sort | xargs grep -H . + parallel-perf-output/cpu-0/time-range-0/cmd.txt:perf script --cpu=0 --time=,9469.005396999 --ns + parallel-perf-output/cpu-0/time-range-1/cmd.txt:perf script --cpu=0 --time=9469.005397000, --ns + parallel-perf-output/cpu-1/time-range-0/cmd.txt:perf script --cpu=1 --time=,9469.005396999 --ns + parallel-perf-output/cpu-1/time-range-1/cmd.txt:perf script --cpu=1 --time=9469.005397000, --ns + +Subdivisions of time range, and cpus if the --per-cpu option is used, are +expressed by the --time and --cpu perf script options respectively. If the +supplied perf script command has a --time option, then that time range is +subdivided, otherwise the time range given by 'time of first sample' to +'time of last sample' is used (refer perf script --header-only). Similarly, the +supplied perf script command may provide a --cpu option, and only those CPUs +will be processed. + +To prevent time intervals becoming too small, the --min-interval option can +be used. + +Note there is special handling for processing Intel PT traces. If an interval is +not specified and the perf record command contained the intel_pt event, then the +time range will be subdivided in order to produce subdivisions that contain +approximately the same amount of trace data. That is accomplished by counting +double-quick (--itrace=qqi) samples, and choosing time ranges that encompass +approximately the same number of samples. In that case, time ranges may not be +the same for each CPU processed. For Intel PT, --per-cpu is the default, but +that can be overridden by --no-per-cpu. Note, for Intel PT, double-quick +decoding produces 1 sample for each PSB synchronization packet, which in turn +come after a certain number of bytes output, determined by psb_period (refer +perf Intel PT documentation). The minimum number of double-quick samples that +will define a time range can be set by the --min_size option, which defaults to +64. +""") + ap.add_argument("-o", "--output-dir", default="parallel-perf-output", help="output directory (default 'parallel-perf-output')") + ap.add_argument("-j", "--jobs", type=int, default=0, help="maximum number of jobs to run in parallel at one time (default is the number of CPUs)") + ap.add_argument("-n", "--nr", type=int, default=0, help="number of time subdivisions (default is the number of jobs)") + ap.add_argument("-i", "--interval", type=float, default=0, help="subdivide the time range using this time interval (in seconds e.g. 0.1 for a tenth of a second)") + ap.add_argument("-c", "--per-cpu", action="store_true", help="process data for each CPU in parallel") + ap.add_argument("-m", "--min-interval", type=float, default=glb_min_interval, help=f"minimum interval (default {glb_min_interval} seconds)") + ap.add_argument("-p", "--pipe-to", help="command to pipe output to (optional)") + ap.add_argument("-N", "--no-per-cpu", action="store_true", help="do not process data for each CPU in parallel") + ap.add_argument("-b", "--min_size", type=int, default=glb_min_samples, help="minimum data size (for Intel PT in PSBs)") + ap.add_argument("-D", "--dry-run", action="store_true", help="do not run any jobs, just show the perf script commands") + ap.add_argument("-q", "--quiet", action="store_true", help="do not print any messages except errors") + ap.add_argument("-v", "--verbose", action="store_true", help="print more messages") + ap.add_argument("-d", "--debug", action="store_true", help="print debugging messages") + cmd_line = list(args) + try: + split_pos = cmd_line.index("--") + cmd = cmd_line[split_pos + 1:] + args = cmd_line[:split_pos] + except: + cmd = None + args = cmd_line + a = ap.parse_args(args=args[1:]) + a.cmd = cmd + a.verbosity = Verbosity(a.quiet, a.verbose, a.debug) + try: + if a.cmd == None: + if len(args) <= 1: + ap.print_help() + return True + raise Exception("Command line must contain '--' before perf command") + return RunParallelPerf(a) + except Exception as e: + print("Fatal error: ", str(e)) + if a.debug: + raise + return False + +if __name__ == "__main__": + if not Main(sys.argv): + sys.exit(1) |