26 files changed, 3709 insertions, 342 deletions
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Build b/tools/perf/scripts/python/Perf-Trace-Util/Build
index 7d0e33ce6aba..be3710c61320 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/Build
+++ b/tools/perf/scripts/python/Perf-Trace-Util/Build
@@ -1,3 +1,4 @@
-perf-y += Context.o
+perf-util-y += Context.o
 
-CFLAGS_Context.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs
+# -Wno-declaration-after-statement: The python headers have mixed code with declarations (decls after asserts, for instance)
+CFLAGS_Context.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs -Wno-declaration-after-statement
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Context.c b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
index 0b7096847991..60dcfe56d4d9 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/Context.c
+++ b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
@@ -5,84 +5,203 @@
  * Copyright (C) 2010 Tom Zanussi <tzanussi@gmail.com>
  */
 
+/*
+ * Use Py_ssize_t for '#' formats to avoid DeprecationWarning: PY_SSIZE_T_CLEAN
+ * will be required for '#' formats.
+ */
+#define PY_SSIZE_T_CLEAN
+
 #include <Python.h>
+#include "../../../util/config.h"
 #include "../../../util/trace-event.h"
+#include "../../../util/event.h"
+#include "../../../util/symbol.h"
+#include "../../../util/thread.h"
+#include "../../../util/map.h"
+#include "../../../util/maps.h"
+#include "../../../util/auxtrace.h"
+#include "../../../util/session.h"
+#include "../../../util/srcline.h"
+#include "../../../util/srccode.h"
 
-#if PY_MAJOR_VERSION < 3
-#define _PyCapsule_GetPointer(arg1, arg2) \
-  PyCObject_AsVoidPtr(arg1)
-
-PyMODINIT_FUNC initperf_trace_context(void);
-#else
 #define _PyCapsule_GetPointer(arg1, arg2) \
   PyCapsule_GetPointer((arg1), (arg2))
+#define _PyBytes_FromStringAndSize(arg1, arg2) \
+  PyBytes_FromStringAndSize((arg1), (arg2))
+#define _PyUnicode_AsUTF8(arg) \
+  PyUnicode_AsUTF8(arg)
 
 PyMODINIT_FUNC PyInit_perf_trace_context(void);
-#endif
 
-static PyObject *perf_trace_context_common_pc(PyObject *obj, PyObject *args)
+static struct scripting_context *get_args(PyObject *args, const char *name, PyObject **arg2)
 {
-	static struct scripting_context *scripting_context;
+	int cnt = 1 + !!arg2;
 	PyObject *context;
-	int retval;
 
-	if (!PyArg_ParseTuple(args, "O", &context))
+	if (!PyArg_UnpackTuple(args, name, 1, cnt, &context, arg2))
 		return NULL;
 
-	scripting_context = _PyCapsule_GetPointer(context, NULL);
-	retval = common_pc(scripting_context);
+	return _PyCapsule_GetPointer(context, NULL);
+}
+
+static struct scripting_context *get_scripting_context(PyObject *args)
+{
+	return get_args(args, "context", NULL);
+}
 
-	return Py_BuildValue("i", retval);
+#ifdef HAVE_LIBTRACEEVENT
+static PyObject *perf_trace_context_common_pc(PyObject *obj, PyObject *args)
+{
+	struct scripting_context *c = get_scripting_context(args);
+
+	if (!c)
+		return NULL;
+
+	return Py_BuildValue("i", common_pc(c));
 }
 
 static PyObject *perf_trace_context_common_flags(PyObject *obj,
 						 PyObject *args)
 {
-	static struct scripting_context *scripting_context;
-	PyObject *context;
-	int retval;
+	struct scripting_context *c = get_scripting_context(args);
 
-	if (!PyArg_ParseTuple(args, "O", &context))
+	if (!c)
 		return NULL;
 
-	scripting_context = _PyCapsule_GetPointer(context, NULL);
-	retval = common_flags(scripting_context);
-
-	return Py_BuildValue("i", retval);
+	return Py_BuildValue("i", common_flags(c));
 }
 
 static PyObject *perf_trace_context_common_lock_depth(PyObject *obj,
 						      PyObject *args)
 {
-	static struct scripting_context *scripting_context;
-	PyObject *context;
-	int retval;
+	struct scripting_context *c = get_scripting_context(args);
+
+	if (!c)
+		return NULL;
+
+	return Py_BuildValue("i", common_lock_depth(c));
+}
+#endif
+
+static PyObject *perf_sample_insn(PyObject *obj, PyObject *args)
+{
+	struct scripting_context *c = get_scripting_context(args);
+
+	if (!c)
+		return NULL;
+
+	if (c->sample->ip && !c->sample->insn_len && thread__maps(c->al->thread)) {
+		struct machine *machine =  maps__machine(thread__maps(c->al->thread));
+
+		script_fetch_insn(c->sample, c->al->thread, machine, /*native_arch=*/true);
+	}
+	if (!c->sample->insn_len)
+		Py_RETURN_NONE; /* N.B. This is a return statement */
 
-	if (!PyArg_ParseTuple(args, "O", &context))
+	return _PyBytes_FromStringAndSize(c->sample->insn, c->sample->insn_len);
+}
+
+static PyObject *perf_set_itrace_options(PyObject *obj, PyObject *args)
+{
+	struct scripting_context *c;
+	const char *itrace_options;
+	int retval = -1;
+	PyObject *str;
+
+	c = get_args(args, "itrace_options", &str);
+	if (!c)
 		return NULL;
 
-	scripting_context = _PyCapsule_GetPointer(context, NULL);
-	retval = common_lock_depth(scripting_context);
+	if (!c->session || !c->session->itrace_synth_opts)
+		goto out;
 
+	if (c->session->itrace_synth_opts->set) {
+		retval = 1;
+		goto out;
+	}
+
+	itrace_options = _PyUnicode_AsUTF8(str);
+
+	retval = itrace_do_parse_synth_opts(c->session->itrace_synth_opts, itrace_options, 0);
+out:
 	return Py_BuildValue("i", retval);
 }
 
+static PyObject *perf_sample_src(PyObject *obj, PyObject *args, bool get_srccode)
+{
+	struct scripting_context *c = get_scripting_context(args);
+	unsigned int line = 0;
+	char *srcfile = NULL;
+	char *srccode = NULL;
+	PyObject *result;
+	struct map *map;
+	struct dso *dso;
+	int len = 0;
+	u64 addr;
+
+	if (!c)
+		return NULL;
+
+	map = c->al->map;
+	addr = c->al->addr;
+	dso = map ? map__dso(map) : NULL;
+
+	if (dso)
+		srcfile = get_srcline_split(dso, map__rip_2objdump(map, addr), &line);
+
+	if (get_srccode) {
+		if (srcfile)
+			srccode = find_sourceline(srcfile, line, &len);
+		result = Py_BuildValue("(sIs#)", srcfile, line, srccode, (Py_ssize_t)len);
+	} else {
+		result = Py_BuildValue("(sI)", srcfile, line);
+	}
+
+	free(srcfile);
+
+	return result;
+}
+
+static PyObject *perf_sample_srcline(PyObject *obj, PyObject *args)
+{
+	return perf_sample_src(obj, args, false);
+}
+
+static PyObject *perf_sample_srccode(PyObject *obj, PyObject *args)
+{
+	return perf_sample_src(obj, args, true);
+}
+
+static PyObject *__perf_config_get(PyObject *obj, PyObject *args)
+{
+	const char *config_name;
+
+	if (!PyArg_ParseTuple(args, "s", &config_name))
+		return NULL;
+	return Py_BuildValue("s", perf_config_get(config_name));
+}
+
 static PyMethodDef ContextMethods[] = {
+#ifdef HAVE_LIBTRACEEVENT
 	{ "common_pc", perf_trace_context_common_pc, METH_VARARGS,
 	  "Get the common preempt count event field value."},
 	{ "common_flags", perf_trace_context_common_flags, METH_VARARGS,
 	  "Get the common flags event field value."},
 	{ "common_lock_depth", perf_trace_context_common_lock_depth,
 	  METH_VARARGS,	"Get the common lock depth event field value."},
+#endif
+	{ "perf_sample_insn", perf_sample_insn,
+	  METH_VARARGS,	"Get the machine code instruction."},
+	{ "perf_set_itrace_options", perf_set_itrace_options,
+	  METH_VARARGS,	"Set --itrace options."},
+	{ "perf_sample_srcline", perf_sample_srcline,
+	  METH_VARARGS,	"Get source file name and line number."},
+	{ "perf_sample_srccode", perf_sample_srccode,
+	  METH_VARARGS,	"Get source file name, line number and line."},
+	{ "perf_config_get", __perf_config_get, METH_VARARGS, "Get perf config entry"},
 	{ NULL, NULL, 0, NULL}
 };
 
-#if PY_MAJOR_VERSION < 3
-PyMODINIT_FUNC initperf_trace_context(void)
-{
-	(void) Py_InitModule("perf_trace_context", ContextMethods);
-}
-#else
 PyMODINIT_FUNC PyInit_perf_trace_context(void)
 {
 	static struct PyModuleDef moduledef = {
@@ -96,6 +215,11 @@ PyMODINIT_FUNC PyInit_perf_trace_context(void)
 		NULL,			/* m_clear */
 		NULL,			/* m_free */
 	};
-	return PyModule_Create(&moduledef);
+	PyObject *mod;
+
+	mod = PyModule_Create(&moduledef);
+	/* Add perf_script_context to the module so it can be imported */
+	PyObject_SetAttrString(mod, "perf_script_context", Py_None);
+
+	return mod;
 }
-#endif
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
index 7384dcb628c4..b75d31858e54 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
@@ -54,6 +54,7 @@ try:
 	import audit
 	machine_to_id = {
 		'x86_64': audit.MACH_86_64,
+		'aarch64': audit.MACH_AARCH64,
 		'alpha'	: audit.MACH_ALPHA,
 		'ia64'	: audit.MACH_IA64,
 		'ppc'	: audit.MACH_PPC,
@@ -73,9 +74,9 @@ try:
 except:
 	if not audit_package_warned:
 		audit_package_warned = True
-		print("Install the audit-libs-python package to get syscall names.\n"
-                    "For example:\n  # apt-get install python-audit (Ubuntu)"
-                    "\n  # yum install audit-libs-python (Fedora)"
+		print("Install the python-audit package to get syscall names.\n"
+                    "For example:\n  # apt-get install python3-audit (Ubuntu)"
+                    "\n  # yum install python3-audit (Fedora)"
                     "\n  etc.\n")
 
 def syscall_name(id):
diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py
new file mode 100755
index 000000000000..ba208c90d631
--- /dev/null
+++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py
@@ -0,0 +1,355 @@
+# SPDX-License-Identifier: GPL-2.0
+# arm-cs-trace-disasm.py: ARM CoreSight Trace Dump With Disassember
+#
+# Author: Tor Jeremiassen <tor@ti.com>
+#         Mathieu Poirier <mathieu.poirier@linaro.org>
+#         Leo Yan <leo.yan@linaro.org>
+#         Al Grant <Al.Grant@arm.com>
+
+from __future__ import print_function
+import os
+from os import path
+import re
+from subprocess import *
+import argparse
+import platform
+
+from perf_trace_context import perf_sample_srccode, perf_config_get
+
+# Below are some example commands for using this script.
+# Note a --kcore recording is required for accurate decode
+# due to the alternatives patching mechanism. However this
+# script only supports reading vmlinux for disassembly dump,
+# meaning that any patched instructions will appear
+# as unpatched, but the instruction ranges themselves will
+# be correct. In addition to this, source line info comes
+# from Perf, and when using kcore there is no debug info. The
+# following lists the supported features in each mode:
+#
+# +-----------+-----------------+------------------+------------------+
+# | Recording | Accurate decode | Source line dump | Disassembly dump |
+# +-----------+-----------------+------------------+------------------+
+# | --kcore   | yes             | no               | yes              |
+# | normal    | no              | yes              | yes              |
+# +-----------+-----------------+------------------+------------------+
+#
+# Output disassembly with objdump and auto detect vmlinux
+# (when running on same machine.)
+#  perf script -s scripts/python/arm-cs-trace-disasm.py -d
+#
+# Output disassembly with llvm-objdump:
+#  perf script -s scripts/python/arm-cs-trace-disasm.py \
+#		-- -d llvm-objdump-11 -k path/to/vmlinux
+#
+# Output only source line and symbols:
+#  perf script -s scripts/python/arm-cs-trace-disasm.py
+
+def default_objdump():
+	config = perf_config_get("annotate.objdump")
+	return config if config else "objdump"
+
+# Command line parsing.
+def int_arg(v):
+	v = int(v)
+	if v < 0:
+		raise argparse.ArgumentTypeError("Argument must be a positive integer")
+	return v
+
+args = argparse.ArgumentParser()
+args.add_argument("-k", "--vmlinux",
+		  help="Set path to vmlinux file. Omit to autodetect if running on same machine")
+args.add_argument("-d", "--objdump", nargs="?", const=default_objdump(),
+		  help="Show disassembly. Can also be used to change the objdump path"),
+args.add_argument("-v", "--verbose", action="store_true", help="Enable debugging log")
+args.add_argument("--start-time", type=int_arg, help="Monotonic clock time of sample to start from. "
+		  "See 'time' field on samples in -v mode.")
+args.add_argument("--stop-time", type=int_arg, help="Monotonic clock time of sample to stop at. "
+		  "See 'time' field on samples in -v mode.")
+args.add_argument("--start-sample", type=int_arg, help="Index of sample to start from. "
+		  "See 'index' field on samples in -v mode.")
+args.add_argument("--stop-sample", type=int_arg, help="Index of sample to stop at. "
+		  "See 'index' field on samples in -v mode.")
+
+options = args.parse_args()
+if (options.start_time and options.stop_time and
+    options.start_time >= options.stop_time):
+	print("--start-time must less than --stop-time")
+	exit(2)
+if (options.start_sample and options.stop_sample and
+    options.start_sample >= options.stop_sample):
+	print("--start-sample must less than --stop-sample")
+	exit(2)
+
+# Initialize global dicts and regular expression
+disasm_cache = dict()
+cpu_data = dict()
+disasm_re = re.compile(r"^\s*([0-9a-fA-F]+):")
+disasm_func_re = re.compile(r"^\s*([0-9a-fA-F]+)\s.*:")
+cache_size = 64*1024
+sample_idx = -1
+
+glb_source_file_name	= None
+glb_line_number		= None
+glb_dso			= None
+
+kver = platform.release()
+vmlinux_paths = [
+	f"/usr/lib/debug/boot/vmlinux-{kver}.debug",
+	f"/usr/lib/debug/lib/modules/{kver}/vmlinux",
+	f"/lib/modules/{kver}/build/vmlinux",
+	f"/usr/lib/debug/boot/vmlinux-{kver}",
+	f"/boot/vmlinux-{kver}",
+	f"/boot/vmlinux",
+	f"vmlinux"
+]
+
+def get_optional(perf_dict, field):
+       if field in perf_dict:
+               return perf_dict[field]
+       return "[unknown]"
+
+def get_offset(perf_dict, field):
+	if field in perf_dict:
+		return "+%#x" % perf_dict[field]
+	return ""
+
+def find_vmlinux():
+	if hasattr(find_vmlinux, "path"):
+		return find_vmlinux.path
+
+	for v in vmlinux_paths:
+		if os.access(v, os.R_OK):
+			find_vmlinux.path = v
+			break
+	else:
+		find_vmlinux.path = None
+
+	return find_vmlinux.path
+
+def get_dso_file_path(dso_name, dso_build_id):
+	if (dso_name == "[kernel.kallsyms]" or dso_name == "vmlinux"):
+		if (options.vmlinux):
+			return options.vmlinux;
+		else:
+			return find_vmlinux() if find_vmlinux() else dso_name
+
+	if (dso_name == "[vdso]") :
+		append = "/vdso"
+	else:
+		append = "/elf"
+
+	dso_path = os.environ['PERF_BUILDID_DIR'] + "/" + dso_name + "/" + dso_build_id + append;
+	# Replace duplicate slash chars to single slash char
+	dso_path = dso_path.replace('//', '/', 1)
+	return dso_path
+
+def read_disam(dso_fname, dso_start, start_addr, stop_addr):
+	addr_range = str(start_addr) + ":" + str(stop_addr) + ":" + dso_fname
+
+	# Don't let the cache get too big, clear it when it hits max size
+	if (len(disasm_cache) > cache_size):
+		disasm_cache.clear();
+
+	if addr_range in disasm_cache:
+		disasm_output = disasm_cache[addr_range];
+	else:
+		start_addr = start_addr - dso_start;
+		stop_addr = stop_addr - dso_start;
+		disasm = [ options.objdump, "-d", "-z",
+			   "--start-address="+format(start_addr,"#x"),
+			   "--stop-address="+format(stop_addr,"#x") ]
+		disasm += [ dso_fname ]
+		disasm_output = check_output(disasm).decode('utf-8').split('\n')
+		disasm_cache[addr_range] = disasm_output
+
+	return disasm_output
+
+def print_disam(dso_fname, dso_start, start_addr, stop_addr):
+	for line in read_disam(dso_fname, dso_start, start_addr, stop_addr):
+		m = disasm_func_re.search(line)
+		if m is None:
+			m = disasm_re.search(line)
+			if m is None:
+				continue
+		print("\t" + line)
+
+def print_sample(sample):
+	print("Sample = { cpu: %04d addr: 0x%016x phys_addr: 0x%016x ip: 0x%016x " \
+	      "pid: %d tid: %d period: %d time: %d index: %d}" % \
+	      (sample['cpu'], sample['addr'], sample['phys_addr'], \
+	       sample['ip'], sample['pid'], sample['tid'], \
+	       sample['period'], sample['time'], sample_idx))
+
+def trace_begin():
+	print('ARM CoreSight Trace Data Assembler Dump')
+
+def trace_end():
+	print('End')
+
+def trace_unhandled(event_name, context, event_fields_dict):
+	print(' '.join(['%s=%s'%(k,str(v))for k,v in sorted(event_fields_dict.items())]))
+
+def common_start_str(comm, sample):
+	sec = int(sample["time"] / 1000000000)
+	ns = sample["time"] % 1000000000
+	cpu = sample["cpu"]
+	pid = sample["pid"]
+	tid = sample["tid"]
+	return "%16s %5u/%-5u [%04u] %9u.%09u  " % (comm, pid, tid, cpu, sec, ns)
+
+# This code is copied from intel-pt-events.py for printing source code
+# line and symbols.
+def print_srccode(comm, param_dict, sample, symbol, dso):
+	ip = sample["ip"]
+	if symbol == "[unknown]":
+		start_str = common_start_str(comm, sample) + ("%x" % ip).rjust(16).ljust(40)
+	else:
+		offs = get_offset(param_dict, "symoff")
+		start_str = common_start_str(comm, sample) + (symbol + offs).ljust(40)
+
+	global glb_source_file_name
+	global glb_line_number
+	global glb_dso
+
+	source_file_name, line_number, source_line = perf_sample_srccode(perf_script_context)
+	if source_file_name:
+		if glb_line_number == line_number and glb_source_file_name == source_file_name:
+			src_str = ""
+		else:
+			if len(source_file_name) > 40:
+				src_file = ("..." + source_file_name[-37:]) + " "
+			else:
+				src_file = source_file_name.ljust(41)
+
+			if source_line is None:
+				src_str = src_file + str(line_number).rjust(4) + " <source not found>"
+			else:
+				src_str = src_file + str(line_number).rjust(4) + " " + source_line
+		glb_dso = None
+	elif dso == glb_dso:
+		src_str = ""
+	else:
+		src_str = dso
+		glb_dso = dso
+
+	glb_line_number = line_number
+	glb_source_file_name = source_file_name
+
+	print(start_str, src_str)
+
+def process_event(param_dict):
+	global cache_size
+	global options
+	global sample_idx
+
+	sample = param_dict["sample"]
+	comm = param_dict["comm"]
+
+	name = param_dict["ev_name"]
+	dso = get_optional(param_dict, "dso")
+	dso_bid = get_optional(param_dict, "dso_bid")
+	dso_start = get_optional(param_dict, "dso_map_start")
+	dso_end = get_optional(param_dict, "dso_map_end")
+	symbol = get_optional(param_dict, "symbol")
+	map_pgoff = get_optional(param_dict, "map_pgoff")
+	# check for valid map offset
+	if (str(map_pgoff) == '[unknown]'):
+		map_pgoff = 0
+
+	cpu = sample["cpu"]
+	ip = sample["ip"]
+	addr = sample["addr"]
+
+	sample_idx += 1
+
+	if (options.start_time and sample["time"] < options.start_time):
+		return
+	if (options.stop_time and sample["time"] > options.stop_time):
+		exit(0)
+	if (options.start_sample and sample_idx < options.start_sample):
+		return
+	if (options.stop_sample and sample_idx > options.stop_sample):
+		exit(0)
+
+	if (options.verbose == True):
+		print("Event type: %s" % name)
+		print_sample(sample)
+
+	# Initialize CPU data if it's empty, and directly return back
+	# if this is the first tracing event for this CPU.
+	if (cpu_data.get(str(cpu) + 'addr') == None):
+		cpu_data[str(cpu) + 'addr'] = addr
+		return
+
+	# If cannot find dso so cannot dump assembler, bail out
+	if (dso == '[unknown]'):
+		return
+
+	# Validate dso start and end addresses
+	if ((dso_start == '[unknown]') or (dso_end == '[unknown]')):
+		print("Failed to find valid dso map for dso %s" % dso)
+		return
+
+	if (name[0:12] == "instructions"):
+		print_srccode(comm, param_dict, sample, symbol, dso)
+		return
+
+	# Don't proceed if this event is not a branch sample, .
+	if (name[0:8] != "branches"):
+		return
+
+	# The format for packet is:
+	#
+	#		  +------------+------------+------------+
+	#  sample_prev:   |    addr    |    ip	    |	 cpu	 |
+	#		  +------------+------------+------------+
+	#  sample_next:   |    addr    |    ip	    |	 cpu	 |
+	#		  +------------+------------+------------+
+	#
+	# We need to combine the two continuous packets to get the instruction
+	# range for sample_prev::cpu:
+	#
+	#     [ sample_prev::addr .. sample_next::ip ]
+	#
+	# For this purose, sample_prev::addr is stored into cpu_data structure
+	# and read back for 'start_addr' when the new packet comes, and we need
+	# to use sample_next::ip to calculate 'stop_addr', plusing extra 4 for
+	# 'stop_addr' is for the sake of objdump so the final assembler dump can
+	# include last instruction for sample_next::ip.
+	start_addr = cpu_data[str(cpu) + 'addr']
+	stop_addr  = ip + 4
+
+	# Record for previous sample packet
+	cpu_data[str(cpu) + 'addr'] = addr
+
+	# Filter out zero start_address. Optionally identify CS_ETM_TRACE_ON packet
+	if (start_addr == 0):
+		if ((stop_addr == 4) and (options.verbose == True)):
+			print("CPU%d: CS_ETM_TRACE_ON packet is inserted" % cpu)
+		return
+
+	if (start_addr < int(dso_start) or start_addr > int(dso_end)):
+		print("Start address 0x%x is out of range [ 0x%x .. 0x%x ] for dso %s" % (start_addr, int(dso_start), int(dso_end), dso))
+		return
+
+	if (stop_addr < int(dso_start) or stop_addr > int(dso_end)):
+		print("Stop address 0x%x is out of range [ 0x%x .. 0x%x ] for dso %s" % (stop_addr, int(dso_start), int(dso_end), dso))
+		return
+
+	if (options.objdump != None):
+		# It doesn't need to decrease virtual memory offset for disassembly
+		# for kernel dso and executable file dso, so in this case we set
+		# vm_start to zero.
+		if (dso == "[kernel.kallsyms]" or dso_start == 0x400000):
+			dso_vm_start = 0
+			map_pgoff = 0
+		else:
+			dso_vm_start = int(dso_start)
+
+		dso_fname = get_dso_file_path(dso, dso_bid)
+		if path.exists(dso_fname):
+			print_disam(dso_fname, dso_vm_start, start_addr + map_pgoff, stop_addr + map_pgoff)
+		else:
+			print("Failed to find dso %s for address range [ 0x%x .. 0x%x ]" % (dso, start_addr + map_pgoff, stop_addr + map_pgoff))
+
+	print_srccode(comm, param_dict, sample, symbol, dso)
diff --git a/tools/perf/scripts/python/bin/flamegraph-report b/tools/perf/scripts/python/bin/flamegraph-report
index 53c5dc90c87e..453a6918afbe 100755
--- a/tools/perf/scripts/python/bin/flamegraph-report
+++ b/tools/perf/scripts/python/bin/flamegraph-report
@@ -1,3 +1,3 @@
 #!/bin/bash
 # description: create flame graphs
-perf script -s "$PERF_EXEC_PATH"/scripts/python/flamegraph.py -- "$@"
+perf script -s "$PERF_EXEC_PATH"/scripts/python/flamegraph.py "$@"
diff --git a/tools/perf/scripts/python/bin/gecko-record b/tools/perf/scripts/python/bin/gecko-record
new file mode 100644
index 000000000000..f0d1aa55f171
--- /dev/null
+++ b/tools/perf/scripts/python/bin/gecko-record
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -F 99 -g "$@"
diff --git a/tools/perf/scripts/python/bin/gecko-report b/tools/perf/scripts/python/bin/gecko-report
new file mode 100755
index 000000000000..1867ec8d9757
--- /dev/null
+++ b/tools/perf/scripts/python/bin/gecko-report
@@ -0,0 +1,7 @@
+#!/bin/bash
+# description: create firefox gecko profile json format from perf.data
+if [ "$*" = "-i -" ]; then
+perf script -s "$PERF_EXEC_PATH"/scripts/python/gecko.py
+else
+perf script -s "$PERF_EXEC_PATH"/scripts/python/gecko.py -- "$@"
+fi
diff --git a/tools/perf/scripts/python/bin/intel-pt-events-record b/tools/perf/scripts/python/bin/intel-pt-events-record
index 10fe2b6977d4..6b9877cfe23e 100644
--- a/tools/perf/scripts/python/bin/intel-pt-events-record
+++ b/tools/perf/scripts/python/bin/intel-pt-events-record
@@ -1,8 +1,8 @@
 #!/bin/bash
 
 #
-# print Intel PT Power Events and PTWRITE. The intel_pt PMU event needs
-# to be specified with appropriate config terms.
+# print Intel PT Events including Power Events and PTWRITE. The intel_pt PMU
+# event needs to be specified with appropriate config terms.
 #
 if ! echo "$@" | grep -q intel_pt ; then
 	echo "Options must include the Intel PT event e.g. -e intel_pt/pwr_evt,ptw/"
diff --git a/tools/perf/scripts/python/bin/intel-pt-events-report b/tools/perf/scripts/python/bin/intel-pt-events-report
index 9a9c92fcd026..beeac3fde9db 100644
--- a/tools/perf/scripts/python/bin/intel-pt-events-report
+++ b/tools/perf/scripts/python/bin/intel-pt-events-report
@@ -1,3 +1,3 @@
 #!/bin/bash
-# description: print Intel PT Power Events and PTWRITE
-perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/intel-pt-events.py
-\ No newline at end of file
+# description: print Intel PT Events including Power Events and PTWRITE
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/intel-pt-events.py
diff --git a/tools/perf/scripts/python/bin/stackcollapse-report b/tools/perf/scripts/python/bin/stackcollapse-report
index 356b9656393d..21a356bd27f6 100755
--- a/tools/perf/scripts/python/bin/stackcollapse-report
+++ b/tools/perf/scripts/python/bin/stackcollapse-report
@@ -1,3 +1,3 @@
 #!/bin/sh
 # description: produce callgraphs in short form for scripting use
-perf script -s "$PERF_EXEC_PATH"/scripts/python/stackcollapse.py -- "$@"
+perf script -s "$PERF_EXEC_PATH"/scripts/python/stackcollapse.py "$@"
diff --git a/tools/perf/scripts/python/bin/task-analyzer-record b/tools/perf/scripts/python/bin/task-analyzer-record
new file mode 100755
index 000000000000..0f6b51bb2767
--- /dev/null
+++ b/tools/perf/scripts/python/bin/task-analyzer-record
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -e sched:sched_switch -e sched:sched_migrate_task "$@"
diff --git a/tools/perf/scripts/python/bin/task-analyzer-report b/tools/perf/scripts/python/bin/task-analyzer-report
new file mode 100755
index 000000000000..4b16a8cc40a0
--- /dev/null
+++ b/tools/perf/scripts/python/bin/task-analyzer-report
@@ -0,0 +1,3 @@
+#!/bin/bash
+# description: analyze timings of tasks
+perf script -s "$PERF_EXEC_PATH"/scripts/python/task-analyzer.py -- "$@"
diff --git a/tools/perf/scripts/python/compaction-times.py b/tools/perf/scripts/python/compaction-times.py
index 2560a042dc6f..9401f7c14747 100644
--- a/tools/perf/scripts/python/compaction-times.py
+++ b/tools/perf/scripts/python/compaction-times.py
@@ -260,7 +260,7 @@ def pr_help():
 
 comm_re = None
 pid_re = None
-pid_regex = "^(\d*)-(\d*)$|^(\d*)$"
+pid_regex = r"^(\d*)-(\d*)$|^(\d*)$"
 
 opt_proc = popt.DISP_DFL
 opt_disp = topt.DISP_ALL
diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py
index 7bd73a904b4e..3a6bdcd74e60 100644
--- a/tools/perf/scripts/python/export-to-postgresql.py
+++ b/tools/perf/scripts/python/export-to-postgresql.py
@@ -399,7 +399,8 @@ if branches:
 		'in_tx		boolean,'
 		'call_path_id	bigint,'
 		'insn_count	bigint,'
-		'cyc_count	bigint)')
+		'cyc_count	bigint,'
+		'flags		integer)')
 else:
 	do_query(query, 'CREATE TABLE samples ('
 		'id		bigint		NOT NULL,'
@@ -425,7 +426,8 @@ else:
 		'in_tx		boolean,'
 		'call_path_id	bigint,'
 		'insn_count	bigint,'
-		'cyc_count	bigint)')
+		'cyc_count	bigint,'
+		'flags		integer)')
 
 if perf_db_export_calls or perf_db_export_callchains:
 	do_query(query, 'CREATE TABLE call_paths ('
@@ -604,7 +606,8 @@ do_query(query, 'CREATE VIEW samples_view AS '
 		'in_tx,'
 		'insn_count,'
 		'cyc_count,'
-		'CASE WHEN cyc_count=0 THEN CAST(0 AS NUMERIC(20, 2)) ELSE CAST((CAST(insn_count AS FLOAT) / cyc_count) AS NUMERIC(20, 2)) END AS IPC'
+		'CASE WHEN cyc_count=0 THEN CAST(0 AS NUMERIC(20, 2)) ELSE CAST((CAST(insn_count AS FLOAT) / cyc_count) AS NUMERIC(20, 2)) END AS IPC,'
+		'flags'
 	' FROM samples')
 
 do_query(query, 'CREATE VIEW ptwrite_view AS '
@@ -804,7 +807,7 @@ def trace_begin():
 	comm_table(0, "unknown", 0, 0, 0)
 	dso_table(0, 0, "unknown", "unknown", "")
 	symbol_table(0, 0, 0, 0, 0, "unknown")
-	sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+	sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
 	if perf_db_export_calls or perf_db_export_callchains:
 		call_path_table(0, 0, 0, 0)
 		call_return_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
@@ -1025,11 +1028,11 @@ def branch_type_table(branch_type, name, *x):
 	value = struct.pack(fmt, 2, 4, branch_type, n, name)
 	branch_type_file.write(value)
 
-def sample_table(sample_id, evsel_id, machine_id, thread_id, comm_id, dso_id, symbol_id, sym_offset, ip, time, cpu, to_dso_id, to_symbol_id, to_sym_offset, to_ip, period, weight, transaction, data_src, branch_type, in_tx, call_path_id, insn_cnt, cyc_cnt, *x):
+def sample_table(sample_id, evsel_id, machine_id, thread_id, comm_id, dso_id, symbol_id, sym_offset, ip, time, cpu, to_dso_id, to_symbol_id, to_sym_offset, to_ip, period, weight, transaction, data_src, branch_type, in_tx, call_path_id, insn_cnt, cyc_cnt, flags, *x):
 	if branches:
-		value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiiiBiqiqiq", 20, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 4, branch_type, 1, in_tx, 8, call_path_id, 8, insn_cnt, 8, cyc_cnt)
+		value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiiiBiqiqiqii", 21, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 4, branch_type, 1, in_tx, 8, call_path_id, 8, insn_cnt, 8, cyc_cnt, 4, flags)
 	else:
-		value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiqiqiqiqiiiBiqiqiq", 24, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 8, period, 8, weight, 8, transaction, 8, data_src, 4, branch_type, 1, in_tx, 8, call_path_id, 8, insn_cnt, 8, cyc_cnt)
+		value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiqiqiqiqiiiBiqiqiqii", 25, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 8, period, 8, weight, 8, transaction, 8, data_src, 4, branch_type, 1, in_tx, 8, call_path_id, 8, insn_cnt, 8, cyc_cnt, 4, flags)
 	sample_file.write(value)
 
 def call_path_table(cp_id, parent_id, symbol_id, ip, *x):
@@ -1055,7 +1058,7 @@ def cbr(id, raw_buf):
 	cbr = data[0]
 	MHz = (data[4] + 500) / 1000
 	percent = ((cbr * 1000 / data[2]) + 5) / 10
-	value = struct.pack("!hiqiiiiii", 4, 8, id, 4, cbr, 4, MHz, 4, percent)
+	value = struct.pack("!hiqiiiiii", 4, 8, id, 4, cbr, 4, int(MHz), 4, int(percent))
 	cbr_file.write(value)
 
 def mwait(id, raw_buf):
diff --git a/tools/perf/scripts/python/export-to-sqlite.py b/tools/perf/scripts/python/export-to-sqlite.py
index 8043a7272a56..73c992feb1b9 100644
--- a/tools/perf/scripts/python/export-to-sqlite.py
+++ b/tools/perf/scripts/python/export-to-sqlite.py
@@ -223,7 +223,8 @@ if branches:
 		'in_tx		boolean,'
 		'call_path_id	bigint,'
 		'insn_count	bigint,'
-		'cyc_count	bigint)')
+		'cyc_count	bigint,'
+		'flags		integer)')
 else:
 	do_query(query, 'CREATE TABLE samples ('
 		'id		integer		NOT NULL	PRIMARY KEY,'
@@ -249,7 +250,8 @@ else:
 		'in_tx		boolean,'
 		'call_path_id	bigint,'
 		'insn_count	bigint,'
-		'cyc_count	bigint)')
+		'cyc_count	bigint,'
+		'flags		integer)')
 
 if perf_db_export_calls or perf_db_export_callchains:
 	do_query(query, 'CREATE TABLE call_paths ('
@@ -442,7 +444,8 @@ do_query(query, 'CREATE VIEW samples_view AS '
 		'in_tx,'
 		'insn_count,'
 		'cyc_count,'
-		'CASE WHEN cyc_count=0 THEN CAST(0 AS FLOAT) ELSE ROUND(CAST(insn_count AS FLOAT) / cyc_count, 2) END AS IPC'
+		'CASE WHEN cyc_count=0 THEN CAST(0 AS FLOAT) ELSE ROUND(CAST(insn_count AS FLOAT) / cyc_count, 2) END AS IPC,'
+		'flags'
 	' FROM samples')
 
 do_query(query, 'CREATE VIEW ptwrite_view AS '
@@ -584,9 +587,9 @@ branch_type_query = QSqlQuery(db)
 branch_type_query.prepare("INSERT INTO branch_types VALUES (?, ?)")
 sample_query = QSqlQuery(db)
 if branches:
-	sample_query.prepare("INSERT INTO samples VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
+	sample_query.prepare("INSERT INTO samples VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
 else:
-	sample_query.prepare("INSERT INTO samples VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
+	sample_query.prepare("INSERT INTO samples VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
 if perf_db_export_calls or perf_db_export_callchains:
 	call_path_query = QSqlQuery(db)
 	call_path_query.prepare("INSERT INTO call_paths VALUES (?, ?, ?, ?)")
@@ -618,7 +621,7 @@ def trace_begin():
 	comm_table(0, "unknown", 0, 0, 0)
 	dso_table(0, 0, "unknown", "unknown", "")
 	symbol_table(0, 0, 0, 0, 0, "unknown")
-	sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+	sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
 	if perf_db_export_calls or perf_db_export_callchains:
 		call_path_table(0, 0, 0, 0)
 		call_return_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
@@ -703,11 +706,11 @@ def sample_table(*x):
 	if branches:
 		for xx in x[0:15]:
 			sample_query.addBindValue(str(xx))
-		for xx in x[19:24]:
+		for xx in x[19:25]:
 			sample_query.addBindValue(str(xx))
 		do_query_(sample_query)
 	else:
-		bind_exec(sample_query, 24, x)
+		bind_exec(sample_query, 25, x)
 
 def call_path_table(*x):
 	bind_exec(call_path_query, 4, x)
diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py
index 26d7be785288..e0b2e7268ef6 100755
--- a/tools/perf/scripts/python/exported-sql-viewer.py
+++ b/tools/perf/scripts/python/exported-sql-viewer.py
@@ -91,6 +91,11 @@
 from __future__ import print_function
 
 import sys
+# Only change warnings if the python -W option was not used
+if not sys.warnoptions:
+	import warnings
+	# PySide2 causes deprecation warnings, ignore them.
+	warnings.filterwarnings("ignore", category=DeprecationWarning)
 import argparse
 import weakref
 import threading
@@ -108,6 +113,7 @@ import os
 import random
 import copy
 import math
+from libxed import LibXED
 
 pyside_version_1 = True
 if not "--pyside-version-1" in sys.argv:
@@ -125,8 +131,9 @@ if pyside_version_1:
 	from PySide.QtGui import *
 	from PySide.QtSql import *
 
-from decimal import *
-from ctypes import *
+from decimal import Decimal, ROUND_HALF_UP
+from ctypes import CDLL, Structure, create_string_buffer, addressof, sizeof, \
+		   c_void_p, c_bool, c_byte, c_char, c_int, c_uint, c_longlong, c_ulonglong
 from multiprocessing import Process, Array, Value, Event
 
 # xrange is range in Python3
@@ -670,10 +677,13 @@ class CallGraphModelBase(TreeModel):
 			#   sqlite supports GLOB (text only) which uses * and ? and is case sensitive
 			if not self.glb.dbref.is_sqlite3:
 				# Escape % and _
-				s = value.replace("%", "\%")
-				s = s.replace("_", "\_")
+				s = value.replace("%", "\\%")
+				s = s.replace("_", "\\_")
 				# Translate * and ? into SQL LIKE pattern characters % and _
-				trans = string.maketrans("*?", "%_")
+				if sys.version_info[0] == 3:
+					trans = str.maketrans("*?", "%_")
+				else:
+					trans = string.maketrans("*?", "%_")
 				match = " LIKE '" + str(s).translate(trans) + "'"
 			else:
 				match = " GLOB '" + str(value) + "'"
@@ -768,7 +778,8 @@ class CallGraphModel(CallGraphModelBase):
 						" FROM calls"
 						" INNER JOIN call_paths ON calls.call_path_id = call_paths.id"
 						" INNER JOIN symbols ON call_paths.symbol_id = symbols.id"
-						" WHERE symbols.name" + match +
+						" WHERE calls.id <> 0"
+						" AND symbols.name" + match +
 						" GROUP BY comm_id, thread_id, call_path_id"
 						" ORDER BY comm_id, thread_id, call_path_id")
 
@@ -963,7 +974,8 @@ class CallTreeModel(CallGraphModelBase):
 						" FROM calls"
 						" INNER JOIN call_paths ON calls.call_path_id = call_paths.id"
 						" INNER JOIN symbols ON call_paths.symbol_id = symbols.id"
-						" WHERE symbols.name" + match +
+						" WHERE calls.id <> 0"
+						" AND symbols.name" + match +
 						" ORDER BY comm_id, thread_id, call_time, calls.id")
 
 	def FindPath(self, query):
@@ -1050,6 +1062,7 @@ class TreeWindowBase(QMdiSubWindow):
 				child = self.model.index(row, 0, parent)
 				if child.internalPointer().dbid == dbid:
 					found = True
+					self.view.setExpanded(parent, True)
 					self.view.setCurrentIndex(child)
 					parent = child
 					break
@@ -1127,6 +1140,7 @@ class CallTreeWindow(TreeWindowBase):
 				child = self.model.index(row, 0, parent)
 				if child.internalPointer().dbid == dbid:
 					found = True
+					self.view.setExpanded(parent, True)
 					self.view.setCurrentIndex(child)
 					parent = child
 					break
@@ -1139,6 +1153,7 @@ class CallTreeWindow(TreeWindowBase):
 				return
 			last_child = None
 			for row in xrange(n):
+				self.view.setExpanded(parent, True)
 				child = self.model.index(row, 0, parent)
 				child_call_time = child.internalPointer().call_time
 				if child_call_time < time:
@@ -1151,9 +1166,11 @@ class CallTreeWindow(TreeWindowBase):
 			if not last_child:
 				if not found:
 					child = self.model.index(0, 0, parent)
+					self.view.setExpanded(parent, True)
 					self.view.setCurrentIndex(child)
 				return
 			found = True
+			self.view.setExpanded(parent, True)
 			self.view.setCurrentIndex(last_child)
 			parent = last_child
 
@@ -3861,7 +3878,7 @@ def CopyTableCellsToClipboard(view, as_csv=False, with_hdr=False):
 	if with_hdr:
 		model = indexes[0].model()
 		for col in range(min_col, max_col + 1):
-			val = model.headerData(col, Qt.Horizontal)
+			val = model.headerData(col, Qt.Horizontal, Qt.DisplayRole)
 			if as_csv:
 				text += sep + ToCSValue(val)
 				sep = ","
@@ -4734,94 +4751,6 @@ class MainWindow(QMainWindow):
 		dialog = AboutDialog(self.glb, self)
 		dialog.exec_()
 
-# XED Disassembler
-
-class xed_state_t(Structure):
-
-	_fields_ = [
-		("mode", c_int),
-		("width", c_int)
-	]
-
-class XEDInstruction():
-
-	def __init__(self, libxed):
-		# Current xed_decoded_inst_t structure is 192 bytes. Use 512 to allow for future expansion
-		xedd_t = c_byte * 512
-		self.xedd = xedd_t()
-		self.xedp = addressof(self.xedd)
-		libxed.xed_decoded_inst_zero(self.xedp)
-		self.state = xed_state_t()
-		self.statep = addressof(self.state)
-		# Buffer for disassembled instruction text
-		self.buffer = create_string_buffer(256)
-		self.bufferp = addressof(self.buffer)
-
-class LibXED():
-
-	def __init__(self):
-		try:
-			self.libxed = CDLL("libxed.so")
-		except:
-			self.libxed = None
-		if not self.libxed:
-			self.libxed = CDLL("/usr/local/lib/libxed.so")
-
-		self.xed_tables_init = self.libxed.xed_tables_init
-		self.xed_tables_init.restype = None
-		self.xed_tables_init.argtypes = []
-
-		self.xed_decoded_inst_zero = self.libxed.xed_decoded_inst_zero
-		self.xed_decoded_inst_zero.restype = None
-		self.xed_decoded_inst_zero.argtypes = [ c_void_p ]
-
-		self.xed_operand_values_set_mode = self.libxed.xed_operand_values_set_mode
-		self.xed_operand_values_set_mode.restype = None
-		self.xed_operand_values_set_mode.argtypes = [ c_void_p, c_void_p ]
-
-		self.xed_decoded_inst_zero_keep_mode = self.libxed.xed_decoded_inst_zero_keep_mode
-		self.xed_decoded_inst_zero_keep_mode.restype = None
-		self.xed_decoded_inst_zero_keep_mode.argtypes = [ c_void_p ]
-
-		self.xed_decode = self.libxed.xed_decode
-		self.xed_decode.restype = c_int
-		self.xed_decode.argtypes = [ c_void_p, c_void_p, c_uint ]
-
-		self.xed_format_context = self.libxed.xed_format_context
-		self.xed_format_context.restype = c_uint
-		self.xed_format_context.argtypes = [ c_int, c_void_p, c_void_p, c_int, c_ulonglong, c_void_p, c_void_p ]
-
-		self.xed_tables_init()
-
-	def Instruction(self):
-		return XEDInstruction(self)
-
-	def SetMode(self, inst, mode):
-		if mode:
-			inst.state.mode = 4 # 32-bit
-			inst.state.width = 4 # 4 bytes
-		else:
-			inst.state.mode = 1 # 64-bit
-			inst.state.width = 8 # 8 bytes
-		self.xed_operand_values_set_mode(inst.xedp, inst.statep)
-
-	def DisassembleOne(self, inst, bytes_ptr, bytes_cnt, ip):
-		self.xed_decoded_inst_zero_keep_mode(inst.xedp)
-		err = self.xed_decode(inst.xedp, bytes_ptr, bytes_cnt)
-		if err:
-			return 0, ""
-		# Use AT&T mode (2), alternative is Intel (3)
-		ok = self.xed_format_context(2, inst.xedp, inst.bufferp, sizeof(inst.buffer), ip, 0, 0)
-		if not ok:
-			return 0, ""
-		if sys.version_info[0] == 2:
-			result = inst.buffer.value
-		else:
-			result = inst.buffer.value.decode()
-		# Return instruction length and the disassembled instruction text
-		# For now, assume the length is in byte 166
-		return inst.xedd[166], result
-
 def TryOpen(file_name):
 	try:
 		return open(file_name, "rb")
diff --git a/tools/perf/scripts/python/flamegraph.py b/tools/perf/scripts/python/flamegraph.py
index 61f3be9add6b..cf7ce8229a6c 100755
--- a/tools/perf/scripts/python/flamegraph.py
+++ b/tools/perf/scripts/python/flamegraph.py
@@ -13,22 +13,52 @@
 # Written by Andreas Gerstmayr <agerstmayr@redhat.com>
 # Flame Graphs invented by Brendan Gregg <bgregg@netflix.com>
 # Works in tandem with d3-flame-graph by Martin Spier <mspier@netflix.com>
+#
+# pylint: disable=missing-module-docstring
+# pylint: disable=missing-class-docstring
+# pylint: disable=missing-function-docstring
 
 from __future__ import print_function
-import sys
-import os
 import argparse
+import hashlib
+import io
 import json
-
-
+import os
+import subprocess
+import sys
+import urllib.request
+
+minimal_html = """<head>
+  <link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/npm/d3-flame-graph@4.1.3/dist/d3-flamegraph.css">
+</head>
+<body>
+  <div id="chart"></div>
+  <script type="text/javascript" src="https://d3js.org/d3.v7.js"></script>
+  <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/d3-flame-graph@4.1.3/dist/d3-flamegraph.min.js"></script>
+  <script type="text/javascript">
+  const stacks = [/** @flamegraph_json **/];
+  // Note, options is unused.
+  const options = [/** @options_json **/];
+
+  var chart = flamegraph();
+  d3.select("#chart")
+        .datum(stacks[0])
+        .call(chart);
+  </script>
+</body>
+"""
+
+# pylint: disable=too-few-public-methods
 class Node:
-    def __init__(self, name, libtype=""):
+    def __init__(self, name, libtype):
         self.name = name
+        # "root" | "kernel" | ""
+        # "" indicates user space
         self.libtype = libtype
         self.value = 0
         self.children = []
 
-    def toJSON(self):
+    def to_json(self):
         return {
             "n": self.name,
             "l": self.libtype,
@@ -40,25 +70,23 @@ class Node:
 class FlameGraphCLI:
     def __init__(self, args):
         self.args = args
-        self.stack = Node("root")
-
-        if self.args.format == "html" and \
-                not os.path.isfile(self.args.template):
-            print("Flame Graph template {} does not exist. Please install "
-                  "the js-d3-flame-graph (RPM) or libjs-d3-flame-graph (deb) "
-                  "package, specify an existing flame graph template "
-                  "(--template PATH) or another output format "
-                  "(--format FORMAT).".format(self.args.template),
-                  file=sys.stderr)
-            sys.exit(1)
-
-    def find_or_create_node(self, node, name, dso):
-        libtype = "kernel" if dso == "[kernel.kallsyms]" else ""
-        if name is None:
-            name = "[unknown]"
+        self.stack = Node("all", "root")
+
+    @staticmethod
+    def get_libtype_from_dso(dso):
+        """
+        when kernel-debuginfo is installed,
+        dso points to /usr/lib/debug/lib/modules/*/vmlinux
+        """
+        if dso and (dso == "[kernel.kallsyms]" or dso.endswith("/vmlinux")):
+            return "kernel"
 
+        return ""
+
+    @staticmethod
+    def find_or_create_node(node, name, libtype):
         for child in node.children:
-            if child.name == name and child.libtype == libtype:
+            if child.name == name:
                 return child
 
         child = Node(name, libtype)
@@ -66,41 +94,124 @@ class FlameGraphCLI:
         return child
 
     def process_event(self, event):
-        node = self.find_or_create_node(self.stack, event["comm"], None)
+        pid = event.get("sample", {}).get("pid", 0)
+        # event["dso"] sometimes contains /usr/lib/debug/lib/modules/*/vmlinux
+        # for user-space processes; let's use pid for kernel or user-space distinction
+        if pid == 0:
+            comm = event["comm"]
+            libtype = "kernel"
+        else:
+            comm = "{} ({})".format(event["comm"], pid)
+            libtype = ""
+        node = self.find_or_create_node(self.stack, comm, libtype)
+
         if "callchain" in event:
-            for entry in reversed(event['callchain']):
-                node = self.find_or_create_node(
-                    node, entry.get("sym", {}).get("name"), event.get("dso"))
+            for entry in reversed(event["callchain"]):
+                name = entry.get("sym", {}).get("name", "[unknown]")
+                libtype = self.get_libtype_from_dso(entry.get("dso"))
+                node = self.find_or_create_node(node, name, libtype)
         else:
-            node = self.find_or_create_node(
-                node, entry.get("symbol"), event.get("dso"))
+            name = event.get("symbol", "[unknown]")
+            libtype = self.get_libtype_from_dso(event.get("dso"))
+            node = self.find_or_create_node(node, name, libtype)
         node.value += 1
 
+    def get_report_header(self):
+        if self.args.input == "-":
+            # when this script is invoked with "perf script flamegraph",
+            # no perf.data is created and we cannot read the header of it
+            return ""
+
+        try:
+            output = subprocess.check_output(["perf", "report", "--header-only"])
+            return output.decode("utf-8")
+        except Exception as err:  # pylint: disable=broad-except
+            print("Error reading report header: {}".format(err), file=sys.stderr)
+            return ""
+
     def trace_end(self):
-        json_str = json.dumps(self.stack, default=lambda x: x.toJSON())
+        stacks_json = json.dumps(self.stack, default=lambda x: x.to_json())
 
         if self.args.format == "html":
+            report_header = self.get_report_header()
+            options = {
+                "colorscheme": self.args.colorscheme,
+                "context": report_header
+            }
+            options_json = json.dumps(options)
+
+            template_md5sum = None
+            if self.args.format == "html":
+                if os.path.isfile(self.args.template):
+                    template = f"file://{self.args.template}"
+                else:
+                    if not self.args.allow_download:
+                        print(f"""Warning: Flame Graph template '{self.args.template}'
+does not exist. To avoid this please install a package such as the
+js-d3-flame-graph or libjs-d3-flame-graph, specify an existing flame
+graph template (--template PATH) or use another output format (--format
+FORMAT).""",
+                              file=sys.stderr)
+                        if self.args.input == "-":
+                            print("""Not attempting to download Flame Graph template as script command line
+input is disabled due to using live mode. If you want to download the
+template retry without live mode. For example, use 'perf record -a -g
+-F 99 sleep 60' and 'perf script report flamegraph'. Alternatively,
+download the template from:
+https://cdn.jsdelivr.net/npm/d3-flame-graph@4.1.3/dist/templates/d3-flamegraph-base.html
+and place it at:
+/usr/share/d3-flame-graph/d3-flamegraph-base.html""",
+                                  file=sys.stderr)
+                            quit()
+                        s = None
+                        while s != "y" and s != "n":
+                            s = input("Do you wish to download a template from cdn.jsdelivr.net? (this warning can be suppressed with --allow-download) [yn] ").lower()
+                        if s == "n":
+                            quit()
+                    template = "https://cdn.jsdelivr.net/npm/d3-flame-graph@4.1.3/dist/templates/d3-flamegraph-base.html"
+                    template_md5sum = "143e0d06ba69b8370b9848dcd6ae3f36"
+
             try:
-                with open(self.args.template) as f:
-                    output_str = f.read().replace("/** @flamegraph_json **/",
-                                                  json_str)
-            except IOError as e:
-                print("Error reading template file: {}".format(e), file=sys.stderr)
-                sys.exit(1)
+                with urllib.request.urlopen(template) as template:
+                    output_str = "".join([
+                        l.decode("utf-8") for l in template.readlines()
+                    ])
+            except Exception as err:
+                print(f"Error reading template {template}: {err}\n"
+                      "a minimal flame graph will be generated", file=sys.stderr)
+                output_str = minimal_html
+                template_md5sum = None
+
+            if template_md5sum:
+                download_md5sum = hashlib.md5(output_str.encode("utf-8")).hexdigest()
+                if download_md5sum != template_md5sum:
+                    s = None
+                    while s != "y" and s != "n":
+                        s = input(f"""Unexpected template md5sum.
+{download_md5sum} != {template_md5sum}, for:
+{output_str}
+continue?[yn] """).lower()
+                    if s == "n":
+                        quit()
+
+            output_str = output_str.replace("/** @options_json **/", options_json)
+            output_str = output_str.replace("/** @flamegraph_json **/", stacks_json)
+
             output_fn = self.args.output or "flamegraph.html"
         else:
-            output_str = json_str
+            output_str = stacks_json
             output_fn = self.args.output or "stacks.json"
 
         if output_fn == "-":
-            sys.stdout.write(output_str)
+            with io.open(sys.stdout.fileno(), "w", encoding="utf-8", closefd=False) as out:
+                out.write(output_str)
         else:
             print("dumping data to {}".format(output_fn))
             try:
-                with open(output_fn, "w") as out:
+                with io.open(output_fn, "w", encoding="utf-8") as out:
                     out.write(output_str)
-            except IOError as e:
-                print("Error writing output file: {}".format(e), file=sys.stderr)
+            except IOError as err:
+                print("Error writing output file: {}".format(err), file=sys.stderr)
                 sys.exit(1)
 
 
@@ -113,12 +224,20 @@ if __name__ == "__main__":
                         help="output file name")
     parser.add_argument("--template",
                         default="/usr/share/d3-flame-graph/d3-flamegraph-base.html",
-                        help="path to flamegraph HTML template")
+                        help="path to flame graph HTML template")
+    parser.add_argument("--colorscheme",
+                        default="blue-green",
+                        help="flame graph color scheme",
+                        choices=["blue-green", "orange"])
     parser.add_argument("-i", "--input",
                         help=argparse.SUPPRESS)
+    parser.add_argument("--allow-download",
+                        default=False,
+                        action="store_true",
+                        help="allow unprompted downloading of HTML template")
 
-    args = parser.parse_args()
-    cli = FlameGraphCLI(args)
+    cli_args = parser.parse_args()
+    cli = FlameGraphCLI(cli_args)
 
     process_event = cli.process_event
     trace_end = cli.trace_end
diff --git a/tools/perf/scripts/python/futex-contention.py b/tools/perf/scripts/python/futex-contention.py
index 0c4841acf75d..7e884d46f920 100644
--- a/tools/perf/scripts/python/futex-contention.py
+++ b/tools/perf/scripts/python/futex-contention.py
@@ -12,41 +12,46 @@
 
 from __future__ import print_function
 
-import os, sys
-sys.path.append(os.environ['PERF_EXEC_PATH'] + '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+import os
+import sys
+sys.path.append(os.environ['PERF_EXEC_PATH'] +
+                '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
 from Util import *
 
 process_names = {}
 thread_thislock = {}
 thread_blocktime = {}
 
-lock_waits = {} # long-lived stats on (tid,lock) blockage elapsed time
-process_names = {} # long-lived pid-to-execname mapping
+lock_waits = {}  # long-lived stats on (tid,lock) blockage elapsed time
+process_names = {}  # long-lived pid-to-execname mapping
+
 
 def syscalls__sys_enter_futex(event, ctxt, cpu, s, ns, tid, comm, callchain,
-			      nr, uaddr, op, val, utime, uaddr2, val3):
-	cmd = op & FUTEX_CMD_MASK
-	if cmd != FUTEX_WAIT:
-		return # we don't care about originators of WAKE events
+                              nr, uaddr, op, val, utime, uaddr2, val3):
+    cmd = op & FUTEX_CMD_MASK
+    if cmd != FUTEX_WAIT:
+        return  # we don't care about originators of WAKE events
+
+    process_names[tid] = comm
+    thread_thislock[tid] = uaddr
+    thread_blocktime[tid] = nsecs(s, ns)
 
-	process_names[tid] = comm
-	thread_thislock[tid] = uaddr
-	thread_blocktime[tid] = nsecs(s, ns)
 
 def syscalls__sys_exit_futex(event, ctxt, cpu, s, ns, tid, comm, callchain,
-			     nr, ret):
-	if tid in thread_blocktime:
-		elapsed = nsecs(s, ns) - thread_blocktime[tid]
-		add_stats(lock_waits, (tid, thread_thislock[tid]), elapsed)
-		del thread_blocktime[tid]
-		del thread_thislock[tid]
+                             nr, ret):
+    if tid in thread_blocktime:
+        elapsed = nsecs(s, ns) - thread_blocktime[tid]
+        add_stats(lock_waits, (tid, thread_thislock[tid]), elapsed)
+        del thread_blocktime[tid]
+        del thread_thislock[tid]
+
 
 def trace_begin():
-	print("Press control+C to stop and show the summary")
+    print("Press control+C to stop and show the summary")
 
-def trace_end():
-	for (tid, lock) in lock_waits:
-		min, max, avg, count = lock_waits[tid, lock]
-		print("%s[%d] lock %x contended %d times, %d avg ns" %
-			(process_names[tid], tid, lock, count, avg))
 
+def trace_end():
+    for (tid, lock) in lock_waits:
+        min, max, avg, count = lock_waits[tid, lock]
+        print("%s[%d] lock %x contended %d times, %d avg ns [max: %d ns, min %d ns]" %
+              (process_names[tid], tid, lock, count, avg, max, min))
diff --git a/tools/perf/scripts/python/gecko.py b/tools/perf/scripts/python/gecko.py
new file mode 100644
index 000000000000..bc5a72f94bfa
--- /dev/null
+++ b/tools/perf/scripts/python/gecko.py
@@ -0,0 +1,395 @@
+# gecko.py - Convert perf record output to Firefox's gecko profile format
+# SPDX-License-Identifier: GPL-2.0
+#
+# The script converts perf.data to Gecko Profile Format,
+# which can be read by https://profiler.firefox.com/.
+#
+# Usage:
+#
+#     perf record -a -g -F 99 sleep 60
+#     perf script report gecko
+#
+# Combined:
+#
+#     perf script gecko -F 99 -a sleep 60
+
+import os
+import sys
+import time
+import json
+import string
+import random
+import argparse
+import threading
+import webbrowser
+import urllib.parse
+from os import system
+from functools import reduce
+from dataclasses import dataclass, field
+from http.server import HTTPServer, SimpleHTTPRequestHandler, test
+from typing import List, Dict, Optional, NamedTuple, Set, Tuple, Any
+
+# Add the Perf-Trace-Util library to the Python path
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+
+StringID = int
+StackID = int
+FrameID = int
+CategoryID = int
+Milliseconds = float
+
+# start_time is intialiazed only once for the all event traces.
+start_time = None
+
+# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/profile.js#L425
+# Follow Brendan Gregg's Flamegraph convention: orange for kernel and yellow for user space by default.
+CATEGORIES = None
+
+# The product name is used by the profiler UI to show the Operating system and Processor.
+PRODUCT = os.popen('uname -op').read().strip()
+
+# store the output file
+output_file = None
+
+# Here key = tid, value = Thread
+tid_to_thread = dict()
+
+# The HTTP server is used to serve the profile to the profiler UI.
+http_server_thread = None
+
+# The category index is used by the profiler UI to show the color of the flame graph.
+USER_CATEGORY_INDEX = 0
+KERNEL_CATEGORY_INDEX = 1
+
+# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L156
+class Frame(NamedTuple):
+	string_id: StringID
+	relevantForJS: bool
+	innerWindowID: int
+	implementation: None
+	optimizations: None
+	line: None
+	column: None
+	category: CategoryID
+	subcategory: int
+
+# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L216
+class Stack(NamedTuple):
+	prefix_id: Optional[StackID]
+	frame_id: FrameID
+
+# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L90
+class Sample(NamedTuple):
+	stack_id: Optional[StackID]
+	time_ms: Milliseconds
+	responsiveness: int
+
+@dataclass
+class Thread:
+	"""A builder for a profile of the thread.
+
+	Attributes:
+		comm: Thread command-line (name).
+		pid: process ID of containing process.
+		tid: thread ID.
+		samples: Timeline of profile samples.
+		frameTable: interned stack frame ID -> stack frame.
+		stringTable: interned string ID -> string.
+		stringMap: interned string -> string ID.
+		stackTable: interned stack ID -> stack.
+		stackMap: (stack prefix ID, leaf stack frame ID) -> interned Stack ID.
+		frameMap: Stack Frame string -> interned Frame ID.
+		comm: str
+		pid: int
+		tid: int
+		samples: List[Sample] = field(default_factory=list)
+		frameTable: List[Frame] = field(default_factory=list)
+		stringTable: List[str] = field(default_factory=list)
+		stringMap: Dict[str, int] = field(default_factory=dict)
+		stackTable: List[Stack] = field(default_factory=list)
+		stackMap: Dict[Tuple[Optional[int], int], int] = field(default_factory=dict)
+		frameMap: Dict[str, int] = field(default_factory=dict)
+	"""
+	comm: str
+	pid: int
+	tid: int
+	samples: List[Sample] = field(default_factory=list)
+	frameTable: List[Frame] = field(default_factory=list)
+	stringTable: List[str] = field(default_factory=list)
+	stringMap: Dict[str, int] = field(default_factory=dict)
+	stackTable: List[Stack] = field(default_factory=list)
+	stackMap: Dict[Tuple[Optional[int], int], int] = field(default_factory=dict)
+	frameMap: Dict[str, int] = field(default_factory=dict)
+
+	def _intern_stack(self, frame_id: int, prefix_id: Optional[int]) -> int:
+		"""Gets a matching stack, or saves the new stack. Returns a Stack ID."""
+		key = f"{frame_id}" if prefix_id is None else f"{frame_id},{prefix_id}"
+		# key = (prefix_id, frame_id)
+		stack_id = self.stackMap.get(key)
+		if stack_id is None:
+			# return stack_id
+			stack_id = len(self.stackTable)
+			self.stackTable.append(Stack(prefix_id=prefix_id, frame_id=frame_id))
+			self.stackMap[key] = stack_id
+		return stack_id
+
+	def _intern_string(self, string: str) -> int:
+		"""Gets a matching string, or saves the new string. Returns a String ID."""
+		string_id = self.stringMap.get(string)
+		if string_id is not None:
+			return string_id
+		string_id = len(self.stringTable)
+		self.stringTable.append(string)
+		self.stringMap[string] = string_id
+		return string_id
+
+	def _intern_frame(self, frame_str: str) -> int:
+		"""Gets a matching stack frame, or saves the new frame. Returns a Frame ID."""
+		frame_id = self.frameMap.get(frame_str)
+		if frame_id is not None:
+			return frame_id
+		frame_id = len(self.frameTable)
+		self.frameMap[frame_str] = frame_id
+		string_id = self._intern_string(frame_str)
+
+		symbol_name_to_category = KERNEL_CATEGORY_INDEX if frame_str.find('kallsyms') != -1 \
+		or frame_str.find('/vmlinux') != -1 \
+		or frame_str.endswith('.ko)') \
+		else USER_CATEGORY_INDEX
+
+		self.frameTable.append(Frame(
+			string_id=string_id,
+			relevantForJS=False,
+			innerWindowID=0,
+			implementation=None,
+			optimizations=None,
+			line=None,
+			column=None,
+			category=symbol_name_to_category,
+			subcategory=None,
+		))
+		return frame_id
+
+	def _add_sample(self, comm: str, stack: List[str], time_ms: Milliseconds) -> None:
+		"""Add a timestamped stack trace sample to the thread builder.
+		Args:
+			comm: command-line (name) of the thread at this sample
+			stack: sampled stack frames. Root first, leaf last.
+			time_ms: timestamp of sample in milliseconds.
+		"""
+		# Ihreads may not set their names right after they are created.
+		# Instead, they might do it later. In such situations, to use the latest name they have set.
+		if self.comm != comm:
+			self.comm = comm
+
+		prefix_stack_id = reduce(lambda prefix_id, frame: self._intern_stack
+						(self._intern_frame(frame), prefix_id), stack, None)
+		if prefix_stack_id is not None:
+			self.samples.append(Sample(stack_id=prefix_stack_id,
+									time_ms=time_ms,
+									responsiveness=0))
+
+	def _to_json_dict(self) -> Dict:
+		"""Converts current Thread to GeckoThread JSON format."""
+		# Gecko profile format is row-oriented data as List[List],
+		# And a schema for interpreting each index.
+		# Schema:
+		# https://github.com/firefox-devtools/profiler/blob/main/docs-developer/gecko-profile-format.md
+		# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L230
+		return {
+			"tid": self.tid,
+			"pid": self.pid,
+			"name": self.comm,
+			# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L51
+			"markers": {
+				"schema": {
+					"name": 0,
+					"startTime": 1,
+					"endTime": 2,
+					"phase": 3,
+					"category": 4,
+					"data": 5,
+				},
+				"data": [],
+			},
+
+			# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L90
+			"samples": {
+				"schema": {
+					"stack": 0,
+					"time": 1,
+					"responsiveness": 2,
+				},
+				"data": self.samples
+			},
+
+			# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L156
+			"frameTable": {
+				"schema": {
+					"location": 0,
+					"relevantForJS": 1,
+					"innerWindowID": 2,
+					"implementation": 3,
+					"optimizations": 4,
+					"line": 5,
+					"column": 6,
+					"category": 7,
+					"subcategory": 8,
+				},
+				"data": self.frameTable,
+			},
+
+			# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L216
+			"stackTable": {
+				"schema": {
+					"prefix": 0,
+					"frame": 1,
+				},
+				"data": self.stackTable,
+			},
+			"stringTable": self.stringTable,
+			"registerTime": 0,
+			"unregisterTime": None,
+			"processType": "default",
+		}
+
+# Uses perf script python interface to parse each
+# event and store the data in the thread builder.
+def process_event(param_dict: Dict) -> None:
+	global start_time
+	global tid_to_thread
+	time_stamp = (param_dict['sample']['time'] // 1000) / 1000
+	pid = param_dict['sample']['pid']
+	tid = param_dict['sample']['tid']
+	comm = param_dict['comm']
+
+	# Start time is the time of the first sample
+	if not start_time:
+		start_time = time_stamp
+
+	# Parse and append the callchain of the current sample into a stack.
+	stack = []
+	if param_dict['callchain']:
+		for call in param_dict['callchain']:
+			if 'sym' not in call:
+				continue
+			stack.append(f'{call["sym"]["name"]} (in {call["dso"]})')
+		if len(stack) != 0:
+			# Reverse the stack, as root come first and the leaf at the end.
+			stack = stack[::-1]
+
+	# During perf record if -g is not used, the callchain is not available.
+	# In that case, the symbol and dso are available in the event parameters.
+	else:
+		func = param_dict['symbol'] if 'symbol' in param_dict else '[unknown]'
+		dso = param_dict['dso'] if 'dso' in param_dict else '[unknown]'
+		stack.append(f'{func} (in {dso})')
+
+	# Add sample to the specific thread.
+	thread = tid_to_thread.get(tid)
+	if thread is None:
+		thread = Thread(comm=comm, pid=pid, tid=tid)
+		tid_to_thread[tid] = thread
+	thread._add_sample(comm=comm, stack=stack, time_ms=time_stamp)
+
+def trace_begin() -> None:
+	global output_file
+	if (output_file is None):
+		print("Staring Firefox Profiler on your default browser...")
+		global http_server_thread
+		http_server_thread = threading.Thread(target=test, args=(CORSRequestHandler, HTTPServer,))
+		http_server_thread.daemon = True
+		http_server_thread.start()
+
+# Trace_end runs at the end and will be used to aggregate
+# the data into the final json object and print it out to stdout.
+def trace_end() -> None:
+	global output_file
+	threads = [thread._to_json_dict() for thread in tid_to_thread.values()]
+
+	# Schema: https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L305
+	gecko_profile_with_meta = {
+		"meta": {
+			"interval": 1,
+			"processType": 0,
+			"product": PRODUCT,
+			"stackwalk": 1,
+			"debug": 0,
+			"gcpoison": 0,
+			"asyncstack": 1,
+			"startTime": start_time,
+			"shutdownTime": None,
+			"version": 24,
+			"presymbolicated": True,
+			"categories": CATEGORIES,
+			"markerSchema": [],
+			},
+		"libs": [],
+		"threads": threads,
+		"processes": [],
+		"pausedRanges": [],
+	}
+	# launch the profiler on local host if not specified --save-only args, otherwise print to file
+	if (output_file is None):
+		output_file = 'gecko_profile.json'
+		with open(output_file, 'w') as f:
+			json.dump(gecko_profile_with_meta, f, indent=2)
+		launchFirefox(output_file)
+		time.sleep(1)
+		print(f'[ perf gecko: Captured and wrote into {output_file} ]')
+	else:
+		print(f'[ perf gecko: Captured and wrote into {output_file} ]')
+		with open(output_file, 'w') as f:
+			json.dump(gecko_profile_with_meta, f, indent=2)
+
+# Used to enable Cross-Origin Resource Sharing (CORS) for requests coming from 'https://profiler.firefox.com', allowing it to access resources from this server.
+class CORSRequestHandler(SimpleHTTPRequestHandler):
+	def end_headers (self):
+		self.send_header('Access-Control-Allow-Origin', 'https://profiler.firefox.com')
+		SimpleHTTPRequestHandler.end_headers(self)
+
+# start a local server to serve the gecko_profile.json file to the profiler.firefox.com
+def launchFirefox(file):
+	safe_string = urllib.parse.quote_plus(f'http://localhost:8000/{file}')
+	url = 'https://profiler.firefox.com/from-url/' + safe_string
+	webbrowser.open(f'{url}')
+
+def main() -> None:
+	global output_file
+	global CATEGORIES
+	parser = argparse.ArgumentParser(description="Convert perf.data to Firefox\'s Gecko Profile format which can be uploaded to profiler.firefox.com for visualization")
+
+	# Add the command-line options
+	# Colors must be defined according to this:
+	# https://github.com/firefox-devtools/profiler/blob/50124adbfa488adba6e2674a8f2618cf34b59cd2/res/css/categories.css
+	parser.add_argument('--user-color', default='yellow', help='Color for the User category', choices=['yellow', 'blue', 'purple', 'green', 'orange', 'red', 'grey', 'magenta'])
+	parser.add_argument('--kernel-color', default='orange', help='Color for the Kernel category', choices=['yellow', 'blue', 'purple', 'green', 'orange', 'red', 'grey', 'magenta'])
+	# If --save-only is specified, the output will be saved to a file instead of opening Firefox's profiler directly.
+	parser.add_argument('--save-only', help='Save the output to a file instead of opening Firefox\'s profiler')
+
+	# Parse the command-line arguments
+	args = parser.parse_args()
+	# Access the values provided by the user
+	user_color = args.user_color
+	kernel_color = args.kernel_color
+	output_file = args.save_only
+
+	CATEGORIES = [
+		{
+			"name": 'User',
+			"color": user_color,
+			"subcategories": ['Other']
+		},
+		{
+			"name": 'Kernel',
+			"color": kernel_color,
+			"subcategories": ['Other']
+		},
+	]
+
+if __name__ == '__main__':
+	main()
diff --git a/tools/perf/scripts/python/intel-pt-events.py b/tools/perf/scripts/python/intel-pt-events.py
index a73847c8f548..346c89bd16d6 100644
--- a/tools/perf/scripts/python/intel-pt-events.py
+++ b/tools/perf/scripts/python/intel-pt-events.py
@@ -1,5 +1,6 @@
-# intel-pt-events.py: Print Intel PT Power Events and PTWRITE
-# Copyright (c) 2017, Intel Corporation.
+# SPDX-License-Identifier: GPL-2.0
+# intel-pt-events.py: Print Intel PT Events including Power Events and PTWRITE
+# Copyright (c) 2017-2021, Intel Corporation.
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms and conditions of the GNU General Public License,
@@ -10,34 +11,142 @@
 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 # more details.
 
-from __future__ import print_function
+from __future__ import division, print_function
 
+import io
 import os
 import sys
 import struct
+import argparse
+import contextlib
+
+from libxed import LibXED
+from ctypes import create_string_buffer, addressof
 
 sys.path.append(os.environ['PERF_EXEC_PATH'] + \
 	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
 
-# These perf imports are not used at present
-#from perf_trace_context import *
-#from Core import *
+from perf_trace_context import perf_set_itrace_options, \
+	perf_sample_insn, perf_sample_srccode
+
+try:
+	broken_pipe_exception = BrokenPipeError
+except:
+	broken_pipe_exception = IOError
+
+glb_switch_str		= {}
+glb_insn		= False
+glb_disassembler	= None
+glb_src			= False
+glb_source_file_name	= None
+glb_line_number		= None
+glb_dso			= None
+glb_stash_dict		= {}
+glb_output		= None
+glb_output_pos		= 0
+glb_cpu			= -1
+glb_time		= 0
+
+def get_optional_null(perf_dict, field):
+	if field in perf_dict:
+		return perf_dict[field]
+	return ""
+
+def get_optional_zero(perf_dict, field):
+	if field in perf_dict:
+		return perf_dict[field]
+	return 0
+
+def get_optional_bytes(perf_dict, field):
+	if field in perf_dict:
+		return perf_dict[field]
+	return bytes()
+
+def get_optional(perf_dict, field):
+	if field in perf_dict:
+		return perf_dict[field]
+	return "[unknown]"
+
+def get_offset(perf_dict, field):
+	if field in perf_dict:
+		return "+%#x" % perf_dict[field]
+	return ""
 
 def trace_begin():
-	print("Intel PT Power Events and PTWRITE")
+	ap = argparse.ArgumentParser(usage = "", add_help = False)
+	ap.add_argument("--insn-trace", action='store_true')
+	ap.add_argument("--src-trace", action='store_true')
+	ap.add_argument("--all-switch-events", action='store_true')
+	ap.add_argument("--interleave", type=int, nargs='?', const=4, default=0)
+	global glb_args
+	global glb_insn
+	global glb_src
+	glb_args = ap.parse_args()
+	if glb_args.insn_trace:
+		print("Intel PT Instruction Trace")
+		itrace = "i0nsepwxI"
+		glb_insn = True
+	elif glb_args.src_trace:
+		print("Intel PT Source Trace")
+		itrace = "i0nsepwxI"
+		glb_insn = True
+		glb_src = True
+	else:
+		print("Intel PT Branch Trace, Power Events, Event Trace and PTWRITE")
+		itrace = "bepwxI"
+	global glb_disassembler
+	try:
+		glb_disassembler = LibXED()
+	except:
+		glb_disassembler = None
+	perf_set_itrace_options(perf_script_context, itrace)
 
 def trace_end():
+	if glb_args.interleave:
+		flush_stashed_output()
 	print("End")
 
 def trace_unhandled(event_name, context, event_fields_dict):
 		print(' '.join(['%s=%s'%(k,str(v))for k,v in sorted(event_fields_dict.items())]))
 
+def stash_output():
+	global glb_stash_dict
+	global glb_output_pos
+	output_str = glb_output.getvalue()[glb_output_pos:]
+	n = len(output_str)
+	if n:
+		glb_output_pos += n
+		if glb_cpu not in glb_stash_dict:
+			glb_stash_dict[glb_cpu] = []
+		glb_stash_dict[glb_cpu].append(output_str)
+
+def flush_stashed_output():
+	global glb_stash_dict
+	while glb_stash_dict:
+		cpus = list(glb_stash_dict.keys())
+		# Output at most glb_args.interleave output strings per cpu
+		for cpu in cpus:
+			items = glb_stash_dict[cpu]
+			countdown = glb_args.interleave
+			while len(items) and countdown:
+				sys.stdout.write(items[0])
+				del items[0]
+				countdown -= 1
+			if not items:
+				del glb_stash_dict[cpu]
+
 def print_ptwrite(raw_buf):
 	data = struct.unpack_from("<IQ", raw_buf)
 	flags = data[0]
 	payload = data[1]
 	exact_ip = flags & 1
-	print("IP: %u payload: %#x" % (exact_ip, payload), end=' ')
+	try:
+		s = payload.to_bytes(8, "little").decode("ascii").rstrip("\x00")
+		if not s.isprintable():
+			s = ""
+	except:
+		s = ""
+	print("IP: %u payload: %#x" % (exact_ip, payload), s, end=' ')
 
 def print_cbr(raw_buf):
 	data = struct.unpack_from("<BBBBII", raw_buf)
@@ -77,58 +186,309 @@ def print_pwrx(raw_buf):
 	print("deepest cstate: %u last cstate: %u wake reason: %#x" %
 		(deepest_cstate, last_cstate, wake_reason), end=' ')
 
-def print_common_start(comm, sample, name):
+def print_psb(raw_buf):
+	data = struct.unpack_from("<IQ", raw_buf)
+	offset = data[1]
+	print("offset: %#x" % (offset), end=' ')
+
+glb_cfe = ["", "INTR", "IRET", "SMI", "RSM", "SIPI", "INIT", "VMENTRY", "VMEXIT",
+		"VMEXIT_INTR", "SHUTDOWN", "", "UINT", "UIRET"] + [""] * 18
+glb_evd = ["", "PFA", "VMXQ", "VMXR"] + [""] * 60
+
+def print_evt(raw_buf):
+	data = struct.unpack_from("<BBH", raw_buf)
+	typ = data[0] & 0x1f
+	ip_flag = (data[0] & 0x80) >> 7
+	vector = data[1]
+	evd_cnt = data[2]
+	s = glb_cfe[typ]
+	if s:
+		print(" cfe: %s IP: %u vector: %u" % (s, ip_flag, vector), end=' ')
+	else:
+		print(" cfe: %u IP: %u vector: %u" % (typ, ip_flag, vector), end=' ')
+	pos = 4
+	for i in range(evd_cnt):
+		data = struct.unpack_from("<QQ", raw_buf)
+		et = data[0] & 0x3f
+		s = glb_evd[et]
+		if s:
+			print("%s: %#x" % (s, data[1]), end=' ')
+		else:
+			print("EVD_%u: %#x" % (et, data[1]), end=' ')
+
+def print_iflag(raw_buf):
+	data = struct.unpack_from("<IQ", raw_buf)
+	iflag = data[0] & 1
+	old_iflag = iflag ^ 1
+	via_branch = data[0] & 2
+	branch_ip = data[1]
+	if via_branch:
+		s = "via"
+	else:
+		s = "non"
+	print("IFLAG: %u->%u %s branch" % (old_iflag, iflag, s), end=' ')
+
+def common_start_str(comm, sample):
 	ts = sample["time"]
 	cpu = sample["cpu"]
 	pid = sample["pid"]
 	tid = sample["tid"]
-	print("%16s %5u/%-5u [%03u] %9u.%09u %7s:" %
-		(comm, pid, tid, cpu, ts / 1000000000, ts %1000000000, name),
-		end=' ')
+	if "machine_pid" in sample:
+		machine_pid = sample["machine_pid"]
+		vcpu = sample["vcpu"]
+		return "VM:%5d VCPU:%03d %16s %5u/%-5u [%03u] %9u.%09u  " % (machine_pid, vcpu, comm, pid, tid, cpu, ts / 1000000000, ts %1000000000)
+	else:
+		return "%16s %5u/%-5u [%03u] %9u.%09u  " % (comm, pid, tid, cpu, ts / 1000000000, ts %1000000000)
+
+def print_common_start(comm, sample, name):
+	flags_disp = get_optional_null(sample, "flags_disp")
+	# Unused fields:
+	# period      = sample["period"]
+	# phys_addr   = sample["phys_addr"]
+	# weight      = sample["weight"]
+	# transaction = sample["transaction"]
+	# cpumode     = get_optional_zero(sample, "cpumode")
+	print(common_start_str(comm, sample) + "%8s  %21s" % (name, flags_disp), end=' ')
+
+def print_instructions_start(comm, sample):
+	if "x" in get_optional_null(sample, "flags"):
+		print(common_start_str(comm, sample) + "x", end=' ')
+	else:
+		print(common_start_str(comm, sample), end='  ')
+
+def disassem(insn, ip):
+	inst = glb_disassembler.Instruction()
+	glb_disassembler.SetMode(inst, 0) # Assume 64-bit
+	buf = create_string_buffer(64)
+	buf.value = insn
+	return glb_disassembler.DisassembleOne(inst, addressof(buf), len(insn), ip)
+
+def print_common_ip(param_dict, sample, symbol, dso):
+	ip   = sample["ip"]
+	offs = get_offset(param_dict, "symoff")
+	if "cyc_cnt" in sample:
+		cyc_cnt = sample["cyc_cnt"]
+		insn_cnt = get_optional_zero(sample, "insn_cnt")
+		ipc_str = "  IPC: %#.2f (%u/%u)" % (insn_cnt / cyc_cnt, insn_cnt, cyc_cnt)
+	else:
+		ipc_str = ""
+	if glb_insn and glb_disassembler is not None:
+		insn = perf_sample_insn(perf_script_context)
+		if insn and len(insn):
+			cnt, text = disassem(insn, ip)
+			byte_str = ("%x" % ip).rjust(16)
+			if sys.version_info.major >= 3:
+				for k in range(cnt):
+					byte_str += " %02x" % insn[k]
+			else:
+				for k in xrange(cnt):
+					byte_str += " %02x" % ord(insn[k])
+			print("%-40s  %-30s" % (byte_str, text), end=' ')
+		print("%s%s (%s)" % (symbol, offs, dso), end=' ')
+	else:
+		print("%16x %s%s (%s)" % (ip, symbol, offs, dso), end=' ')
+	if "addr_correlates_sym" in sample:
+		addr   = sample["addr"]
+		dso    = get_optional(sample, "addr_dso")
+		symbol = get_optional(sample, "addr_symbol")
+		offs   = get_offset(sample, "addr_symoff")
+		print("=> %x %s%s (%s)%s" % (addr, symbol, offs, dso, ipc_str))
+	else:
+		print(ipc_str)
 
-def print_common_ip(sample, symbol, dso):
+def print_srccode(comm, param_dict, sample, symbol, dso, with_insn):
 	ip = sample["ip"]
-	print("%16x %s (%s)" % (ip, symbol, dso))
+	if symbol == "[unknown]":
+		start_str = common_start_str(comm, sample) + ("%x" % ip).rjust(16).ljust(40)
+	else:
+		offs = get_offset(param_dict, "symoff")
+		start_str = common_start_str(comm, sample) + (symbol + offs).ljust(40)
 
-def process_event(param_dict):
-	event_attr = param_dict["attr"]
-	sample	 = param_dict["sample"]
-	raw_buf	= param_dict["raw_buf"]
+	if with_insn and glb_insn and glb_disassembler is not None:
+		insn = perf_sample_insn(perf_script_context)
+		if insn and len(insn):
+			cnt, text = disassem(insn, ip)
+		start_str += text.ljust(30)
+
+	global glb_source_file_name
+	global glb_line_number
+	global glb_dso
+
+	source_file_name, line_number, source_line = perf_sample_srccode(perf_script_context)
+	if source_file_name:
+		if glb_line_number == line_number and glb_source_file_name == source_file_name:
+			src_str = ""
+		else:
+			if len(source_file_name) > 40:
+				src_file = ("..." + source_file_name[-37:]) + " "
+			else:
+				src_file = source_file_name.ljust(41)
+			if source_line is None:
+				src_str = src_file + str(line_number).rjust(4) + " <source not found>"
+			else:
+				src_str = src_file + str(line_number).rjust(4) + " " + source_line
+		glb_dso = None
+	elif dso == glb_dso:
+		src_str = ""
+	else:
+		src_str = dso
+		glb_dso = dso
+
+	glb_line_number = line_number
+	glb_source_file_name = source_file_name
+
+	print(start_str, src_str)
+
+def do_process_event(param_dict):
+	sample	   = param_dict["sample"]
+	raw_buf	   = param_dict["raw_buf"]
 	comm	   = param_dict["comm"]
 	name	   = param_dict["ev_name"]
+	# Unused fields:
+	# callchain  = param_dict["callchain"]
+	# brstack    = param_dict["brstack"]
+	# brstacksym = param_dict["brstacksym"]
+	# event_attr = param_dict["attr"]
 
 	# Symbol and dso info are not always resolved
-	if "dso" in param_dict:
-		dso = param_dict["dso"]
-	else:
-		dso = "[unknown]"
+	dso    = get_optional(param_dict, "dso")
+	symbol = get_optional(param_dict, "symbol")
 
-	if "symbol" in param_dict:
-		symbol = param_dict["symbol"]
-	else:
-		symbol = "[unknown]"
+	cpu = sample["cpu"]
+	if cpu in glb_switch_str:
+		print(glb_switch_str[cpu])
+		del glb_switch_str[cpu]
 
-	if name == "ptwrite":
+	if name.startswith("instructions"):
+		if glb_src:
+			print_srccode(comm, param_dict, sample, symbol, dso, True)
+		else:
+			print_instructions_start(comm, sample)
+			print_common_ip(param_dict, sample, symbol, dso)
+	elif name.startswith("branches"):
+		if glb_src:
+			print_srccode(comm, param_dict, sample, symbol, dso, False)
+		else:
+			print_common_start(comm, sample, name)
+			print_common_ip(param_dict, sample, symbol, dso)
+	elif name == "ptwrite":
 		print_common_start(comm, sample, name)
 		print_ptwrite(raw_buf)
-		print_common_ip(sample, symbol, dso)
+		print_common_ip(param_dict, sample, symbol, dso)
 	elif name == "cbr":
 		print_common_start(comm, sample, name)
 		print_cbr(raw_buf)
-		print_common_ip(sample, symbol, dso)
+		print_common_ip(param_dict, sample, symbol, dso)
 	elif name == "mwait":
 		print_common_start(comm, sample, name)
 		print_mwait(raw_buf)
-		print_common_ip(sample, symbol, dso)
+		print_common_ip(param_dict, sample, symbol, dso)
 	elif name == "pwre":
 		print_common_start(comm, sample, name)
 		print_pwre(raw_buf)
-		print_common_ip(sample, symbol, dso)
+		print_common_ip(param_dict, sample, symbol, dso)
 	elif name == "exstop":
 		print_common_start(comm, sample, name)
 		print_exstop(raw_buf)
-		print_common_ip(sample, symbol, dso)
+		print_common_ip(param_dict, sample, symbol, dso)
 	elif name == "pwrx":
 		print_common_start(comm, sample, name)
 		print_pwrx(raw_buf)
-		print_common_ip(sample, symbol, dso)
+		print_common_ip(param_dict, sample, symbol, dso)
+	elif name == "psb":
+		print_common_start(comm, sample, name)
+		print_psb(raw_buf)
+		print_common_ip(param_dict, sample, symbol, dso)
+	elif name == "evt":
+		print_common_start(comm, sample, name)
+		print_evt(raw_buf)
+		print_common_ip(param_dict, sample, symbol, dso)
+	elif name == "iflag":
+		print_common_start(comm, sample, name)
+		print_iflag(raw_buf)
+		print_common_ip(param_dict, sample, symbol, dso)
+	else:
+		print_common_start(comm, sample, name)
+		print_common_ip(param_dict, sample, symbol, dso)
+
+def interleave_events(param_dict):
+	global glb_cpu
+	global glb_time
+	global glb_output
+	global glb_output_pos
+
+	sample  = param_dict["sample"]
+	glb_cpu = sample["cpu"]
+	ts      = sample["time"]
+
+	if glb_time != ts:
+		glb_time = ts
+		flush_stashed_output()
+
+	glb_output_pos = 0
+	with contextlib.redirect_stdout(io.StringIO()) as glb_output:
+		do_process_event(param_dict)
+
+	stash_output()
+
+def process_event(param_dict):
+	try:
+		if glb_args.interleave:
+			interleave_events(param_dict)
+		else:
+			do_process_event(param_dict)
+	except broken_pipe_exception:
+		# Stop python printing broken pipe errors and traceback
+		sys.stdout = open(os.devnull, 'w')
+		sys.exit(1)
+
+def auxtrace_error(typ, code, cpu, pid, tid, ip, ts, msg, cpumode, *x):
+	if glb_args.interleave:
+		flush_stashed_output()
+	if len(x) >= 2 and x[0]:
+		machine_pid = x[0]
+		vcpu = x[1]
+	else:
+		machine_pid = 0
+		vcpu = -1
+	try:
+		if machine_pid:
+			print("VM:%5d VCPU:%03d %16s %5u/%-5u [%03u] %9u.%09u  error type %u code %u: %s ip 0x%16x" %
+				(machine_pid, vcpu, "Trace error", pid, tid, cpu, ts / 1000000000, ts %1000000000, typ, code, msg, ip))
+		else:
+			print("%16s %5u/%-5u [%03u] %9u.%09u  error type %u code %u: %s ip 0x%16x" %
+				("Trace error", pid, tid, cpu, ts / 1000000000, ts %1000000000, typ, code, msg, ip))
+	except broken_pipe_exception:
+		# Stop python printing broken pipe errors and traceback
+		sys.stdout = open(os.devnull, 'w')
+		sys.exit(1)
+
+def context_switch(ts, cpu, pid, tid, np_pid, np_tid, machine_pid, out, out_preempt, *x):
+	if glb_args.interleave:
+		flush_stashed_output()
+	if out:
+		out_str = "Switch out "
+	else:
+		out_str = "Switch In  "
+	if out_preempt:
+		preempt_str = "preempt"
+	else:
+		preempt_str = ""
+	if len(x) >= 2 and x[0]:
+		machine_pid = x[0]
+		vcpu = x[1]
+	else:
+		vcpu = None;
+	if machine_pid == -1:
+		machine_str = ""
+	elif vcpu is None:
+		machine_str = "machine PID %d" % machine_pid
+	else:
+		machine_str = "machine PID %d VCPU %d" % (machine_pid, vcpu)
+	switch_str = "%16s %5d/%-5d [%03u] %9u.%09u %5d/%-5d %s %s" % \
+		(out_str, pid, tid, cpu, ts / 1000000000, ts %1000000000, np_pid, np_tid, machine_str, preempt_str)
+	if glb_args.all_switch_events:
+		print(switch_str)
+	else:
+		global glb_switch_str
+		glb_switch_str[cpu] = switch_str
diff --git a/tools/perf/scripts/python/libxed.py b/tools/perf/scripts/python/libxed.py
new file mode 100644
index 000000000000..2c70a5a7eb9c
--- /dev/null
+++ b/tools/perf/scripts/python/libxed.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: GPL-2.0
+# libxed.py: Python wrapper for libxed.so
+# Copyright (c) 2014-2021, Intel Corporation.
+
+# To use Intel XED, libxed.so must be present. To build and install
+# libxed.so:
+#            git clone https://github.com/intelxed/mbuild.git mbuild
+#            git clone https://github.com/intelxed/xed
+#            cd xed
+#            ./mfile.py --share
+#            sudo ./mfile.py --prefix=/usr/local install
+#            sudo ldconfig
+#
+
+import sys
+
+from ctypes import CDLL, Structure, create_string_buffer, addressof, sizeof, \
+		   c_void_p, c_bool, c_byte, c_char, c_int, c_uint, c_longlong, c_ulonglong
+
+# XED Disassembler
+
+class xed_state_t(Structure):
+
+	_fields_ = [
+		("mode", c_int),
+		("width", c_int)
+	]
+
+class XEDInstruction():
+
+	def __init__(self, libxed):
+		# Current xed_decoded_inst_t structure is 192 bytes. Use 512 to allow for future expansion
+		xedd_t = c_byte * 512
+		self.xedd = xedd_t()
+		self.xedp = addressof(self.xedd)
+		libxed.xed_decoded_inst_zero(self.xedp)
+		self.state = xed_state_t()
+		self.statep = addressof(self.state)
+		# Buffer for disassembled instruction text
+		self.buffer = create_string_buffer(256)
+		self.bufferp = addressof(self.buffer)
+
+class LibXED():
+
+	def __init__(self):
+		try:
+			self.libxed = CDLL("libxed.so")
+		except:
+			self.libxed = None
+		if not self.libxed:
+			self.libxed = CDLL("/usr/local/lib/libxed.so")
+
+		self.xed_tables_init = self.libxed.xed_tables_init
+		self.xed_tables_init.restype = None
+		self.xed_tables_init.argtypes = []
+
+		self.xed_decoded_inst_zero = self.libxed.xed_decoded_inst_zero
+		self.xed_decoded_inst_zero.restype = None
+		self.xed_decoded_inst_zero.argtypes = [ c_void_p ]
+
+		self.xed_operand_values_set_mode = self.libxed.xed_operand_values_set_mode
+		self.xed_operand_values_set_mode.restype = None
+		self.xed_operand_values_set_mode.argtypes = [ c_void_p, c_void_p ]
+
+		self.xed_decoded_inst_zero_keep_mode = self.libxed.xed_decoded_inst_zero_keep_mode
+		self.xed_decoded_inst_zero_keep_mode.restype = None
+		self.xed_decoded_inst_zero_keep_mode.argtypes = [ c_void_p ]
+
+		self.xed_decode = self.libxed.xed_decode
+		self.xed_decode.restype = c_int
+		self.xed_decode.argtypes = [ c_void_p, c_void_p, c_uint ]
+
+		self.xed_format_context = self.libxed.xed_format_context
+		self.xed_format_context.restype = c_uint
+		self.xed_format_context.argtypes = [ c_int, c_void_p, c_void_p, c_int, c_ulonglong, c_void_p, c_void_p ]
+
+		self.xed_tables_init()
+
+	def Instruction(self):
+		return XEDInstruction(self)
+
+	def SetMode(self, inst, mode):
+		if mode:
+			inst.state.mode = 4 # 32-bit
+			inst.state.width = 4 # 4 bytes
+		else:
+			inst.state.mode = 1 # 64-bit
+			inst.state.width = 8 # 8 bytes
+		self.xed_operand_values_set_mode(inst.xedp, inst.statep)
+
+	def DisassembleOne(self, inst, bytes_ptr, bytes_cnt, ip):
+		self.xed_decoded_inst_zero_keep_mode(inst.xedp)
+		err = self.xed_decode(inst.xedp, bytes_ptr, bytes_cnt)
+		if err:
+			return 0, ""
+		# Use AT&T mode (2), alternative is Intel (3)
+		ok = self.xed_format_context(2, inst.xedp, inst.bufferp, sizeof(inst.buffer), ip, 0, 0)
+		if not ok:
+			return 0, ""
+		if sys.version_info[0] == 2:
+			result = inst.buffer.value
+		else:
+			result = inst.buffer.value.decode()
+		# Return instruction length and the disassembled instruction text
+		# For now, assume the length is in byte 166
+		return inst.xedd[166], result
diff --git a/tools/perf/scripts/python/mem-phys-addr.py b/tools/perf/scripts/python/mem-phys-addr.py
index 1f332e72b9b0..5e237a5a5f1b 100644
--- a/tools/perf/scripts/python/mem-phys-addr.py
+++ b/tools/perf/scripts/python/mem-phys-addr.py
@@ -3,98 +3,125 @@
 #
 # Copyright (c) 2018, Intel Corporation.
 
-from __future__ import division
-from __future__ import print_function
-
 import os
 import sys
-import struct
 import re
 import bisect
 import collections
+from dataclasses import dataclass
+from typing import (Dict, Optional)
 
 sys.path.append(os.environ['PERF_EXEC_PATH'] + \
-	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+    '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+@dataclass(frozen=True)
+class IomemEntry:
+    """Read from a line in /proc/iomem"""
+    begin: int
+    end: int
+    indent: int
+    label: str
 
-#physical address ranges for System RAM
-system_ram = []
-#physical address ranges for Persistent Memory
-pmem = []
-#file object for proc iomem
-f = None
-#Count for each type of memory
-load_mem_type_cnt = collections.Counter()
-#perf event name
-event_name = None
+# Physical memory layout from /proc/iomem. Key is the indent and then
+# a list of ranges.
+iomem: Dict[int, list[IomemEntry]] = collections.defaultdict(list)
+# Child nodes from the iomem parent.
+children: Dict[IomemEntry, set[IomemEntry]] = collections.defaultdict(set)
+# Maximum indent seen before an entry in the iomem file.
+max_indent: int = 0
+# Count for each range of memory.
+load_mem_type_cnt: Dict[IomemEntry, int] = collections.Counter()
+# Perf event name set from the first sample in the data.
+event_name: Optional[str] = None
 
 def parse_iomem():
-	global f
-	f = open('/proc/iomem', 'r')
-	for i, j in enumerate(f):
-		m = re.split('-|:',j,2)
-		if m[2].strip() == 'System RAM':
-			system_ram.append(int(m[0], 16))
-			system_ram.append(int(m[1], 16))
-		if m[2].strip() == 'Persistent Memory':
-			pmem.append(int(m[0], 16))
-			pmem.append(int(m[1], 16))
+    """Populate iomem from /proc/iomem file"""
+    global iomem
+    global max_indent
+    global children
+    with open('/proc/iomem', 'r', encoding='ascii') as f:
+        for line in f:
+            indent = 0
+            while line[indent] == ' ':
+                indent += 1
+            if indent > max_indent:
+                max_indent = indent
+            m = re.split('-|:', line, 2)
+            begin = int(m[0], 16)
+            end = int(m[1], 16)
+            label = m[2].strip()
+            entry = IomemEntry(begin, end, indent, label)
+            # Before adding entry, search for a parent node using its begin.
+            if indent > 0:
+                parent = find_memory_type(begin)
+                assert parent, f"Given indent expected a parent for {label}"
+                children[parent].add(entry)
+            iomem[indent].append(entry)
 
-def print_memory_type():
-	print("Event: %s" % (event_name))
-	print("%-40s  %10s  %10s\n" % ("Memory type", "count", "percentage"), end='')
-	print("%-40s  %10s  %10s\n" % ("----------------------------------------",
-					"-----------", "-----------"),
-					end='');
-	total = sum(load_mem_type_cnt.values())
-	for mem_type, count in sorted(load_mem_type_cnt.most_common(), \
-					key = lambda kv: (kv[1], kv[0]), reverse = True):
-		print("%-40s  %10d  %10.1f%%\n" %
-			(mem_type, count, 100 * count / total),
-			end='')
+def find_memory_type(phys_addr) -> Optional[IomemEntry]:
+    """Search iomem for the range containing phys_addr with the maximum indent"""
+    for i in range(max_indent, -1, -1):
+        if i not in iomem:
+            continue
+        position = bisect.bisect_right(iomem[i], phys_addr,
+                                       key=lambda entry: entry.begin)
+        if position is None:
+            continue
+        iomem_entry = iomem[i][position-1]
+        if  iomem_entry.begin <= phys_addr <= iomem_entry.end:
+            return iomem_entry
+    print(f"Didn't find {phys_addr}")
+    return None
 
-def trace_begin():
-	parse_iomem()
+def print_memory_type():
+    print(f"Event: {event_name}")
+    print(f"{'Memory type':<40}  {'count':>10}  {'percentage':>10}")
+    print(f"{'-' * 40:<40}  {'-' * 10:>10}  {'-' * 10:>10}")
+    total = sum(load_mem_type_cnt.values())
+    # Add count from children into the parent.
+    for i in range(max_indent, -1, -1):
+        if i not in iomem:
+            continue
+        for entry in iomem[i]:
+            global children
+            for child in children[entry]:
+                if load_mem_type_cnt[child] > 0:
+                    load_mem_type_cnt[entry] += load_mem_type_cnt[child]
 
-def trace_end():
-	print_memory_type()
-	f.close()
+    def print_entries(entries):
+        """Print counts from parents down to their children"""
+        global children
+        for entry in sorted(entries,
+                            key = lambda entry: load_mem_type_cnt[entry],
+                            reverse = True):
+            count = load_mem_type_cnt[entry]
+            if count > 0:
+                mem_type = ' ' * entry.indent + f"{entry.begin:x}-{entry.end:x} : {entry.label}"
+                percent = 100 * count / total
+                print(f"{mem_type:<40}  {count:>10}  {percent:>10.1f}")
+                print_entries(children[entry])
 
-def is_system_ram(phys_addr):
-	#/proc/iomem is sorted
-	position = bisect.bisect(system_ram, phys_addr)
-	if position % 2 == 0:
-		return False
-	return True
+    print_entries(iomem[0])
 
-def is_persistent_mem(phys_addr):
-	position = bisect.bisect(pmem, phys_addr)
-	if position % 2 == 0:
-		return False
-	return True
+def trace_begin():
+    parse_iomem()
 
-def find_memory_type(phys_addr):
-	if phys_addr == 0:
-		return "N/A"
-	if is_system_ram(phys_addr):
-		return "System RAM"
+def trace_end():
+    print_memory_type()
 
-	if is_persistent_mem(phys_addr):
-		return "Persistent Memory"
+def process_event(param_dict):
+    if "sample" not in param_dict:
+        return
 
-	#slow path, search all
-	f.seek(0, 0)
-	for j in f:
-		m = re.split('-|:',j,2)
-		if int(m[0], 16) <= phys_addr <= int(m[1], 16):
-			return m[2]
-	return "N/A"
+    sample = param_dict["sample"]
+    if "phys_addr" not in sample:
+        return
 
-def process_event(param_dict):
-	name       = param_dict["ev_name"]
-	sample     = param_dict["sample"]
-	phys_addr  = sample["phys_addr"]
+    phys_addr  = sample["phys_addr"]
+    entry = find_memory_type(phys_addr)
+    if entry:
+        load_mem_type_cnt[entry] += 1
 
-	global event_name
-	if event_name == None:
-		event_name = name
-	load_mem_type_cnt[find_memory_type(phys_addr)] += 1
+    global event_name
+    if event_name is None:
+        event_name  = param_dict["ev_name"]
diff --git a/tools/perf/scripts/python/net_dropmonitor.py b/tools/perf/scripts/python/net_dropmonitor.py
index 101059971738..a97e7a6e0940 100755
--- a/tools/perf/scripts/python/net_dropmonitor.py
+++ b/tools/perf/scripts/python/net_dropmonitor.py
@@ -68,9 +68,9 @@ def trace_end():
 	get_kallsyms_table()
 	print_drop_table()
 
-# called from perf, when it finds a correspoinding event
+# called from perf, when it finds a corresponding event
 def skb__kfree_skb(name, context, cpu, sec, nsec, pid, comm, callchain,
-		   skbaddr, location, protocol):
+		   skbaddr, location, protocol, reason):
 	slocation = str(location)
 	try:
 		drop_log[slocation] = drop_log[slocation] + 1
diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py
index ea0c8b90a783..30c4bccee5b2 100644
--- a/tools/perf/scripts/python/netdev-times.py
+++ b/tools/perf/scripts/python/netdev-times.py
@@ -288,12 +288,13 @@ def net__net_dev_xmit(name, context, cpu, sec, nsec, pid, comm, callchain,
 	all_event_list.append(event_info)
 
 def skb__kfree_skb(name, context, cpu, sec, nsec, pid, comm, callchain,
-			skbaddr, protocol, location):
+			skbaddr, location, protocol, reason):
 	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
-			skbaddr, protocol, location)
+			skbaddr, location, protocol, reason)
 	all_event_list.append(event_info)
 
-def skb__consume_skb(name, context, cpu, sec, nsec, pid, comm, callchain, skbaddr):
+def skb__consume_skb(name, context, cpu, sec, nsec, pid, comm, callchain,
+			skbaddr, location):
 	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
 			skbaddr)
 	all_event_list.append(event_info)
@@ -356,7 +357,7 @@ def handle_irq_softirq_exit(event_info):
 		return
 	rec_data = {'sirq_ent_t':sirq_ent_t, 'sirq_ext_t':time,
 			'irq_list':irq_list, 'event_list':event_list}
-	# merge information realted to a NET_RX softirq
+	# merge information related to a NET_RX softirq
 	receive_hunk_list.append(rec_data)
 
 def handle_napi_poll(event_info):
@@ -430,7 +431,7 @@ def handle_net_dev_xmit(event_info):
 
 def handle_kfree_skb(event_info):
 	(name, context, cpu, time, pid, comm,
-		skbaddr, protocol, location) = event_info
+		skbaddr, location, protocol, reason) = event_info
 	for i in range(len(tx_queue_list)):
 		skb = tx_queue_list[i]
 		if skb['skbaddr'] == skbaddr:
diff --git a/tools/perf/scripts/python/parallel-perf.py b/tools/perf/scripts/python/parallel-perf.py
new file mode 100755
index 000000000000..be85fd7f6632
--- /dev/null
+++ b/tools/perf/scripts/python/parallel-perf.py
@@ -0,0 +1,989 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a perf script command multiple times in parallel, using perf script
+# options --cpu and --time so that each job processes a different chunk
+# of the data.
+#
+# Copyright (c) 2024, Intel Corporation.
+
+import subprocess
+import argparse
+import pathlib
+import shlex
+import time
+import copy
+import sys
+import os
+import re
+
+glb_prog_name = "parallel-perf.py"
+glb_min_interval = 10.0
+glb_min_samples = 64
+
+class Verbosity():
+
+	def __init__(self, quiet=False, verbose=False, debug=False):
+		self.normal    = True
+		self.verbose   = verbose
+		self.debug     = debug
+		self.self_test = True
+		if self.debug:
+			self.verbose = True
+		if self.verbose:
+			quiet = False
+		if quiet:
+			self.normal = False
+
+# Manage work (Start/Wait/Kill), as represented by a subprocess.Popen command
+class Work():
+
+	def __init__(self, cmd, pipe_to, output_dir="."):
+		self.popen = None
+		self.consumer = None
+		self.cmd = cmd
+		self.pipe_to = pipe_to
+		self.output_dir = output_dir
+		self.cmdout_name = f"{output_dir}/cmd.txt"
+		self.stdout_name = f"{output_dir}/out.txt"
+		self.stderr_name = f"{output_dir}/err.txt"
+
+	def Command(self):
+		sh_cmd = [ shlex.quote(x) for x in self.cmd ]
+		return " ".join(self.cmd)
+
+	def Stdout(self):
+		return open(self.stdout_name, "w")
+
+	def Stderr(self):
+		return open(self.stderr_name, "w")
+
+	def CreateOutputDir(self):
+		pathlib.Path(self.output_dir).mkdir(parents=True, exist_ok=True)
+
+	def Start(self):
+		if self.popen:
+			return
+		self.CreateOutputDir()
+		with open(self.cmdout_name, "w") as f:
+			f.write(self.Command())
+			f.write("\n")
+		stdout = self.Stdout()
+		stderr = self.Stderr()
+		if self.pipe_to:
+			self.popen = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=stderr)
+			args = shlex.split(self.pipe_to)
+			self.consumer = subprocess.Popen(args, stdin=self.popen.stdout, stdout=stdout, stderr=stderr)
+		else:
+			self.popen = subprocess.Popen(self.cmd, stdout=stdout, stderr=stderr)
+
+	def RemoveEmptyErrFile(self):
+		if os.path.exists(self.stderr_name):
+			if os.path.getsize(self.stderr_name) == 0:
+				os.unlink(self.stderr_name)
+
+	def Errors(self):
+		if os.path.exists(self.stderr_name):
+			if os.path.getsize(self.stderr_name) != 0:
+				return [ f"Non-empty error file {self.stderr_name}" ]
+		return []
+
+	def TidyUp(self):
+		self.RemoveEmptyErrFile()
+
+	def RawPollWait(self, p, wait):
+		if wait:
+			return p.wait()
+		return p.poll()
+
+	def Poll(self, wait=False):
+		if not self.popen:
+			return None
+		result = self.RawPollWait(self.popen, wait)
+		if self.consumer:
+			res = result
+			result = self.RawPollWait(self.consumer, wait)
+			if result != None and res == None:
+				self.popen.kill()
+				result = None
+			elif result == 0 and res != None and res != 0:
+				result = res
+		if result != None:
+			self.TidyUp()
+		return result
+
+	def Wait(self):
+		return self.Poll(wait=True)
+
+	def Kill(self):
+		if not self.popen:
+			return
+		self.popen.kill()
+		if self.consumer:
+			self.consumer.kill()
+
+def KillWork(worklist, verbosity):
+	for w in worklist:
+		w.Kill()
+	for w in worklist:
+		w.Wait()
+
+def NumberOfCPUs():
+	return os.sysconf("SC_NPROCESSORS_ONLN")
+
+def NanoSecsToSecsStr(x):
+	if x == None:
+		return ""
+	x = str(x)
+	if len(x) < 10:
+		x = "0" * (10 - len(x)) + x
+	return x[:len(x) - 9] + "." + x[-9:]
+
+def InsertOptionAfter(cmd, option, after):
+	try:
+		pos = cmd.index(after)
+		cmd.insert(pos + 1, option)
+	except:
+		cmd.append(option)
+
+def CreateWorkList(cmd, pipe_to, output_dir, cpus, time_ranges_by_cpu):
+	max_len = len(str(cpus[-1]))
+	cpu_dir_fmt = f"cpu-%.{max_len}u"
+	worklist = []
+	pos = 0
+	for cpu in cpus:
+		if cpu >= 0:
+			cpu_dir = os.path.join(output_dir, cpu_dir_fmt % cpu)
+			cpu_option = f"--cpu={cpu}"
+		else:
+			cpu_dir = output_dir
+			cpu_option = None
+
+		tr_dir_fmt = "time-range"
+
+		if len(time_ranges_by_cpu) > 1:
+			time_ranges = time_ranges_by_cpu[pos]
+			tr_dir_fmt += f"-{pos}"
+			pos += 1
+		else:
+			time_ranges = time_ranges_by_cpu[0]
+
+		max_len = len(str(len(time_ranges)))
+		tr_dir_fmt += f"-%.{max_len}u"
+
+		i = 0
+		for r in time_ranges:
+			if r == [None, None]:
+				time_option = None
+				work_output_dir = cpu_dir
+			else:
+				time_option = "--time=" + NanoSecsToSecsStr(r[0]) + "," + NanoSecsToSecsStr(r[1])
+				work_output_dir = os.path.join(cpu_dir, tr_dir_fmt % i)
+				i += 1
+			work_cmd = list(cmd)
+			if time_option != None:
+				InsertOptionAfter(work_cmd, time_option, "script")
+			if cpu_option != None:
+				InsertOptionAfter(work_cmd, cpu_option, "script")
+			w = Work(work_cmd, pipe_to, work_output_dir)
+			worklist.append(w)
+	return worklist
+
+def DoRunWork(worklist, nr_jobs, verbosity):
+	nr_to_do = len(worklist)
+	not_started = list(worklist)
+	running = []
+	done = []
+	chg = False
+	while True:
+		nr_done = len(done)
+		if chg and verbosity.normal:
+			nr_run = len(running)
+			print(f"\rThere are {nr_to_do} jobs: {nr_done} completed, {nr_run} running", flush=True, end=" ")
+			if verbosity.verbose:
+				print()
+			chg = False
+		if nr_done == nr_to_do:
+			break
+		while len(running) < nr_jobs and len(not_started):
+			w = not_started.pop(0)
+			running.append(w)
+			if verbosity.verbose:
+				print("Starting:", w.Command())
+			w.Start()
+			chg = True
+		if len(running):
+			time.sleep(0.1)
+		finished = []
+		not_finished = []
+		while len(running):
+			w = running.pop(0)
+			r = w.Poll()
+			if r == None:
+				not_finished.append(w)
+				continue
+			if r == 0:
+				if verbosity.verbose:
+					print("Finished:", w.Command())
+				finished.append(w)
+				chg = True
+				continue
+			if verbosity.normal and not verbosity.verbose:
+				print()
+			print("Job failed!\n    return code:", r, "\n    command:    ", w.Command())
+			if w.pipe_to:
+				print("    piped to:   ", w.pipe_to)
+			print("Killing outstanding jobs")
+			KillWork(not_finished, verbosity)
+			KillWork(running, verbosity)
+			return False
+		running = not_finished
+		done += finished
+	errorlist = []
+	for w in worklist:
+		errorlist += w.Errors()
+	if len(errorlist):
+		print("Errors:")
+		for e in errorlist:
+			print(e)
+	elif verbosity.normal:
+		print("\r"," "*50, "\rAll jobs finished successfully", flush=True)
+	return True
+
+def RunWork(worklist, nr_jobs=NumberOfCPUs(), verbosity=Verbosity()):
+	try:
+		return DoRunWork(worklist, nr_jobs, verbosity)
+	except:
+		for w in worklist:
+			w.Kill()
+		raise
+	return True
+
+def ReadHeader(perf, file_name):
+	return subprocess.Popen([perf, "script", "--header-only", "--input", file_name], stdout=subprocess.PIPE).stdout.read().decode("utf-8")
+
+def ParseHeader(hdr):
+	result = {}
+	lines = hdr.split("\n")
+	for line in lines:
+		if ":" in line and line[0] == "#":
+			pos = line.index(":")
+			name = line[1:pos-1].strip()
+			value = line[pos+1:].strip()
+			if name in result:
+				orig_name = name
+				nr = 2
+				while True:
+					name = f"{orig_name} {nr}"
+					if name not in result:
+						break
+					nr += 1
+			result[name] = value
+	return result
+
+def HeaderField(hdr_dict, hdr_fld):
+	if hdr_fld not in hdr_dict:
+		raise Exception(f"'{hdr_fld}' missing from header information")
+	return hdr_dict[hdr_fld]
+
+# Represent the position of an option within a command string
+# and provide the option value and/or remove the option
+class OptPos():
+
+	def Init(self, opt_element=-1, value_element=-1, opt_pos=-1, value_pos=-1, error=None):
+		self.opt_element = opt_element		# list element that contains option
+		self.value_element = value_element	# list element that contains option value
+		self.opt_pos = opt_pos			# string position of option
+		self.value_pos = value_pos		# string position of value
+		self.error = error			# error message string
+
+	def __init__(self, args, short_name, long_name, default=None):
+		self.args = list(args)
+		self.default = default
+		n = 2 + len(long_name)
+		m = len(short_name)
+		pos = -1
+		for opt in args:
+			pos += 1
+			if m and opt[:2] == f"-{short_name}":
+				if len(opt) == 2:
+					if pos + 1 < len(args):
+						self.Init(pos, pos + 1, 0, 0)
+					else:
+						self.Init(error = f"-{short_name} option missing value")
+				else:
+					self.Init(pos, pos, 0, 2)
+				return
+			if opt[:n] == f"--{long_name}":
+				if len(opt) == n:
+					if pos + 1 < len(args):
+						self.Init(pos, pos + 1, 0, 0)
+					else:
+						self.Init(error = f"--{long_name} option missing value")
+				elif opt[n] == "=":
+					self.Init(pos, pos, 0, n + 1)
+				else:
+					self.Init(error = f"--{long_name} option expected '='")
+				return
+			if m and opt[:1] == "-" and opt[:2] != "--" and short_name in opt:
+				ipos = opt.index(short_name)
+				if "-" in opt[1:]:
+					hpos = opt[1:].index("-")
+					if hpos < ipos:
+						continue
+				if ipos + 1 == len(opt):
+					if pos + 1 < len(args):
+						self.Init(pos, pos + 1, ipos, 0)
+					else:
+						self.Init(error = f"-{short_name} option missing value")
+				else:
+					self.Init(pos, pos, ipos, ipos + 1)
+				return
+		self.Init()
+
+	def Value(self):
+		if self.opt_element >= 0:
+			if self.opt_element != self.value_element:
+				return self.args[self.value_element]
+			else:
+				return self.args[self.value_element][self.value_pos:]
+		return self.default
+
+	def Remove(self, args):
+		if self.opt_element == -1:
+			return
+		if self.opt_element != self.value_element:
+			del args[self.value_element]
+		if self.opt_pos:
+			args[self.opt_element] = args[self.opt_element][:self.opt_pos]
+		else:
+			del args[self.opt_element]
+
+def DetermineInputFileName(cmd):
+	p = OptPos(cmd, "i", "input", "perf.data")
+	if p.error:
+		raise Exception(f"perf command {p.error}")
+	file_name = p.Value()
+	if not os.path.exists(file_name):
+		raise Exception(f"perf command input file '{file_name}' not found")
+	return file_name
+
+def ReadOption(args, short_name, long_name, err_prefix, remove=False):
+	p = OptPos(args, short_name, long_name)
+	if p.error:
+		raise Exception(f"{err_prefix}{p.error}")
+	value = p.Value()
+	if remove:
+		p.Remove(args)
+	return value
+
+def ExtractOption(args, short_name, long_name, err_prefix):
+	return ReadOption(args, short_name, long_name, err_prefix, True)
+
+def ReadPerfOption(args, short_name, long_name):
+	return ReadOption(args, short_name, long_name, "perf command ")
+
+def ExtractPerfOption(args, short_name, long_name):
+	return ExtractOption(args, short_name, long_name, "perf command ")
+
+def PerfDoubleQuickCommands(cmd, file_name):
+	cpu_str = ReadPerfOption(cmd, "C", "cpu")
+	time_str = ReadPerfOption(cmd, "", "time")
+	# Use double-quick sampling to determine trace data density
+	times_cmd = ["perf", "script", "--ns", "--input", file_name, "--itrace=qqi"]
+	if cpu_str != None and cpu_str != "":
+		times_cmd.append(f"--cpu={cpu_str}")
+	if time_str != None and time_str != "":
+		times_cmd.append(f"--time={time_str}")
+	cnts_cmd = list(times_cmd)
+	cnts_cmd.append("-Fcpu")
+	times_cmd.append("-Fcpu,time")
+	return cnts_cmd, times_cmd
+
+class CPUTimeRange():
+	def __init__(self, cpu):
+		self.cpu = cpu
+		self.sample_cnt = 0
+		self.time_ranges = None
+		self.interval = 0
+		self.interval_remaining = 0
+		self.remaining = 0
+		self.tr_pos = 0
+
+def CalcTimeRangesByCPU(line, cpu, cpu_time_ranges, max_time):
+	cpu_time_range = cpu_time_ranges[cpu]
+	cpu_time_range.remaining -= 1
+	cpu_time_range.interval_remaining -= 1
+	if cpu_time_range.remaining == 0:
+		cpu_time_range.time_ranges[cpu_time_range.tr_pos][1] = max_time
+		return
+	if cpu_time_range.interval_remaining == 0:
+		time = TimeVal(line[1][:-1], 0)
+		time_ranges = cpu_time_range.time_ranges
+		time_ranges[cpu_time_range.tr_pos][1] = time - 1
+		time_ranges.append([time, max_time])
+		cpu_time_range.tr_pos += 1
+		cpu_time_range.interval_remaining = cpu_time_range.interval
+
+def CountSamplesByCPU(line, cpu, cpu_time_ranges):
+	try:
+		cpu_time_ranges[cpu].sample_cnt += 1
+	except:
+		print("exception")
+		print("cpu", cpu)
+		print("len(cpu_time_ranges)", len(cpu_time_ranges))
+		raise
+
+def ProcessCommandOutputLines(cmd, per_cpu, fn, *x):
+	# Assume CPU number is at beginning of line and enclosed by []
+	pat = re.compile(r"\s*\[[0-9]+\]")
+	p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+	while True:
+		line = p.stdout.readline()
+		if line:
+			line = line.decode("utf-8")
+			if pat.match(line):
+				line = line.split()
+				if per_cpu:
+					# Assumes CPU number is enclosed by []
+					cpu = int(line[0][1:-1])
+				else:
+					cpu = 0
+				fn(line, cpu, *x)
+		else:
+			break
+	p.wait()
+
+def IntersectTimeRanges(new_time_ranges, time_ranges):
+	pos = 0
+	new_pos = 0
+	# Can assume len(time_ranges) != 0 and len(new_time_ranges) != 0
+	# Note also, there *must* be at least one intersection.
+	while pos < len(time_ranges) and new_pos < len(new_time_ranges):
+		# new end < old start => no intersection, remove new
+		if new_time_ranges[new_pos][1] < time_ranges[pos][0]:
+			del new_time_ranges[new_pos]
+			continue
+		# new start > old end => no intersection, check next
+		if new_time_ranges[new_pos][0] > time_ranges[pos][1]:
+			pos += 1
+			if pos < len(time_ranges):
+				continue
+			# no next, so remove remaining
+			while new_pos < len(new_time_ranges):
+				del new_time_ranges[new_pos]
+			return
+		# Found an intersection
+		# new start < old start => adjust new start = old start
+		if new_time_ranges[new_pos][0] < time_ranges[pos][0]:
+			new_time_ranges[new_pos][0] = time_ranges[pos][0]
+		# new end > old end => keep the overlap, insert the remainder
+		if new_time_ranges[new_pos][1] > time_ranges[pos][1]:
+			r = [ time_ranges[pos][1] + 1, new_time_ranges[new_pos][1] ]
+			new_time_ranges[new_pos][1] = time_ranges[pos][1]
+			new_pos += 1
+			new_time_ranges.insert(new_pos, r)
+			continue
+		# new [start, end] is within old [start, end]
+		new_pos += 1
+
+def SplitTimeRangesByTraceDataDensity(time_ranges, cpus, nr, cmd, file_name, per_cpu, min_size, min_interval, verbosity):
+	if verbosity.normal:
+		print("\rAnalyzing...", flush=True, end=" ")
+		if verbosity.verbose:
+			print()
+	cnts_cmd, times_cmd = PerfDoubleQuickCommands(cmd, file_name)
+
+	nr_cpus = cpus[-1] + 1 if per_cpu else 1
+	if per_cpu:
+		nr_cpus = cpus[-1] + 1
+		cpu_time_ranges = [ CPUTimeRange(cpu) for cpu in range(nr_cpus) ]
+	else:
+		nr_cpus = 1
+		cpu_time_ranges = [ CPUTimeRange(-1) ]
+
+	if verbosity.debug:
+		print("nr_cpus", nr_cpus)
+		print("cnts_cmd", cnts_cmd)
+		print("times_cmd", times_cmd)
+
+	# Count the number of "double quick" samples per CPU
+	ProcessCommandOutputLines(cnts_cmd, per_cpu, CountSamplesByCPU, cpu_time_ranges)
+
+	tot = 0
+	mx = 0
+	for cpu_time_range in cpu_time_ranges:
+		cnt = cpu_time_range.sample_cnt
+		tot += cnt
+		if cnt > mx:
+			mx = cnt
+		if verbosity.debug:
+			print("cpu:", cpu_time_range.cpu, "sample_cnt", cnt)
+
+	if min_size < 1:
+		min_size = 1
+
+	if mx < min_size:
+		# Too little data to be worth splitting
+		if verbosity.debug:
+			print("Too little data to split by time")
+		if nr == 0:
+			nr = 1
+		return [ SplitTimeRangesIntoN(time_ranges, nr, min_interval) ]
+
+	if nr:
+		divisor = nr
+		min_size = 1
+	else:
+		divisor = NumberOfCPUs()
+
+	interval = int(round(tot / divisor, 0))
+	if interval < min_size:
+		interval = min_size
+
+	if verbosity.debug:
+		print("divisor", divisor)
+		print("min_size", min_size)
+		print("interval", interval)
+
+	min_time = time_ranges[0][0]
+	max_time = time_ranges[-1][1]
+
+	for cpu_time_range in cpu_time_ranges:
+		cnt = cpu_time_range.sample_cnt
+		if cnt == 0:
+			cpu_time_range.time_ranges = copy.deepcopy(time_ranges)
+			continue
+		# Adjust target interval for CPU to give approximately equal interval sizes
+		# Determine number of intervals, rounding to nearest integer
+		n = int(round(cnt / interval, 0))
+		if n < 1:
+			n = 1
+		# Determine interval size, rounding up
+		d, m = divmod(cnt, n)
+		if m:
+			d += 1
+		cpu_time_range.interval = d
+		cpu_time_range.interval_remaining = d
+		cpu_time_range.remaining = cnt
+		# Init. time ranges for each CPU with the start time
+		cpu_time_range.time_ranges = [ [min_time, max_time] ]
+
+	# Set time ranges so that the same number of "double quick" samples
+	# will fall into each time range.
+	ProcessCommandOutputLines(times_cmd, per_cpu, CalcTimeRangesByCPU, cpu_time_ranges, max_time)
+
+	for cpu_time_range in cpu_time_ranges:
+		if cpu_time_range.sample_cnt:
+			IntersectTimeRanges(cpu_time_range.time_ranges, time_ranges)
+
+	return [cpu_time_ranges[cpu].time_ranges for cpu in cpus]
+
+def SplitSingleTimeRangeIntoN(time_range, n):
+	if n <= 1:
+		return [time_range]
+	start = time_range[0]
+	end   = time_range[1]
+	duration = int((end - start + 1) / n)
+	if duration < 1:
+		return [time_range]
+	time_ranges = []
+	for i in range(n):
+		time_ranges.append([start, start + duration - 1])
+		start += duration
+	time_ranges[-1][1] = end
+	return time_ranges
+
+def TimeRangeDuration(r):
+	return r[1] - r[0] + 1
+
+def TotalDuration(time_ranges):
+	duration = 0
+	for r in time_ranges:
+		duration += TimeRangeDuration(r)
+	return duration
+
+def SplitTimeRangesByInterval(time_ranges, interval):
+	new_ranges = []
+	for r in time_ranges:
+		duration = TimeRangeDuration(r)
+		n = duration / interval
+		n = int(round(n, 0))
+		new_ranges += SplitSingleTimeRangeIntoN(r, n)
+	return new_ranges
+
+def SplitTimeRangesIntoN(time_ranges, n, min_interval):
+	if n <= len(time_ranges):
+		return time_ranges
+	duration = TotalDuration(time_ranges)
+	interval = duration / n
+	if interval < min_interval:
+		interval = min_interval
+	return SplitTimeRangesByInterval(time_ranges, interval)
+
+def RecombineTimeRanges(tr):
+	new_tr = copy.deepcopy(tr)
+	n = len(new_tr)
+	i = 1
+	while i < len(new_tr):
+		# if prev end + 1 == cur start, combine them
+		if new_tr[i - 1][1] + 1 == new_tr[i][0]:
+			new_tr[i][0] = new_tr[i - 1][0]
+			del new_tr[i - 1]
+		else:
+			i += 1
+	return new_tr
+
+def OpenTimeRangeEnds(time_ranges, min_time, max_time):
+	if time_ranges[0][0] <= min_time:
+		time_ranges[0][0] = None
+	if time_ranges[-1][1] >= max_time:
+		time_ranges[-1][1] = None
+
+def BadTimeStr(time_str):
+	raise Exception(f"perf command bad time option: '{time_str}'\nCheck also 'time of first sample' and 'time of last sample' in perf script --header-only")
+
+def ValidateTimeRanges(time_ranges, time_str):
+	n = len(time_ranges)
+	for i in range(n):
+		start = time_ranges[i][0]
+		end   = time_ranges[i][1]
+		if i != 0 and start <= time_ranges[i - 1][1]:
+			BadTimeStr(time_str)
+		if start > end:
+			BadTimeStr(time_str)
+
+def TimeVal(s, dflt):
+	s = s.strip()
+	if s == "":
+		return dflt
+	a = s.split(".")
+	if len(a) > 2:
+		raise Exception(f"Bad time value'{s}'")
+	x = int(a[0])
+	if x < 0:
+		raise Exception("Negative time not allowed")
+	x *= 1000000000
+	if len(a) > 1:
+		x += int((a[1] + "000000000")[:9])
+	return x
+
+def BadCPUStr(cpu_str):
+	raise Exception(f"perf command bad cpu option: '{cpu_str}'\nCheck also 'nrcpus avail' in perf script --header-only")
+
+def ParseTimeStr(time_str, min_time, max_time):
+	if time_str == None or time_str == "":
+		return [[min_time, max_time]]
+	time_ranges = []
+	for r in time_str.split():
+		a = r.split(",")
+		if len(a) != 2:
+			BadTimeStr(time_str)
+		try:
+			start = TimeVal(a[0], min_time)
+			end   = TimeVal(a[1], max_time)
+		except:
+			BadTimeStr(time_str)
+		time_ranges.append([start, end])
+	ValidateTimeRanges(time_ranges, time_str)
+	return time_ranges
+
+def ParseCPUStr(cpu_str, nr_cpus):
+	if cpu_str == None or cpu_str == "":
+		return [-1]
+	cpus = []
+	for r in cpu_str.split(","):
+		a = r.split("-")
+		if len(a) < 1 or len(a) > 2:
+			BadCPUStr(cpu_str)
+		try:
+			start = int(a[0].strip())
+			if len(a) > 1:
+				end = int(a[1].strip())
+			else:
+				end = start
+		except:
+			BadCPUStr(cpu_str)
+		if start < 0 or end < 0 or end < start or end >= nr_cpus:
+			BadCPUStr(cpu_str)
+		cpus.extend(range(start, end + 1))
+	cpus = list(set(cpus)) # Remove duplicates
+	cpus.sort()
+	return cpus
+
+class ParallelPerf():
+
+	def __init__(self, a):
+		for arg_name in vars(a):
+			setattr(self, arg_name, getattr(a, arg_name))
+		self.orig_nr = self.nr
+		self.orig_cmd = list(self.cmd)
+		self.perf = self.cmd[0]
+		if os.path.exists(self.output_dir):
+			raise Exception(f"Output '{self.output_dir}' already exists")
+		if self.jobs < 0 or self.nr < 0 or self.interval < 0:
+			raise Exception("Bad options (negative values): try -h option for help")
+		if self.nr != 0 and self.interval != 0:
+			raise Exception("Cannot specify number of time subdivisions and time interval")
+		if self.jobs == 0:
+			self.jobs = NumberOfCPUs()
+		if self.nr == 0 and self.interval == 0:
+			if self.per_cpu:
+				self.nr = 1
+			else:
+				self.nr = self.jobs
+
+	def Init(self):
+		if self.verbosity.debug:
+			print("cmd", self.cmd)
+		self.file_name = DetermineInputFileName(self.cmd)
+		self.hdr = ReadHeader(self.perf, self.file_name)
+		self.hdr_dict = ParseHeader(self.hdr)
+		self.cmd_line = HeaderField(self.hdr_dict, "cmdline")
+
+	def ExtractTimeInfo(self):
+		self.min_time = TimeVal(HeaderField(self.hdr_dict, "time of first sample"), 0)
+		self.max_time = TimeVal(HeaderField(self.hdr_dict, "time of last sample"), 0)
+		self.time_str = ExtractPerfOption(self.cmd, "", "time")
+		self.time_ranges = ParseTimeStr(self.time_str, self.min_time, self.max_time)
+		if self.verbosity.debug:
+			print("time_ranges", self.time_ranges)
+
+	def ExtractCPUInfo(self):
+		if self.per_cpu:
+			nr_cpus = int(HeaderField(self.hdr_dict, "nrcpus avail"))
+			self.cpu_str = ExtractPerfOption(self.cmd, "C", "cpu")
+			if self.cpu_str == None or self.cpu_str == "":
+				self.cpus = [ x for x in range(nr_cpus) ]
+			else:
+				self.cpus = ParseCPUStr(self.cpu_str, nr_cpus)
+		else:
+			self.cpu_str = None
+			self.cpus = [-1]
+		if self.verbosity.debug:
+			print("cpus", self.cpus)
+
+	def IsIntelPT(self):
+		return self.cmd_line.find("intel_pt") >= 0
+
+	def SplitTimeRanges(self):
+		if self.IsIntelPT() and self.interval == 0:
+			self.split_time_ranges_for_each_cpu = \
+				SplitTimeRangesByTraceDataDensity(self.time_ranges, self.cpus, self.orig_nr,
+								  self.orig_cmd, self.file_name, self.per_cpu,
+								  self.min_size, self.min_interval, self.verbosity)
+		elif self.nr:
+			self.split_time_ranges_for_each_cpu = [ SplitTimeRangesIntoN(self.time_ranges, self.nr, self.min_interval) ]
+		else:
+			self.split_time_ranges_for_each_cpu = [ SplitTimeRangesByInterval(self.time_ranges, self.interval) ]
+
+	def CheckTimeRanges(self):
+		for tr in self.split_time_ranges_for_each_cpu:
+			# Re-combined time ranges should be the same
+			new_tr = RecombineTimeRanges(tr)
+			if new_tr != self.time_ranges:
+				if self.verbosity.debug:
+					print("tr", tr)
+					print("new_tr", new_tr)
+				raise Exception("Self test failed!")
+
+	def OpenTimeRangeEnds(self):
+		for time_ranges in self.split_time_ranges_for_each_cpu:
+			OpenTimeRangeEnds(time_ranges, self.min_time, self.max_time)
+
+	def CreateWorkList(self):
+		self.worklist = CreateWorkList(self.cmd, self.pipe_to, self.output_dir, self.cpus, self.split_time_ranges_for_each_cpu)
+
+	def PerfDataRecordedPerCPU(self):
+		if "--per-thread" in self.cmd_line.split():
+			return False
+		return True
+
+	def DefaultToPerCPU(self):
+		# --no-per-cpu option takes precedence
+		if self.no_per_cpu:
+			return False
+		if not self.PerfDataRecordedPerCPU():
+			return False
+		# Default to per-cpu for Intel PT data that was recorded per-cpu,
+		# because decoding can be done for each CPU separately.
+		if self.IsIntelPT():
+			return True
+		return False
+
+	def Config(self):
+		self.Init()
+		self.ExtractTimeInfo()
+		if not self.per_cpu:
+			self.per_cpu = self.DefaultToPerCPU()
+		if self.verbosity.debug:
+			print("per_cpu", self.per_cpu)
+		self.ExtractCPUInfo()
+		self.SplitTimeRanges()
+		if self.verbosity.self_test:
+			self.CheckTimeRanges()
+		# Prefer open-ended time range to starting / ending with min_time / max_time resp.
+		self.OpenTimeRangeEnds()
+		self.CreateWorkList()
+
+	def Run(self):
+		if self.dry_run:
+			print(len(self.worklist),"jobs:")
+			for w in self.worklist:
+				print(w.Command())
+			return True
+		result = RunWork(self.worklist, self.jobs, verbosity=self.verbosity)
+		if self.verbosity.verbose:
+			print(glb_prog_name, "done")
+		return result
+
+def RunParallelPerf(a):
+	pp = ParallelPerf(a)
+	pp.Config()
+	return pp.Run()
+
+def Main(args):
+	ap = argparse.ArgumentParser(
+		prog=glb_prog_name, formatter_class = argparse.RawDescriptionHelpFormatter,
+		description =
+"""
+Run a perf script command multiple times in parallel, using perf script options
+--cpu and --time so that each job processes a different chunk of the data.
+""",
+		epilog =
+"""
+Follow the options by '--' and then the perf script command e.g.
+
+	$ perf record -a -- sleep 10
+	$ parallel-perf.py --nr=4 -- perf script --ns
+	All jobs finished successfully
+	$ tree parallel-perf-output/
+	parallel-perf-output/
+	├── time-range-0
+	│   ├── cmd.txt
+	│   └── out.txt
+	├── time-range-1
+	│   ├── cmd.txt
+	│   └── out.txt
+	├── time-range-2
+	│   ├── cmd.txt
+	│   └── out.txt
+	└── time-range-3
+	    ├── cmd.txt
+	    └── out.txt
+	$ find parallel-perf-output -name cmd.txt | sort | xargs grep -H .
+	parallel-perf-output/time-range-0/cmd.txt:perf script --time=,9466.504461499 --ns
+	parallel-perf-output/time-range-1/cmd.txt:perf script --time=9466.504461500,9469.005396999 --ns
+	parallel-perf-output/time-range-2/cmd.txt:perf script --time=9469.005397000,9471.506332499 --ns
+	parallel-perf-output/time-range-3/cmd.txt:perf script --time=9471.506332500, --ns
+
+Any perf script command can be used, including the use of perf script options
+--dlfilter and --script, so that the benefit of running parallel jobs
+naturally extends to them also.
+
+If option --pipe-to is used, standard output is first piped through that
+command. Beware, if the command fails (e.g. grep with no matches), it will be
+considered a fatal error.
+
+Final standard output is redirected to files named out.txt in separate
+subdirectories under the output directory. Similarly, standard error is
+written to files named err.txt. In addition, files named cmd.txt contain the
+corresponding perf script command. After processing, err.txt files are removed
+if they are empty.
+
+If any job exits with a non-zero exit code, then all jobs are killed and no
+more are started. A message is printed if any job results in a non-empty
+err.txt file.
+
+There is a separate output subdirectory for each time range. If the --per-cpu
+option is used, these are further grouped under cpu-n subdirectories, e.g.
+
+	$ parallel-perf.py --per-cpu --nr=2 -- perf script --ns --cpu=0,1
+	All jobs finished successfully
+	$ tree parallel-perf-output
+	parallel-perf-output/
+	├── cpu-0
+	│   ├── time-range-0
+	│   │   ├── cmd.txt
+	│   │   └── out.txt
+	│   └── time-range-1
+	│       ├── cmd.txt
+	│       └── out.txt
+	└── cpu-1
+	    ├── time-range-0
+	    │   ├── cmd.txt
+	    │   └── out.txt
+	    └── time-range-1
+	        ├── cmd.txt
+	        └── out.txt
+	$ find parallel-perf-output -name cmd.txt | sort | xargs grep -H .
+	parallel-perf-output/cpu-0/time-range-0/cmd.txt:perf script --cpu=0 --time=,9469.005396999 --ns
+	parallel-perf-output/cpu-0/time-range-1/cmd.txt:perf script --cpu=0 --time=9469.005397000, --ns
+	parallel-perf-output/cpu-1/time-range-0/cmd.txt:perf script --cpu=1 --time=,9469.005396999 --ns
+	parallel-perf-output/cpu-1/time-range-1/cmd.txt:perf script --cpu=1 --time=9469.005397000, --ns
+
+Subdivisions of time range, and cpus if the --per-cpu option is used, are
+expressed by the --time and --cpu perf script options respectively. If the
+supplied perf script command has a --time option, then that time range is
+subdivided, otherwise the time range given by 'time of first sample' to
+'time of last sample' is used (refer perf script --header-only). Similarly, the
+supplied perf script command may provide a --cpu option, and only those CPUs
+will be processed.
+
+To prevent time intervals becoming too small, the --min-interval option can
+be used.
+
+Note there is special handling for processing Intel PT traces. If an interval is
+not specified and the perf record command contained the intel_pt event, then the
+time range will be subdivided in order to produce subdivisions that contain
+approximately the same amount of trace data. That is accomplished by counting
+double-quick (--itrace=qqi) samples, and choosing time ranges that encompass
+approximately the same number of samples. In that case, time ranges may not be
+the same for each CPU processed. For Intel PT, --per-cpu is the default, but
+that can be overridden by --no-per-cpu. Note, for Intel PT, double-quick
+decoding produces 1 sample for each PSB synchronization packet, which in turn
+come after a certain number of bytes output, determined by psb_period (refer
+perf Intel PT documentation). The minimum number of double-quick samples that
+will define a time range can be set by the --min_size option, which defaults to
+64.
+""")
+	ap.add_argument("-o", "--output-dir", default="parallel-perf-output", help="output directory (default 'parallel-perf-output')")
+	ap.add_argument("-j", "--jobs", type=int, default=0, help="maximum number of jobs to run in parallel at one time (default is the number of CPUs)")
+	ap.add_argument("-n", "--nr", type=int, default=0, help="number of time subdivisions (default is the number of jobs)")
+	ap.add_argument("-i", "--interval", type=float, default=0, help="subdivide the time range using this time interval (in seconds e.g. 0.1 for a tenth of a second)")
+	ap.add_argument("-c", "--per-cpu", action="store_true", help="process data for each CPU in parallel")
+	ap.add_argument("-m", "--min-interval", type=float, default=glb_min_interval, help=f"minimum interval (default {glb_min_interval} seconds)")
+	ap.add_argument("-p", "--pipe-to", help="command to pipe output to (optional)")
+	ap.add_argument("-N", "--no-per-cpu", action="store_true", help="do not process data for each CPU in parallel")
+	ap.add_argument("-b", "--min_size", type=int, default=glb_min_samples, help="minimum data size (for Intel PT in PSBs)")
+	ap.add_argument("-D", "--dry-run", action="store_true", help="do not run any jobs, just show the perf script commands")
+	ap.add_argument("-q", "--quiet", action="store_true", help="do not print any messages except errors")
+	ap.add_argument("-v", "--verbose", action="store_true", help="print more messages")
+	ap.add_argument("-d", "--debug", action="store_true", help="print debugging messages")
+	cmd_line = list(args)
+	try:
+		split_pos = cmd_line.index("--")
+		cmd = cmd_line[split_pos + 1:]
+		args = cmd_line[:split_pos]
+	except:
+		cmd = None
+		args = cmd_line
+	a = ap.parse_args(args=args[1:])
+	a.cmd = cmd
+	a.verbosity = Verbosity(a.quiet, a.verbose, a.debug)
+	try:
+		if a.cmd == None:
+			if len(args) <= 1:
+				ap.print_help()
+				return True
+			raise Exception("Command line must contain '--' before perf command")
+		return RunParallelPerf(a)
+	except Exception as e:
+		print("Fatal error: ", str(e))
+		if a.debug:
+			raise
+		return False
+
+if __name__ == "__main__":
+	if not Main(sys.argv):
+		sys.exit(1)
diff --git a/tools/perf/scripts/python/task-analyzer.py b/tools/perf/scripts/python/task-analyzer.py
new file mode 100755
index 000000000000..3f1df9894246
--- /dev/null
+++ b/tools/perf/scripts/python/task-analyzer.py
@@ -0,0 +1,934 @@
+# task-analyzer.py - comprehensive perf tasks analysis
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022, Hagen Paul Pfeifer <hagen@jauu.net>
+# Licensed under the terms of the GNU GPL License version 2
+#
+# Usage:
+#
+#     perf record -e sched:sched_switch -a -- sleep 10
+#     perf script report task-analyzer
+#
+
+from __future__ import print_function
+import sys
+import os
+import string
+import argparse
+import decimal
+
+
+sys.path.append(
+    os.environ["PERF_EXEC_PATH"] + "/scripts/python/Perf-Trace-Util/lib/Perf/Trace"
+)
+from perf_trace_context import *
+from Core import *
+
+# Definition of possible ASCII color codes
+_COLORS = {
+    "grey": "\033[90m",
+    "red": "\033[91m",
+    "green": "\033[92m",
+    "yellow": "\033[93m",
+    "blue": "\033[94m",
+    "violet": "\033[95m",
+    "reset": "\033[0m",
+}
+
+# Columns will have a static size to align everything properly
+# Support of 116 days of active update with nano precision
+LEN_SWITCHED_IN = len("9999999.999999999")  # 17
+LEN_SWITCHED_OUT = len("9999999.999999999")  # 17
+LEN_CPU = len("000")
+LEN_PID = len("maxvalue")  # 8
+LEN_TID = len("maxvalue")  # 8
+LEN_COMM = len("max-comms-length")  # 16
+LEN_RUNTIME = len("999999.999")  # 10
+# Support of 3.45 hours of timespans
+LEN_OUT_IN = len("99999999999.999")  # 15
+LEN_OUT_OUT = len("99999999999.999")  # 15
+LEN_IN_IN = len("99999999999.999")  # 15
+LEN_IN_OUT = len("99999999999.999")  # 15
+
+
+# py2/py3 compatibility layer, see PEP469
+try:
+    dict.iteritems
+except AttributeError:
+    # py3
+    def itervalues(d):
+        return iter(d.values())
+
+    def iteritems(d):
+        return iter(d.items())
+
+else:
+    # py2
+    def itervalues(d):
+        return d.itervalues()
+
+    def iteritems(d):
+        return d.iteritems()
+
+
+def _check_color():
+    global _COLORS
+    """user enforced no-color or if stdout is no tty we disable colors"""
+    if sys.stdout.isatty() and args.stdio_color != "never":
+        return
+    _COLORS = {
+        "grey": "",
+        "red": "",
+        "green": "",
+        "yellow": "",
+        "blue": "",
+        "violet": "",
+        "reset": "",
+    }
+
+
+def _parse_args():
+    global args
+    parser = argparse.ArgumentParser(description="Analyze tasks behavior")
+    parser.add_argument(
+        "--time-limit",
+        default=[],
+        help=
+            "print tasks only in time[s] window e.g"
+        " --time-limit 123.111:789.222(print all between 123.111 and 789.222)"
+        " --time-limit 123: (print all from 123)"
+        " --time-limit :456 (print all until incl. 456)",
+    )
+    parser.add_argument(
+        "--summary", action="store_true", help="print addtional runtime information"
+    )
+    parser.add_argument(
+        "--summary-only", action="store_true", help="print only summary without traces"
+    )
+    parser.add_argument(
+        "--summary-extended",
+        action="store_true",
+        help="print the summary with additional information of max inter task times"
+            " relative to the prev task",
+    )
+    parser.add_argument(
+        "--ns", action="store_true", help="show timestamps in nanoseconds"
+    )
+    parser.add_argument(
+        "--ms", action="store_true", help="show timestamps in milliseconds"
+    )
+    parser.add_argument(
+        "--extended-times",
+        action="store_true",
+        help="Show the elapsed times between schedule in/schedule out"
+            " of this task and the schedule in/schedule out of previous occurrence"
+            " of the same task",
+    )
+    parser.add_argument(
+        "--filter-tasks",
+        default=[],
+        help="filter out unneeded tasks by tid, pid or processname."
+        " E.g --filter-task 1337,/sbin/init ",
+    )
+    parser.add_argument(
+        "--limit-to-tasks",
+        default=[],
+        help="limit output to selected task by tid, pid, processname."
+        " E.g --limit-to-tasks 1337,/sbin/init",
+    )
+    parser.add_argument(
+        "--highlight-tasks",
+        default="",
+        help="colorize special tasks by their pid/tid/comm."
+        " E.g. --highlight-tasks 1:red,mutt:yellow"
+        " Colors available: red,grey,yellow,blue,violet,green",
+    )
+    parser.add_argument(
+        "--rename-comms-by-tids",
+        default="",
+        help="rename task names by using tid (<tid>:<newname>,<tid>:<newname>)"
+            " This option is handy for inexpressive processnames like python interpreted"
+            " process. E.g --rename 1337:my-python-app",
+    )
+    parser.add_argument(
+        "--stdio-color",
+        default="auto",
+        choices=["always", "never", "auto"],
+        help="always, never or auto, allowing configuring color output"
+            " via the command line",
+    )
+    parser.add_argument(
+        "--csv",
+        default="",
+        help="Write trace to file selected by user. Options, like --ns or --extended"
+            "-times are used.",
+    )
+    parser.add_argument(
+        "--csv-summary",
+        default="",
+        help="Write summary to file selected by user. Options, like --ns or"
+            " --summary-extended are used.",
+    )
+    args = parser.parse_args()
+    args.tid_renames = dict()
+
+    _argument_filter_sanity_check()
+    _argument_prepare_check()
+
+
+def time_uniter(unit):
+    picker = {
+        "s": 1,
+        "ms": 1e3,
+        "us": 1e6,
+        "ns": 1e9,
+    }
+    return picker[unit]
+
+
+def _init_db():
+    global db
+    db = dict()
+    db["running"] = dict()
+    db["cpu"] = dict()
+    db["tid"] = dict()
+    db["global"] = []
+    if args.summary or args.summary_extended or args.summary_only:
+        db["task_info"] = dict()
+        db["runtime_info"] = dict()
+        # min values for summary depending on the header
+        db["task_info"]["pid"] = len("PID")
+        db["task_info"]["tid"] = len("TID")
+        db["task_info"]["comm"] = len("Comm")
+        db["runtime_info"]["runs"] = len("Runs")
+        db["runtime_info"]["acc"] = len("Accumulated")
+        db["runtime_info"]["max"] = len("Max")
+        db["runtime_info"]["max_at"] = len("Max At")
+        db["runtime_info"]["min"] = len("Min")
+        db["runtime_info"]["mean"] = len("Mean")
+        db["runtime_info"]["median"] = len("Median")
+        if args.summary_extended:
+            db["inter_times"] = dict()
+            db["inter_times"]["out_in"] = len("Out-In")
+            db["inter_times"]["inter_at"] = len("At")
+            db["inter_times"]["out_out"] = len("Out-Out")
+            db["inter_times"]["in_in"] = len("In-In")
+            db["inter_times"]["in_out"] = len("In-Out")
+
+
+def _median(numbers):
+    """phython3 hat statistics module - we have nothing"""
+    n = len(numbers)
+    index = n // 2
+    if n % 2:
+        return sorted(numbers)[index]
+    return sum(sorted(numbers)[index - 1 : index + 1]) / 2
+
+
+def _mean(numbers):
+    return sum(numbers) / len(numbers)
+
+
+class Timespans(object):
+    """
+    The elapsed time between two occurrences of the same task is being tracked with the
+    help of this class. There are 4 of those Timespans Out-Out, In-Out, Out-In and
+    In-In.
+    The first half of the name signals the first time point of the
+    first task. The second half of the name represents the second
+    timepoint of the second task.
+    """
+
+    def __init__(self):
+        self._last_start = None
+        self._last_finish = None
+        self.out_out = -1
+        self.in_out = -1
+        self.out_in = -1
+        self.in_in = -1
+        if args.summary_extended:
+            self._time_in = -1
+            self.max_out_in = -1
+            self.max_at = -1
+            self.max_in_out = -1
+            self.max_in_in = -1
+            self.max_out_out = -1
+
+    def feed(self, task):
+        """
+        Called for every recorded trace event to find process pair and calculate the
+        task timespans. Chronological ordering, feed does not do reordering
+        """
+        if not self._last_finish:
+            self._last_start = task.time_in(time_unit)
+            self._last_finish = task.time_out(time_unit)
+            return
+        self._time_in = task.time_in()
+        time_in = task.time_in(time_unit)
+        time_out = task.time_out(time_unit)
+        self.in_in = time_in - self._last_start
+        self.out_in = time_in - self._last_finish
+        self.in_out = time_out - self._last_start
+        self.out_out = time_out - self._last_finish
+        if args.summary_extended:
+            self._update_max_entries()
+        self._last_finish = task.time_out(time_unit)
+        self._last_start = task.time_in(time_unit)
+
+    def _update_max_entries(self):
+        if self.in_in > self.max_in_in:
+            self.max_in_in = self.in_in
+        if self.out_out > self.max_out_out:
+            self.max_out_out = self.out_out
+        if self.in_out > self.max_in_out:
+            self.max_in_out = self.in_out
+        if self.out_in > self.max_out_in:
+            self.max_out_in = self.out_in
+            self.max_at = self._time_in
+
+
+
+class Summary(object):
+    """
+    Primary instance for calculating the summary output. Processes the whole trace to
+    find and memorize relevant data such as mean, max et cetera. This instance handles
+    dynamic alignment aspects for summary output.
+    """
+
+    def __init__(self):
+        self._body = []
+
+    class AlignmentHelper:
+        """
+        Used to calculated the alignment for the output of the summary.
+        """
+        def __init__(self, pid, tid, comm, runs, acc, mean,
+                    median, min, max, max_at):
+            self.pid = pid
+            self.tid = tid
+            self.comm = comm
+            self.runs = runs
+            self.acc = acc
+            self.mean = mean
+            self.median = median
+            self.min = min
+            self.max = max
+            self.max_at = max_at
+            if args.summary_extended:
+                self.out_in = None
+                self.inter_at = None
+                self.out_out = None
+                self.in_in = None
+                self.in_out = None
+
+    def _print_header(self):
+        '''
+        Output is trimmed in _format_stats thus additional adjustment in the header
+        is needed, depending on the choice of timeunit. The adjustment corresponds
+        to the amount of column titles being adjusted in _column_titles.
+        '''
+        decimal_precision = 6 if not args.ns else 9
+        fmt = " {{:^{}}}".format(sum(db["task_info"].values()))
+        fmt += " {{:^{}}}".format(
+            sum(db["runtime_info"].values()) - 2 * decimal_precision
+            )
+        _header = ("Task Information", "Runtime Information")
+
+        if args.summary_extended:
+            fmt += " {{:^{}}}".format(
+                sum(db["inter_times"].values()) - 4 * decimal_precision
+                )
+            _header += ("Max Inter Task Times",)
+        fd_sum.write(fmt.format(*_header) +  "\n")
+
+    def _column_titles(self):
+        """
+        Cells are being processed and displayed in different way so an alignment adjust
+        is implemented depeding on the choice of the timeunit. The positions of the max
+        values are being displayed in grey. Thus in their format two additional {},
+        are placed for color set and reset.
+        """
+        separator, fix_csv_align = _prepare_fmt_sep()
+        decimal_precision, time_precision = _prepare_fmt_precision()
+        fmt = "{{:>{}}}".format(db["task_info"]["pid"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, db["task_info"]["tid"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, db["task_info"]["comm"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, db["runtime_info"]["runs"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, db["runtime_info"]["acc"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, db["runtime_info"]["mean"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(
+            separator, db["runtime_info"]["median"] * fix_csv_align
+        )
+        fmt += "{}{{:>{}}}".format(
+            separator, (db["runtime_info"]["min"] - decimal_precision) * fix_csv_align
+        )
+        fmt += "{}{{:>{}}}".format(
+            separator, (db["runtime_info"]["max"] - decimal_precision) * fix_csv_align
+        )
+        fmt += "{}{{}}{{:>{}}}{{}}".format(
+            separator, (db["runtime_info"]["max_at"] - time_precision) * fix_csv_align
+        )
+
+        column_titles = ("PID", "TID", "Comm")
+        column_titles += ("Runs", "Accumulated", "Mean", "Median", "Min", "Max")
+        column_titles += (_COLORS["grey"], "Max At", _COLORS["reset"])
+
+        if args.summary_extended:
+            fmt += "{}{{:>{}}}".format(
+                separator,
+                (db["inter_times"]["out_in"] - decimal_precision) * fix_csv_align
+            )
+            fmt += "{}{{}}{{:>{}}}{{}}".format(
+                separator,
+                (db["inter_times"]["inter_at"] - time_precision) * fix_csv_align
+            )
+            fmt += "{}{{:>{}}}".format(
+                separator,
+                (db["inter_times"]["out_out"] - decimal_precision) * fix_csv_align
+            )
+            fmt += "{}{{:>{}}}".format(
+                separator,
+                (db["inter_times"]["in_in"] - decimal_precision) * fix_csv_align
+            )
+            fmt += "{}{{:>{}}}".format(
+                separator,
+                (db["inter_times"]["in_out"] - decimal_precision) * fix_csv_align
+            )
+
+            column_titles += ("Out-In", _COLORS["grey"], "Max At", _COLORS["reset"],
+                        "Out-Out", "In-In", "In-Out")
+
+        fd_sum.write(fmt.format(*column_titles) + "\n")
+
+
+    def _task_stats(self):
+        """calculates the stats of every task and constructs the printable summary"""
+        for tid in sorted(db["tid"]):
+            color_one_sample = _COLORS["grey"]
+            color_reset = _COLORS["reset"]
+            no_executed = 0
+            runtimes = []
+            time_in = []
+            timespans = Timespans()
+            for task in db["tid"][tid]:
+                pid = task.pid
+                comm = task.comm
+                no_executed += 1
+                runtimes.append(task.runtime(time_unit))
+                time_in.append(task.time_in())
+                timespans.feed(task)
+            if len(runtimes) > 1:
+                color_one_sample = ""
+                color_reset = ""
+            time_max = max(runtimes)
+            time_min = min(runtimes)
+            max_at = time_in[runtimes.index(max(runtimes))]
+
+            # The size of the decimal after sum,mean and median varies, thus we cut
+            # the decimal number, by rounding it. It has no impact on the output,
+            # because we have a precision of the decimal points at the output.
+            time_sum = round(sum(runtimes), 3)
+            time_mean = round(_mean(runtimes), 3)
+            time_median = round(_median(runtimes), 3)
+
+            align_helper = self.AlignmentHelper(pid, tid, comm, no_executed, time_sum,
+                                    time_mean, time_median, time_min, time_max, max_at)
+            self._body.append([pid, tid, comm, no_executed, time_sum, color_one_sample,
+                                time_mean, time_median, time_min, time_max,
+                                _COLORS["grey"], max_at, _COLORS["reset"], color_reset])
+            if args.summary_extended:
+                self._body[-1].extend([timespans.max_out_in,
+                                _COLORS["grey"], timespans.max_at,
+                                _COLORS["reset"], timespans.max_out_out,
+                                timespans.max_in_in,
+                                timespans.max_in_out])
+                align_helper.out_in = timespans.max_out_in
+                align_helper.inter_at = timespans.max_at
+                align_helper.out_out = timespans.max_out_out
+                align_helper.in_in = timespans.max_in_in
+                align_helper.in_out = timespans.max_in_out
+            self._calc_alignments_summary(align_helper)
+
+    def _format_stats(self):
+        separator, fix_csv_align = _prepare_fmt_sep()
+        decimal_precision, time_precision = _prepare_fmt_precision()
+        len_pid = db["task_info"]["pid"] * fix_csv_align
+        len_tid = db["task_info"]["tid"] * fix_csv_align
+        len_comm = db["task_info"]["comm"] * fix_csv_align
+        len_runs = db["runtime_info"]["runs"] * fix_csv_align
+        len_acc = db["runtime_info"]["acc"] * fix_csv_align
+        len_mean = db["runtime_info"]["mean"] * fix_csv_align
+        len_median = db["runtime_info"]["median"] * fix_csv_align
+        len_min = (db["runtime_info"]["min"] - decimal_precision) * fix_csv_align
+        len_max = (db["runtime_info"]["max"] - decimal_precision) * fix_csv_align
+        len_max_at = (db["runtime_info"]["max_at"] - time_precision) * fix_csv_align
+        if args.summary_extended:
+            len_out_in = (
+                db["inter_times"]["out_in"] - decimal_precision
+            ) * fix_csv_align
+            len_inter_at = (
+                db["inter_times"]["inter_at"] - time_precision
+            ) * fix_csv_align
+            len_out_out = (
+                db["inter_times"]["out_out"] - decimal_precision
+            ) * fix_csv_align
+            len_in_in = (db["inter_times"]["in_in"] - decimal_precision) * fix_csv_align
+            len_in_out = (
+                db["inter_times"]["in_out"] - decimal_precision
+            ) * fix_csv_align
+
+        fmt = "{{:{}d}}".format(len_pid)
+        fmt += "{}{{:{}d}}".format(separator, len_tid)
+        fmt += "{}{{:>{}}}".format(separator, len_comm)
+        fmt += "{}{{:{}d}}".format(separator, len_runs)
+        fmt += "{}{{:{}.{}f}}".format(separator, len_acc, time_precision)
+        fmt += "{}{{}}{{:{}.{}f}}".format(separator, len_mean, time_precision)
+        fmt += "{}{{:{}.{}f}}".format(separator, len_median, time_precision)
+        fmt += "{}{{:{}.{}f}}".format(separator, len_min, time_precision)
+        fmt += "{}{{:{}.{}f}}".format(separator, len_max, time_precision)
+        fmt += "{}{{}}{{:{}.{}f}}{{}}{{}}".format(
+            separator, len_max_at, decimal_precision
+        )
+        if args.summary_extended:
+            fmt += "{}{{:{}.{}f}}".format(separator, len_out_in, time_precision)
+            fmt += "{}{{}}{{:{}.{}f}}{{}}".format(
+                separator, len_inter_at, decimal_precision
+            )
+            fmt += "{}{{:{}.{}f}}".format(separator, len_out_out, time_precision)
+            fmt += "{}{{:{}.{}f}}".format(separator, len_in_in, time_precision)
+            fmt += "{}{{:{}.{}f}}".format(separator, len_in_out, time_precision)
+        return fmt
+
+
+    def _calc_alignments_summary(self, align_helper):
+        # Length is being cut in 3 groups so that further addition is easier to handle.
+        # The length of every argument from the alignment helper is being checked if it
+        # is longer than the longest until now. In that case the length is being saved.
+        for key in db["task_info"]:
+            if len(str(getattr(align_helper, key))) > db["task_info"][key]:
+                db["task_info"][key] = len(str(getattr(align_helper, key)))
+        for key in db["runtime_info"]:
+            if len(str(getattr(align_helper, key))) > db["runtime_info"][key]:
+                db["runtime_info"][key] = len(str(getattr(align_helper, key)))
+        if args.summary_extended:
+            for key in db["inter_times"]:
+                if len(str(getattr(align_helper, key))) > db["inter_times"][key]:
+                    db["inter_times"][key] = len(str(getattr(align_helper, key)))
+
+
+    def print(self):
+        self._task_stats()
+        fmt = self._format_stats()
+
+        if not args.csv_summary:
+            print("\nSummary")
+            self._print_header()
+        self._column_titles()
+        for i in range(len(self._body)):
+            fd_sum.write(fmt.format(*tuple(self._body[i])) + "\n")
+
+
+
+class Task(object):
+    """ The class is used to handle the information of a given task."""
+
+    def __init__(self, id, tid, cpu, comm):
+        self.id = id
+        self.tid = tid
+        self.cpu = cpu
+        self.comm = comm
+        self.pid = None
+        self._time_in = None
+        self._time_out = None
+
+    def schedule_in_at(self, time):
+        """set the time where the task was scheduled in"""
+        self._time_in = time
+
+    def schedule_out_at(self, time):
+        """set the time where the task was scheduled out"""
+        self._time_out = time
+
+    def time_out(self, unit="s"):
+        """return time where a given task was scheduled out"""
+        factor = time_uniter(unit)
+        return self._time_out * decimal.Decimal(factor)
+
+    def time_in(self, unit="s"):
+        """return time where a given task was scheduled in"""
+        factor = time_uniter(unit)
+        return self._time_in * decimal.Decimal(factor)
+
+    def runtime(self, unit="us"):
+        factor = time_uniter(unit)
+        return (self._time_out - self._time_in) * decimal.Decimal(factor)
+
+    def update_pid(self, pid):
+        self.pid = pid
+
+
+def _task_id(pid, cpu):
+    """returns a "unique-enough" identifier, please do not change"""
+    return "{}-{}".format(pid, cpu)
+
+
+def _filter_non_printable(unfiltered):
+    """comm names may contain loony chars like '\x00000'"""
+    filtered = ""
+    for char in unfiltered:
+        if char not in string.printable:
+            continue
+        filtered += char
+    return filtered
+
+
+def _fmt_header():
+    separator, fix_csv_align = _prepare_fmt_sep()
+    fmt = "{{:>{}}}".format(LEN_SWITCHED_IN*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_SWITCHED_OUT*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_CPU*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_PID*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_TID*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_COMM*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_RUNTIME*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_OUT_IN*fix_csv_align)
+    if args.extended_times:
+        fmt += "{}{{:>{}}}".format(separator, LEN_OUT_OUT*fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, LEN_IN_IN*fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, LEN_IN_OUT*fix_csv_align)
+    return fmt
+
+
+def _fmt_body():
+    separator, fix_csv_align = _prepare_fmt_sep()
+    decimal_precision, time_precision = _prepare_fmt_precision()
+    fmt = "{{}}{{:{}.{}f}}".format(LEN_SWITCHED_IN*fix_csv_align, decimal_precision)
+    fmt += "{}{{:{}.{}f}}".format(
+        separator, LEN_SWITCHED_OUT*fix_csv_align, decimal_precision
+    )
+    fmt += "{}{{:{}d}}".format(separator, LEN_CPU*fix_csv_align)
+    fmt += "{}{{:{}d}}".format(separator, LEN_PID*fix_csv_align)
+    fmt += "{}{{}}{{:{}d}}{{}}".format(separator, LEN_TID*fix_csv_align)
+    fmt += "{}{{}}{{:>{}}}".format(separator, LEN_COMM*fix_csv_align)
+    fmt += "{}{{:{}.{}f}}".format(separator, LEN_RUNTIME*fix_csv_align, time_precision)
+    if args.extended_times:
+        fmt += "{}{{:{}.{}f}}".format(separator, LEN_OUT_IN*fix_csv_align, time_precision)
+        fmt += "{}{{:{}.{}f}}".format(separator, LEN_OUT_OUT*fix_csv_align, time_precision)
+        fmt += "{}{{:{}.{}f}}".format(separator, LEN_IN_IN*fix_csv_align, time_precision)
+        fmt += "{}{{:{}.{}f}}{{}}".format(
+            separator, LEN_IN_OUT*fix_csv_align, time_precision
+        )
+    else:
+        fmt += "{}{{:{}.{}f}}{{}}".format(
+            separator, LEN_OUT_IN*fix_csv_align, time_precision
+        )
+    return fmt
+
+
+def _print_header():
+    fmt = _fmt_header()
+    header = ("Switched-In", "Switched-Out", "CPU", "PID", "TID", "Comm", "Runtime",
+            "Time Out-In")
+    if args.extended_times:
+        header += ("Time Out-Out", "Time In-In", "Time In-Out")
+    fd_task.write(fmt.format(*header) + "\n")
+
+
+
+def _print_task_finish(task):
+    """calculating every entry of a row and printing it immediately"""
+    c_row_set = ""
+    c_row_reset = ""
+    out_in = -1
+    out_out = -1
+    in_in = -1
+    in_out = -1
+    fmt = _fmt_body()
+    # depending on user provided highlight option we change the color
+    # for particular tasks
+    if str(task.tid) in args.highlight_tasks_map:
+        c_row_set = _COLORS[args.highlight_tasks_map[str(task.tid)]]
+        c_row_reset = _COLORS["reset"]
+    if task.comm in args.highlight_tasks_map:
+        c_row_set = _COLORS[args.highlight_tasks_map[task.comm]]
+        c_row_reset = _COLORS["reset"]
+    # grey-out entries if PID == TID, they
+    # are identical, no threaded model so the
+    # thread id (tid) do not matter
+    c_tid_set = ""
+    c_tid_reset = ""
+    if task.pid == task.tid:
+        c_tid_set = _COLORS["grey"]
+        c_tid_reset = _COLORS["reset"]
+    if task.tid in db["tid"]:
+        # get last task of tid
+        last_tid_task = db["tid"][task.tid][-1]
+        # feed the timespan calculate, last in tid db
+        # and second the current one
+        timespan_gap_tid = Timespans()
+        timespan_gap_tid.feed(last_tid_task)
+        timespan_gap_tid.feed(task)
+        out_in = timespan_gap_tid.out_in
+        out_out = timespan_gap_tid.out_out
+        in_in = timespan_gap_tid.in_in
+        in_out = timespan_gap_tid.in_out
+
+
+    if args.extended_times:
+        line_out = fmt.format(c_row_set, task.time_in(), task.time_out(), task.cpu,
+                        task.pid, c_tid_set, task.tid, c_tid_reset, c_row_set, task.comm,
+                        task.runtime(time_unit), out_in, out_out, in_in, in_out,
+                        c_row_reset) + "\n"
+    else:
+        line_out = fmt.format(c_row_set, task.time_in(), task.time_out(), task.cpu,
+                        task.pid, c_tid_set, task.tid, c_tid_reset, c_row_set, task.comm,
+                        task.runtime(time_unit), out_in, c_row_reset) + "\n"
+    try:
+        fd_task.write(line_out)
+    except(IOError):
+        # don't mangle the output if user SIGINT this script
+        sys.exit()
+
+def _record_cleanup(_list):
+    """
+    no need to store more then one element if --summarize
+    is not enabled
+    """
+    if not args.summary and len(_list) > 1:
+        _list = _list[len(_list) - 1 :]
+
+
+def _record_by_tid(task):
+    tid = task.tid
+    if tid not in db["tid"]:
+        db["tid"][tid] = []
+    db["tid"][tid].append(task)
+    _record_cleanup(db["tid"][tid])
+
+
+def _record_by_cpu(task):
+    cpu = task.cpu
+    if cpu not in db["cpu"]:
+        db["cpu"][cpu] = []
+    db["cpu"][cpu].append(task)
+    _record_cleanup(db["cpu"][cpu])
+
+
+def _record_global(task):
+    """record all executed task, ordered by finish chronological"""
+    db["global"].append(task)
+    _record_cleanup(db["global"])
+
+
+def _handle_task_finish(tid, cpu, time, perf_sample_dict):
+    if tid == 0:
+        return
+    _id = _task_id(tid, cpu)
+    if _id not in db["running"]:
+        # may happen, if we missed the switch to
+        # event. Seen in combination with --exclude-perf
+        # where the start is filtered out, but not the
+        # switched in. Probably a bug in exclude-perf
+        # option.
+        return
+    task = db["running"][_id]
+    task.schedule_out_at(time)
+
+    # record tid, during schedule in the tid
+    # is not available, update now
+    pid = int(perf_sample_dict["sample"]["pid"])
+
+    task.update_pid(pid)
+    del db["running"][_id]
+
+    # print only tasks which are not being filtered and no print of trace
+    # for summary only, but record every task.
+    if not _limit_filtered(tid, pid, task.comm) and not args.summary_only:
+        _print_task_finish(task)
+    _record_by_tid(task)
+    _record_by_cpu(task)
+    _record_global(task)
+
+
+def _handle_task_start(tid, cpu, comm, time):
+    if tid == 0:
+        return
+    if tid in args.tid_renames:
+        comm = args.tid_renames[tid]
+    _id = _task_id(tid, cpu)
+    if _id in db["running"]:
+        # handle corner cases where already running tasks
+        # are switched-to again - saw this via --exclude-perf
+        # recorded traces. We simple ignore this "second start"
+        # event.
+        return
+    assert _id not in db["running"]
+    task = Task(_id, tid, cpu, comm)
+    task.schedule_in_at(time)
+    db["running"][_id] = task
+
+
+def _time_to_internal(time_ns):
+    """
+    To prevent float rounding errors we use Decimal internally
+    """
+    return decimal.Decimal(time_ns) / decimal.Decimal(1e9)
+
+
+def _limit_filtered(tid, pid, comm):
+    if args.filter_tasks:
+        if str(tid) in args.filter_tasks or comm in args.filter_tasks:
+            return True
+        else:
+            return False
+    if args.limit_to_tasks:
+        if str(tid) in args.limit_to_tasks or comm in args.limit_to_tasks:
+            return False
+        else:
+            return True
+
+
+def _argument_filter_sanity_check():
+    if args.limit_to_tasks and args.filter_tasks:
+        sys.exit("Error: Filter and Limit at the same time active.")
+    if args.extended_times and args.summary_only:
+        sys.exit("Error: Summary only and extended times active.")
+    if args.time_limit and ":" not in args.time_limit:
+        sys.exit(
+            "Error: No bound set for time limit. Please set bound by ':' e.g :123."
+        )
+    if args.time_limit and (args.summary or args.summary_only or args.summary_extended):
+        sys.exit("Error: Cannot set time limit and print summary")
+    if args.csv_summary:
+        args.summary = True
+        if args.csv == args.csv_summary:
+            sys.exit("Error: Chosen files for csv and csv summary are the same")
+    if args.csv and (args.summary_extended or args.summary) and not args.csv_summary:
+        sys.exit("Error: No file chosen to write summary to. Choose with --csv-summary "
+        "<file>")
+    if args.csv and args.summary_only:
+        sys.exit("Error: --csv chosen and --summary-only. Standard task would not be"
+            "written to csv file.")
+
+def _argument_prepare_check():
+    global time_unit, fd_task, fd_sum
+    if args.filter_tasks:
+        args.filter_tasks = args.filter_tasks.split(",")
+    if args.limit_to_tasks:
+        args.limit_to_tasks = args.limit_to_tasks.split(",")
+    if args.time_limit:
+        args.time_limit = args.time_limit.split(":")
+    for rename_tuple in args.rename_comms_by_tids.split(","):
+        tid_name = rename_tuple.split(":")
+        if len(tid_name) != 2:
+            continue
+        args.tid_renames[int(tid_name[0])] = tid_name[1]
+    args.highlight_tasks_map = dict()
+    for highlight_tasks_tuple in args.highlight_tasks.split(","):
+        tasks_color_map = highlight_tasks_tuple.split(":")
+        # default highlight color to red if no color set by user
+        if len(tasks_color_map) == 1:
+            tasks_color_map.append("red")
+        if args.highlight_tasks and tasks_color_map[1].lower() not in _COLORS:
+            sys.exit(
+                "Error: Color not defined, please choose from grey,red,green,yellow,blue,"
+                "violet"
+            )
+        if len(tasks_color_map) != 2:
+            continue
+        args.highlight_tasks_map[tasks_color_map[0]] = tasks_color_map[1]
+    time_unit = "us"
+    if args.ns:
+        time_unit = "ns"
+    elif args.ms:
+        time_unit = "ms"
+
+
+    fd_task = sys.stdout
+    if args.csv:
+        args.stdio_color = "never"
+        fd_task = open(args.csv, "w")
+        print("generating csv at",args.csv,)
+
+    fd_sum = sys.stdout
+    if args.csv_summary:
+        args.stdio_color = "never"
+        fd_sum = open(args.csv_summary, "w")
+        print("generating csv summary at",args.csv_summary)
+        if not args.csv:
+            args.summary_only = True
+
+
+def _is_within_timelimit(time):
+    """
+    Check if a time limit was given by parameter, if so ignore the rest. If not,
+    process the recorded trace in its entirety.
+    """
+    if not args.time_limit:
+        return True
+    lower_time_limit = args.time_limit[0]
+    upper_time_limit = args.time_limit[1]
+    # check for upper limit
+    if upper_time_limit == "":
+        if time >= decimal.Decimal(lower_time_limit):
+            return True
+    # check for lower limit
+    if lower_time_limit == "":
+        if time <= decimal.Decimal(upper_time_limit):
+            return True
+        # quit if time exceeds upper limit. Good for big datasets
+        else:
+            quit()
+    if lower_time_limit != "" and upper_time_limit != "":
+        if (time >= decimal.Decimal(lower_time_limit) and
+            time <= decimal.Decimal(upper_time_limit)):
+            return True
+        # quit if time exceeds upper limit. Good for big datasets
+        elif time > decimal.Decimal(upper_time_limit):
+            quit()
+
+def _prepare_fmt_precision():
+    decimal_precision = 6
+    time_precision = 3
+    if args.ns:
+     decimal_precision = 9
+     time_precision = 0
+    return decimal_precision, time_precision
+
+def _prepare_fmt_sep():
+    separator = " "
+    fix_csv_align = 1
+    if args.csv or args.csv_summary:
+        separator = ";"
+        fix_csv_align = 0
+    return separator, fix_csv_align
+
+def trace_unhandled(event_name, context, event_fields_dict, perf_sample_dict):
+    pass
+
+
+def trace_begin():
+    _parse_args()
+    _check_color()
+    _init_db()
+    if not args.summary_only:
+        _print_header()
+
+def trace_end():
+    if args.summary or args.summary_extended or args.summary_only:
+        Summary().print()
+
+def sched__sched_switch(event_name, context, common_cpu, common_secs, common_nsecs,
+                        common_pid, common_comm, common_callchain, prev_comm,
+                        prev_pid, prev_prio, prev_state, next_comm, next_pid,
+                        next_prio, perf_sample_dict):
+    # ignore common_secs & common_nsecs cause we need
+    # high res timestamp anyway, using the raw value is
+    # faster
+    time = _time_to_internal(perf_sample_dict["sample"]["time"])
+    if not _is_within_timelimit(time):
+        # user specific --time-limit a:b set
+        return
+
+    next_comm = _filter_non_printable(next_comm)
+    _handle_task_finish(prev_pid, common_cpu, time, perf_sample_dict)
+    _handle_task_start(next_pid, common_cpu, next_comm, time)