aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/tools/workqueue/wq_monitor.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/workqueue/wq_monitor.py')
-rw-r--r--tools/workqueue/wq_monitor.py175
1 files changed, 175 insertions, 0 deletions
diff --git a/tools/workqueue/wq_monitor.py b/tools/workqueue/wq_monitor.py
new file mode 100644
index 000000000000..a8856a9c45dc
--- /dev/null
+++ b/tools/workqueue/wq_monitor.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env drgn
+#
+# Copyright (C) 2023 Tejun Heo <tj@kernel.org>
+# Copyright (C) 2023 Meta Platforms, Inc. and affiliates.
+
+desc = """
+This is a drgn script to monitor workqueues. For more info on drgn, visit
+https://github.com/osandov/drgn.
+
+ total Total number of work items executed by the workqueue.
+
+ infl The number of currently in-flight work items.
+
+ CPUtime Total CPU time consumed by the workqueue in seconds. This is
+ sampled from scheduler ticks and only provides ballpark
+ measurement. "nohz_full=" CPUs are excluded from measurement.
+
+ CPUitsv The number of times a concurrency-managed work item hogged CPU
+ longer than the threshold (workqueue.cpu_intensive_thresh_us)
+ and got excluded from concurrency management to avoid stalling
+ other work items.
+
+ CMW/RPR For per-cpu workqueues, the number of concurrency-management
+ wake-ups while executing a work item of the workqueue. For
+ unbound workqueues, the number of times a worker was repatriated
+ to its affinity scope after being migrated to an off-scope CPU by
+ the scheduler.
+
+ mayday The number of times the rescuer was requested while waiting for
+ new worker creation.
+
+ rescued The number of work items executed by the rescuer.
+"""
+
+import sys
+import signal
+import os
+import re
+import time
+import json
+
+import drgn
+from drgn.helpers.linux.list import list_for_each_entry,list_empty
+from drgn.helpers.linux.cpumask import for_each_possible_cpu
+
+import argparse
+parser = argparse.ArgumentParser(description=desc,
+ formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument('workqueue', metavar='REGEX', nargs='*',
+ help='Target workqueue name patterns (all if empty)')
+parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1,
+ help='Monitoring interval (0 to print once and exit)')
+parser.add_argument('-j', '--json', action='store_true',
+ help='Output in json')
+args = parser.parse_args()
+
+def err(s):
+ print(s, file=sys.stderr, flush=True)
+ sys.exit(1)
+
+workqueues = prog['workqueues']
+
+WQ_UNBOUND = prog['WQ_UNBOUND']
+WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM']
+
+PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution
+PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution
+PWQ_STAT_CPU_TIME = prog['PWQ_STAT_CPU_TIME'] # total CPU time consumed
+PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
+PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups
+PWQ_STAT_REPATRIATED = prog['PWQ_STAT_REPATRIATED'] # unbound workers brought back into scope
+PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer
+PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer
+PWQ_NR_STATS = prog['PWQ_NR_STATS']
+
+class WqStats:
+ def __init__(self, wq):
+ self.name = wq.name.string_().decode()
+ self.unbound = wq.flags & WQ_UNBOUND != 0
+ self.mem_reclaim = wq.flags & WQ_MEM_RECLAIM != 0
+ self.stats = [0] * PWQ_NR_STATS
+ for pwq in list_for_each_entry('struct pool_workqueue', wq.pwqs.address_of_(), 'pwqs_node'):
+ for i in range(PWQ_NR_STATS):
+ self.stats[i] += int(pwq.stats[i])
+
+ def dict(self, now):
+ return { 'timestamp' : now,
+ 'name' : self.name,
+ 'unbound' : self.unbound,
+ 'mem_reclaim' : self.mem_reclaim,
+ 'started' : self.stats[PWQ_STAT_STARTED],
+ 'completed' : self.stats[PWQ_STAT_COMPLETED],
+ 'cpu_time' : self.stats[PWQ_STAT_CPU_TIME],
+ 'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE],
+ 'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP],
+ 'repatriated' : self.stats[PWQ_STAT_REPATRIATED],
+ 'mayday' : self.stats[PWQ_STAT_MAYDAY],
+ 'rescued' : self.stats[PWQ_STAT_RESCUED], }
+
+ def table_header_str():
+ return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\
+ f'{"CPUitsv":>7} {"CMW/RPR":>7} {"mayday":>7} {"rescued":>7}'
+
+ def table_row_str(self):
+ cpu_intensive = '-'
+ cmw_rpr = '-'
+ mayday = '-'
+ rescued = '-'
+
+ if self.unbound:
+ cmw_rpr = str(self.stats[PWQ_STAT_REPATRIATED]);
+ else:
+ cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE])
+ cmw_rpr = str(self.stats[PWQ_STAT_CM_WAKEUP])
+
+ if self.mem_reclaim:
+ mayday = str(self.stats[PWQ_STAT_MAYDAY])
+ rescued = str(self.stats[PWQ_STAT_RESCUED])
+
+ out = f'{self.name[-24:]:24} ' \
+ f'{self.stats[PWQ_STAT_STARTED]:8} ' \
+ f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
+ f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \
+ f'{cpu_intensive:>7} ' \
+ f'{cmw_rpr:>7} ' \
+ f'{mayday:>7} ' \
+ f'{rescued:>7} '
+ return out.rstrip(':')
+
+exit_req = False
+
+def sigint_handler(signr, frame):
+ global exit_req
+ exit_req = True
+
+def main():
+ # handle args
+ table_fmt = not args.json
+ interval = args.interval
+
+ re_str = None
+ if args.workqueue:
+ for r in args.workqueue:
+ if re_str is None:
+ re_str = r
+ else:
+ re_str += '|' + r
+
+ filter_re = re.compile(re_str) if re_str else None
+
+ # monitoring loop
+ signal.signal(signal.SIGINT, sigint_handler)
+
+ while not exit_req:
+ now = time.time()
+
+ if table_fmt:
+ print()
+ print(WqStats.table_header_str())
+
+ for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'):
+ stats = WqStats(wq)
+ if filter_re and not filter_re.search(stats.name):
+ continue
+ if table_fmt:
+ print(stats.table_row_str())
+ else:
+ print(stats.dict(now))
+
+ if interval == 0:
+ break
+ time.sleep(interval)
+
+if __name__ == "__main__":
+ main()