diff options
Diffstat (limited to 'tools/perf/bench')
29 files changed, 3267 insertions, 647 deletions
diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build index 768e408757a0..b558ab98719f 100644 --- a/tools/perf/bench/Build +++ b/tools/perf/bench/Build @@ -1,18 +1,26 @@ -perf-y += sched-messaging.o -perf-y += sched-pipe.o -perf-y += mem-functions.o -perf-y += futex-hash.o -perf-y += futex-wake.o -perf-y += futex-wake-parallel.o -perf-y += futex-requeue.o -perf-y += futex-lock-pi.o -perf-y += epoll-wait.o -perf-y += epoll-ctl.o -perf-y += synthesize.o -perf-y += kallsyms-parse.o +perf-bench-y += sched-messaging.o +perf-bench-y += sched-pipe.o +perf-bench-y += sched-seccomp-notify.o +perf-bench-y += syscall.o +perf-bench-y += mem-functions.o +perf-bench-y += futex.o +perf-bench-y += futex-hash.o +perf-bench-y += futex-wake.o +perf-bench-y += futex-wake-parallel.o +perf-bench-y += futex-requeue.o +perf-bench-y += futex-lock-pi.o +perf-bench-y += epoll-wait.o +perf-bench-y += epoll-ctl.o +perf-bench-y += synthesize.o +perf-bench-y += kallsyms-parse.o +perf-bench-y += find-bit-bench.o +perf-bench-y += inject-buildid.o +perf-bench-y += evlist-open-close.o +perf-bench-y += breakpoint.o +perf-bench-y += pmu-scan.o +perf-bench-y += uprobe.o -perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o -perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o -perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o +perf-bench-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o +perf-bench-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o -perf-$(CONFIG_NUMA) += numa.o +perf-bench-$(CONFIG_NUMA) += numa.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 61cae4966cae..9f736423af53 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -10,31 +10,25 @@ extern struct timeval bench__start, bench__end, bench__runtime; * The madvise transparent hugepage constants were added in glibc * 2.13. For compatibility with older versions of glibc, define these * tokens if they are not already defined. - * - * PA-RISC uses different madvise values from other architectures and - * needs to be special-cased. */ -#ifdef __hppa__ -# ifndef MADV_HUGEPAGE -# define MADV_HUGEPAGE 67 -# endif -# ifndef MADV_NOHUGEPAGE -# define MADV_NOHUGEPAGE 68 -# endif -#else # ifndef MADV_HUGEPAGE # define MADV_HUGEPAGE 14 # endif # ifndef MADV_NOHUGEPAGE # define MADV_NOHUGEPAGE 15 # endif -#endif int bench_numa(int argc, const char **argv); int bench_sched_messaging(int argc, const char **argv); int bench_sched_pipe(int argc, const char **argv); +int bench_sched_seccomp_notify(int argc, const char **argv); +int bench_syscall_basic(int argc, const char **argv); +int bench_syscall_getpgid(int argc, const char **argv); +int bench_syscall_fork(int argc, const char **argv); +int bench_syscall_execve(int argc, const char **argv); int bench_mem_memcpy(int argc, const char **argv); int bench_mem_memset(int argc, const char **argv); +int bench_mem_find_bit(int argc, const char **argv); int bench_futex_hash(int argc, const char **argv); int bench_futex_wake(int argc, const char **argv); int bench_futex_wake_parallel(int argc, const char **argv); @@ -45,6 +39,16 @@ int bench_epoll_wait(int argc, const char **argv); int bench_epoll_ctl(int argc, const char **argv); int bench_synthesize(int argc, const char **argv); int bench_kallsyms_parse(int argc, const char **argv); +int bench_inject_build_id(int argc, const char **argv); +int bench_evlist_open_close(int argc, const char **argv); +int bench_breakpoint_thread(int argc, const char **argv); +int bench_breakpoint_enable(int argc, const char **argv); +int bench_uprobe_baseline(int argc, const char **argv); +int bench_uprobe_empty(int argc, const char **argv); +int bench_uprobe_trace_printk(int argc, const char **argv); +int bench_uprobe_empty_ret(int argc, const char **argv); +int bench_uprobe_trace_printk_ret(int argc, const char **argv); +int bench_pmu_scan(int argc, const char **argv); #define BENCH_FORMAT_DEFAULT_STR "default" #define BENCH_FORMAT_DEFAULT 0 diff --git a/tools/perf/bench/breakpoint.c b/tools/perf/bench/breakpoint.c new file mode 100644 index 000000000000..dfd18f5db97d --- /dev/null +++ b/tools/perf/bench/breakpoint.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <subcmd/parse-options.h> +#include <linux/hw_breakpoint.h> +#include <linux/perf_event.h> +#include <linux/time64.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <pthread.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdio.h> +#include <errno.h> +#include "bench.h" +#include "futex.h" + +struct { + unsigned int nbreakpoints; + unsigned int nparallel; + unsigned int nthreads; +} thread_params = { + .nbreakpoints = 1, + .nparallel = 1, + .nthreads = 1, +}; + +static const struct option thread_options[] = { + OPT_UINTEGER('b', "breakpoints", &thread_params.nbreakpoints, + "Specify amount of breakpoints"), + OPT_UINTEGER('p', "parallelism", &thread_params.nparallel, "Specify amount of parallelism"), + OPT_UINTEGER('t', "threads", &thread_params.nthreads, "Specify amount of threads"), + OPT_END() +}; + +static const char * const thread_usage[] = { + "perf bench breakpoint thread <options>", + NULL +}; + +struct breakpoint { + int fd; + char watched; +}; + +static int breakpoint_setup(void *addr) +{ + struct perf_event_attr attr = { .size = 0, }; + int fd; + + attr.type = PERF_TYPE_BREAKPOINT; + attr.size = sizeof(attr); + attr.inherit = 1; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; + attr.bp_addr = (unsigned long)addr; + attr.bp_type = HW_BREAKPOINT_RW; + attr.bp_len = HW_BREAKPOINT_LEN_1; + fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0); + + if (fd < 0) + fd = -errno; + + return fd; +} + +static void *passive_thread(void *arg) +{ + unsigned int *done = (unsigned int *)arg; + + while (!__atomic_load_n(done, __ATOMIC_RELAXED)) + futex_wait(done, 0, NULL, 0); + return NULL; +} + +static void *active_thread(void *arg) +{ + unsigned int *done = (unsigned int *)arg; + + while (!__atomic_load_n(done, __ATOMIC_RELAXED)); + return NULL; +} + +static void *breakpoint_thread(void *arg) +{ + unsigned int i, done; + int *repeat = (int *)arg; + pthread_t *threads; + + threads = calloc(thread_params.nthreads, sizeof(threads[0])); + if (!threads) + exit((perror("calloc"), EXIT_FAILURE)); + + while (__atomic_fetch_sub(repeat, 1, __ATOMIC_RELAXED) > 0) { + done = 0; + for (i = 0; i < thread_params.nthreads; i++) { + if (pthread_create(&threads[i], NULL, passive_thread, &done)) + exit((perror("pthread_create"), EXIT_FAILURE)); + } + __atomic_store_n(&done, 1, __ATOMIC_RELAXED); + futex_wake(&done, thread_params.nthreads, 0); + for (i = 0; i < thread_params.nthreads; i++) + pthread_join(threads[i], NULL); + } + free(threads); + return NULL; +} + +// The benchmark creates nbreakpoints inheritable breakpoints, +// then starts nparallel threads which create and join bench_repeat batches of nthreads threads. +int bench_breakpoint_thread(int argc, const char **argv) +{ + unsigned int i, result_usec; + int repeat = bench_repeat; + struct breakpoint *breakpoints; + pthread_t *parallel; + struct timeval start, stop, diff; + + if (parse_options(argc, argv, thread_options, thread_usage, 0)) { + usage_with_options(thread_usage, thread_options); + exit(EXIT_FAILURE); + } + breakpoints = calloc(thread_params.nbreakpoints, sizeof(breakpoints[0])); + parallel = calloc(thread_params.nparallel, sizeof(parallel[0])); + if (!breakpoints || !parallel) + exit((perror("calloc"), EXIT_FAILURE)); + + for (i = 0; i < thread_params.nbreakpoints; i++) { + breakpoints[i].fd = breakpoint_setup(&breakpoints[i].watched); + + if (breakpoints[i].fd < 0) { + if (breakpoints[i].fd == -ENODEV) { + printf("Skipping perf bench breakpoint thread: No hardware support\n"); + return 0; + } + exit((perror("perf_event_open"), EXIT_FAILURE)); + } + } + gettimeofday(&start, NULL); + for (i = 0; i < thread_params.nparallel; i++) { + if (pthread_create(¶llel[i], NULL, breakpoint_thread, &repeat)) + exit((perror("pthread_create"), EXIT_FAILURE)); + } + for (i = 0; i < thread_params.nparallel; i++) + pthread_join(parallel[i], NULL); + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + for (i = 0; i < thread_params.nbreakpoints; i++) + close(breakpoints[i].fd); + free(parallel); + free(breakpoints); + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# Created/joined %d threads with %d breakpoints and %d parallelism\n", + bench_repeat, thread_params.nbreakpoints, thread_params.nparallel); + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", + (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); + result_usec = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + printf(" %14lf usecs/op\n", + (double)result_usec / bench_repeat / thread_params.nthreads); + printf(" %14lf usecs/op/cpu\n", + (double)result_usec / bench_repeat / + thread_params.nthreads * thread_params.nparallel); + break; + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); + break; + default: + fprintf(stderr, "Unknown format: %d\n", bench_format); + exit(EXIT_FAILURE); + } + return 0; +} + +struct { + unsigned int npassive; + unsigned int nactive; +} enable_params = { + .nactive = 0, + .npassive = 0, +}; + +static const struct option enable_options[] = { + OPT_UINTEGER('p', "passive", &enable_params.npassive, "Specify amount of passive threads"), + OPT_UINTEGER('a', "active", &enable_params.nactive, "Specify amount of active threads"), + OPT_END() +}; + +static const char * const enable_usage[] = { + "perf bench breakpoint enable <options>", + NULL +}; + +// The benchmark creates an inheritable breakpoint, +// then starts npassive threads that block and nactive threads that actively spin +// and then disables and enables the breakpoint bench_repeat times. +int bench_breakpoint_enable(int argc, const char **argv) +{ + unsigned int i, nthreads, result_usec, done = 0; + char watched; + int fd; + pthread_t *threads; + struct timeval start, stop, diff; + + if (parse_options(argc, argv, enable_options, enable_usage, 0)) { + usage_with_options(enable_usage, enable_options); + exit(EXIT_FAILURE); + } + fd = breakpoint_setup(&watched); + + if (fd < 0) { + if (fd == -ENODEV) { + printf("Skipping perf bench breakpoint enable: No hardware support\n"); + return 0; + } + exit((perror("perf_event_open"), EXIT_FAILURE)); + } + nthreads = enable_params.npassive + enable_params.nactive; + threads = calloc(nthreads, sizeof(threads[0])); + if (!threads) + exit((perror("calloc"), EXIT_FAILURE)); + + for (i = 0; i < nthreads; i++) { + if (pthread_create(&threads[i], NULL, + i < enable_params.npassive ? passive_thread : active_thread, &done)) + exit((perror("pthread_create"), EXIT_FAILURE)); + } + usleep(10000); // let the threads block + gettimeofday(&start, NULL); + for (i = 0; i < bench_repeat; i++) { + if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0)) + exit((perror("ioctl(PERF_EVENT_IOC_DISABLE)"), EXIT_FAILURE)); + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) + exit((perror("ioctl(PERF_EVENT_IOC_ENABLE)"), EXIT_FAILURE)); + } + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + __atomic_store_n(&done, 1, __ATOMIC_RELAXED); + futex_wake(&done, enable_params.npassive, 0); + for (i = 0; i < nthreads; i++) + pthread_join(threads[i], NULL); + free(threads); + close(fd); + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# Enabled/disabled breakpoint %d time with %d passive and %d active threads\n", + bench_repeat, enable_params.npassive, enable_params.nactive); + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", + (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); + result_usec = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + printf(" %14lf usecs/op\n", (double)result_usec / bench_repeat); + break; + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); + break; + default: + fprintf(stderr, "Unknown format: %d\n", bench_format); + exit(EXIT_FAILURE); + } + return 0; +} diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c index ca2d591aad8a..d66d852b90e4 100644 --- a/tools/perf/bench/epoll-ctl.c +++ b/tools/perf/bench/epoll-ctl.c @@ -21,9 +21,9 @@ #include <sys/resource.h> #include <sys/epoll.h> #include <sys/eventfd.h> -#include <internal/cpumap.h> #include <perf/cpumap.h> +#include "../util/mutex.h" #include "../util/stat.h" #include <subcmd/parse-options.h> #include "bench.h" @@ -59,10 +59,10 @@ static unsigned int nested = 0; /* amount of fds to monitor, per thread */ static unsigned int nfds = 64; -static pthread_mutex_t thread_lock; +static struct mutex thread_lock; static unsigned int threads_starting; static struct stats all_stats[EPOLL_NR_OPS]; -static pthread_cond_t thread_parent, thread_worker; +static struct cond thread_parent, thread_worker; struct worker { int tid; @@ -107,7 +107,7 @@ static void nest_epollfd(void) printinfo("Nesting level(s): %d\n", nested); epollfdp = calloc(nested, sizeof(int)); - if (!epollfd) + if (!epollfdp) err(EXIT_FAILURE, "calloc"); for (i = 0; i < nested; i++) { @@ -175,12 +175,12 @@ static void *workerfn(void *arg) struct timespec ts = { .tv_sec = 0, .tv_nsec = 250 }; - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); /* Let 'em loose */ do { @@ -223,13 +223,20 @@ static void init_fdmaps(struct worker *w, int pct) static int do_threads(struct worker *worker, struct perf_cpu_map *cpu) { pthread_attr_t thread_attr, *attrp = NULL; - cpu_set_t cpuset; + cpu_set_t *cpuset; unsigned int i, j; int ret = 0; + int nrcpus; + size_t size; if (!noaffinity) pthread_attr_init(&thread_attr); + nrcpus = cpu__max_cpu().cpu; + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); + for (i = 0; i < nthreads; i++) { struct worker *w = &worker[i]; @@ -253,22 +260,28 @@ static int do_threads(struct worker *worker, struct perf_cpu_map *cpu) init_fdmaps(w, 50); if (!noaffinity) { - CPU_ZERO(&cpuset); - CPU_SET(cpu->map[i % cpu->nr], &cpuset); + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, + size, cpuset); - ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset); - if (ret) + ret = pthread_attr_setaffinity_np(&thread_attr, size, cpuset); + if (ret) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } attrp = &thread_attr; } ret = pthread_create(&w->thread, attrp, workerfn, (void *)(struct worker *) w); - if (ret) + if (ret) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_create"); + } } + CPU_FREE(cpuset); if (!noaffinity) pthread_attr_destroy(&thread_attr); @@ -317,7 +330,7 @@ int bench_epoll_ctl(int argc, const char **argv) act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) goto errmem; @@ -334,7 +347,7 @@ int bench_epoll_ctl(int argc, const char **argv) /* default to the number of CPUs */ if (!nthreads) - nthreads = cpu->nr; + nthreads = perf_cpu_map__nr(cpu); worker = calloc(nthreads, sizeof(*worker)); if (!worker) @@ -355,9 +368,9 @@ int bench_epoll_ctl(int argc, const char **argv) for (i = 0; i < EPOLL_NR_OPS; i++) init_stats(&all_stats[i]); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); threads_starting = nthreads; @@ -365,11 +378,11 @@ int bench_epoll_ctl(int argc, const char **argv) do_threads(worker, cpu); - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); sleep(nsecs); toggle_done(0, NULL, NULL); @@ -382,9 +395,9 @@ int bench_epoll_ctl(int argc, const char **argv) } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); for (i = 0; i < nthreads; i++) { unsigned long t[EPOLL_NR_OPS]; @@ -408,6 +421,11 @@ int bench_epoll_ctl(int argc, const char **argv) print_summary(); close(epollfd); + perf_cpu_map__put(cpu); + for (i = 0; i < nthreads; i++) + free(worker[i].fdmap); + + free(worker); return ret; errmem: err(EXIT_FAILURE, "calloc"); diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c index 75dca9773186..20fe4f72b4af 100644 --- a/tools/perf/bench/epoll-wait.c +++ b/tools/perf/bench/epoll-wait.c @@ -17,7 +17,7 @@ * While the second model, enabled via --multiq option, uses multiple * queueing (which refers to one epoll instance per worker). For example, * short lived tcp connections in a high throughput httpd server will - * ditribute the accept()'ing connections across CPUs. In this case each + * distribute the accept()'ing connections across CPUs. In this case each * worker does a limited amount of processing. * * [queue A] ---> [worker] @@ -76,10 +76,10 @@ #include <sys/epoll.h> #include <sys/eventfd.h> #include <sys/types.h> -#include <internal/cpumap.h> #include <perf/cpumap.h> #include "../util/stat.h" +#include "../util/mutex.h" #include <subcmd/parse-options.h> #include "bench.h" @@ -110,10 +110,10 @@ static bool multiq; /* use an epoll instance per thread */ /* amount of fds to monitor, per thread */ static unsigned int nfds = 64; -static pthread_mutex_t thread_lock; +static struct mutex thread_lock; static unsigned int threads_starting; static struct stats throughput_stats; -static pthread_cond_t thread_parent, thread_worker; +static struct cond thread_parent, thread_worker; struct worker { int tid; @@ -190,16 +190,16 @@ static void *workerfn(void *arg) int to = nonblocking? 0 : -1; int efd = multiq ? w->epollfd : epollfd; - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); do { /* - * Block undefinitely waiting for the IN event. + * Block indefinitely waiting for the IN event. * In order to stress the epoll_wait(2) syscall, * call it event per event, instead of a larger * batch (max)limit. @@ -292,9 +292,11 @@ static void print_summary(void) static int do_threads(struct worker *worker, struct perf_cpu_map *cpu) { pthread_attr_t thread_attr, *attrp = NULL; - cpu_set_t cpuset; + cpu_set_t *cpuset; unsigned int i, j; int ret = 0, events = EPOLLIN; + int nrcpus; + size_t size; if (oneshot) events |= EPOLLONESHOT; @@ -307,6 +309,11 @@ static int do_threads(struct worker *worker, struct perf_cpu_map *cpu) if (!noaffinity) pthread_attr_init(&thread_attr); + nrcpus = cpu__max_cpu().cpu; + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); + for (i = 0; i < nthreads; i++) { struct worker *w = &worker[i]; @@ -342,22 +349,28 @@ static int do_threads(struct worker *worker, struct perf_cpu_map *cpu) } if (!noaffinity) { - CPU_ZERO(&cpuset); - CPU_SET(cpu->map[i % cpu->nr], &cpuset); + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, + size, cpuset); - ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset); - if (ret) + ret = pthread_attr_setaffinity_np(&thread_attr, size, cpuset); + if (ret) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } attrp = &thread_attr; } ret = pthread_create(&w->thread, attrp, workerfn, (void *)(struct worker *) w); - if (ret) + if (ret) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_create"); + } } + CPU_FREE(cpuset); if (!noaffinity) pthread_attr_destroy(&thread_attr); @@ -407,7 +420,12 @@ static int cmpworker(const void *p1, const void *p2) struct worker *w1 = (struct worker *) p1; struct worker *w2 = (struct worker *) p2; - return w1->tid > w2->tid; + + if (w1->tid > w2->tid) + return 1; + if (w1->tid < w2->tid) + return -1; + return 0; } int bench_epoll_wait(int argc, const char **argv) @@ -431,7 +449,7 @@ int bench_epoll_wait(int argc, const char **argv) act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) goto errmem; @@ -453,7 +471,7 @@ int bench_epoll_wait(int argc, const char **argv) /* default to the number of CPUs and leave one for the writer pthread */ if (!nthreads) - nthreads = cpu->nr - 1; + nthreads = perf_cpu_map__nr(cpu) - 1; worker = calloc(nthreads, sizeof(*worker)); if (!worker) { @@ -473,9 +491,9 @@ int bench_epoll_wait(int argc, const char **argv) getpid(), nthreads, oneshot ? " (EPOLLONESHOT semantics)": "", nfds, nsecs); init_stats(&throughput_stats); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); threads_starting = nthreads; @@ -483,11 +501,11 @@ int bench_epoll_wait(int argc, const char **argv) do_threads(worker, cpu); - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); /* * At this point the workers should be blocked waiting for read events @@ -510,9 +528,9 @@ int bench_epoll_wait(int argc, const char **argv) err(EXIT_FAILURE, "pthread_join"); /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); /* sort the array back before reporting */ if (randomize) @@ -536,6 +554,11 @@ int bench_epoll_wait(int argc, const char **argv) print_summary(); close(epollfd); + perf_cpu_map__put(cpu); + for (i = 0; i < nthreads; i++) + free(worker[i].fdmap); + + free(worker); return ret; errmem: err(EXIT_FAILURE, "calloc"); diff --git a/tools/perf/bench/evlist-open-close.c b/tools/perf/bench/evlist-open-close.c new file mode 100644 index 000000000000..79cedcf94a39 --- /dev/null +++ b/tools/perf/bench/evlist-open-close.c @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include "bench.h" +#include "../util/debug.h" +#include "../util/stat.h" +#include "../util/evlist.h" +#include "../util/evsel.h" +#include "../util/strbuf.h" +#include "../util/record.h" +#include "../util/parse-events.h" +#include "internal/threadmap.h" +#include "internal/cpumap.h" +#include <linux/perf_event.h> +#include <linux/kernel.h> +#include <linux/time64.h> +#include <linux/string.h> +#include <subcmd/parse-options.h> + +#define MMAP_FLUSH_DEFAULT 1 + +static int iterations = 100; +static int nr_events = 1; +static const char *event_string = "dummy"; + +static inline u64 timeval2usec(struct timeval *tv) +{ + return tv->tv_sec * USEC_PER_SEC + tv->tv_usec; +} + +static struct record_opts opts = { + .sample_time = true, + .mmap_pages = UINT_MAX, + .user_freq = UINT_MAX, + .user_interval = ULLONG_MAX, + .freq = 4000, + .target = { + .uses_mmap = true, + .default_per_cpu = true, + }, + .mmap_flush = MMAP_FLUSH_DEFAULT, + .nr_threads_synthesize = 1, + .ctl_fd = -1, + .ctl_fd_ack = -1, +}; + +static int evlist__count_evsel_fds(struct evlist *evlist) +{ + struct evsel *evsel; + int cnt = 0; + + evlist__for_each_entry(evlist, evsel) + cnt += evsel->core.threads->nr * perf_cpu_map__nr(evsel->core.cpus); + + return cnt; +} + +static struct evlist *bench__create_evlist(char *evstr) +{ + struct parse_events_error err; + struct evlist *evlist = evlist__new(); + int ret; + + if (!evlist) { + pr_err("Not enough memory to create evlist\n"); + return NULL; + } + + parse_events_error__init(&err); + ret = parse_events(evlist, evstr, &err); + if (ret) { + parse_events_error__print(&err, evstr); + parse_events_error__exit(&err); + pr_err("Run 'perf list' for a list of valid events\n"); + ret = 1; + goto out_delete_evlist; + } + parse_events_error__exit(&err); + ret = evlist__create_maps(evlist, &opts.target); + if (ret < 0) { + pr_err("Not enough memory to create thread/cpu maps\n"); + goto out_delete_evlist; + } + + evlist__config(evlist, &opts, NULL); + + return evlist; + +out_delete_evlist: + evlist__delete(evlist); + return NULL; +} + +static int bench__do_evlist_open_close(struct evlist *evlist) +{ + char sbuf[STRERR_BUFSIZE]; + int err = evlist__open(evlist); + + if (err < 0) { + pr_err("evlist__open: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); + return err; + } + + err = evlist__mmap(evlist, opts.mmap_pages); + if (err < 0) { + pr_err("evlist__mmap: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); + return err; + } + + evlist__enable(evlist); + evlist__disable(evlist); + evlist__munmap(evlist); + evlist__close(evlist); + + return 0; +} + +static int bench_evlist_open_close__run(char *evstr) +{ + // used to print statistics only + struct evlist *evlist = bench__create_evlist(evstr); + double time_average, time_stddev; + struct timeval start, end, diff; + struct stats time_stats; + u64 runtime_us; + int i, err; + + if (!evlist) + return -ENOMEM; + + init_stats(&time_stats); + + printf(" Number of cpus:\t%d\n", perf_cpu_map__nr(evlist->core.user_requested_cpus)); + printf(" Number of threads:\t%d\n", evlist->core.threads->nr); + printf(" Number of events:\t%d (%d fds)\n", + evlist->core.nr_entries, evlist__count_evsel_fds(evlist)); + printf(" Number of iterations:\t%d\n", iterations); + + evlist__delete(evlist); + + for (i = 0; i < iterations; i++) { + pr_debug("Started iteration %d\n", i); + evlist = bench__create_evlist(evstr); + if (!evlist) + return -ENOMEM; + + gettimeofday(&start, NULL); + err = bench__do_evlist_open_close(evlist); + if (err) { + evlist__delete(evlist); + return err; + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = timeval2usec(&diff); + update_stats(&time_stats, runtime_us); + + evlist__delete(evlist); + pr_debug("Iteration %d took:\t%" PRIu64 "us\n", i, runtime_us); + } + + time_average = avg_stats(&time_stats); + time_stddev = stddev_stats(&time_stats); + printf(" Average open-close took: %.3f usec (+- %.3f usec)\n", time_average, time_stddev); + + return 0; +} + +static char *bench__repeat_event_string(const char *evstr, int n) +{ + char sbuf[STRERR_BUFSIZE]; + struct strbuf buf; + int i, str_size = strlen(evstr), + final_size = str_size * n + n, + err = strbuf_init(&buf, final_size); + + if (err) { + pr_err("strbuf_init: %s\n", str_error_r(err, sbuf, sizeof(sbuf))); + goto out_error; + } + + for (i = 0; i < n; i++) { + err = strbuf_add(&buf, evstr, str_size); + if (err) { + pr_err("strbuf_add: %s\n", str_error_r(err, sbuf, sizeof(sbuf))); + goto out_error; + } + + err = strbuf_addch(&buf, i == n-1 ? '\0' : ','); + if (err) { + pr_err("strbuf_addch: %s\n", str_error_r(err, sbuf, sizeof(sbuf))); + goto out_error; + } + } + + return strbuf_detach(&buf, NULL); + +out_error: + strbuf_release(&buf); + return NULL; +} + + +int bench_evlist_open_close(int argc, const char **argv) +{ + const struct option options[] = { + OPT_STRING('e', "event", &event_string, "event", + "event selector. use 'perf list' to list available events"), + OPT_INTEGER('n', "nr-events", &nr_events, + "number of dummy events to create (default 1). If used with -e, it clones those events n times (1 = no change)"), + OPT_INTEGER('i', "iterations", &iterations, + "Number of iterations used to compute average (default=100)"), + OPT_BOOLEAN('a', "all-cpus", &opts.target.system_wide, + "system-wide collection from all CPUs"), + OPT_STRING('C', "cpu", &opts.target.cpu_list, "cpu", + "list of cpus where to open events"), + OPT_STRING('p', "pid", &opts.target.pid, "pid", + "record events on existing process id"), + OPT_STRING('t', "tid", &opts.target.tid, "tid", + "record events on existing thread id"), + OPT_STRING('u', "uid", &opts.target.uid_str, "user", "user to profile"), + OPT_BOOLEAN(0, "per-thread", &opts.target.per_thread, "use per-thread mmaps"), + OPT_END() + }; + const char *const bench_usage[] = { + "perf bench internals evlist-open-close <options>", + NULL + }; + char *evstr, errbuf[BUFSIZ]; + int err; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + err = target__validate(&opts.target); + if (err) { + target__strerror(&opts.target, err, errbuf, sizeof(errbuf)); + pr_err("%s\n", errbuf); + goto out; + } + + err = target__parse_uid(&opts.target); + if (err) { + target__strerror(&opts.target, err, errbuf, sizeof(errbuf)); + pr_err("%s", errbuf); + goto out; + } + + /* Enable ignoring missing threads when -u/-p option is defined. */ + opts.ignore_missing_thread = opts.target.uid != UINT_MAX || opts.target.pid; + + evstr = bench__repeat_event_string(event_string, nr_events); + if (!evstr) { + err = -ENOMEM; + goto out; + } + + err = bench_evlist_open_close__run(evstr); + + free(evstr); +out: + return err; +} diff --git a/tools/perf/bench/find-bit-bench.c b/tools/perf/bench/find-bit-bench.c new file mode 100644 index 000000000000..7e25b0e413f6 --- /dev/null +++ b/tools/perf/bench/find-bit-bench.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Benchmark find_next_bit and related bit operations. + * + * Copyright 2020 Google LLC. + */ +#include <stdlib.h> +#include "bench.h" +#include "../util/stat.h" +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/time64.h> +#include <subcmd/parse-options.h> + +static unsigned int outer_iterations = 5; +static unsigned int inner_iterations = 100000; + +static const struct option options[] = { + OPT_UINTEGER('i', "outer-iterations", &outer_iterations, + "Number of outer iterations used"), + OPT_UINTEGER('j', "inner-iterations", &inner_iterations, + "Number of inner iterations used"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench mem find_bit <options>", + NULL +}; + +static unsigned int accumulator; +static unsigned int use_of_val; + +static noinline void workload(int val) +{ + use_of_val += val; + accumulator++; +} + +#if (defined(__i386__) || defined(__x86_64__)) && defined(__GCC_ASM_FLAG_OUTPUTS__) +static bool asm_test_bit(long nr, const unsigned long *addr) +{ + bool oldbit; + + asm volatile("bt %2,%1" + : "=@ccc" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory"); + + return oldbit; +} +#else +#define asm_test_bit test_bit +#endif + +static int do_for_each_set_bit(unsigned int num_bits) +{ + unsigned long *to_test = bitmap_zalloc(num_bits); + struct timeval start, end, diff; + u64 runtime_us; + struct stats fb_time_stats, tb_time_stats; + double time_average, time_stddev; + unsigned int bit, i, j; + unsigned int set_bits, skip; + + init_stats(&fb_time_stats); + init_stats(&tb_time_stats); + + for (set_bits = 1; set_bits <= num_bits; set_bits <<= 1) { + bitmap_zero(to_test, num_bits); + skip = num_bits / set_bits; + for (i = 0; i < num_bits; i += skip) + __set_bit(i, to_test); + + for (i = 0; i < outer_iterations; i++) { +#ifndef NDEBUG + unsigned int old = accumulator; +#endif + + gettimeofday(&start, NULL); + for (j = 0; j < inner_iterations; j++) { + for_each_set_bit(bit, to_test, num_bits) + workload(bit); + } + gettimeofday(&end, NULL); + assert(old + (inner_iterations * set_bits) == accumulator); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&fb_time_stats, runtime_us); + +#ifndef NDEBUG + old = accumulator; +#endif + gettimeofday(&start, NULL); + for (j = 0; j < inner_iterations; j++) { + for (bit = 0; bit < num_bits; bit++) { + if (asm_test_bit(bit, to_test)) + workload(bit); + } + } + gettimeofday(&end, NULL); + assert(old + (inner_iterations * set_bits) == accumulator); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&tb_time_stats, runtime_us); + } + + printf("%d operations %d bits set of %d bits\n", + inner_iterations, set_bits, num_bits); + time_average = avg_stats(&fb_time_stats); + time_stddev = stddev_stats(&fb_time_stats); + printf(" Average for_each_set_bit took: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + time_average = avg_stats(&tb_time_stats); + time_stddev = stddev_stats(&tb_time_stats); + printf(" Average test_bit loop took: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + + if (use_of_val == accumulator) /* Try to avoid compiler tricks. */ + printf("\n"); + } + bitmap_free(to_test); + return 0; +} + +int bench_mem_find_bit(int argc, const char **argv) +{ + int err = 0, i; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + for (i = 1; i <= 2048; i <<= 1) + do_for_each_set_bit(i); + + return err; +} diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index 915bf3da7ce2..d2d6d7f3ea33 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -20,9 +20,11 @@ #include <linux/kernel.h> #include <linux/zalloc.h> #include <sys/time.h> -#include <internal/cpumap.h> +#include <sys/mman.h> +#include <sys/prctl.h> #include <perf/cpumap.h> +#include "../util/mutex.h" #include "../util/stat.h" #include <subcmd/parse-options.h> #include "bench.h" @@ -30,18 +32,14 @@ #include <err.h> -static unsigned int nthreads = 0; -static unsigned int nsecs = 10; -/* amount of futexes per thread */ -static unsigned int nfutexes = 1024; -static bool fshared = false, done = false, silent = false; +static bool done = false; static int futex_flag = 0; struct timeval bench__start, bench__end, bench__runtime; -static pthread_mutex_t thread_lock; +static struct mutex thread_lock; static unsigned int threads_starting; static struct stats throughput_stats; -static pthread_cond_t thread_parent, thread_worker; +static struct cond thread_parent, thread_worker; struct worker { int tid; @@ -50,12 +48,21 @@ struct worker { unsigned long ops; }; +static struct bench_futex_parameters params = { + .nfutexes = 1024, + .runtime = 10, + .nbuckets = -1, +}; + static const struct option options[] = { - OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), - OPT_UINTEGER('r', "runtime", &nsecs, "Specify runtime (in seconds)"), - OPT_UINTEGER('f', "futexes", &nfutexes, "Specify amount of futexes per threads"), - OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), - OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), + OPT_INTEGER( 'b', "buckets", ¶ms.nbuckets, "Specify amount of hash buckets"), + OPT_BOOLEAN( 'I', "immutable", ¶ms.buckets_immutable, "Make the hash buckets immutable"), + OPT_UINTEGER('t', "threads", ¶ms.nthreads, "Specify amount of threads"), + OPT_UINTEGER('r', "runtime", ¶ms.runtime, "Specify runtime (in seconds)"), + OPT_UINTEGER('f', "futexes", ¶ms.nfutexes, "Specify amount of futexes per threads"), + OPT_BOOLEAN( 's', "silent", ¶ms.silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", ¶ms.fshared, "Use shared futexes instead of private ones"), + OPT_BOOLEAN( 'm', "mlockall", ¶ms.mlockall, "Lock all current and future memory"), OPT_END() }; @@ -71,15 +78,15 @@ static void *workerfn(void *arg) unsigned int i; unsigned long ops = w->ops; /* avoid cacheline bouncing */ - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); do { - for (i = 0; i < nfutexes; i++, ops++) { + for (i = 0; i < params.nfutexes; i++, ops++) { /* * We want the futex calls to fail in order to stress * the hashing of uaddr and not measure other steps, @@ -87,7 +94,7 @@ static void *workerfn(void *arg) * the critical region protected by hb->lock. */ ret = futex_wait(&w->futex[i], 1234, NULL, futex_flag); - if (!silent && + if (!params.silent && (!ret || errno != EAGAIN || errno != EWOULDBLOCK)) warn("Non-expected futex return call"); } @@ -113,19 +120,22 @@ static void print_summary(void) double stddev = stddev_stats(&throughput_stats); printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", - !silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), + !params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), (int)bench__runtime.tv_sec); + futex_print_nbuckets(¶ms); } int bench_futex_hash(int argc, const char **argv) { int ret = 0; - cpu_set_t cpuset; + cpu_set_t *cpuset; struct sigaction act; unsigned int i; pthread_attr_t thread_attr; struct worker *worker = NULL; struct perf_cpu_map *cpu; + int nrcpus; + size_t size; argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0); if (argc) { @@ -133,7 +143,7 @@ int bench_futex_hash(int argc, const char **argv) exit(EXIT_FAILURE); } - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) goto errmem; @@ -142,80 +152,96 @@ int bench_futex_hash(int argc, const char **argv) act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); - if (!nthreads) /* default to the number of CPUs */ - nthreads = cpu->nr; + if (params.mlockall) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + err(EXIT_FAILURE, "mlockall"); + } + + if (!params.nthreads) /* default to the number of CPUs */ + params.nthreads = perf_cpu_map__nr(cpu); - worker = calloc(nthreads, sizeof(*worker)); + worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) goto errmem; - if (!fshared) + if (!params.fshared) futex_flag = FUTEX_PRIVATE_FLAG; + futex_set_nbuckets_param(¶ms); printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n", - getpid(), nthreads, nfutexes, fshared ? "shared":"private", nsecs); + getpid(), params.nthreads, params.nfutexes, params.fshared ? "shared":"private", params.runtime); init_stats(&throughput_stats); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); - threads_starting = nthreads; + threads_starting = params.nthreads; pthread_attr_init(&thread_attr); gettimeofday(&bench__start, NULL); - for (i = 0; i < nthreads; i++) { + + nrcpus = cpu__max_cpu().cpu; + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); + + for (i = 0; i < params.nthreads; i++) { worker[i].tid = i; - worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex)); + worker[i].futex = calloc(params.nfutexes, sizeof(*worker[i].futex)); if (!worker[i].futex) goto errmem; - CPU_ZERO(&cpuset); - CPU_SET(cpu->map[i % cpu->nr], &cpuset); + CPU_ZERO_S(size, cpuset); - ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset); - if (ret) + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); + ret = pthread_attr_setaffinity_np(&thread_attr, size, cpuset); + if (ret) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); - + } ret = pthread_create(&worker[i].thread, &thread_attr, workerfn, (void *)(struct worker *) &worker[i]); - if (ret) + if (ret) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_create"); + } } + CPU_FREE(cpuset); pthread_attr_destroy(&thread_attr); - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); - sleep(nsecs); + sleep(params.runtime); toggle_done(0, NULL, NULL); - for (i = 0; i < nthreads; i++) { + for (i = 0; i < params.nthreads; i++) { ret = pthread_join(worker[i].thread, NULL); if (ret) err(EXIT_FAILURE, "pthread_join"); } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); - for (i = 0; i < nthreads; i++) { + for (i = 0; i < params.nthreads; i++) { unsigned long t = bench__runtime.tv_sec > 0 ? worker[i].ops / bench__runtime.tv_sec : 0; update_stats(&throughput_stats, t); - if (!silent) { - if (nfutexes == 1) + if (!params.silent) { + if (params.nfutexes == 1) printf("[thread %2d] futex: %p [ %ld ops/sec ]\n", worker[i].tid, &worker[i].futex[0], t); else printf("[thread %2d] futexes: %p ... %p [ %ld ops/sec ]\n", worker[i].tid, &worker[i].futex[0], - &worker[i].futex[nfutexes-1], t); + &worker[i].futex[params.nfutexes-1], t); } zfree(&worker[i].futex); diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index bb25d8beb3b8..5144a158512c 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -8,13 +8,13 @@ #include <pthread.h> #include <signal.h> +#include "../util/mutex.h" #include "../util/stat.h" #include <subcmd/parse-options.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/zalloc.h> #include <errno.h> -#include <internal/cpumap.h> #include <perf/cpumap.h> #include "bench.h" #include "futex.h" @@ -22,6 +22,7 @@ #include <err.h> #include <stdlib.h> #include <sys/time.h> +#include <sys/mman.h> struct worker { int tid; @@ -32,22 +33,27 @@ struct worker { static u_int32_t global_futex = 0; static struct worker *worker; -static unsigned int nsecs = 10; -static bool silent = false, multi = false; -static bool done = false, fshared = false; -static unsigned int nthreads = 0; +static bool done = false; static int futex_flag = 0; -static pthread_mutex_t thread_lock; +static struct mutex thread_lock; static unsigned int threads_starting; static struct stats throughput_stats; -static pthread_cond_t thread_parent, thread_worker; +static struct cond thread_parent, thread_worker; + +static struct bench_futex_parameters params = { + .nbuckets = -1, + .runtime = 10, +}; static const struct option options[] = { - OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), - OPT_UINTEGER('r', "runtime", &nsecs, "Specify runtime (in seconds)"), - OPT_BOOLEAN( 'M', "multi", &multi, "Use multiple futexes"), - OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), - OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), + OPT_INTEGER( 'b', "buckets", ¶ms.nbuckets, "Specify amount of hash buckets"), + OPT_BOOLEAN( 'I', "immutable", ¶ms.buckets_immutable, "Make the hash buckets immutable"), + OPT_UINTEGER('t', "threads", ¶ms.nthreads, "Specify amount of threads"), + OPT_UINTEGER('r', "runtime", ¶ms.runtime, "Specify runtime (in seconds)"), + OPT_BOOLEAN( 'M', "multi", ¶ms.multi, "Use multiple futexes"), + OPT_BOOLEAN( 's', "silent", ¶ms.silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", ¶ms.fshared, "Use shared futexes instead of private ones"), + OPT_BOOLEAN( 'm', "mlockall", ¶ms.mlockall, "Lock all current and future memory"), OPT_END() }; @@ -62,8 +68,9 @@ static void print_summary(void) double stddev = stddev_stats(&throughput_stats); printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", - !silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), + !params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), (int)bench__runtime.tv_sec); + futex_print_nbuckets(¶ms); } static void toggle_done(int sig __maybe_unused, @@ -81,12 +88,12 @@ static void *workerfn(void *arg) struct worker *w = (struct worker *) arg; unsigned long ops = w->ops; - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); do { int ret; @@ -94,7 +101,7 @@ static void *workerfn(void *arg) ret = futex_lock_pi(w->futex, NULL, futex_flag); if (ret) { /* handle lock acquisition */ - if (!silent) + if (!params.silent) warn("thread %d: Could not lock pi-lock for %p (%d)", w->tid, w->futex, ret); if (done) @@ -105,7 +112,7 @@ static void *workerfn(void *arg) usleep(1); ret = futex_unlock_pi(w->futex, futex_flag); - if (ret && !silent) + if (ret && !params.silent) warn("thread %d: Could not unlock pi-lock for %p (%d)", w->tid, w->futex, ret); ops++; /* account for thread's share of work */ @@ -115,33 +122,47 @@ static void *workerfn(void *arg) return NULL; } -static void create_threads(struct worker *w, pthread_attr_t thread_attr, - struct perf_cpu_map *cpu) +static void create_threads(struct worker *w, struct perf_cpu_map *cpu) { - cpu_set_t cpuset; + cpu_set_t *cpuset; unsigned int i; + int nrcpus = cpu__max_cpu().cpu; + size_t size; + + threads_starting = params.nthreads; + + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); - threads_starting = nthreads; + for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; - for (i = 0; i < nthreads; i++) { + pthread_attr_init(&thread_attr); worker[i].tid = i; - if (multi) { + if (params.multi) { worker[i].futex = calloc(1, sizeof(u_int32_t)); if (!worker[i].futex) err(EXIT_FAILURE, "calloc"); } else worker[i].futex = &global_futex; - CPU_ZERO(&cpuset); - CPU_SET(cpu->map[i % cpu->nr], &cpuset); + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); - if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset)) + if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } - if (pthread_create(&w[i].thread, &thread_attr, workerfn, &worker[i])) + if (pthread_create(&w[i].thread, &thread_attr, workerfn, &worker[i])) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_create"); + } + pthread_attr_destroy(&thread_attr); } + CPU_FREE(cpuset); } int bench_futex_lock_pi(int argc, const char **argv) @@ -149,14 +170,13 @@ int bench_futex_lock_pi(int argc, const char **argv) int ret = 0; unsigned int i; struct sigaction act; - pthread_attr_t thread_attr; struct perf_cpu_map *cpu; argc = parse_options(argc, argv, options, bench_futex_lock_pi_usage, 0); if (argc) goto err; - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) err(EXIT_FAILURE, "calloc"); @@ -165,67 +185,72 @@ int bench_futex_lock_pi(int argc, const char **argv) act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); - if (!nthreads) - nthreads = cpu->nr; + if (params.mlockall) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + err(EXIT_FAILURE, "mlockall"); + } + + if (!params.nthreads) + params.nthreads = perf_cpu_map__nr(cpu); - worker = calloc(nthreads, sizeof(*worker)); + worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) err(EXIT_FAILURE, "calloc"); - if (!fshared) + if (!params.fshared) futex_flag = FUTEX_PRIVATE_FLAG; printf("Run summary [PID %d]: %d threads doing pi lock/unlock pairing for %d secs.\n\n", - getpid(), nthreads, nsecs); + getpid(), params.nthreads, params.runtime); init_stats(&throughput_stats); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); + futex_set_nbuckets_param(¶ms); - threads_starting = nthreads; - pthread_attr_init(&thread_attr); + threads_starting = params.nthreads; gettimeofday(&bench__start, NULL); - create_threads(worker, thread_attr, cpu); - pthread_attr_destroy(&thread_attr); + create_threads(worker, cpu); - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); - sleep(nsecs); + sleep(params.runtime); toggle_done(0, NULL, NULL); - for (i = 0; i < nthreads; i++) { + for (i = 0; i < params.nthreads; i++) { ret = pthread_join(worker[i].thread, NULL); if (ret) err(EXIT_FAILURE, "pthread_join"); } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); - for (i = 0; i < nthreads; i++) { + for (i = 0; i < params.nthreads; i++) { unsigned long t = bench__runtime.tv_sec > 0 ? worker[i].ops / bench__runtime.tv_sec : 0; update_stats(&throughput_stats, t); - if (!silent) + if (!params.silent) printf("[thread %3d] futex: %p [ %ld ops/sec ]\n", worker[i].tid, worker[i].futex, t); - if (multi) + if (params.multi) zfree(&worker[i].futex); } print_summary(); free(worker); + perf_cpu_map__put(cpu); return ret; err: usage_with_options(bench_futex_lock_pi_usage, options); diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c index 7a15c2e61022..a2f91ee1950b 100644 --- a/tools/perf/bench/futex-requeue.c +++ b/tools/perf/bench/futex-requeue.c @@ -6,7 +6,8 @@ * on futex2, N at a time. * * This program is particularly useful to measure the latency of nthread - * requeues without waking up any tasks -- thus mimicking a regular futex_wait. + * requeues without waking up any tasks (in the non-pi case) -- thus + * mimicking a regular futex_wait. */ /* For the CLR_() macros */ @@ -14,13 +15,13 @@ #include <pthread.h> #include <signal.h> +#include "../util/mutex.h" #include "../util/stat.h" #include <subcmd/parse-options.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/time64.h> #include <errno.h> -#include <internal/cpumap.h> #include <perf/cpumap.h> #include "bench.h" #include "futex.h" @@ -28,28 +29,38 @@ #include <err.h> #include <stdlib.h> #include <sys/time.h> +#include <sys/mman.h> static u_int32_t futex1 = 0, futex2 = 0; -/* - * How many tasks to requeue at a time. - * Default to 1 in order to make the kernel work more. - */ -static unsigned int nrequeue = 1; - static pthread_t *worker; -static bool done = false, silent = false, fshared = false; -static pthread_mutex_t thread_lock; -static pthread_cond_t thread_parent, thread_worker; +static bool done = false; +static struct mutex thread_lock; +static struct cond thread_parent, thread_worker; static struct stats requeuetime_stats, requeued_stats; -static unsigned int threads_starting, nthreads = 0; +static unsigned int threads_starting; static int futex_flag = 0; +static struct bench_futex_parameters params = { + .nbuckets = -1, + /* + * How many tasks to requeue at a time. + * Default to 1 in order to make the kernel work more. + */ + .nrequeue = 1, +}; + static const struct option options[] = { - OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), - OPT_UINTEGER('q', "nrequeue", &nrequeue, "Specify amount of threads to requeue at once"), - OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), - OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), + OPT_INTEGER( 'b', "buckets", ¶ms.nbuckets, "Specify amount of hash buckets"), + OPT_BOOLEAN( 'I', "immutable", ¶ms.buckets_immutable, "Make the hash buckets immutable"), + OPT_UINTEGER('t', "threads", ¶ms.nthreads, "Specify amount of threads"), + OPT_UINTEGER('q', "nrequeue", ¶ms.nrequeue, "Specify amount of threads to requeue at once"), + OPT_BOOLEAN( 's', "silent", ¶ms.silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", ¶ms.fshared, "Use shared futexes instead of private ones"), + OPT_BOOLEAN( 'm', "mlockall", ¶ms.mlockall, "Lock all current and future memory"), + OPT_BOOLEAN( 'B', "broadcast", ¶ms.broadcast, "Requeue all threads at once"), + OPT_BOOLEAN( 'p', "pi", ¶ms.pi, "Use PI-aware variants of FUTEX_CMP_REQUEUE"), + OPT_END() }; @@ -66,43 +77,87 @@ static void print_summary(void) printf("Requeued %d of %d threads in %.4f ms (+-%.2f%%)\n", requeued_avg, - nthreads, + params.nthreads, requeuetime_avg / USEC_PER_MSEC, rel_stddev_stats(requeuetime_stddev, requeuetime_avg)); + futex_print_nbuckets(¶ms); } static void *workerfn(void *arg __maybe_unused) { - pthread_mutex_lock(&thread_lock); + int ret; + + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); + + while (1) { + if (!params.pi) { + ret = futex_wait(&futex1, 0, NULL, futex_flag); + if (!ret) + break; + + if (ret && errno != EAGAIN) { + if (!params.silent) + warnx("futex_wait"); + break; + } + } else { + ret = futex_wait_requeue_pi(&futex1, 0, &futex2, + NULL, futex_flag); + if (!ret) { + /* got the lock at futex2 */ + futex_unlock_pi(&futex2, futex_flag); + break; + } + + if (ret && errno != EAGAIN) { + if (!params.silent) + warnx("futex_wait_requeue_pi"); + break; + } + } + } - futex_wait(&futex1, 0, NULL, futex_flag); return NULL; } -static void block_threads(pthread_t *w, - pthread_attr_t thread_attr, struct perf_cpu_map *cpu) +static void block_threads(pthread_t *w, struct perf_cpu_map *cpu) { - cpu_set_t cpuset; + cpu_set_t *cpuset; unsigned int i; + int nrcpus = cpu__max_cpu().cpu; + size_t size; + + threads_starting = params.nthreads; - threads_starting = nthreads; + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); /* create and block all threads */ - for (i = 0; i < nthreads; i++) { - CPU_ZERO(&cpuset); - CPU_SET(cpu->map[i % cpu->nr], &cpuset); + for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; - if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset)) + pthread_attr_init(&thread_attr); + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); + + if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } - if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) + if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_create"); + } + pthread_attr_destroy(&thread_attr); } + CPU_FREE(cpuset); } static void toggle_done(int sig __maybe_unused, @@ -117,14 +172,13 @@ int bench_futex_requeue(int argc, const char **argv) int ret = 0; unsigned int i, j; struct sigaction act; - pthread_attr_t thread_attr; struct perf_cpu_map *cpu; argc = parse_options(argc, argv, options, bench_futex_requeue_usage, 0); if (argc) goto err; - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) err(EXIT_FAILURE, "cpu_map__new"); @@ -133,55 +187,83 @@ int bench_futex_requeue(int argc, const char **argv) act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); - if (!nthreads) - nthreads = cpu->nr; + if (params.mlockall) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + err(EXIT_FAILURE, "mlockall"); + } + + if (!params.nthreads) + params.nthreads = perf_cpu_map__nr(cpu); - worker = calloc(nthreads, sizeof(*worker)); + worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) err(EXIT_FAILURE, "calloc"); - if (!fshared) + if (!params.fshared) futex_flag = FUTEX_PRIVATE_FLAG; - if (nrequeue > nthreads) - nrequeue = nthreads; + if (params.nrequeue > params.nthreads) + params.nrequeue = params.nthreads; - printf("Run summary [PID %d]: Requeuing %d threads (from [%s] %p to %p), " - "%d at a time.\n\n", getpid(), nthreads, - fshared ? "shared":"private", &futex1, &futex2, nrequeue); + if (params.broadcast) + params.nrequeue = params.nthreads; + + futex_set_nbuckets_param(¶ms); + + printf("Run summary [PID %d]: Requeuing %d threads (from [%s] %p to %s%p), " + "%d at a time.\n\n", getpid(), params.nthreads, + params.fshared ? "shared":"private", &futex1, + params.pi ? "PI ": "", &futex2, params.nrequeue); init_stats(&requeued_stats); init_stats(&requeuetime_stats); - pthread_attr_init(&thread_attr); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); for (j = 0; j < bench_repeat && !done; j++) { - unsigned int nrequeued = 0; + unsigned int nrequeued = 0, wakeups = 0; struct timeval start, end, runtime; /* create, launch & block all threads */ - block_threads(worker, thread_attr, cpu); + block_threads(worker, cpu); /* make sure all threads are already blocked */ - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); usleep(100000); /* Ok, all threads are patiently blocked, start requeueing */ gettimeofday(&start, NULL); - while (nrequeued < nthreads) { + while (nrequeued < params.nthreads) { + int r; + /* - * Do not wakeup any tasks blocked on futex1, allowing - * us to really measure futex_wait functionality. + * For the regular non-pi case, do not wakeup any tasks + * blocked on futex1, allowing us to really measure + * futex_wait functionality. For the PI case the first + * waiter is always awoken. */ - nrequeued += futex_cmp_requeue(&futex1, 0, &futex2, 0, - nrequeue, futex_flag); + if (!params.pi) { + r = futex_cmp_requeue(&futex1, 0, &futex2, 0, + params.nrequeue, + futex_flag); + } else { + r = futex_cmp_requeue_pi(&futex1, 0, &futex2, + params.nrequeue, + futex_flag); + wakeups++; /* assume no error */ + } + + if (r < 0) + err(EXIT_FAILURE, "couldn't requeue from %p to %p", + &futex1, &futex2); + + nrequeued += r; } gettimeofday(&end, NULL); @@ -190,17 +272,32 @@ int bench_futex_requeue(int argc, const char **argv) update_stats(&requeued_stats, nrequeued); update_stats(&requeuetime_stats, runtime.tv_usec); - if (!silent) { - printf("[Run %d]: Requeued %d of %d threads in %.4f ms\n", - j + 1, nrequeued, nthreads, runtime.tv_usec / (double)USEC_PER_MSEC); + if (!params.silent) { + if (!params.pi) + printf("[Run %d]: Requeued %d of %d threads in " + "%.4f ms\n", j + 1, nrequeued, + params.nthreads, + runtime.tv_usec / (double)USEC_PER_MSEC); + else { + nrequeued -= wakeups; + printf("[Run %d]: Awoke and Requeued (%d+%d) of " + "%d threads in %.4f ms\n", + j + 1, wakeups, nrequeued, + params.nthreads, + runtime.tv_usec / (double)USEC_PER_MSEC); + } + } - /* everybody should be blocked on futex2, wake'em up */ - nrequeued = futex_wake(&futex2, nrequeued, futex_flag); - if (nthreads != nrequeued) - warnx("couldn't wakeup all tasks (%d/%d)", nrequeued, nthreads); + if (!params.pi) { + /* everybody should be blocked on futex2, wake'em up */ + nrequeued = futex_wake(&futex2, nrequeued, futex_flag); + if (params.nthreads != nrequeued) + warnx("couldn't wakeup all tasks (%d/%d)", + nrequeued, params.nthreads); + } - for (i = 0; i < nthreads; i++) { + for (i = 0; i < params.nthreads; i++) { ret = pthread_join(worker[i], NULL); if (ret) err(EXIT_FAILURE, "pthread_join"); @@ -208,14 +305,14 @@ int bench_futex_requeue(int argc, const char **argv) } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); - pthread_attr_destroy(&thread_attr); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); print_summary(); free(worker); + perf_cpu_map__put(cpu); return ret; err: usage_with_options(bench_futex_requeue_usage, options); diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index cd2b81a845ac..ee66482c29fd 100644 --- a/tools/perf/bench/futex-wake-parallel.c +++ b/tools/perf/bench/futex-wake-parallel.c @@ -10,6 +10,7 @@ #include "bench.h" #include <linux/compiler.h> #include "../util/debug.h" +#include "../util/mutex.h" #ifndef HAVE_PTHREAD_BARRIER int bench_futex_wake_parallel(int argc __maybe_unused, const char **argv __maybe_unused) @@ -29,12 +30,12 @@ int bench_futex_wake_parallel(int argc __maybe_unused, const char **argv __maybe #include <linux/time64.h> #include <errno.h> #include "futex.h" -#include <internal/cpumap.h> #include <perf/cpumap.h> #include <err.h> #include <stdlib.h> #include <sys/time.h> +#include <sys/mman.h> struct thread_data { pthread_t worker; @@ -48,20 +49,27 @@ static unsigned int nwakes = 1; static u_int32_t futex = 0; static pthread_t *blocked_worker; -static bool done = false, silent = false, fshared = false; -static unsigned int nblocked_threads = 0, nwaking_threads = 0; -static pthread_mutex_t thread_lock; -static pthread_cond_t thread_parent, thread_worker; +static bool done = false; +static struct mutex thread_lock; +static struct cond thread_parent, thread_worker; static pthread_barrier_t barrier; static struct stats waketime_stats, wakeup_stats; static unsigned int threads_starting; static int futex_flag = 0; +static struct bench_futex_parameters params = { + .nbuckets = -1, +}; + static const struct option options[] = { - OPT_UINTEGER('t', "threads", &nblocked_threads, "Specify amount of threads"), - OPT_UINTEGER('w', "nwakers", &nwaking_threads, "Specify amount of waking threads"), - OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), - OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), + OPT_INTEGER( 'b', "buckets", ¶ms.nbuckets, "Specify amount of hash buckets"), + OPT_BOOLEAN( 'I', "immutable", ¶ms.buckets_immutable, "Make the hash buckets immutable"), + OPT_UINTEGER('t', "threads", ¶ms.nthreads, "Specify amount of threads"), + OPT_UINTEGER('w', "nwakers", ¶ms.nwakes, "Specify amount of waking threads"), + OPT_BOOLEAN( 's', "silent", ¶ms.silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", ¶ms.fshared, "Use shared futexes instead of private ones"), + OPT_BOOLEAN( 'm', "mlockall", ¶ms.mlockall, "Lock all current and future memory"), + OPT_END() }; @@ -91,16 +99,18 @@ static void *waking_workerfn(void *arg) return NULL; } -static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr) +static void wakeup_threads(struct thread_data *td) { unsigned int i; + pthread_attr_t thread_attr; + pthread_attr_init(&thread_attr); pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE); - pthread_barrier_init(&barrier, NULL, nwaking_threads + 1); + pthread_barrier_init(&barrier, NULL, params.nwakes + 1); /* create and block all threads */ - for (i = 0; i < nwaking_threads; i++) { + for (i = 0; i < params.nwakes; i++) { /* * Thread creation order will impact per-thread latency * as it will affect the order to acquire the hb spinlock. @@ -113,21 +123,22 @@ static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr) pthread_barrier_wait(&barrier); - for (i = 0; i < nwaking_threads; i++) + for (i = 0; i < params.nwakes; i++) if (pthread_join(td[i].worker, NULL)) err(EXIT_FAILURE, "pthread_join"); pthread_barrier_destroy(&barrier); + pthread_attr_destroy(&thread_attr); } static void *blocked_workerfn(void *arg __maybe_unused) { - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); while (1) { /* handle spurious wakeups */ if (futex_wait(&futex, 0, NULL, futex_flag) != EINTR) @@ -138,25 +149,39 @@ static void *blocked_workerfn(void *arg __maybe_unused) return NULL; } -static void block_threads(pthread_t *w, pthread_attr_t thread_attr, - struct perf_cpu_map *cpu) +static void block_threads(pthread_t *w, struct perf_cpu_map *cpu) { - cpu_set_t cpuset; + cpu_set_t *cpuset; unsigned int i; + int nrcpus = cpu__max_cpu().cpu; + size_t size; - threads_starting = nblocked_threads; + threads_starting = params.nthreads; + + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); /* create and block all threads */ - for (i = 0; i < nblocked_threads; i++) { - CPU_ZERO(&cpuset); - CPU_SET(cpu->map[i % cpu->nr], &cpuset); + for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; + + pthread_attr_init(&thread_attr); + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); - if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset)) + if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } - if (pthread_create(&w[i], &thread_attr, blocked_workerfn, NULL)) + if (pthread_create(&w[i], &thread_attr, blocked_workerfn, NULL)) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_create"); + } + pthread_attr_destroy(&thread_attr); } + CPU_FREE(cpuset); } static void print_run(struct thread_data *waking_worker, unsigned int run_num) @@ -168,7 +193,7 @@ static void print_run(struct thread_data *waking_worker, unsigned int run_num) init_stats(&__wakeup_stats); init_stats(&__waketime_stats); - for (i = 0; i < nwaking_threads; i++) { + for (i = 0; i < params.nwakes; i++) { update_stats(&__waketime_stats, waking_worker[i].runtime.tv_usec); update_stats(&__wakeup_stats, waking_worker[i].nwoken); } @@ -179,7 +204,7 @@ static void print_run(struct thread_data *waking_worker, unsigned int run_num) printf("[Run %d]: Avg per-thread latency (waking %d/%d threads) " "in %.4f ms (+-%.2f%%)\n", run_num + 1, wakeup_avg, - nblocked_threads, waketime_avg / USEC_PER_MSEC, + params.nthreads, waketime_avg / USEC_PER_MSEC, rel_stddev_stats(waketime_stddev, waketime_avg)); } @@ -194,9 +219,10 @@ static void print_summary(void) printf("Avg per-thread latency (waking %d/%d threads) in %.4f ms (+-%.2f%%)\n", wakeup_avg, - nblocked_threads, + params.nthreads, waketime_avg / USEC_PER_MSEC, rel_stddev_stats(waketime_stddev, waketime_avg)); + futex_print_nbuckets(¶ms); } @@ -204,7 +230,7 @@ static void do_run_stats(struct thread_data *waking_worker) { unsigned int i; - for (i = 0; i < nwaking_threads; i++) { + for (i = 0; i < params.nwakes; i++) { update_stats(&waketime_stats, waking_worker[i].runtime.tv_usec); update_stats(&wakeup_stats, waking_worker[i].nwoken); } @@ -223,7 +249,6 @@ int bench_futex_wake_parallel(int argc, const char **argv) int ret = 0; unsigned int i, j; struct sigaction act; - pthread_attr_t thread_attr; struct thread_data *waking_worker; struct perf_cpu_map *cpu; @@ -239,87 +264,94 @@ int bench_futex_wake_parallel(int argc, const char **argv) act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); - cpu = perf_cpu_map__new(NULL); + if (params.mlockall) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + err(EXIT_FAILURE, "mlockall"); + } + + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) err(EXIT_FAILURE, "calloc"); - if (!nblocked_threads) - nblocked_threads = cpu->nr; + if (!params.nthreads) + params.nthreads = perf_cpu_map__nr(cpu); /* some sanity checks */ - if (nwaking_threads > nblocked_threads || !nwaking_threads) - nwaking_threads = nblocked_threads; + if (params.nwakes > params.nthreads || + !params.nwakes) + params.nwakes = params.nthreads; - if (nblocked_threads % nwaking_threads) + if (params.nthreads % params.nwakes) errx(EXIT_FAILURE, "Must be perfectly divisible"); /* * Each thread will wakeup nwakes tasks in * a single futex_wait call. */ - nwakes = nblocked_threads/nwaking_threads; + nwakes = params.nthreads/params.nwakes; - blocked_worker = calloc(nblocked_threads, sizeof(*blocked_worker)); + blocked_worker = calloc(params.nthreads, sizeof(*blocked_worker)); if (!blocked_worker) err(EXIT_FAILURE, "calloc"); - if (!fshared) + if (!params.fshared) futex_flag = FUTEX_PRIVATE_FLAG; + futex_set_nbuckets_param(¶ms); + printf("Run summary [PID %d]: blocking on %d threads (at [%s] " "futex %p), %d threads waking up %d at a time.\n\n", - getpid(), nblocked_threads, fshared ? "shared":"private", - &futex, nwaking_threads, nwakes); + getpid(), params.nthreads, params.fshared ? "shared":"private", + &futex, params.nwakes, nwakes); init_stats(&wakeup_stats); init_stats(&waketime_stats); - pthread_attr_init(&thread_attr); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); for (j = 0; j < bench_repeat && !done; j++) { - waking_worker = calloc(nwaking_threads, sizeof(*waking_worker)); + waking_worker = calloc(params.nwakes, sizeof(*waking_worker)); if (!waking_worker) err(EXIT_FAILURE, "calloc"); /* create, launch & block all threads */ - block_threads(blocked_worker, thread_attr, cpu); + block_threads(blocked_worker, cpu); /* make sure all threads are already blocked */ - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); - usleep(100000); + usleep(200000); /* Ok, all threads are patiently blocked, start waking folks up */ - wakeup_threads(waking_worker, thread_attr); + wakeup_threads(waking_worker); - for (i = 0; i < nblocked_threads; i++) { + for (i = 0; i < params.nthreads; i++) { ret = pthread_join(blocked_worker[i], NULL); if (ret) err(EXIT_FAILURE, "pthread_join"); } do_run_stats(waking_worker); - if (!silent) + if (!params.silent) print_run(waking_worker, j); free(waking_worker); } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); - pthread_attr_destroy(&thread_attr); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); print_summary(); free(blocked_worker); + perf_cpu_map__put(cpu); return ret; } #endif /* HAVE_PTHREAD_BARRIER */ diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c index 2dfcef3e371e..8d6107f7cd94 100644 --- a/tools/perf/bench/futex-wake.c +++ b/tools/perf/bench/futex-wake.c @@ -14,13 +14,13 @@ #include <pthread.h> #include <signal.h> +#include "../util/mutex.h" #include "../util/stat.h" #include <subcmd/parse-options.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/time64.h> #include <errno.h> -#include <internal/cpumap.h> #include <perf/cpumap.h> #include "bench.h" #include "futex.h" @@ -28,29 +28,37 @@ #include <err.h> #include <stdlib.h> #include <sys/time.h> +#include <sys/mman.h> /* all threads will block on the same futex */ static u_int32_t futex1 = 0; -/* - * How many wakeups to do at a time. - * Default to 1 in order to make the kernel work more. - */ -static unsigned int nwakes = 1; - -pthread_t *worker; -static bool done = false, silent = false, fshared = false; -static pthread_mutex_t thread_lock; -static pthread_cond_t thread_parent, thread_worker; +static pthread_t *worker; +static bool done = false; +static struct mutex thread_lock; +static struct cond thread_parent, thread_worker; static struct stats waketime_stats, wakeup_stats; -static unsigned int threads_starting, nthreads = 0; +static unsigned int threads_starting; static int futex_flag = 0; +static struct bench_futex_parameters params = { + .nbuckets = -1, + /* + * How many wakeups to do at a time. + * Default to 1 in order to make the kernel work more. + */ + .nwakes = 1, +}; + static const struct option options[] = { - OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), - OPT_UINTEGER('w', "nwakes", &nwakes, "Specify amount of threads to wake at once"), - OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), - OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), + OPT_INTEGER( 'b', "buckets", ¶ms.nbuckets, "Specify amount of hash buckets"), + OPT_BOOLEAN( 'I', "immutable", ¶ms.buckets_immutable, "Make the hash buckets immutable"), + OPT_UINTEGER('t', "threads", ¶ms.nthreads, "Specify amount of threads"), + OPT_UINTEGER('w', "nwakes", ¶ms.nwakes, "Specify amount of threads to wake at once"), + OPT_BOOLEAN( 's', "silent", ¶ms.silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", ¶ms.fshared, "Use shared futexes instead of private ones"), + OPT_BOOLEAN( 'm', "mlockall", ¶ms.mlockall, "Lock all current and future memory"), + OPT_END() }; @@ -61,12 +69,12 @@ static const char * const bench_futex_wake_usage[] = { static void *workerfn(void *arg __maybe_unused) { - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); while (1) { if (futex_wait(&futex1, 0, NULL, futex_flag) != EINTR) @@ -85,30 +93,44 @@ static void print_summary(void) printf("Wokeup %d of %d threads in %.4f ms (+-%.2f%%)\n", wakeup_avg, - nthreads, + params.nthreads, waketime_avg / USEC_PER_MSEC, rel_stddev_stats(waketime_stddev, waketime_avg)); + futex_print_nbuckets(¶ms); } -static void block_threads(pthread_t *w, - pthread_attr_t thread_attr, struct perf_cpu_map *cpu) +static void block_threads(pthread_t *w, struct perf_cpu_map *cpu) { - cpu_set_t cpuset; + cpu_set_t *cpuset; unsigned int i; + size_t size; + int nrcpus = cpu__max_cpu().cpu; + threads_starting = params.nthreads; - threads_starting = nthreads; + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); /* create and block all threads */ - for (i = 0; i < nthreads; i++) { - CPU_ZERO(&cpuset); - CPU_SET(cpu->map[i % cpu->nr], &cpuset); + for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; - if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset)) + pthread_attr_init(&thread_attr); + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); + + if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } - if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) + if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) { + CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_create"); + } + pthread_attr_destroy(&thread_attr); } + CPU_FREE(cpuset); } static void toggle_done(int sig __maybe_unused, @@ -123,7 +145,6 @@ int bench_futex_wake(int argc, const char **argv) int ret = 0; unsigned int i, j; struct sigaction act; - pthread_attr_t thread_attr; struct perf_cpu_map *cpu; argc = parse_options(argc, argv, options, bench_futex_wake_usage, 0); @@ -132,7 +153,7 @@ int bench_futex_wake(int argc, const char **argv) exit(EXIT_FAILURE); } - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) err(EXIT_FAILURE, "calloc"); @@ -141,59 +162,66 @@ int bench_futex_wake(int argc, const char **argv) act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); - if (!nthreads) - nthreads = cpu->nr; + if (params.mlockall) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + err(EXIT_FAILURE, "mlockall"); + } + + if (!params.nthreads) + params.nthreads = perf_cpu_map__nr(cpu); - worker = calloc(nthreads, sizeof(*worker)); + worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) err(EXIT_FAILURE, "calloc"); - if (!fshared) + if (!params.fshared) futex_flag = FUTEX_PRIVATE_FLAG; printf("Run summary [PID %d]: blocking on %d threads (at [%s] futex %p), " "waking up %d at a time.\n\n", - getpid(), nthreads, fshared ? "shared":"private", &futex1, nwakes); + getpid(), params.nthreads, params.fshared ? "shared":"private", + &futex1, params.nwakes); init_stats(&wakeup_stats); init_stats(&waketime_stats); - pthread_attr_init(&thread_attr); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); for (j = 0; j < bench_repeat && !done; j++) { unsigned int nwoken = 0; struct timeval start, end, runtime; /* create, launch & block all threads */ - block_threads(worker, thread_attr, cpu); + block_threads(worker, cpu); /* make sure all threads are already blocked */ - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); usleep(100000); /* Ok, all threads are patiently blocked, start waking folks up */ gettimeofday(&start, NULL); - while (nwoken != nthreads) - nwoken += futex_wake(&futex1, nwakes, futex_flag); + while (nwoken != params.nthreads) + nwoken += futex_wake(&futex1, + params.nwakes, futex_flag); gettimeofday(&end, NULL); timersub(&end, &start, &runtime); update_stats(&wakeup_stats, nwoken); update_stats(&waketime_stats, runtime.tv_usec); - if (!silent) { + if (!params.silent) { printf("[Run %d]: Wokeup %d of %d threads in %.4f ms\n", - j + 1, nwoken, nthreads, runtime.tv_usec / (double)USEC_PER_MSEC); + j + 1, nwoken, params.nthreads, + runtime.tv_usec / (double)USEC_PER_MSEC); } - for (i = 0; i < nthreads; i++) { + for (i = 0; i < params.nthreads; i++) { ret = pthread_join(worker[i], NULL); if (ret) err(EXIT_FAILURE, "pthread_join"); @@ -202,13 +230,13 @@ int bench_futex_wake(int argc, const char **argv) } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); - pthread_attr_destroy(&thread_attr); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); print_summary(); free(worker); + perf_cpu_map__put(cpu); return ret; } diff --git a/tools/perf/bench/futex.c b/tools/perf/bench/futex.c new file mode 100644 index 000000000000..4c4fee107e59 --- /dev/null +++ b/tools/perf/bench/futex.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <err.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/prctl.h> + +#include "futex.h" + +#ifndef PR_FUTEX_HASH +#define PR_FUTEX_HASH 78 +# define PR_FUTEX_HASH_SET_SLOTS 1 +# define FH_FLAG_IMMUTABLE (1ULL << 0) +# define PR_FUTEX_HASH_GET_SLOTS 2 +# define PR_FUTEX_HASH_GET_IMMUTABLE 3 +#endif // PR_FUTEX_HASH + +void futex_set_nbuckets_param(struct bench_futex_parameters *params) +{ + unsigned long flags; + int ret; + + if (params->nbuckets < 0) + return; + + flags = params->buckets_immutable ? FH_FLAG_IMMUTABLE : 0; + ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, params->nbuckets, flags); + if (ret) { + printf("Requesting %d hash buckets failed: %d/%m\n", + params->nbuckets, ret); + err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)"); + } +} + +void futex_print_nbuckets(struct bench_futex_parameters *params) +{ + char *futex_hash_mode; + int ret; + + ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS); + if (params->nbuckets >= 0) { + if (ret != params->nbuckets) { + if (ret < 0) { + printf("Can't query number of buckets: %m\n"); + err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)"); + } + printf("Requested number of hash buckets does not currently used.\n"); + printf("Requested: %d in usage: %d\n", params->nbuckets, ret); + err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)"); + } + if (params->nbuckets == 0) { + ret = asprintf(&futex_hash_mode, "Futex hashing: global hash"); + } else { + ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_IMMUTABLE); + if (ret < 0) { + printf("Can't check if the hash is immutable: %m\n"); + err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)"); + } + ret = asprintf(&futex_hash_mode, "Futex hashing: %d hash buckets %s", + params->nbuckets, + ret == 1 ? "(immutable)" : ""); + } + } else { + if (ret <= 0) { + ret = asprintf(&futex_hash_mode, "Futex hashing: global hash"); + } else { + ret = asprintf(&futex_hash_mode, "Futex hashing: auto resized to %d buckets", + ret); + } + } + if (ret < 0) + err(EXIT_FAILURE, "ENOMEM, futex_hash_mode"); + printf("%s\n", futex_hash_mode); + free(futex_hash_mode); +} diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h index 31b53cc7d5bc..9c9a73f9d865 100644 --- a/tools/perf/bench/futex.h +++ b/tools/perf/bench/futex.h @@ -13,28 +13,53 @@ #include <sys/types.h> #include <linux/futex.h> +struct bench_futex_parameters { + bool silent; + bool fshared; + bool mlockall; + bool multi; /* lock-pi */ + bool pi; /* requeue-pi */ + bool broadcast; /* requeue */ + unsigned int runtime; /* seconds*/ + unsigned int nthreads; + unsigned int nfutexes; + unsigned int nwakes; + unsigned int nrequeue; + int nbuckets; + bool buckets_immutable; +}; + /** - * futex() - SYS_futex syscall wrapper + * futex_syscall() - SYS_futex syscall wrapper * @uaddr: address of first futex * @op: futex op code * @val: typically expected value of uaddr, but varies by op * @timeout: typically an absolute struct timespec (except where noted * otherwise). Overloaded by some ops - * @uaddr2: address of second futex for some ops\ + * @uaddr2: address of second futex for some ops * @val3: varies by op * @opflags: flags to be bitwise OR'd with op, such as FUTEX_PRIVATE_FLAG * - * futex() is used by all the following futex op wrappers. It can also be + * futex_syscall() is used by all the following futex op wrappers. It can also be * used for misuse and abuse testing. Generally, the specific op wrappers - * should be used instead. It is a macro instead of an static inline function as - * some of the types over overloaded (timeout is used for nr_requeue for - * example). + * should be used instead. * * These argument descriptions are the defaults for all * like-named arguments in the following wrappers except where noted below. */ -#define futex(uaddr, op, val, timeout, uaddr2, val3, opflags) \ - syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3) +static inline int +futex_syscall(volatile u_int32_t *uaddr, int op, u_int32_t val, struct timespec *timeout, + volatile u_int32_t *uaddr2, int val3, int opflags) +{ + return syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3); +} + +static inline int +futex_syscall_nr_requeue(volatile u_int32_t *uaddr, int op, u_int32_t val, int nr_requeue, + volatile u_int32_t *uaddr2, int val3, int opflags) +{ + return syscall(SYS_futex, uaddr, op | opflags, val, nr_requeue, uaddr2, val3); +} /** * futex_wait() - block on uaddr with optional timeout @@ -43,7 +68,7 @@ static inline int futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflags) { - return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); + return futex_syscall(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); } /** @@ -53,7 +78,7 @@ futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflag static inline int futex_wake(u_int32_t *uaddr, int nr_wake, int opflags) { - return futex(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags); + return futex_syscall(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags); } /** @@ -62,7 +87,7 @@ futex_wake(u_int32_t *uaddr, int nr_wake, int opflags) static inline int futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int opflags) { - return futex(uaddr, FUTEX_LOCK_PI, 0, timeout, NULL, 0, opflags); + return futex_syscall(uaddr, FUTEX_LOCK_PI, 0, timeout, NULL, 0, opflags); } /** @@ -71,19 +96,56 @@ futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int opflags) static inline int futex_unlock_pi(u_int32_t *uaddr, int opflags) { - return futex(uaddr, FUTEX_UNLOCK_PI, 0, NULL, NULL, 0, opflags); + return futex_syscall(uaddr, FUTEX_UNLOCK_PI, 0, NULL, NULL, 0, opflags); } /** * futex_cmp_requeue() - requeue tasks from uaddr to uaddr2 * @nr_wake: wake up to this many tasks -* @nr_requeue: requeue up to this many tasks +* @nr_requeue: requeue up to this many tasks */ static inline int futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wake, int nr_requeue, int opflags) { - return futex(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2, - val, opflags); + return futex_syscall_nr_requeue(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2, + val, opflags); +} + +/** + * futex_wait_requeue_pi() - block on uaddr and prepare to requeue to uaddr2 + * @uaddr: non-PI futex source + * @uaddr2: PI futex target + * + * This is the first half of the requeue_pi mechanism. It shall always be + * paired with futex_cmp_requeue_pi(). + */ +static inline int +futex_wait_requeue_pi(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, + struct timespec *timeout, int opflags) +{ + return futex_syscall(uaddr, FUTEX_WAIT_REQUEUE_PI, val, timeout, uaddr2, 0, + opflags); +} + +/** + * futex_cmp_requeue_pi() - requeue tasks from uaddr to uaddr2 + * @uaddr: non-PI futex source + * @uaddr2: PI futex target + * @nr_requeue: requeue up to this many tasks + * + * This is the second half of the requeue_pi mechanism. It shall always be + * paired with futex_wait_requeue_pi(). The first waker is always awoken. + */ +static inline int +futex_cmp_requeue_pi(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, + int nr_requeue, int opflags) +{ + return futex_syscall_nr_requeue(uaddr, FUTEX_CMP_REQUEUE_PI, 1, nr_requeue, uaddr2, + val, opflags); } + +void futex_set_nbuckets_param(struct bench_futex_parameters *params); +void futex_print_nbuckets(struct bench_futex_parameters *params); + #endif /* _FUTEX_H */ diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c new file mode 100644 index 000000000000..f55c07e4be94 --- /dev/null +++ b/tools/perf/bench/inject-buildid.c @@ -0,0 +1,486 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdlib.h> +#include <stddef.h> +#include <ftw.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> +#include <pthread.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <linux/kernel.h> +#include <linux/time64.h> +#include <linux/list.h> +#include <linux/err.h> +#include <linux/zalloc.h> +#include <internal/lib.h> +#include <subcmd/parse-options.h> + +#include "bench.h" +#include "util/data.h" +#include "util/stat.h" +#include "util/debug.h" +#include "util/symbol.h" +#include "util/session.h" +#include "util/build-id.h" +#include "util/sample.h" +#include "util/synthetic-events.h" + +#define MMAP_DEV_MAJOR 8 +#define DSO_MMAP_RATIO 4 + +static unsigned int iterations = 100; +static unsigned int nr_mmaps = 100; +static unsigned int nr_samples = 100; /* samples per mmap */ + +static u64 bench_sample_type; +static u16 bench_id_hdr_size; + +struct bench_data { + int pid; + int input_pipe[2]; + int output_pipe[2]; + pthread_t th; +}; + +struct bench_dso { + struct list_head list; + char *name; + int ino; +}; + +static int nr_dsos; +static struct bench_dso *dsos; + +extern int main(int argc, const char **argv); + +static const struct option options[] = { + OPT_UINTEGER('i', "iterations", &iterations, + "Number of iterations used to compute average (default: 100)"), + OPT_UINTEGER('m', "nr-mmaps", &nr_mmaps, + "Number of mmap events for each iteration (default: 100)"), + OPT_UINTEGER('n', "nr-samples", &nr_samples, + "Number of sample events per mmap event (default: 100)"), + OPT_INCR('v', "verbose", &verbose, + "be more verbose (show iteration count, DSO name, etc)"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench internals inject-build-id <options>", + NULL +}; + +/* + * Helper for collect_dso that adds the given file as a dso to dso_list + * if it contains a build-id. Stops after collecting 4 times more than + * we need (for MMAP2 events). + */ +static int add_dso(const char *fpath, const struct stat *sb __maybe_unused, + int typeflag, struct FTW *ftwbuf __maybe_unused) +{ + struct bench_dso *dso = &dsos[nr_dsos]; + struct build_id bid; + + if (typeflag == FTW_D || typeflag == FTW_SL) + return 0; + + if (filename__read_build_id(fpath, &bid) < 0) + return 0; + + dso->name = realpath(fpath, NULL); + if (dso->name == NULL) + return -1; + + dso->ino = nr_dsos++; + pr_debug2(" Adding DSO: %s\n", fpath); + + /* stop if we collected enough DSOs */ + if ((unsigned int)nr_dsos == DSO_MMAP_RATIO * nr_mmaps) + return 1; + + return 0; +} + +static void collect_dso(void) +{ + dsos = calloc(nr_mmaps * DSO_MMAP_RATIO, sizeof(*dsos)); + if (dsos == NULL) { + printf(" Memory allocation failed\n"); + exit(1); + } + + if (nftw("/usr/lib/", add_dso, 10, FTW_PHYS) < 0) + return; + + pr_debug(" Collected %d DSOs\n", nr_dsos); +} + +static void release_dso(void) +{ + int i; + + for (i = 0; i < nr_dsos; i++) { + struct bench_dso *dso = &dsos[i]; + + zfree(&dso->name); + } + free(dsos); +} + +/* Fake address used by mmap and sample events */ +static u64 dso_map_addr(struct bench_dso *dso) +{ + return 0x400000ULL + dso->ino * 8192ULL; +} + +static ssize_t synthesize_attr(struct bench_data *data) +{ + union perf_event event; + + memset(&event, 0, sizeof(event.attr) + sizeof(u64)); + + event.header.type = PERF_RECORD_HEADER_ATTR; + event.header.size = sizeof(event.attr) + sizeof(u64); + + event.attr.attr.type = PERF_TYPE_SOFTWARE; + event.attr.attr.config = PERF_COUNT_SW_TASK_CLOCK; + event.attr.attr.exclude_kernel = 1; + event.attr.attr.sample_id_all = 1; + event.attr.attr.sample_type = bench_sample_type; + + return writen(data->input_pipe[1], &event, event.header.size); +} + +static ssize_t synthesize_fork(struct bench_data *data) +{ + union perf_event event; + + memset(&event, 0, sizeof(event.fork) + bench_id_hdr_size); + + event.header.type = PERF_RECORD_FORK; + event.header.misc = PERF_RECORD_MISC_FORK_EXEC; + event.header.size = sizeof(event.fork) + bench_id_hdr_size; + + event.fork.ppid = 1; + event.fork.ptid = 1; + event.fork.pid = data->pid; + event.fork.tid = data->pid; + + return writen(data->input_pipe[1], &event, event.header.size); +} + +static ssize_t synthesize_mmap(struct bench_data *data, struct bench_dso *dso, u64 timestamp) +{ + union perf_event event; + size_t len = offsetof(struct perf_record_mmap2, filename); + u64 *id_hdr_ptr = (void *)&event; + int ts_idx; + + len += roundup(strlen(dso->name) + 1, 8) + bench_id_hdr_size; + + memset(&event, 0, min(len, sizeof(event.mmap2))); + + event.header.type = PERF_RECORD_MMAP2; + event.header.misc = PERF_RECORD_MISC_USER; + event.header.size = len; + + event.mmap2.pid = data->pid; + event.mmap2.tid = data->pid; + event.mmap2.maj = MMAP_DEV_MAJOR; + event.mmap2.ino = dso->ino; + + strcpy(event.mmap2.filename, dso->name); + + event.mmap2.start = dso_map_addr(dso); + event.mmap2.len = 4096; + event.mmap2.prot = PROT_EXEC; + + if (len > sizeof(event.mmap2)) { + /* write mmap2 event first */ + if (writen(data->input_pipe[1], &event, len - bench_id_hdr_size) < 0) + return -1; + /* zero-fill sample id header */ + memset(id_hdr_ptr, 0, bench_id_hdr_size); + /* put timestamp in the right position */ + ts_idx = (bench_id_hdr_size / sizeof(u64)) - 2; + id_hdr_ptr[ts_idx] = timestamp; + if (writen(data->input_pipe[1], id_hdr_ptr, bench_id_hdr_size) < 0) + return -1; + + return len; + } + + ts_idx = (len / sizeof(u64)) - 2; + id_hdr_ptr[ts_idx] = timestamp; + return writen(data->input_pipe[1], &event, len); +} + +static ssize_t synthesize_sample(struct bench_data *data, struct bench_dso *dso, u64 timestamp) +{ + union perf_event event; + struct perf_sample sample = { + .tid = data->pid, + .pid = data->pid, + .ip = dso_map_addr(dso), + .time = timestamp, + }; + + event.header.type = PERF_RECORD_SAMPLE; + event.header.misc = PERF_RECORD_MISC_USER; + event.header.size = perf_event__sample_event_size(&sample, bench_sample_type, 0); + + perf_event__synthesize_sample(&event, bench_sample_type, 0, &sample); + + return writen(data->input_pipe[1], &event, event.header.size); +} + +static ssize_t synthesize_flush(struct bench_data *data) +{ + struct perf_event_header header = { + .size = sizeof(header), + .type = PERF_RECORD_FINISHED_ROUND, + }; + + return writen(data->input_pipe[1], &header, header.size); +} + +static void *data_reader(void *arg) +{ + struct bench_data *data = arg; + char buf[8192]; + int flag; + int n; + + flag = fcntl(data->output_pipe[0], F_GETFL); + fcntl(data->output_pipe[0], F_SETFL, flag | O_NONBLOCK); + + /* read out data from child */ + while (true) { + n = read(data->output_pipe[0], buf, sizeof(buf)); + if (n > 0) + continue; + if (n == 0) + break; + + if (errno != EINTR && errno != EAGAIN) + break; + + usleep(100); + } + + close(data->output_pipe[0]); + return NULL; +} + +static int setup_injection(struct bench_data *data, bool build_id_all) +{ + int ready_pipe[2]; + int dev_null_fd; + char buf; + + if (pipe(ready_pipe) < 0) + return -1; + + if (pipe(data->input_pipe) < 0) + return -1; + + if (pipe(data->output_pipe) < 0) + return -1; + + data->pid = fork(); + if (data->pid < 0) + return -1; + + if (data->pid == 0) { + const char **inject_argv; + int inject_argc = 3; + + close(data->input_pipe[1]); + close(data->output_pipe[0]); + close(ready_pipe[0]); + + dup2(data->input_pipe[0], STDIN_FILENO); + close(data->input_pipe[0]); + dup2(data->output_pipe[1], STDOUT_FILENO); + close(data->output_pipe[1]); + + dev_null_fd = open("/dev/null", O_WRONLY); + if (dev_null_fd < 0) + exit(1); + + dup2(dev_null_fd, STDERR_FILENO); + + if (build_id_all) + inject_argc++; + + inject_argv = calloc(inject_argc + 1, sizeof(*inject_argv)); + if (inject_argv == NULL) + exit(1); + + inject_argv[0] = strdup("perf"); + inject_argv[1] = strdup("inject"); + inject_argv[2] = strdup("-b"); + if (build_id_all) + inject_argv[3] = strdup("--buildid-all"); + + /* signal that we're ready to go */ + close(ready_pipe[1]); + + main(inject_argc, inject_argv); + + exit(0); + } + + pthread_create(&data->th, NULL, data_reader, data); + + close(ready_pipe[1]); + close(data->input_pipe[0]); + close(data->output_pipe[1]); + + /* wait for child ready */ + if (read(ready_pipe[0], &buf, 1) < 0) + return -1; + close(ready_pipe[0]); + + return 0; +} + +static int inject_build_id(struct bench_data *data, u64 *max_rss) +{ + int status; + unsigned int i, k; + struct rusage rusage; + + /* this makes the child to run */ + if (perf_header__write_pipe(data->input_pipe[1]) < 0) + return -1; + + if (synthesize_attr(data) < 0) + return -1; + + if (synthesize_fork(data) < 0) + return -1; + + for (i = 0; i < nr_mmaps; i++) { + int idx = rand() % nr_dsos; + struct bench_dso *dso = &dsos[idx]; + u64 timestamp = rand() % 1000000; + + pr_debug2(" [%d] injecting: %s\n", i+1, dso->name); + if (synthesize_mmap(data, dso, timestamp) < 0) + return -1; + + for (k = 0; k < nr_samples; k++) { + if (synthesize_sample(data, dso, timestamp + k * 1000) < 0) + return -1; + } + + if ((i + 1) % 10 == 0) { + if (synthesize_flush(data) < 0) + return -1; + } + } + + /* this makes the child to finish */ + close(data->input_pipe[1]); + + wait4(data->pid, &status, 0, &rusage); + *max_rss = rusage.ru_maxrss; + + pr_debug(" Child %d exited with %d\n", data->pid, status); + + return 0; +} + +static void do_inject_loop(struct bench_data *data, bool build_id_all) +{ + unsigned int i; + struct stats time_stats, mem_stats; + double time_average, time_stddev; + double mem_average, mem_stddev; + + init_stats(&time_stats); + init_stats(&mem_stats); + + pr_debug(" Build-id%s injection benchmark\n", build_id_all ? "-all" : ""); + + for (i = 0; i < iterations; i++) { + struct timeval start, end, diff; + u64 runtime_us, max_rss; + + pr_debug(" Iteration #%d\n", i+1); + + if (setup_injection(data, build_id_all) < 0) { + printf(" Build-id injection setup failed\n"); + break; + } + + gettimeofday(&start, NULL); + if (inject_build_id(data, &max_rss) < 0) { + printf(" Build-id injection failed\n"); + break; + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&time_stats, runtime_us); + update_stats(&mem_stats, max_rss); + + pthread_join(data->th, NULL); + } + + time_average = avg_stats(&time_stats) / USEC_PER_MSEC; + time_stddev = stddev_stats(&time_stats) / USEC_PER_MSEC; + printf(" Average build-id%s injection took: %.3f msec (+- %.3f msec)\n", + build_id_all ? "-all" : "", time_average, time_stddev); + + /* each iteration, it processes MMAP2 + BUILD_ID + nr_samples * SAMPLE */ + time_average = avg_stats(&time_stats) / (nr_mmaps * (nr_samples + 2)); + time_stddev = stddev_stats(&time_stats) / (nr_mmaps * (nr_samples + 2)); + printf(" Average time per event: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + + mem_average = avg_stats(&mem_stats); + mem_stddev = stddev_stats(&mem_stats); + printf(" Average memory usage: %.0f KB (+- %.0f KB)\n", + mem_average, mem_stddev); +} + +static int do_inject_loops(struct bench_data *data) +{ + + srand(time(NULL)); + symbol__init(NULL); + + bench_sample_type = PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_IP; + bench_sample_type |= PERF_SAMPLE_TID | PERF_SAMPLE_TIME; + bench_id_hdr_size = 32; + + collect_dso(); + if (nr_dsos == 0) { + printf(" Cannot collect DSOs for injection\n"); + return -1; + } + + do_inject_loop(data, false); + do_inject_loop(data, true); + + release_dso(); + return 0; +} + +int bench_inject_build_id(int argc, const char **argv) +{ + struct bench_data data; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + return do_inject_loops(&data); +} + diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 9235b76501be..19d45c377ac1 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -223,12 +223,8 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * return 0; } -static u64 do_memcpy_cycles(const struct function *r, size_t size, void *src, void *dst) +static void memcpy_prefault(memcpy_t fn, size_t size, void *src, void *dst) { - u64 cycle_start = 0ULL, cycle_end = 0ULL; - memcpy_t fn = r->fn.memcpy; - int i; - /* Make sure to always prefault zero pages even if MMAP_THRESH is crossed: */ memset(src, 0, size); @@ -237,6 +233,15 @@ static u64 do_memcpy_cycles(const struct function *r, size_t size, void *src, vo * to not measure page fault overhead: */ fn(dst, src, size); +} + +static u64 do_memcpy_cycles(const struct function *r, size_t size, void *src, void *dst) +{ + u64 cycle_start = 0ULL, cycle_end = 0ULL; + memcpy_t fn = r->fn.memcpy; + int i; + + memcpy_prefault(fn, size, src, dst); cycle_start = get_cycles(); for (i = 0; i < nr_loops; ++i) @@ -252,11 +257,7 @@ static double do_memcpy_gettimeofday(const struct function *r, size_t size, void memcpy_t fn = r->fn.memcpy; int i; - /* - * We prefault the freshly allocated memory range here, - * to not measure page fault overhead: - */ - fn(dst, src, size); + memcpy_prefault(fn, size, src, dst); BUG_ON(gettimeofday(&tv_start, NULL)); for (i = 0; i < nr_loops; ++i) diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h index 50ae8bd58296..6188e19d3129 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h @@ -7,7 +7,3 @@ MEMCPY_FN(memcpy_orig, MEMCPY_FN(__memcpy, "x86-64-movsq", "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") - -MEMCPY_FN(memcpy_erms, - "x86-64-movsb", - "movsb-based memcpy() in arch/x86/lib/memcpy_64.S") diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S index 9ad015a1e202..1b9fef7efcdc 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm.S +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S @@ -2,6 +2,9 @@ /* Various wrappers to make the kernel .S file build in user-space: */ +// memcpy_orig is being defined as SYM_L_LOCAL but we need it +#define SYM_FUNC_START_LOCAL(name) \ + SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) #define memcpy MEMCPY /* don't hide glibc's memcpy() */ #define altinstr_replacement text #define globl p2align 4; .globl diff --git a/tools/perf/bench/mem-memcpy-x86-64-lib.c b/tools/perf/bench/mem-memcpy-x86-64-lib.c deleted file mode 100644 index 4130734dde84..000000000000 --- a/tools/perf/bench/mem-memcpy-x86-64-lib.c +++ /dev/null @@ -1,24 +0,0 @@ -/* - * From code in arch/x86/lib/usercopy_64.c, copied to keep tools/ copy - * of the kernel's arch/x86/lib/memcpy_64.s used in 'perf bench mem memcpy' - * happy. - */ -#include <linux/types.h> - -unsigned long __memcpy_mcsafe(void *dst, const void *src, size_t cnt); -unsigned long mcsafe_handle_tail(char *to, char *from, unsigned len); - -unsigned long mcsafe_handle_tail(char *to, char *from, unsigned len) -{ - for (; len; --len, to++, from++) { - /* - * Call the assembly routine back directly since - * memcpy_mcsafe() may silently fallback to memcpy. - */ - unsigned long rem = __memcpy_mcsafe(to, from, 1); - - if (rem) - break; - } - return len; -} diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h index dac6d2b7c39b..247c72fdfb9d 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h @@ -7,7 +7,3 @@ MEMSET_FN(memset_orig, MEMSET_FN(__memset, "x86-64-stosq", "movsq-based memset() in arch/x86/lib/memset_64.S") - -MEMSET_FN(memset_erms, - "x86-64-stosb", - "movsb-based memset() in arch/x86/lib/memset_64.S") diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S index d550bd526162..abd26c95f1aa 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm.S +++ b/tools/perf/bench/mem-memset-x86-64-asm.S @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +// memset_orig is being defined as SYM_L_LOCAL but we need it +#define SYM_FUNC_START_LOCAL(name) \ + SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) #define memset MEMSET /* don't hide glibc's memset() */ #define altinstr_replacement text #define globl p2align 4; .globl diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 5797253b9700..19be2aaf4dc0 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -6,8 +6,6 @@ */ #include <inttypes.h> -/* For the CLR_() macros */ -#include <pthread.h> #include <subcmd/parse-options.h> #include "../util/cloexec.h" @@ -18,6 +16,7 @@ #include <sched.h> #include <stdio.h> #include <assert.h> +#include <debug.h> #include <malloc.h> #include <signal.h> #include <stdlib.h> @@ -28,12 +27,16 @@ #include <sys/resource.h> #include <sys/wait.h> #include <sys/prctl.h> +#include <sys/stat.h> #include <sys/types.h> #include <linux/kernel.h> #include <linux/time64.h> #include <linux/numa.h> #include <linux/zalloc.h> +#include "../util/header.h" +#include "../util/mutex.h" +#include <api/fs/fs.h> #include <numa.h> #include <numaif.h> @@ -42,7 +45,7 @@ #endif /* - * Regular printout to the terminal, supressed if -q is specified: + * Regular printout to the terminal, suppressed if -q is specified: */ #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0) @@ -54,7 +57,7 @@ struct thread_data { int curr_cpu; - cpu_set_t bind_cpumask; + cpu_set_t *bind_cpumask; int bind_node; u8 *process_data; int process_nr; @@ -66,7 +69,7 @@ struct thread_data { u64 system_time_ns; u64 user_time_ns; double speed_gbs; - pthread_mutex_t *process_lock; + struct mutex *process_lock; }; /* Parameters set by options: */ @@ -116,7 +119,6 @@ struct params { long bytes_thread; int nr_tasks; - bool show_quiet; bool show_convergence; bool measure_convergence; @@ -136,15 +138,16 @@ struct params { struct global_info { u8 *data; - pthread_mutex_t startup_mutex; + struct mutex startup_mutex; + struct cond startup_cond; int nr_tasks_started; - pthread_mutex_t startup_done_mutex; - - pthread_mutex_t start_work_mutex; + struct mutex start_work_mutex; + struct cond start_work_cond; int nr_tasks_working; + bool start_work; - pthread_mutex_t stop_work_mutex; + struct mutex stop_work_mutex; u64 bytes_done; struct thread_data *threads; @@ -196,7 +199,8 @@ static const struct option options[] = { OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details, " "convergence is reached when each process (all its threads) is running on a single NUMA node."), OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"), - OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "quiet mode"), + OPT_BOOLEAN('q', "quiet" , &quiet, + "quiet mode (do not show any warnings or messages)"), OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"), /* Special option string parsing callbacks: */ @@ -247,85 +251,135 @@ static int is_node_present(int node) */ static bool node_has_cpus(int node) { - struct bitmask *cpu = numa_allocate_cpumask(); - unsigned int i; + struct bitmask *cpumask = numa_allocate_cpumask(); + bool ret = false; /* fall back to nocpus */ + int cpu; - if (cpu && !numa_node_to_cpus(node, cpu)) { - for (i = 0; i < cpu->size; i++) { - if (numa_bitmask_isbitset(cpu, i)) - return true; + BUG_ON(!cpumask); + if (!numa_node_to_cpus(node, cpumask)) { + for (cpu = 0; cpu < (int)cpumask->size; cpu++) { + if (numa_bitmask_isbitset(cpumask, cpu)) { + ret = true; + break; + } } } + numa_free_cpumask(cpumask); - return false; /* lets fall back to nocpus safely */ + return ret; } -static cpu_set_t bind_to_cpu(int target_cpu) +static cpu_set_t *bind_to_cpu(int target_cpu) { - cpu_set_t orig_mask, mask; - int ret; + int nrcpus = numa_num_possible_cpus(); + cpu_set_t *orig_mask, *mask; + size_t size; - ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); - BUG_ON(ret); + orig_mask = CPU_ALLOC(nrcpus); + BUG_ON(!orig_mask); + size = CPU_ALLOC_SIZE(nrcpus); + CPU_ZERO_S(size, orig_mask); + + if (sched_getaffinity(0, size, orig_mask)) + goto err_out; + + mask = CPU_ALLOC(nrcpus); + if (!mask) + goto err_out; - CPU_ZERO(&mask); + CPU_ZERO_S(size, mask); if (target_cpu == -1) { int cpu; for (cpu = 0; cpu < g->p.nr_cpus; cpu++) - CPU_SET(cpu, &mask); + CPU_SET_S(cpu, size, mask); } else { - BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus); - CPU_SET(target_cpu, &mask); + if (target_cpu < 0 || target_cpu >= g->p.nr_cpus) + goto err; + + CPU_SET_S(target_cpu, size, mask); } - ret = sched_setaffinity(0, sizeof(mask), &mask); - BUG_ON(ret); + if (sched_setaffinity(0, size, mask)) + goto err; return orig_mask; + +err: + CPU_FREE(mask); +err_out: + CPU_FREE(orig_mask); + + /* BUG_ON due to failure in allocation of orig_mask/mask */ + BUG_ON(-1); + return NULL; } -static cpu_set_t bind_to_node(int target_node) +static cpu_set_t *bind_to_node(int target_node) { - int cpus_per_node = g->p.nr_cpus / nr_numa_nodes(); - cpu_set_t orig_mask, mask; + int nrcpus = numa_num_possible_cpus(); + size_t size; + cpu_set_t *orig_mask, *mask; int cpu; - int ret; - BUG_ON(cpus_per_node * nr_numa_nodes() != g->p.nr_cpus); - BUG_ON(!cpus_per_node); + orig_mask = CPU_ALLOC(nrcpus); + BUG_ON(!orig_mask); + size = CPU_ALLOC_SIZE(nrcpus); + CPU_ZERO_S(size, orig_mask); - ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); - BUG_ON(ret); + if (sched_getaffinity(0, size, orig_mask)) + goto err_out; + + mask = CPU_ALLOC(nrcpus); + if (!mask) + goto err_out; - CPU_ZERO(&mask); + CPU_ZERO_S(size, mask); if (target_node == NUMA_NO_NODE) { for (cpu = 0; cpu < g->p.nr_cpus; cpu++) - CPU_SET(cpu, &mask); + CPU_SET_S(cpu, size, mask); } else { - int cpu_start = (target_node + 0) * cpus_per_node; - int cpu_stop = (target_node + 1) * cpus_per_node; + struct bitmask *cpumask = numa_allocate_cpumask(); - BUG_ON(cpu_stop > g->p.nr_cpus); + if (!cpumask) + goto err; - for (cpu = cpu_start; cpu < cpu_stop; cpu++) - CPU_SET(cpu, &mask); + if (!numa_node_to_cpus(target_node, cpumask)) { + for (cpu = 0; cpu < (int)cpumask->size; cpu++) { + if (numa_bitmask_isbitset(cpumask, cpu)) + CPU_SET_S(cpu, size, mask); + } + } + numa_free_cpumask(cpumask); } - ret = sched_setaffinity(0, sizeof(mask), &mask); - BUG_ON(ret); + if (sched_setaffinity(0, size, mask)) + goto err; return orig_mask; + +err: + CPU_FREE(mask); +err_out: + CPU_FREE(orig_mask); + + /* BUG_ON due to failure in allocation of orig_mask/mask */ + BUG_ON(-1); + return NULL; } -static void bind_to_cpumask(cpu_set_t mask) +static void bind_to_cpumask(cpu_set_t *mask) { int ret; + size_t size = CPU_ALLOC_SIZE(numa_num_possible_cpus()); - ret = sched_setaffinity(0, sizeof(mask), &mask); - BUG_ON(ret); + ret = sched_setaffinity(0, size, mask); + if (ret) { + CPU_FREE(mask); + BUG_ON(ret); + } } static void mempol_restore(void) @@ -339,18 +393,22 @@ static void mempol_restore(void) static void bind_to_memnode(int node) { - unsigned long nodemask; + struct bitmask *node_mask; int ret; if (node == NUMA_NO_NODE) return; - BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); - nodemask = 1L << node; + node_mask = numa_allocate_nodemask(); + BUG_ON(!node_mask); + + numa_bitmask_clearall(node_mask); + numa_bitmask_setbit(node_mask, node); - ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8); - dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret); + ret = set_mempolicy(MPOL_BIND, node_mask->maskp, node_mask->size + 1); + dprintf("binding to node %d, mask: %016lx => %d\n", node, *node_mask->maskp, ret); + numa_bitmask_free(node_mask); BUG_ON(ret); } @@ -367,7 +425,7 @@ do { \ static u8 *alloc_data(ssize_t bytes0, int map_flags, int init_zero, int init_cpu0, int thp, int init_random) { - cpu_set_t orig_mask; + cpu_set_t *orig_mask = NULL; ssize_t bytes; u8 *buf; int ret; @@ -425,6 +483,7 @@ static u8 *alloc_data(ssize_t bytes0, int map_flags, /* Restore affinity: */ if (init_cpu0) { bind_to_cpumask(orig_mask); + CPU_FREE(orig_mask); mempol_restore(); } @@ -467,18 +526,6 @@ static void * setup_private_data(ssize_t bytes) return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); } -/* - * Return a process-shared (global) mutex: - */ -static void init_global_mutex(pthread_mutex_t *mutex) -{ - pthread_mutexattr_t attr; - - pthread_mutexattr_init(&attr); - pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); - pthread_mutex_init(mutex, &attr); -} - static int parse_cpu_list(const char *arg) { p0.cpu_list_str = strdup(arg); @@ -488,6 +535,57 @@ static int parse_cpu_list(const char *arg) return 0; } +/* + * Check whether a CPU is online + * + * Returns: + * 1 -> if CPU is online + * 0 -> if CPU is offline + * -1 -> error case + */ +static int is_cpu_online(unsigned int cpu) +{ + char *str; + size_t strlen; + char buf[256]; + int status = -1; + struct stat statbuf; + + snprintf(buf, sizeof(buf), + "/sys/devices/system/cpu/cpu%d", cpu); + if (stat(buf, &statbuf) != 0) + return 0; + + /* + * Check if /sys/devices/system/cpu/cpux/online file + * exists. Some cases cpu0 won't have online file since + * it is not expected to be turned off generally. + * In kernels without CONFIG_HOTPLUG_CPU, this + * file won't exist + */ + snprintf(buf, sizeof(buf), + "/sys/devices/system/cpu/cpu%d/online", cpu); + if (stat(buf, &statbuf) != 0) + return 1; + + /* + * Read online file using sysfs__read_str. + * If read or open fails, return -1. + * If read succeeds, return value from file + * which gets stored in "str" + */ + snprintf(buf, sizeof(buf), + "devices/system/cpu/cpu%d/online", cpu); + + if (sysfs__read_str(buf, &str, &strlen) < 0) + return status; + + status = atoi(str); + + free(str); + return status; +} + static int parse_setup_cpu_list(void) { struct thread_data *td; @@ -564,10 +662,16 @@ static int parse_setup_cpu_list(void) return -1; } + if (is_cpu_online(bind_cpu_0) != 1 || is_cpu_online(bind_cpu_1) != 1) { + printf("\nTest not applicable, bind_cpu_0 or bind_cpu_1 is offline\n"); + return -1; + } + BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0); BUG_ON(bind_cpu_0 > bind_cpu_1); for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) { + size_t size = CPU_ALLOC_SIZE(g->p.nr_cpus); int i; for (i = 0; i < mul; i++) { @@ -587,10 +691,15 @@ static int parse_setup_cpu_list(void) tprintf("%2d", bind_cpu); } - CPU_ZERO(&td->bind_cpumask); + td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus); + BUG_ON(!td->bind_cpumask); + CPU_ZERO_S(size, td->bind_cpumask); for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) { - BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus); - CPU_SET(cpu, &td->bind_cpumask); + if (cpu < 0 || cpu >= g->p.nr_cpus) { + CPU_FREE(td->bind_cpumask); + BUG_ON(-1); + } + CPU_SET_S(cpu, size, td->bind_cpumask); } t++; } @@ -729,12 +838,8 @@ static int parse_nodes_opt(const struct option *opt __maybe_unused, return -1; return parse_node_list(arg); - - return 0; } -#define BIT(x) (1ul << x) - static inline uint32_t lfsr_32(uint32_t lfsr) { const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31); @@ -795,7 +900,7 @@ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val if (g->p.data_rand_walk) { u32 lfsr = nr + loop + val; - int j; + long j; for (i = 0; i < words/1024; i++) { long start, end; @@ -813,12 +918,12 @@ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val } } } else if (!g->p.data_backwards || (nr + loop) & 1) { + /* Process data forwards: */ d0 = data + off; d = data + off + 1; d1 = data + words; - /* Process data forwards: */ for (;;) { if (unlikely(d >= d1)) d = data; @@ -836,7 +941,6 @@ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val d = data + off - 1; d1 = data + words; - /* Process data forwards: */ for (;;) { if (unlikely(d < data)) d = data + words-1; @@ -862,8 +966,6 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked) prctl(0, bytes_worked); } -#define MAX_NR_NODES 64 - /* * Count the number of nodes a process's threads * are spread out on. @@ -874,10 +976,15 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked) */ static int count_process_nodes(int process_nr) { - char node_present[MAX_NR_NODES] = { 0, }; + char *node_present; int nodes; int n, t; + node_present = (char *)malloc(g->p.nr_nodes * sizeof(char)); + BUG_ON(!node_present); + for (nodes = 0; nodes < g->p.nr_nodes; nodes++) + node_present[nodes] = 0; + for (t = 0; t < g->p.nr_threads; t++) { struct thread_data *td; int task_nr; @@ -887,17 +994,20 @@ static int count_process_nodes(int process_nr) td = g->threads + task_nr; node = numa_node_of_cpu(td->curr_cpu); - if (node < 0) /* curr_cpu was likely still -1 */ + if (node < 0) /* curr_cpu was likely still -1 */ { + free(node_present); return 0; + } node_present[node] = 1; } nodes = 0; - for (n = 0; n < MAX_NR_NODES; n++) + for (n = 0; n < g->p.nr_nodes; n++) nodes += node_present[n]; + free(node_present); return nodes; } @@ -966,7 +1076,7 @@ static void calc_convergence(double runtime_ns_max, double *convergence) { unsigned int loops_done_min, loops_done_max; int process_groups; - int nodes[MAX_NR_NODES]; + int *nodes; int distance; int nr_min; int nr_max; @@ -980,6 +1090,8 @@ static void calc_convergence(double runtime_ns_max, double *convergence) if (!g->p.show_convergence && !g->p.measure_convergence) return; + nodes = (int *)malloc(g->p.nr_nodes * sizeof(int)); + BUG_ON(!nodes); for (node = 0; node < g->p.nr_nodes; node++) nodes[node] = 0; @@ -1021,8 +1133,10 @@ static void calc_convergence(double runtime_ns_max, double *convergence) BUG_ON(sum > g->p.nr_tasks); - if (0 && (sum < g->p.nr_tasks)) + if (0 && (sum < g->p.nr_tasks)) { + free(nodes); return; + } /* * Count the number of distinct process groups present @@ -1074,6 +1188,8 @@ static void calc_convergence(double runtime_ns_max, double *convergence) } tprintf("\n"); } + + free(nodes); } static void show_summary(double runtime_ns_max, int l, double *convergence) @@ -1133,19 +1249,22 @@ static void *worker_thread(void *__tdata) } if (g->p.serialize_startup) { - pthread_mutex_lock(&g->startup_mutex); + mutex_lock(&g->startup_mutex); g->nr_tasks_started++; - pthread_mutex_unlock(&g->startup_mutex); + /* The last thread wakes the main process. */ + if (g->nr_tasks_started == g->p.nr_tasks) + cond_signal(&g->startup_cond); + + mutex_unlock(&g->startup_mutex); /* Here we will wait for the main process to start us all at once: */ - pthread_mutex_lock(&g->start_work_mutex); + mutex_lock(&g->start_work_mutex); + g->start_work = false; g->nr_tasks_working++; + while (!g->start_work) + cond_wait(&g->start_work_cond, &g->start_work_mutex); - /* Last one wake the main process: */ - if (g->nr_tasks_working == g->p.nr_tasks) - pthread_mutex_unlock(&g->startup_done_mutex); - - pthread_mutex_unlock(&g->start_work_mutex); + mutex_unlock(&g->start_work_mutex); } gettimeofday(&start0, NULL); @@ -1164,17 +1283,17 @@ static void *worker_thread(void *__tdata) val += do_work(thread_data, g->p.bytes_thread, 0, 1, l, val); if (g->p.sleep_usecs) { - pthread_mutex_lock(td->process_lock); + mutex_lock(td->process_lock); usleep(g->p.sleep_usecs); - pthread_mutex_unlock(td->process_lock); + mutex_unlock(td->process_lock); } /* * Amount of work to be done under a process-global lock: */ if (g->p.bytes_process_locked) { - pthread_mutex_lock(td->process_lock); + mutex_lock(td->process_lock); val += do_work(process_data, g->p.bytes_process_locked, thread_nr, g->p.nr_threads, l, val); - pthread_mutex_unlock(td->process_lock); + mutex_unlock(td->process_lock); } work_done = g->p.bytes_global + g->p.bytes_process + @@ -1208,7 +1327,7 @@ static void *worker_thread(void *__tdata) * by migrating to CPU#0: */ if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) { - cpu_set_t orig_mask; + cpu_set_t *orig_mask; int target_cpu; int this_cpu; @@ -1232,6 +1351,7 @@ static void *worker_thread(void *__tdata) printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu); bind_to_cpumask(orig_mask); + CPU_FREE(orig_mask); } if (details >= 3) { @@ -1270,9 +1390,9 @@ static void *worker_thread(void *__tdata) free_data(thread_data, g->p.bytes_thread); - pthread_mutex_lock(&g->stop_work_mutex); + mutex_lock(&g->stop_work_mutex); g->bytes_done += bytes_done; - pthread_mutex_unlock(&g->stop_work_mutex); + mutex_unlock(&g->stop_work_mutex); return NULL; } @@ -1282,7 +1402,7 @@ static void *worker_thread(void *__tdata) */ static void worker_process(int process_nr) { - pthread_mutex_t process_lock; + struct mutex process_lock; struct thread_data *td; pthread_t *pthreads; u8 *process_data; @@ -1290,7 +1410,7 @@ static void worker_process(int process_nr) int ret; int t; - pthread_mutex_init(&process_lock, NULL); + mutex_init(&process_lock); set_taskname("process %d", process_nr); /* @@ -1365,21 +1485,31 @@ static void init_thread_data(void) for (t = 0; t < g->p.nr_tasks; t++) { struct thread_data *td = g->threads + t; + size_t cpuset_size = CPU_ALLOC_SIZE(g->p.nr_cpus); int cpu; /* Allow all nodes by default: */ td->bind_node = NUMA_NO_NODE; /* Allow all CPUs by default: */ - CPU_ZERO(&td->bind_cpumask); + td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus); + BUG_ON(!td->bind_cpumask); + CPU_ZERO_S(cpuset_size, td->bind_cpumask); for (cpu = 0; cpu < g->p.nr_cpus; cpu++) - CPU_SET(cpu, &td->bind_cpumask); + CPU_SET_S(cpu, cpuset_size, td->bind_cpumask); } } static void deinit_thread_data(void) { ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; + int t; + + /* Free the bind_cpumask allocated for thread_data */ + for (t = 0; t < g->p.nr_tasks; t++) { + struct thread_data *td = g->threads + t; + CPU_FREE(td->bind_cpumask); + } free_data(g->threads, size); } @@ -1396,9 +1526,9 @@ static int init(void) g->p.nr_nodes = numa_max_node() + 1; /* char array in count_process_nodes(): */ - BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0); + BUG_ON(g->p.nr_nodes < 0); - if (g->p.show_quiet && !g->p.show_details) + if (quiet && !g->p.show_details) g->p.show_details = -1; /* Some memory should be specified: */ @@ -1439,10 +1569,11 @@ static int init(void) g->data = setup_shared_data(g->p.bytes_global); /* Startup serialization: */ - init_global_mutex(&g->start_work_mutex); - init_global_mutex(&g->startup_mutex); - init_global_mutex(&g->startup_done_mutex); - init_global_mutex(&g->stop_work_mutex); + mutex_init_pshared(&g->start_work_mutex); + cond_init_pshared(&g->start_work_cond); + mutex_init_pshared(&g->startup_mutex); + cond_init_pshared(&g->startup_cond); + mutex_init_pshared(&g->stop_work_mutex); init_thread_data(); @@ -1476,7 +1607,7 @@ static void print_res(const char *name, double val, if (!name) name = "main,"; - if (!g->p.show_quiet) + if (!quiet) printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short); else printf(" %14.3f %s\n", val, txt_long); @@ -1501,9 +1632,6 @@ static int __bench_numa(const char *name) pids = zalloc(g->p.nr_proc * sizeof(*pids)); pid = -1; - /* All threads try to acquire it, this way we can wait for them to start up: */ - pthread_mutex_lock(&g->start_work_mutex); - if (g->p.serialize_startup) { tprintf(" #\n"); tprintf(" # Startup synchronization: ..."); fflush(stdout); @@ -1525,22 +1653,29 @@ static int __bench_numa(const char *name) pids[i] = pid; } - /* Wait for all the threads to start up: */ - while (g->nr_tasks_started != g->p.nr_tasks) - usleep(USEC_PER_MSEC); - - BUG_ON(g->nr_tasks_started != g->p.nr_tasks); if (g->p.serialize_startup) { + bool threads_ready = false; double startup_sec; - pthread_mutex_lock(&g->startup_done_mutex); - - /* This will start all threads: */ - pthread_mutex_unlock(&g->start_work_mutex); - - /* This mutex is locked - the last started thread will wake us: */ - pthread_mutex_lock(&g->startup_done_mutex); + /* + * Wait for all the threads to start up. The last thread will + * signal this process. + */ + mutex_lock(&g->startup_mutex); + while (g->nr_tasks_started != g->p.nr_tasks) + cond_wait(&g->startup_cond, &g->startup_mutex); + + mutex_unlock(&g->startup_mutex); + + /* Wait for all threads to be at the start_work_cond. */ + while (!threads_ready) { + mutex_lock(&g->start_work_mutex); + threads_ready = (g->nr_tasks_working == g->p.nr_tasks); + mutex_unlock(&g->start_work_mutex); + if (!threads_ready) + usleep(1); + } gettimeofday(&stop, NULL); @@ -1554,7 +1689,11 @@ static int __bench_numa(const char *name) tprintf(" #\n"); start = stop; - pthread_mutex_unlock(&g->startup_done_mutex); + /* Start all threads running. */ + mutex_lock(&g->start_work_mutex); + g->start_work = true; + mutex_unlock(&g->start_work_mutex); + cond_broadcast(&g->start_work_cond); } else { gettimeofday(&start, NULL); } @@ -1630,7 +1769,7 @@ static int __bench_numa(const char *name) "GB/sec,", "total-speed", "GB/sec total speed"); if (g->p.show_details >= 2) { - char tname[14 + 2 * 10 + 1]; + char tname[14 + 2 * 11 + 1]; struct thread_data *td; for (p = 0; p < g->p.nr_proc; p++) { for (t = 0; t < g->p.nr_threads; t++) { @@ -1733,12 +1872,12 @@ err: */ static const char *tests[][MAX_ARGS] = { /* Basic single-stream NUMA bandwidth measurements: */ - { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024", + { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024", "-C" , "0", "-M", "0", OPT_BW_RAM }, { "RAM-bw-local-NOTHP,", "mem", "-p", "1", "-t", "1", "-P", "1024", "-C" , "0", "-M", "0", OPT_BW_RAM_NOTHP }, - { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024", + { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024", "-C" , "0", "-M", "1", OPT_BW_RAM }, /* 2-stream NUMA bandwidth measurements: */ @@ -1755,7 +1894,7 @@ static const char *tests[][MAX_ARGS] = { { " 1x3-convergence,", "mem", "-p", "1", "-t", "3", "-P", "512", OPT_CONV }, { " 1x4-convergence,", "mem", "-p", "1", "-t", "4", "-P", "512", OPT_CONV }, { " 1x6-convergence,", "mem", "-p", "1", "-t", "6", "-P", "1020", OPT_CONV }, - { " 2x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, + { " 2x3-convergence,", "mem", "-p", "2", "-t", "3", "-P", "1020", OPT_CONV }, { " 3x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, { " 4x4-convergence,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV }, { " 4x4-convergence-NOTHP,", @@ -1780,24 +1919,24 @@ static const char *tests[][MAX_ARGS] = { "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW_NOTHP }, { "16x1-bw-process,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_BW }, - { " 4x1-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW }, - { " 8x1-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW }, - { "16x1-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW }, - { "32x1-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW }, + { " 1x4-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW }, + { " 1x8-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW }, + { "1x16-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW }, + { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW }, - { " 2x3-bw-thread,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW }, - { " 4x4-bw-thread,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW }, - { " 4x6-bw-thread,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW }, - { " 4x8-bw-thread,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW }, - { " 4x8-bw-thread-NOTHP,", + { " 2x3-bw-process,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW }, + { " 4x4-bw-process,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW }, + { " 4x6-bw-process,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW }, + { " 4x8-bw-process,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW }, + { " 4x8-bw-process-NOTHP,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW_NOTHP }, - { " 3x3-bw-thread,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW }, - { " 5x5-bw-thread,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW }, + { " 3x3-bw-process,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW }, + { " 5x5-bw-process,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW }, - { "2x16-bw-thread,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW }, - { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW }, + { "2x16-bw-process,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW }, + { "1x32-bw-process,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW }, - { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW }, + { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW }, { "numa02-bw-NOTHP,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW_NOTHP }, { "numa01-bw-thread,", "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW }, { "numa01-bw-thread-NOTHP,", diff --git a/tools/perf/bench/pmu-scan.c b/tools/perf/bench/pmu-scan.c new file mode 100644 index 000000000000..9e4d36486f62 --- /dev/null +++ b/tools/perf/bench/pmu-scan.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Benchmark scanning sysfs files for PMU information. + * + * Copyright 2023 Google LLC. + */ +#include <stdio.h> +#include "bench.h" +#include "util/debug.h" +#include "util/pmu.h" +#include "util/pmus.h" +#include "util/stat.h" +#include <linux/atomic.h> +#include <linux/err.h> +#include <linux/time64.h> +#include <subcmd/parse-options.h> + +static unsigned int iterations = 100; + +struct pmu_scan_result { + char *name; + int nr_aliases; + int nr_formats; + int nr_caps; + bool is_core; +}; + +static const struct option options[] = { + OPT_UINTEGER('i', "iterations", &iterations, + "Number of iterations used to compute average"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench internals pmu-scan <options>", + NULL +}; + +static int nr_pmus; +static struct pmu_scan_result *results; + +static int save_result(void) +{ + struct perf_pmu *pmu = NULL; + struct list_head *list; + struct pmu_scan_result *r; + + while ((pmu = perf_pmus__scan(pmu)) != NULL) { + r = realloc(results, (nr_pmus + 1) * sizeof(*r)); + if (r == NULL) + return -ENOMEM; + + results = r; + r = results + nr_pmus; + + r->name = strdup(pmu->name); + r->is_core = pmu->is_core; + r->nr_caps = pmu->nr_caps; + + r->nr_aliases = perf_pmu__num_events(pmu); + + r->nr_formats = 0; + list_for_each(list, &pmu->format) + r->nr_formats++; + + pr_debug("pmu[%d] name=%s, nr_caps=%d, nr_aliases=%d, nr_formats=%d\n", + nr_pmus, r->name, r->nr_caps, r->nr_aliases, r->nr_formats); + nr_pmus++; + } + + perf_pmus__destroy(); + return 0; +} + +static int check_result(bool core_only) +{ + struct pmu_scan_result *r; + struct perf_pmu *pmu; + struct list_head *list; + int nr; + + for (int i = 0; i < nr_pmus; i++) { + r = &results[i]; + if (core_only && !r->is_core) + continue; + + pmu = perf_pmus__find(r->name); + if (pmu == NULL) { + pr_err("Cannot find PMU %s\n", r->name); + return -1; + } + + if (pmu->nr_caps != (u32)r->nr_caps) { + pr_err("Unmatched number of event caps in %s: expect %d vs got %d\n", + pmu->name, r->nr_caps, pmu->nr_caps); + return -1; + } + + nr = perf_pmu__num_events(pmu); + if (nr != r->nr_aliases) { + pr_err("Unmatched number of event aliases in %s: expect %d vs got %d\n", + pmu->name, r->nr_aliases, nr); + return -1; + } + + nr = 0; + list_for_each(list, &pmu->format) + nr++; + if (nr != r->nr_formats) { + pr_err("Unmatched number of event formats in %s: expect %d vs got %d\n", + pmu->name, r->nr_formats, nr); + return -1; + } + } + return 0; +} + +static void delete_result(void) +{ + for (int i = 0; i < nr_pmus; i++) + free(results[i].name); + free(results); + + results = NULL; + nr_pmus = 0; +} + +static int run_pmu_scan(void) +{ + struct stats stats; + struct timeval start, end, diff; + double time_average, time_stddev; + u64 runtime_us; + int ret; + + init_stats(&stats); + pr_info("Computing performance of sysfs PMU event scan for %u times\n", + iterations); + + if (save_result() < 0) { + pr_err("Failed to initialize PMU scan result\n"); + return -1; + } + + for (int j = 0; j < 2; j++) { + bool core_only = (j == 0); + + for (unsigned int i = 0; i < iterations; i++) { + gettimeofday(&start, NULL); + if (core_only) + perf_pmus__scan_core(NULL); + else + perf_pmus__scan(NULL); + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&stats, runtime_us); + + ret = check_result(core_only); + perf_pmus__destroy(); + if (ret < 0) + break; + } + time_average = avg_stats(&stats); + time_stddev = stddev_stats(&stats); + pr_info(" Average%s PMU scanning took: %.3f usec (+- %.3f usec)\n", + core_only ? " core" : "", time_average, time_stddev); + } + delete_result(); + return 0; +} + +int bench_pmu_scan(int argc, const char **argv) +{ + int err = 0; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + err = run_pmu_scan(); + + return err; +} diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c index 71d830d7b923..93dcd9dba3d0 100644 --- a/tools/perf/bench/sched-messaging.c +++ b/tools/perf/bench/sched-messaging.c @@ -27,6 +27,7 @@ #include <poll.h> #include <limits.h> #include <err.h> +#include <linux/list.h> #include <linux/time64.h> #define DATASIZE 100 @@ -35,8 +36,12 @@ static bool use_pipes = false; static unsigned int nr_loops = 100; static bool thread_mode = false; static unsigned int num_groups = 10; +static unsigned int total_children = 0; +static struct list_head sender_contexts = LIST_HEAD_INIT(sender_contexts); +static struct list_head receiver_contexts = LIST_HEAD_INIT(receiver_contexts); struct sender_context { + struct list_head list; unsigned int num_fds; int ready_out; int wakefd; @@ -44,12 +49,20 @@ struct sender_context { }; struct receiver_context { + struct list_head list; unsigned int num_packets; int in_fds[2]; int ready_out; int wakefd; }; +union messaging_worker { + pthread_t thread; + pid_t pid; +}; + +static union messaging_worker *worker_tab; + static void fdpair(int fds[2]) { if (use_pipes) { @@ -66,11 +79,10 @@ static void fdpair(int fds[2]) /* Block until we're ready to go */ static void ready(int ready_out, int wakefd) { - char dummy; struct pollfd pollfd = { .fd = wakefd, .events = POLLIN }; /* Tell them we're ready. */ - if (write(ready_out, &dummy, 1) != 1) + if (write(ready_out, "R", 1) != 1) err(EXIT_FAILURE, "CLIENT: ready write"); /* Wait for "GO" signal */ @@ -85,6 +97,7 @@ static void *sender(struct sender_context *ctx) unsigned int i, j; ready(ctx->ready_out, ctx->wakefd); + memset(data, 'S', sizeof(data)); /* Now pump to every receiver. */ for (i = 0; i < nr_loops; i++) { @@ -93,7 +106,7 @@ static void *sender(struct sender_context *ctx) again: ret = write(ctx->out_fds[j], data + done, - sizeof(data)-done); + sizeof(data) - done); if (ret < 0) err(EXIT_FAILURE, "SENDER: write"); done += ret; @@ -134,30 +147,12 @@ again: return NULL; } -static pthread_t create_worker(void *ctx, void *(*func)(void *)) +static void create_thread_worker(union messaging_worker *worker, + void *ctx, void *(*func)(void *)) { pthread_attr_t attr; - pthread_t childid; int ret; - if (!thread_mode) { - /* process mode */ - /* Fork the receiver. */ - switch (fork()) { - case -1: - err(EXIT_FAILURE, "fork()"); - break; - case 0: - (*func) (ctx); - exit(0); - break; - default: - break; - } - - return (pthread_t)0; - } - if (pthread_attr_init(&attr) != 0) err(EXIT_FAILURE, "pthread_attr_init:"); @@ -166,14 +161,37 @@ static pthread_t create_worker(void *ctx, void *(*func)(void *)) err(EXIT_FAILURE, "pthread_attr_setstacksize"); #endif - ret = pthread_create(&childid, &attr, func, ctx); + ret = pthread_create(&worker->thread, &attr, func, ctx); if (ret != 0) err(EXIT_FAILURE, "pthread_create failed"); - return childid; + pthread_attr_destroy(&attr); +} + +static void create_process_worker(union messaging_worker *worker, + void *ctx, void *(*func)(void *)) +{ + /* Fork the receiver. */ + worker->pid = fork(); + + if (worker->pid == -1) { + err(EXIT_FAILURE, "fork()"); + } else if (worker->pid == 0) { + (*func) (ctx); + exit(0); + } } -static void reap_worker(pthread_t id) +static void create_worker(union messaging_worker *worker, + void *ctx, void *(*func)(void *)) +{ + if (!thread_mode) + return create_process_worker(worker, ctx, func); + else + return create_thread_worker(worker, ctx, func); +} + +static void reap_worker(union messaging_worker *worker) { int proc_status; void *thread_status; @@ -184,23 +202,24 @@ static void reap_worker(pthread_t id) if (!WIFEXITED(proc_status)) exit(1); } else { - pthread_join(id, &thread_status); + pthread_join(worker->thread, &thread_status); } } /* One group of senders and receivers */ -static unsigned int group(pthread_t *pth, +static unsigned int group(union messaging_worker *worker, unsigned int num_fds, int ready_out, int wakefd) { unsigned int i; - struct sender_context *snd_ctx = malloc(sizeof(struct sender_context) - + num_fds * sizeof(int)); + struct sender_context *snd_ctx = malloc(sizeof(struct sender_context) + + num_fds * sizeof(int)); if (!snd_ctx) err(EXIT_FAILURE, "malloc()"); + list_add(&snd_ctx->list, &sender_contexts); for (i = 0; i < num_fds; i++) { int fds[2]; struct receiver_context *ctx = malloc(sizeof(*ctx)); @@ -208,6 +227,7 @@ static unsigned int group(pthread_t *pth, if (!ctx) err(EXIT_FAILURE, "malloc()"); + list_add(&ctx->list, &receiver_contexts); /* Create the pipe between client and server */ fdpair(fds); @@ -218,7 +238,7 @@ static unsigned int group(pthread_t *pth, ctx->ready_out = ready_out; ctx->wakefd = wakefd; - pth[i] = create_worker(ctx, (void *)receiver); + create_worker(worker + i, ctx, (void *)receiver); snd_ctx->out_fds[i] = fds[1]; if (!thread_mode) @@ -231,7 +251,7 @@ static unsigned int group(pthread_t *pth, snd_ctx->wakefd = wakefd; snd_ctx->num_fds = num_fds; - pth[num_fds+i] = create_worker(snd_ctx, (void *)sender); + create_worker(worker + num_fds + i, snd_ctx, (void *)sender); } /* Close the fds we have left */ @@ -243,6 +263,17 @@ static unsigned int group(pthread_t *pth, return num_fds * 2; } +static void sig_handler(int sig __maybe_unused) +{ + unsigned int i; + + /* + * When exit abnormally, kill all forked child processes. + */ + for (i = 0; i < total_children; i++) + kill(worker_tab[i].pid, SIGKILL); +} + static const struct option options[] = { OPT_BOOLEAN('p', "pipe", &use_pipes, "Use pipe() instead of socketpair()"), @@ -260,26 +291,30 @@ static const char * const bench_sched_message_usage[] = { int bench_sched_messaging(int argc, const char **argv) { - unsigned int i, total_children; + unsigned int i; struct timeval start, stop, diff; unsigned int num_fds = 20; int readyfds[2], wakefds[2]; char dummy; - pthread_t *pth_tab; + struct sender_context *pos, *n; argc = parse_options(argc, argv, options, bench_sched_message_usage, 0); - pth_tab = malloc(num_fds * 2 * num_groups * sizeof(pthread_t)); - if (!pth_tab) + worker_tab = malloc(num_fds * 2 * num_groups * sizeof(union messaging_worker)); + if (!worker_tab) err(EXIT_FAILURE, "main:malloc()"); fdpair(readyfds); fdpair(wakefds); - total_children = 0; + if (!thread_mode) { + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + } + for (i = 0; i < num_groups; i++) - total_children += group(pth_tab+total_children, num_fds, + total_children += group(worker_tab + total_children, num_fds, readyfds[1], wakefds[0]); /* Wait for everyone to be ready */ @@ -295,7 +330,7 @@ int bench_sched_messaging(int argc, const char **argv) /* Reap them all */ for (i = 0; i < total_children; i++) - reap_worker(pth_tab[i]); + reap_worker(worker_tab + i); gettimeofday(&stop, NULL); @@ -309,11 +344,11 @@ int bench_sched_messaging(int argc, const char **argv) num_groups, num_groups * 2 * num_fds, thread_mode ? "threads" : "processes"); printf(" %14s: %lu.%03lu [sec]\n", "Total time", - diff.tv_sec, + (unsigned long) diff.tv_sec, (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); break; case BENCH_FORMAT_SIMPLE: - printf("%lu.%03lu\n", diff.tv_sec, + printf("%lu.%03lu\n", (unsigned long) diff.tv_sec, (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); break; default: @@ -323,7 +358,14 @@ int bench_sched_messaging(int argc, const char **argv) break; } - free(pth_tab); - + free(worker_tab); + list_for_each_entry_safe(pos, n, &sender_contexts, list) { + list_del_init(&pos->list); + free(pos); + } + list_for_each_entry_safe(pos, n, &receiver_contexts, list) { + list_del_init(&pos->list); + free(pos); + } return 0; } diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c index 3c88d1f201f1..70139036d68f 100644 --- a/tools/perf/bench/sched-pipe.c +++ b/tools/perf/bench/sched-pipe.c @@ -10,7 +10,9 @@ * Ported to perf by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp> */ #include <subcmd/parse-options.h> +#include <api/fs/fs.h> #include "bench.h" +#include "util/cgroup.h" #include <unistd.h> #include <stdio.h> @@ -19,7 +21,9 @@ #include <sys/wait.h> #include <string.h> #include <errno.h> +#include <fcntl.h> #include <assert.h> +#include <sys/epoll.h> #include <sys/time.h> #include <sys/types.h> #include <sys/syscall.h> @@ -31,6 +35,9 @@ struct thread_data { int nr; int pipe_read; int pipe_write; + struct epoll_event epoll_ev; + int epoll_fd; + bool cgroup_failed; pthread_t pthread; }; @@ -40,9 +47,50 @@ static int loops = LOOPS_DEFAULT; /* Use processes by default: */ static bool threaded; +static bool nonblocking; +static char *cgrp_names[2]; +static struct cgroup *cgrps[2]; + +static int parse_two_cgroups(const struct option *opt __maybe_unused, + const char *str, int unset __maybe_unused) +{ + char *p = strdup(str); + char *q; + int ret = -1; + + if (p == NULL) { + fprintf(stderr, "memory allocation failure\n"); + return -1; + } + + q = strchr(p, ','); + if (q == NULL) { + fprintf(stderr, "it should have two cgroup names: %s\n", p); + goto out; + } + *q = '\0'; + + cgrp_names[0] = strdup(p); + cgrp_names[1] = strdup(q + 1); + + if (cgrp_names[0] == NULL || cgrp_names[1] == NULL) { + fprintf(stderr, "memory allocation failure\n"); + goto out; + } + ret = 0; + +out: + free(p); + return ret; +} + static const struct option options[] = { + OPT_BOOLEAN('n', "nonblocking", &nonblocking, "Use non-blocking operations"), OPT_INTEGER('l', "loop", &loops, "Specify number of loops"), OPT_BOOLEAN('T', "threaded", &threaded, "Specify threads/process based task setup"), + OPT_CALLBACK('G', "cgroups", NULL, "SEND,RECV", + "Put sender and receivers in given cgroups", + parse_two_cgroups), OPT_END() }; @@ -51,24 +99,115 @@ static const char * const bench_sched_pipe_usage[] = { NULL }; +static int enter_cgroup(int nr) +{ + char buf[32]; + int fd, len, ret; + int saved_errno; + struct cgroup *cgrp; + pid_t pid; + + if (cgrp_names[nr] == NULL) + return 0; + + if (cgrps[nr] == NULL) { + cgrps[nr] = cgroup__new(cgrp_names[nr], /*do_open=*/true); + if (cgrps[nr] == NULL) + goto err; + } + cgrp = cgrps[nr]; + + if (threaded) + pid = syscall(__NR_gettid); + else + pid = getpid(); + + snprintf(buf, sizeof(buf), "%d\n", pid); + len = strlen(buf); + + /* try cgroup v2 interface first */ + if (threaded) + fd = openat(cgrp->fd, "cgroup.threads", O_WRONLY); + else + fd = openat(cgrp->fd, "cgroup.procs", O_WRONLY); + + /* try cgroup v1 if failed */ + if (fd < 0 && errno == ENOENT) + fd = openat(cgrp->fd, "tasks", O_WRONLY); + + if (fd < 0) + goto err; + + ret = write(fd, buf, len); + close(fd); + + if (ret != len) { + printf("Cannot enter to cgroup: %s\n", cgrp->name); + return -1; + } + return 0; + +err: + saved_errno = errno; + printf("Failed to open cgroup file in %s\n", cgrp_names[nr]); + + if (saved_errno == ENOENT) { + char mnt[PATH_MAX]; + + if (cgroupfs_find_mountpoint(mnt, sizeof(mnt), "perf_event") == 0) + printf(" Hint: create the cgroup first, like 'mkdir %s/%s'\n", + mnt, cgrp_names[nr]); + } else if (saved_errno == EACCES && geteuid() > 0) { + printf(" Hint: try to run as root\n"); + } + + return -1; +} + +static void exit_cgroup(int nr) +{ + cgroup__put(cgrps[nr]); + free(cgrp_names[nr]); +} + +static inline int read_pipe(struct thread_data *td) +{ + int ret, m; +retry: + if (nonblocking) { + ret = epoll_wait(td->epoll_fd, &td->epoll_ev, 1, -1); + if (ret < 0) + return ret; + } + ret = read(td->pipe_read, &m, sizeof(int)); + if (nonblocking && ret < 0 && errno == EWOULDBLOCK) + goto retry; + return ret; +} + static void *worker_thread(void *__tdata) { struct thread_data *td = __tdata; - int m = 0, i; - int ret; + int i, ret, m = 0; + + ret = enter_cgroup(td->nr); + if (ret < 0) { + td->cgroup_failed = true; + return NULL; + } + + if (nonblocking) { + td->epoll_ev.events = EPOLLIN; + td->epoll_fd = epoll_create(1); + BUG_ON(td->epoll_fd < 0); + BUG_ON(epoll_ctl(td->epoll_fd, EPOLL_CTL_ADD, td->pipe_read, &td->epoll_ev) < 0); + } for (i = 0; i < loops; i++) { - if (!td->nr) { - ret = read(td->pipe_read, &m, sizeof(int)); - BUG_ON(ret != sizeof(int)); - ret = write(td->pipe_write, &m, sizeof(int)); - BUG_ON(ret != sizeof(int)); - } else { - ret = write(td->pipe_write, &m, sizeof(int)); - BUG_ON(ret != sizeof(int)); - ret = read(td->pipe_read, &m, sizeof(int)); - BUG_ON(ret != sizeof(int)); - } + ret = write(td->pipe_write, &m, sizeof(int)); + BUG_ON(ret != sizeof(int)); + ret = read_pipe(td); + BUG_ON(ret != sizeof(int)); } return NULL; @@ -76,7 +215,8 @@ static void *worker_thread(void *__tdata) int bench_sched_pipe(int argc, const char **argv) { - struct thread_data threads[2], *td; + struct thread_data threads[2] = {}; + struct thread_data *td; int pipe_1[2], pipe_2[2]; struct timeval start, stop, diff; unsigned long long result_usec = 0; @@ -88,13 +228,16 @@ int bench_sched_pipe(int argc, const char **argv) * discarding returned value of read(), write() * causes error in building environment for perf */ - int __maybe_unused ret, wait_stat; + int __maybe_unused ret, wait_stat, flags = 0; pid_t pid, retpid __maybe_unused; argc = parse_options(argc, argv, options, bench_sched_pipe_usage, 0); - BUG_ON(pipe(pipe_1)); - BUG_ON(pipe(pipe_2)); + if (nonblocking) + flags |= O_NONBLOCK; + + BUG_ON(pipe2(pipe_1, flags)); + BUG_ON(pipe2(pipe_2, flags)); gettimeofday(&start, NULL); @@ -112,9 +255,7 @@ int bench_sched_pipe(int argc, const char **argv) } } - if (threaded) { - for (t = 0; t < nr_threads; t++) { td = threads + t; @@ -128,7 +269,6 @@ int bench_sched_pipe(int argc, const char **argv) ret = pthread_join(td->pthread, NULL); BUG_ON(ret); } - } else { pid = fork(); assert(pid >= 0); @@ -147,6 +287,12 @@ int bench_sched_pipe(int argc, const char **argv) gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); + exit_cgroup(0); + exit_cgroup(1); + + if (threads[0].cgroup_failed || threads[1].cgroup_failed) + return 0; + switch (bench_format) { case BENCH_FORMAT_DEFAULT: printf("# Executed %d pipe operations between two %s\n\n", @@ -156,7 +302,7 @@ int bench_sched_pipe(int argc, const char **argv) result_usec += diff.tv_usec; printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", - diff.tv_sec, + (unsigned long) diff.tv_sec, (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); printf(" %14lf usecs/op\n", @@ -168,7 +314,7 @@ int bench_sched_pipe(int argc, const char **argv) case BENCH_FORMAT_SIMPLE: printf("%lu.%03lu\n", - diff.tv_sec, + (unsigned long) diff.tv_sec, (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); break; diff --git a/tools/perf/bench/sched-seccomp-notify.c b/tools/perf/bench/sched-seccomp-notify.c new file mode 100644 index 000000000000..269c1f4a6852 --- /dev/null +++ b/tools/perf/bench/sched-seccomp-notify.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <subcmd/parse-options.h> +#include "bench.h" + +#include <uapi/linux/filter.h> +#include <sys/types.h> +#include <sys/time.h> +#include <linux/unistd.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <linux/time64.h> +#include <uapi/linux/seccomp.h> +#include <sys/prctl.h> + +#include <unistd.h> +#include <limits.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <sys/wait.h> +#include <string.h> +#include <errno.h> +#include <err.h> +#include <inttypes.h> + +#define LOOPS_DEFAULT 1000000UL +static uint64_t loops = LOOPS_DEFAULT; +static bool sync_mode; + +static const struct option options[] = { + OPT_U64('l', "loop", &loops, "Specify number of loops"), + OPT_BOOLEAN('s', "sync-mode", &sync_mode, + "Enable the synchronous mode for seccomp notifications"), + OPT_END() +}; + +static const char * const bench_seccomp_usage[] = { + "perf bench sched secccomp-notify <options>", + NULL +}; + +static int seccomp(unsigned int op, unsigned int flags, void *args) +{ + return syscall(__NR_seccomp, op, flags, args); +} + +static int user_notif_syscall(int nr, unsigned int flags) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); +} + +#define USER_NOTIF_MAGIC INT_MAX +static void user_notification_sync_loop(int listener) +{ + struct seccomp_notif_resp resp; + struct seccomp_notif req; + uint64_t nr; + + for (nr = 0; nr < loops; nr++) { + memset(&req, 0, sizeof(req)); + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req)) + err(EXIT_FAILURE, "SECCOMP_IOCTL_NOTIF_RECV failed"); + + if (req.data.nr != __NR_gettid) + errx(EXIT_FAILURE, "unexpected syscall: %d", req.data.nr); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + resp.flags = 0; + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp)) + err(EXIT_FAILURE, "SECCOMP_IOCTL_NOTIF_SEND failed"); + } +} + +#ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP +#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0) +#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64) +#endif +int bench_sched_seccomp_notify(int argc, const char **argv) +{ + struct timeval start, stop, diff; + unsigned long long result_usec = 0; + int status, listener; + pid_t pid; + long ret; + + argc = parse_options(argc, argv, options, bench_seccomp_usage, 0); + + gettimeofday(&start, NULL); + + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + listener = user_notif_syscall(__NR_gettid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + if (listener < 0) + err(EXIT_FAILURE, "can't create a notification descriptor"); + + pid = fork(); + if (pid < 0) + err(EXIT_FAILURE, "fork"); + if (pid == 0) { + if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0)) + err(EXIT_FAILURE, "can't set the parent death signal"); + while (1) { + ret = syscall(__NR_gettid); + if (ret == USER_NOTIF_MAGIC) + continue; + break; + } + _exit(1); + } + + if (sync_mode) { + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS, + SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0)) + err(EXIT_FAILURE, + "can't set SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP"); + } + user_notification_sync_loop(listener); + + kill(pid, SIGKILL); + if (waitpid(pid, &status, 0) != pid) + err(EXIT_FAILURE, "waitpid(%d) failed", pid); + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) + errx(EXIT_FAILURE, "unexpected exit code: %d", status); + + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# Executed %" PRIu64 " system calls\n\n", + loops); + + result_usec = diff.tv_sec * USEC_PER_SEC; + result_usec += diff.tv_usec; + + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); + + printf(" %14lf usecs/op\n", + (double)result_usec / (double)loops); + printf(" %14d ops/sec\n", + (int)((double)loops / + ((double)result_usec / (double)USEC_PER_SEC))); + break; + + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); + break; + + default: + /* reaching here is something disaster */ + fprintf(stderr, "Unknown format:%d\n", bench_format); + exit(1); + break; + } + + return 0; +} diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c index 8d624aea1c5e..9b333276cbdb 100644 --- a/tools/perf/bench/synthesize.c +++ b/tools/perf/bench/synthesize.c @@ -49,7 +49,7 @@ static const char *const bench_usage[] = { static atomic_t event_count; -static int process_synthesized_event(struct perf_tool *tool __maybe_unused, +static int process_synthesized_event(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -80,7 +80,7 @@ static int do_run_single_threaded(struct perf_session *session, NULL, target, threads, process_synthesized_event, - data_mmap, + true, data_mmap, nr_threads_synthesize); if (err) return err; @@ -117,7 +117,7 @@ static int run_single_threaded(void) int err; perf_set_singlethreaded(); - session = perf_session__new(NULL, false, NULL); + session = perf_session__new(NULL, NULL); if (IS_ERR(session)) { pr_err("Session creation failed.\n"); return PTR_ERR(session); @@ -161,9 +161,9 @@ static int do_run_multi_threaded(struct target *target, init_stats(&time_stats); init_stats(&event_stats); for (i = 0; i < multi_iterations; i++) { - session = perf_session__new(NULL, false, NULL); - if (!session) - return -ENOMEM; + session = perf_session__new(NULL, NULL); + if (IS_ERR(session)) + return PTR_ERR(session); atomic_set(&event_count, 0); gettimeofday(&start, NULL); @@ -171,7 +171,7 @@ static int do_run_multi_threaded(struct target *target, NULL, target, NULL, process_synthesized_event, - false, + true, false, nr_threads_synthesize); if (err) { perf_session__delete(session); diff --git a/tools/perf/bench/syscall.c b/tools/perf/bench/syscall.c new file mode 100644 index 000000000000..e7dc216f717f --- /dev/null +++ b/tools/perf/bench/syscall.c @@ -0,0 +1,188 @@ +/* + * + * syscall.c + * + * syscall: Benchmark for system call performance + */ +#include "../perf.h" +#include "../util/util.h" +#include <subcmd/parse-options.h> +#include "../builtin.h" +#include "bench.h" + +#include <stdio.h> +#include <sys/time.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <stdlib.h> + +#ifndef __NR_fork +#define __NR_fork -1 +#endif + +static int loops; + +static const struct option options[] = { + OPT_INTEGER('l', "loop", &loops, "Specify number of loops"), + OPT_END() +}; + +static const char * const bench_syscall_usage[] = { + "perf bench syscall <options>", + NULL +}; + +static void test_fork(void) +{ + pid_t pid = fork(); + + if (pid < 0) { + fprintf(stderr, "fork failed\n"); + exit(1); + } else if (pid == 0) { + exit(0); + } else { + if (waitpid(pid, NULL, 0) < 0) { + fprintf(stderr, "waitpid failed\n"); + exit(1); + } + } +} + +static void test_execve(void) +{ + const char *pathname = "/bin/true"; + char *const argv[] = { (char *)pathname, NULL }; + pid_t pid = fork(); + + if (pid < 0) { + fprintf(stderr, "fork failed\n"); + exit(1); + } else if (pid == 0) { + execve(pathname, argv, NULL); + fprintf(stderr, "execve /bin/true failed\n"); + exit(1); + } else { + if (waitpid(pid, NULL, 0) < 0) { + fprintf(stderr, "waitpid failed\n"); + exit(1); + } + } +} + +static int bench_syscall_common(int argc, const char **argv, int syscall) +{ + struct timeval start, stop, diff; + unsigned long long result_usec = 0; + const char *name = NULL; + int i; + + switch (syscall) { + case __NR_fork: + case __NR_execve: + /* Limit default loop to 10000 times to save time */ + loops = 10000; + break; + default: + loops = 10000000; + break; + } + + /* Options -l and --loops override default above */ + argc = parse_options(argc, argv, options, bench_syscall_usage, 0); + + gettimeofday(&start, NULL); + + for (i = 0; i < loops; i++) { + switch (syscall) { + case __NR_getppid: + getppid(); + break; + case __NR_getpgid: + getpgid(0); + break; + case __NR_fork: + test_fork(); + break; + case __NR_execve: + test_execve(); + default: + break; + } + } + + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + + switch (syscall) { + case __NR_getppid: + name = "getppid()"; + break; + case __NR_getpgid: + name = "getpgid()"; + break; + case __NR_fork: + name = "fork()"; + break; + case __NR_execve: + name = "execve()"; + break; + default: + break; + } + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# Executed %'d %s calls\n", loops, name); + + result_usec = diff.tv_sec * 1000000; + result_usec += diff.tv_usec; + + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec/1000)); + + printf(" %14lf usecs/op\n", + (double)result_usec / (double)loops); + printf(" %'14d ops/sec\n", + (int)((double)loops / + ((double)result_usec / (double)1000000))); + break; + + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / 1000)); + break; + + default: + /* reaching here is something disaster */ + fprintf(stderr, "Unknown format:%d\n", bench_format); + exit(1); + break; + } + + return 0; +} + +int bench_syscall_basic(int argc, const char **argv) +{ + return bench_syscall_common(argc, argv, __NR_getppid); +} + +int bench_syscall_getpgid(int argc, const char **argv) +{ + return bench_syscall_common(argc, argv, __NR_getpgid); +} + +int bench_syscall_fork(int argc, const char **argv) +{ + return bench_syscall_common(argc, argv, __NR_fork); +} + +int bench_syscall_execve(int argc, const char **argv) +{ + return bench_syscall_common(argc, argv, __NR_execve); +} diff --git a/tools/perf/bench/uprobe.c b/tools/perf/bench/uprobe.c new file mode 100644 index 000000000000..0b90275862e1 --- /dev/null +++ b/tools/perf/bench/uprobe.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* + * uprobe.c + * + * uprobe benchmarks + * + * Copyright (C) 2023, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> + */ +#include "../perf.h" +#include "../util/util.h" +#include <subcmd/parse-options.h> +#include "../builtin.h" +#include "bench.h" +#include <linux/compiler.h> +#include <linux/time64.h> + +#include <inttypes.h> +#include <stdio.h> +#include <sys/time.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> +#include <stdlib.h> + +#define LOOPS_DEFAULT 1000 +static int loops = LOOPS_DEFAULT; + +enum bench_uprobe { + BENCH_UPROBE__BASELINE, + BENCH_UPROBE__EMPTY, + BENCH_UPROBE__TRACE_PRINTK, + BENCH_UPROBE__EMPTY_RET, + BENCH_UPROBE__TRACE_PRINTK_RET, +}; + +static const struct option options[] = { + OPT_INTEGER('l', "loop", &loops, "Specify number of loops"), + OPT_END() +}; + +static const char * const bench_uprobe_usage[] = { + "perf bench uprobe <options>", + NULL +}; + +#ifdef HAVE_BPF_SKEL +#include "bpf_skel/bench_uprobe.skel.h" + +#define bench_uprobe__attach_uprobe(prog) \ + skel->links.prog = bpf_program__attach_uprobe_opts(/*prog=*/skel->progs.prog, \ + /*pid=*/-1, \ + /*binary_path=*/"libc.so.6", \ + /*func_offset=*/0, \ + /*opts=*/&uprobe_opts); \ + if (!skel->links.prog) { \ + err = -errno; \ + fprintf(stderr, "Failed to attach bench uprobe \"%s\": %s\n", #prog, strerror(errno)); \ + goto cleanup; \ + } + +struct bench_uprobe_bpf *skel; + +static int bench_uprobe__setup_bpf_skel(enum bench_uprobe bench) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); + int err; + + /* Load and verify BPF application */ + skel = bench_uprobe_bpf__open(); + if (!skel) { + fprintf(stderr, "Failed to open and load uprobes bench BPF skeleton\n"); + return -1; + } + + err = bench_uprobe_bpf__load(skel); + if (err) { + fprintf(stderr, "Failed to load and verify BPF skeleton\n"); + goto cleanup; + } + + uprobe_opts.func_name = "usleep"; + switch (bench) { + case BENCH_UPROBE__BASELINE: break; + case BENCH_UPROBE__EMPTY: bench_uprobe__attach_uprobe(empty); break; + case BENCH_UPROBE__TRACE_PRINTK: bench_uprobe__attach_uprobe(trace_printk); break; + case BENCH_UPROBE__EMPTY_RET: bench_uprobe__attach_uprobe(empty_ret); break; + case BENCH_UPROBE__TRACE_PRINTK_RET: bench_uprobe__attach_uprobe(trace_printk_ret); break; + default: + fprintf(stderr, "Invalid bench: %d\n", bench); + goto cleanup; + } + + return err; +cleanup: + bench_uprobe_bpf__destroy(skel); + skel = NULL; + return err; +} + +static void bench_uprobe__teardown_bpf_skel(void) +{ + if (skel) { + bench_uprobe_bpf__destroy(skel); + skel = NULL; + } +} +#else +static int bench_uprobe__setup_bpf_skel(enum bench_uprobe bench __maybe_unused) { return 0; } +static void bench_uprobe__teardown_bpf_skel(void) {}; +#endif + +static int bench_uprobe_format__default_fprintf(const char *name, const char *unit, u64 diff, FILE *fp) +{ + static u64 baseline, previous; + s64 diff_to_baseline = diff - baseline, + diff_to_previous = diff - previous; + int printed = fprintf(fp, "# Executed %'d %s calls\n", loops, name); + + printed += fprintf(fp, " %14s: %'" PRIu64 " %ss", "Total time", diff, unit); + + if (baseline) { + printed += fprintf(fp, " %s%'" PRId64 " to baseline", diff_to_baseline > 0 ? "+" : "", diff_to_baseline); + + if (previous != baseline) + fprintf(stdout, " %s%'" PRId64 " to previous", diff_to_previous > 0 ? "+" : "", diff_to_previous); + } + + printed += fprintf(fp, "\n\n %'.3f %ss/op", (double)diff / (double)loops, unit); + + if (baseline) { + printed += fprintf(fp, " %'.3f %ss/op to baseline", (double)diff_to_baseline / (double)loops, unit); + + if (previous != baseline) + printed += fprintf(fp, " %'.3f %ss/op to previous", (double)diff_to_previous / (double)loops, unit); + } else { + baseline = diff; + } + + fputc('\n', fp); + + previous = diff; + + return printed + 1; +} + +static int bench_uprobe(int argc, const char **argv, enum bench_uprobe bench) +{ + const char *name = "usleep(1000)", *unit = "usec"; + struct timespec start, end; + u64 diff; + int i; + + argc = parse_options(argc, argv, options, bench_uprobe_usage, 0); + + if (bench != BENCH_UPROBE__BASELINE && bench_uprobe__setup_bpf_skel(bench) < 0) + return 0; + + clock_gettime(CLOCK_REALTIME, &start); + + for (i = 0; i < loops; i++) { + usleep(USEC_PER_MSEC); + } + + clock_gettime(CLOCK_REALTIME, &end); + + diff = end.tv_sec * NSEC_PER_SEC + end.tv_nsec - (start.tv_sec * NSEC_PER_SEC + start.tv_nsec); + diff /= NSEC_PER_USEC; + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + bench_uprobe_format__default_fprintf(name, unit, diff, stdout); + break; + + case BENCH_FORMAT_SIMPLE: + printf("%" PRIu64 "\n", diff); + break; + + default: + /* reaching here is something of a disaster */ + fprintf(stderr, "Unknown format:%d\n", bench_format); + exit(1); + } + + if (bench != BENCH_UPROBE__BASELINE) + bench_uprobe__teardown_bpf_skel(); + + return 0; +} + +int bench_uprobe_baseline(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__BASELINE); +} + +int bench_uprobe_empty(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__EMPTY); +} + +int bench_uprobe_trace_printk(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__TRACE_PRINTK); +} + +int bench_uprobe_empty_ret(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__EMPTY_RET); +} + +int bench_uprobe_trace_printk_ret(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__TRACE_PRINTK_RET); +} |