diff options
Diffstat (limited to 'tools')
68 files changed, 6220 insertions, 1 deletions
diff --git a/tools/Makefile b/tools/Makefile index 276f5d0d53a4..278d24723b74 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -28,6 +28,7 @@ help: @echo ' pci - PCI tools' @echo ' perf - Linux performance measurement and analysis tool' @echo ' selftests - various kernel selftests' + @echo ' sched_ext - sched_ext example schedulers' @echo ' bootconfig - boot config tool' @echo ' spi - spi tools' @echo ' tmon - thermal monitoring and tuning tool' @@ -91,6 +92,9 @@ perf: FORCE $(Q)mkdir -p $(PERF_O) . $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= +sched_ext: FORCE + $(call descend,sched_ext) + selftests: FORCE $(call descend,testing/$@) @@ -184,6 +188,9 @@ perf_clean: $(Q)mkdir -p $(PERF_O) . $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean +sched_ext_clean: + $(call descend,sched_ext,clean) + selftests_clean: $(call descend,testing/$(@:_clean=),clean) @@ -213,6 +220,7 @@ clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ - intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean + intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \ + sched_ext_clean .PHONY: FORCE diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore new file mode 100644 index 000000000000..d6264fe1c8cd --- /dev/null +++ b/tools/sched_ext/.gitignore @@ -0,0 +1,2 @@ +tools/ +build/ diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile new file mode 100644 index 000000000000..bf7e108f5ae1 --- /dev/null +++ b/tools/sched_ext/Makefile @@ -0,0 +1,246 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. +include ../build/Build.include +include ../scripts/Makefile.arch +include ../scripts/Makefile.include + +all: all_targets + +ifneq ($(LLVM),) +ifneq ($(filter %/,$(LLVM)),) +LLVM_PREFIX := $(LLVM) +else ifneq ($(filter -%,$(LLVM)),) +LLVM_SUFFIX := $(LLVM) +endif + +CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi +CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu +CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl +CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu +CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu +CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu +CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu +CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu +CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu +CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) + +ifeq ($(CROSS_COMPILE),) +ifeq ($(CLANG_TARGET_FLAGS),) +$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk) +else +CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) +endif # CLANG_TARGET_FLAGS +else +CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif # CROSS_COMPILE + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +else +CC := $(CROSS_COMPILE)gcc +endif # LLVM + +CURDIR := $(abspath .) +TOOLSDIR := $(abspath ..) +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi +GENDIR := $(abspath ../../include/generated) +GENHDR := $(GENDIR)/autoconf.h + +ifeq ($(O),) +OUTPUT_DIR := $(CURDIR)/build +else +OUTPUT_DIR := $(O)/build +endif # O +OBJ_DIR := $(OUTPUT_DIR)/obj +INCLUDE_DIR := $(OUTPUT_DIR)/include +BPFOBJ_DIR := $(OBJ_DIR)/libbpf +SCXOBJ_DIR := $(OBJ_DIR)/sched_ext +BINDIR := $(OUTPUT_DIR)/bin +BPFOBJ := $(BPFOBJ_DIR)/libbpf.a +ifneq ($(CROSS_COMPILE),) +HOST_BUILD_DIR := $(OBJ_DIR)/host +HOST_OUTPUT_DIR := host-tools +HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include +else +HOST_BUILD_DIR := $(OBJ_DIR) +HOST_OUTPUT_DIR := $(OUTPUT_DIR) +HOST_INCLUDE_DIR := $(INCLUDE_DIR) +endif +HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a +RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids +DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool + +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +ifneq ($(wildcard $(GENHDR)),) + GENFLAGS := -DHAVE_GENHDR +endif + +CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include + +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + +LDFLAGS = -lelf -lz -lpthread + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \ + grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__') + +# Get Clang's default includes on this system, as opposed to those seen by +# '-target bpf'. This fixes "missing" files on some architectures/distros, +# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +define get_sys_includes +$(shell $(1) -v -E - </dev/null 2>&1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') +endef + +BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \ + $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) \ + -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat \ + -I$(INCLUDE_DIR) -I$(APIDIR) \ + -I../../include \ + $(call get_sys_includes,$(CLANG)) \ + -Wall -Wno-compare-distinct-pointer-types \ + -O2 -mcpu=v3 + +# sort removes libbpf duplicates when not cross-building +MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \ + $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids \ + $(INCLUDE_DIR) $(SCXOBJ_DIR) $(BINDIR)) + +$(MAKE_DIRS): + $(call msg,MKDIR,,$@) + $(Q)mkdir -p $@ + +$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ + $(APIDIR)/linux/bpf.h \ + | $(OBJ_DIR)/libbpf + $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/ \ + EXTRA_CFLAGS='-g -O0 -fPIC' \ + DESTDIR=$(OUTPUT_DIR) prefix= all install_headers + +$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ + $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ + ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) \ + EXTRA_CFLAGS='-g -O0' \ + OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ + LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/ \ + LIBBPF_DESTDIR=$(HOST_OUTPUT_DIR)/ \ + prefix= DESTDIR=$(HOST_OUTPUT_DIR)/ install-bin + +$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) +ifeq ($(VMLINUX_H),) + $(call msg,GEN,,$@) + $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ +else + $(call msg,CP,,$@) + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h \ + | $(BPFOBJ) $(SCXOBJ_DIR) + $(call msg,CLNG-BPF,,$(notdir $@)) + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ + +$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) + $(eval sched=$(notdir $@)) + $(call msg,GEN-SKEL,,$(sched)) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) + +SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) + +c-sched-targets = scx_simple scx_qmap scx_central + +$(addprefix $(BINDIR)/,$(c-sched-targets)): \ + $(BINDIR)/%: \ + $(filter-out %.bpf.c,%.c) \ + $(INCLUDE_DIR)/%.bpf.skel.h \ + $(SCX_COMMON_DEPS) + $(eval sched=$(notdir $@)) + $(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o + $(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS) + +$(c-sched-targets): %: $(BINDIR)/% + +install: all + $(Q)mkdir -p $(DESTDIR)/usr/local/bin/ + $(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/ + +clean: + rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR) + rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h + rm -f $(c-sched-targets) + +help: + @echo 'Building targets' + @echo '================' + @echo '' + @echo ' all - Compile all schedulers' + @echo '' + @echo 'Alternatively, you may compile individual schedulers:' + @echo '' + @printf ' %s\n' $(c-sched-targets) + @echo '' + @echo 'For any scheduler build target, you may specify an alternative' + @echo 'build output path with the O= environment variable. For example:' + @echo '' + @echo ' O=/tmp/sched_ext make all' + @echo '' + @echo 'will compile all schedulers, and emit the build artifacts to' + @echo '/tmp/sched_ext/build.' + @echo '' + @echo '' + @echo 'Installing targets' + @echo '==================' + @echo '' + @echo ' install - Compile and install all schedulers to /usr/bin.' + @echo ' You may specify the DESTDIR= environment variable' + @echo ' to indicate a prefix for /usr/bin. For example:' + @echo '' + @echo ' DESTDIR=/tmp/sched_ext make install' + @echo '' + @echo ' will build the schedulers in CWD/build, and' + @echo ' install the schedulers to /tmp/sched_ext/usr/bin.' + @echo '' + @echo '' + @echo 'Cleaning targets' + @echo '================' + @echo '' + @echo ' clean - Remove all generated files' + +all_targets: $(c-sched-targets) + +.PHONY: all all_targets $(c-sched-targets) clean help + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md new file mode 100644 index 000000000000..8efe70cc4363 --- /dev/null +++ b/tools/sched_ext/README.md @@ -0,0 +1,258 @@ +SCHED_EXT EXAMPLE SCHEDULERS +============================ + +# Introduction + +This directory contains a number of example sched_ext schedulers. These +schedulers are meant to provide examples of different types of schedulers +that can be built using sched_ext, and illustrate how various features of +sched_ext can be used. + +Some of the examples are performant, production-ready schedulers. That is, for +the correct workload and with the correct tuning, they may be deployed in a +production environment with acceptable or possibly even improved performance. +Others are just examples that in practice, would not provide acceptable +performance (though they could be improved to get there). + +This README will describe these example schedulers, including describing the +types of workloads or scenarios they're designed to accommodate, and whether or +not they're production ready. For more details on any of these schedulers, +please see the header comment in their .bpf.c file. + + +# Compiling the examples + +There are a few toolchain dependencies for compiling the example schedulers. + +## Toolchain dependencies + +1. clang >= 16.0.0 + +The schedulers are BPF programs, and therefore must be compiled with clang. gcc +is actively working on adding a BPF backend compiler as well, but are still +missing some features such as BTF type tags which are necessary for using +kptrs. + +2. pahole >= 1.25 + +You may need pahole in order to generate BTF from DWARF. + +3. rust >= 1.70.0 + +Rust schedulers uses features present in the rust toolchain >= 1.70.0. You +should be able to use the stable build from rustup, but if that doesn't +work, try using the rustup nightly build. + +There are other requirements as well, such as make, but these are the main / +non-trivial ones. + +## Compiling the kernel + +In order to run a sched_ext scheduler, you'll have to run a kernel compiled +with the patches in this repository, and with a minimum set of necessary +Kconfig options: + +``` +CONFIG_BPF=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_DEBUG_INFO_BTF=y +``` + +It's also recommended that you also include the following Kconfig options: + +``` +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +CONFIG_PAHOLE_HAS_SPLIT_BTF=y +CONFIG_PAHOLE_HAS_BTF_TAG=y +``` + +There is a `Kconfig` file in this directory whose contents you can append to +your local `.config` file, as long as there are no conflicts with any existing +options in the file. + +## Getting a vmlinux.h file + +You may notice that most of the example schedulers include a "vmlinux.h" file. +This is a large, auto-generated header file that contains all of the types +defined in some vmlinux binary that was compiled with +[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig +options specified above). + +The header file is created using `bpftool`, by passing it a vmlinux binary +compiled with BTF as follows: + +```bash +$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h +``` + +`bpftool` analyzes all of the BTF encodings in the binary, and produces a +header file that can be included by BPF programs to access those types. For +example, using vmlinux.h allows a scheduler to access fields defined directly +in vmlinux as follows: + +```c +#include "vmlinux.h" +// vmlinux.h is also implicitly included by scx_common.bpf.h. +#include "scx_common.bpf.h" + +/* + * vmlinux.h provides definitions for struct task_struct and + * struct scx_enable_args. + */ +void BPF_STRUCT_OPS(example_enable, struct task_struct *p, + struct scx_enable_args *args) +{ + bpf_printk("Task %s enabled in example scheduler", p->comm); +} + +// vmlinux.h provides the definition for struct sched_ext_ops. +SEC(".struct_ops.link") +struct sched_ext_ops example_ops { + .enable = (void *)example_enable, + .name = "example", +} +``` + +The scheduler build system will generate this vmlinux.h file as part of the +scheduler build pipeline. It looks for a vmlinux file in the following +dependency order: + +1. If the O= environment variable is defined, at `$O/vmlinux` +2. If the KBUILD_OUTPUT= environment variable is defined, at + `$KBUILD_OUTPUT/vmlinux` +3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're + compiling the schedulers) +3. `/sys/kernel/btf/vmlinux` +4. `/boot/vmlinux-$(uname -r)` + +In other words, if you have compiled a kernel in your local repo, its vmlinux +file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of +the kernel you're currently running on. This means that if you're running on a +kernel with sched_ext support, you may not need to compile a local kernel at +all. + +### Aside on CO-RE + +One of the cooler features of BPF is that it supports +[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run +Everywhere). This feature allows you to reference fields inside of structs with +types defined internal to the kernel, and not have to recompile if you load the +BPF program on a different kernel with the field at a different offset. In our +example above, we print out a task name with `p->comm`. CO-RE would perform +relocations for that access when the program is loaded to ensure that it's +referencing the correct offset for the currently running kernel. + +## Compiling the schedulers + +Once you have your toolchain setup, and a vmlinux that can be used to generate +a full vmlinux.h file, you can compile the schedulers using `make`: + +```bash +$ make -j($nproc) +``` + +# Example schedulers + +This directory contains the following example schedulers. These schedulers are +for testing and demonstrating different aspects of sched_ext. While some may be +useful in limited scenarios, they are not intended to be practical. + +For more scheduler implementations, tools and documentation, visit +https://github.com/sched-ext/scx. + +## scx_simple + +A simple scheduler that provides an example of a minimal sched_ext scheduler. +scx_simple can be run in either global weighted vtime mode, or FIFO mode. + +Though very simple, in limited scenarios, this scheduler can perform reasonably +well on single-socket systems with a unified L3 cache. + +## scx_qmap + +Another simple, yet slightly more complex scheduler that provides an example of +a basic weighted FIFO queuing policy. It also provides examples of some common +useful BPF features, such as sleepable per-task storage allocation in the +`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to +enqueue tasks. It also illustrates how core-sched support could be implemented. + +## scx_central + +A "central" scheduler where scheduling decisions are made from a single CPU. +This scheduler illustrates how scheduling decisions can be dispatched from a +single CPU, allowing other cores to run with infinite slices, without timer +ticks, and without having to incur the overhead of making scheduling decisions. + +The approach demonstrated by this scheduler may be useful for any workload that +benefits from minimizing scheduling overhead and timer ticks. An example of +where this could be particularly useful is running VMs, where running with +infinite slices and no timer ticks allows the VM to avoid unnecessary expensive +vmexits. + + +# Troubleshooting + +There are a number of common issues that you may run into when building the +schedulers. We'll go over some of the common ones here. + +## Build Failures + +### Old version of clang + +``` +error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + ^~~~~~~~~~~~~~~~~~~~ +1 error generated. +``` + +This means you built the kernel or the schedulers with an older version of +clang than what's supported (i.e. older than 16.0.0). To remediate this: + +1. `which clang` to make sure you're using a sufficiently new version of clang. + +2. `make fullclean` in the root path of the repository, and rebuild the kernel + and schedulers. + +3. Rebuild the kernel, and then your example schedulers. + +The schedulers are also cleaned if you invoke `make mrproper` in the root +directory of the tree. + +### Stale kernel build / incomplete vmlinux.h file + +As described above, you'll need a `vmlinux.h` file that was generated from a +vmlinux built with BTF, and with sched_ext support enabled. If you don't, +you'll see errors such as the following which indicate that a type being +referenced in a scheduler is unknown: + +``` +/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info' + +const struct scx_exit_info *ei) + +^ +``` + +In order to resolve this, please follow the steps above in +[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your +schedulers are using a vmlinux.h file that includes the requisite types. + +## Misc + +### llvm: [OFF] + +You may see the following output when building the schedulers: + +``` +Auto-detecting system features: +... clang-bpf-co-re: [ on ] +... llvm: [ OFF ] +... libcap: [ on ] +... libbfd: [ on ] +``` + +Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore. diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h new file mode 100644 index 000000000000..ad7d139ce907 --- /dev/null +++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h @@ -0,0 +1,11 @@ +/* + * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when + * compiling BPF files although its content doesn't play any role. The file in + * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is + * defined. When compiling a BPF source, __x86_64__ isn't set and thus + * stubs-32.h is selected. However, the file is not there if the system doesn't + * have 32bit glibc devel package installed leading to a build failure. + * + * The problem is worked around by making this file available in the include + * search paths before the system one when building BPF. + */ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h new file mode 100644 index 000000000000..20280df62857 --- /dev/null +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -0,0 +1,401 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifndef __SCX_COMMON_BPF_H +#define __SCX_COMMON_BPF_H + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <asm-generic/errno.h> +#include "user_exit_info.h" + +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ +#define PF_EXITING 0x00000004 +#define CLOCK_MONOTONIC 1 + +/* + * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can + * lead to really confusing misbehaviors. Let's trigger a build failure. + */ +static inline void ___vmlinux_h_sanity_check___(void) +{ + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); +} + +s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; +void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; +void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym; +u32 scx_bpf_dispatch_nr_slots(void) __ksym; +void scx_bpf_dispatch_cancel(void) __ksym; +bool scx_bpf_consume(u64 dsq_id) __ksym; +u32 scx_bpf_reenqueue_local(void) __ksym; +void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; +s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; +void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; +struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; +void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; +void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; +void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; +void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak; +u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; +u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; +void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; +u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; +const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; +const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; +void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; +const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; +void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; +bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; +s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +bool scx_bpf_task_running(const struct task_struct *p) __ksym; +s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; +struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; + +static inline __attribute__((format(printf, 1, 2))) +void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} + +/* + * Helper macro for initializing the fmt and variadic argument inputs to both + * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to + * refer to the initialized list of inputs to the bstr kfunc. + */ +#define scx_bpf_bstr_preamble(fmt, args...) \ + static char ___fmt[] = fmt; \ + /* \ + * Note that __param[] must have at least one \ + * element to keep the verifier happy. \ + */ \ + unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + +/* + * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments + * instead of an array of u64. Using this macro will cause the scheduler to + * exit cleanly with the specified exit code being passed to user space. + */ +#define scx_bpf_exit(code, fmt, args...) \ +({ \ + scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_bstr_format_checker(fmt, ##args); \ +}) + +/* + * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments + * instead of an array of u64. Invoking this macro will cause the scheduler to + * exit in an erroneous state, with diagnostic information being passed to the + * user. + */ +#define scx_bpf_error(fmt, args...) \ +({ \ + scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_bstr_format_checker(fmt, ##args); \ +}) + +/* + * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments + * instead of an array of u64. To be used from ops.dump() and friends. + */ +#define scx_bpf_dump(fmt, args...) \ +({ \ + scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_bstr_format_checker(fmt, ##args); \ +}) + +#define BPF_STRUCT_OPS(name, args...) \ +SEC("struct_ops/"#name) \ +BPF_PROG(name, ##args) + +#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ +SEC("struct_ops.s/"#name) \ +BPF_PROG(name, ##args) + +/** + * RESIZABLE_ARRAY - Generates annotations for an array that may be resized + * @elfsec: the data section of the BPF program in which to place the array + * @arr: the name of the array + * + * libbpf has an API for setting map value sizes. Since data sections (i.e. + * bss, data, rodata) themselves are maps, a data section can be resized. If + * a data section has an array as its last element, the BTF info for that + * array will be adjusted so that length of the array is extended to meet the + * new length of the data section. This macro annotates an array to have an + * element count of one with the assumption that this array can be resized + * within the userspace program. It also annotates the section specifier so + * this array exists in a custom sub data section which can be resized + * independently. + * + * See RESIZE_ARRAY() for the userspace convenience macro for resizing an + * array declared with RESIZABLE_ARRAY(). + */ +#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) + +/** + * MEMBER_VPTR - Obtain the verified pointer to a struct or array member + * @base: struct or array to index + * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) + * + * The verifier often gets confused by the instruction sequence the compiler + * generates for indexing struct fields or arrays. This macro forces the + * compiler to generate a code sequence which first calculates the byte offset, + * checks it against the struct or array size and add that byte offset to + * generate the pointer to the member to help the verifier. + * + * Ideally, we want to abort if the calculated offset is out-of-bounds. However, + * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller + * must check for %NULL and take appropriate action to appease the verifier. To + * avoid confusing the verifier, it's best to check for %NULL and dereference + * immediately. + * + * vptr = MEMBER_VPTR(my_array, [i][j]); + * if (!vptr) + * return error; + * *vptr = new_value; + * + * sizeof(@base) should encompass the memory area to be accessed and thus can't + * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of + * `MEMBER_VPTR(ptr, ->member)`. + */ +#define MEMBER_VPTR(base, member) (typeof((base) member) *) \ +({ \ + u64 __base = (u64)&(base); \ + u64 __addr = (u64)&((base) member) - __base; \ + _Static_assert(sizeof(base) >= sizeof((base) member), \ + "@base is smaller than @member, is @base a pointer?"); \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"i"(sizeof(base) - sizeof((base) member))); \ + __addr; \ +}) + +/** + * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element + * @arr: array to index into + * @i: array index + * @n: number of elements in array + * + * Similar to MEMBER_VPTR() but is intended for use with arrays where the + * element count needs to be explicit. + * It can be used in cases where a global array is defined with an initial + * size but is intended to be be resized before loading the BPF program. + * Without this version of the macro, MEMBER_VPTR() will use the compile time + * size of the array to compute the max, which will result in rejection by + * the verifier. + */ +#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ +({ \ + u64 __base = (u64)arr; \ + u64 __addr = (u64)&(arr[i]) - __base; \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ + __addr; \ +}) + + +/* + * BPF declarations and helpers + */ + +/* list and rbtree */ +#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; +void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; + +#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) +#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) + +void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; +struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; +struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, + struct bpf_rb_node *node) __ksym; +int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), + void *meta, __u64 off) __ksym; +#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) + +struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; + +void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; +#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) + +/* task */ +struct task_struct *bpf_task_from_pid(s32 pid) __ksym; +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; +void bpf_task_release(struct task_struct *p) __ksym; + +/* cgroup */ +struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; +void bpf_cgroup_release(struct cgroup *cgrp) __ksym; +struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; + +/* css iteration */ +struct bpf_iter_css; +struct cgroup_subsys_state; +extern int bpf_iter_css_new(struct bpf_iter_css *it, + struct cgroup_subsys_state *start, + unsigned int flags) __weak __ksym; +extern struct cgroup_subsys_state * +bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; +extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; + +/* cpumask */ +struct bpf_cpumask *bpf_cpumask_create(void) __ksym; +struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; +u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; +u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, + const struct cpumask *src2) __ksym; + +/* rcu */ +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + + +/* + * Other helpers + */ + +/* useful compiler attributes */ +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#define __maybe_unused __attribute__((__unused__)) + +/* + * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They + * prevent compiler from caching, redoing or reordering reads or writes. + */ +typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; +typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; +typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; +typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; + +static __always_inline void __read_once_size(const volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; + case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; + case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; + case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; + default: + barrier(); + __builtin_memcpy((void *)res, (const void *)p, size); + barrier(); + } +} + +static __always_inline void __write_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; + case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; + case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; + case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; + default: + barrier(); + __builtin_memcpy((void *)p, (const void *)res, size); + barrier(); + } +} + +#define READ_ONCE(x) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +#define WRITE_ONCE(x, val) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +/* + * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. + * @v: The value for which we're computing the base 2 logarithm. + */ +static inline u32 log2_u32(u32 v) +{ + u32 r; + u32 shift; + + r = (v > 0xFFFF) << 4; v >>= r; + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; + shift = (v > 0xF) << 2; v >>= shift; r |= shift; + shift = (v > 0x3) << 1; v >>= shift; r |= shift; + r |= (v >> 1); + return r; +} + +/* + * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value. + * @v: The value for which we're computing the base 2 logarithm. + */ +static inline u32 log2_u64(u64 v) +{ + u32 hi = v >> 32; + if (hi) + return log2_u32(hi) + 32 + 1; + else + return log2_u32(v) + 1; +} + +#include "compat.bpf.h" + +#endif /* __SCX_COMMON_BPF_H */ diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h new file mode 100644 index 000000000000..5b0f90152152 --- /dev/null +++ b/tools/sched_ext/include/scx/common.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + */ +#ifndef __SCHED_EXT_COMMON_H +#define __SCHED_EXT_COMMON_H + +#ifdef __KERNEL__ +#error "Should not be included by BPF programs" +#endif + +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <errno.h> + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +#define SCX_BUG(__fmt, ...) \ + do { \ + fprintf(stderr, "[SCX_BUG] %s:%d", __FILE__, __LINE__); \ + if (errno) \ + fprintf(stderr, " (%s)\n", strerror(errno)); \ + else \ + fprintf(stderr, "\n"); \ + fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + \ + exit(EXIT_FAILURE); \ + } while (0) + +#define SCX_BUG_ON(__cond, __fmt, ...) \ + do { \ + if (__cond) \ + SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \ + } while (0) + +/** + * RESIZE_ARRAY - Convenience macro for resizing a BPF array + * @__skel: the skeleton containing the array + * @elfsec: the data section of the BPF program in which the array exists + * @arr: the name of the array + * @n: the desired array element count + * + * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two + * operations. It resizes the map which corresponds to the custom data + * section that contains the target array. As a side effect, the BTF info for + * the array is adjusted so that the array length is sized to cover the new + * data section size. The second operation is reassigning the skeleton pointer + * for that custom data section so that it points to the newly memory mapped + * region. + */ +#define RESIZE_ARRAY(__skel, elfsec, arr, n) \ + do { \ + size_t __sz; \ + bpf_map__set_value_size((__skel)->maps.elfsec##_##arr, \ + sizeof((__skel)->elfsec##_##arr->arr[0]) * (n)); \ + (__skel)->elfsec##_##arr = \ + bpf_map__initial_value((__skel)->maps.elfsec##_##arr, &__sz); \ + } while (0) + +#include "user_exit_info.h" +#include "compat.h" + +#endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h new file mode 100644 index 000000000000..3d2fe1208900 --- /dev/null +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#ifndef __SCX_COMPAT_BPF_H +#define __SCX_COMPAT_BPF_H + +#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ +({ \ + __type __ret = 0; \ + if (bpf_core_enum_value_exists(__type, __ent)) \ + __ret = __ent; \ + __ret; \ +}) + +/* + * Define sched_ext_ops. This may be expanded to define multiple variants for + * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). + */ +#define SCX_OPS_DEFINE(__name, ...) \ + SEC(".struct_ops.link") \ + struct sched_ext_ops __name = { \ + __VA_ARGS__, \ + }; + +#endif /* __SCX_COMPAT_BPF_H */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h new file mode 100644 index 000000000000..cc56ff9aa252 --- /dev/null +++ b/tools/sched_ext/include/scx/compat.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#ifndef __SCX_COMPAT_H +#define __SCX_COMPAT_H + +#include <bpf/btf.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> + +struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); + +static inline void __COMPAT_load_vmlinux_btf(void) +{ + if (!__COMPAT_vmlinux_btf) { + __COMPAT_vmlinux_btf = btf__load_vmlinux_btf(); + SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()"); + } +} + +static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v) +{ + const struct btf_type *t; + const char *n; + s32 tid; + int i; + + __COMPAT_load_vmlinux_btf(); + + tid = btf__find_by_name(__COMPAT_vmlinux_btf, type); + if (tid < 0) + return false; + + t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); + SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); + + if (btf_is_enum(t)) { + struct btf_enum *e = btf_enum(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, name)) { + *v = e[i].val; + return true; + } + } + } else if (btf_is_enum64(t)) { + struct btf_enum64 *e = btf_enum64(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, name)) { + *v = btf_enum64_value(&e[i]); + return true; + } + } + } + + return false; +} + +#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ +({ \ + u64 __val = 0; \ + __COMPAT_read_enum(__type, __ent, &__val); \ + __val; \ +}) + +static inline bool __COMPAT_has_ksym(const char *ksym) +{ + __COMPAT_load_vmlinux_btf(); + return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0; +} + +static inline bool __COMPAT_struct_has_field(const char *type, const char *field) +{ + const struct btf_type *t; + const struct btf_member *m; + const char *n; + s32 tid; + int i; + + __COMPAT_load_vmlinux_btf(); + tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT); + if (tid < 0) + return false; + + t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); + SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); + + m = btf_members(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, field)) + return true; + } + + return false; +} + +#define SCX_OPS_SWITCH_PARTIAL \ + __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") + +static inline long scx_hotplug_seq(void) +{ + int fd; + char buf[32]; + ssize_t len; + long val; + + fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY); + if (fd < 0) + return -ENOENT; + + len = read(fd, buf, sizeof(buf) - 1); + SCX_BUG_ON(len <= 0, "read failed (%ld)", len); + buf[len] = 0; + close(fd); + + val = strtoul(buf, NULL, 10); + SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val); + + return val; +} + +/* + * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() + * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load + * and attach it, backward compatibility is automatically maintained where + * reasonable. + * + * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is + * the current minimum required kernel version. + */ +#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ + struct __scx_name *__skel; \ + \ + SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \ + "sched_ext_ops.dump() missing, kernel too old?"); \ + \ + __skel = __scx_name##__open(); \ + SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ + __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ + __skel; \ +}) + +#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ + UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ + SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \ +}) + +/* + * New versions of bpftool now emit additional link placeholders for BPF maps, + * and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps + * automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do + * nothing with those links and won't attempt to auto-attach maps. + * + * To maintain compatibility with older libbpf while avoiding trying to attach + * twice, disable the autoattach feature on newer libbpf. + */ +#if LIBBPF_MAJOR_VERSION > 1 || \ + (LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5) +#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) \ + bpf_map__set_autoattach((__skel)->maps.__ops_name, false) +#else +#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0) +#endif + +#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \ + struct bpf_link *__link; \ + __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name); \ + SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel"); \ + __link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name); \ + SCX_BUG_ON(!__link, "Failed to attach struct_ops"); \ + __link; \ +}) + +#endif /* __SCX_COMPAT_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h new file mode 100644 index 000000000000..891693ee604e --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifndef __USER_EXIT_INFO_H +#define __USER_EXIT_INFO_H + +enum uei_sizes { + UEI_REASON_LEN = 128, + UEI_MSG_LEN = 1024, + UEI_DUMP_DFL_LEN = 32768, +}; + +struct user_exit_info { + int kind; + s64 exit_code; + char reason[UEI_REASON_LEN]; + char msg[UEI_MSG_LEN]; +}; + +#ifdef __bpf__ + +#include "vmlinux.h" +#include <bpf/bpf_core_read.h> + +#define UEI_DEFINE(__name) \ + char RESIZABLE_ARRAY(data, __name##_dump); \ + const volatile u32 __name##_dump_len; \ + struct user_exit_info __name SEC(".data") + +#define UEI_RECORD(__uei_name, __ei) ({ \ + bpf_probe_read_kernel_str(__uei_name.reason, \ + sizeof(__uei_name.reason), (__ei)->reason); \ + bpf_probe_read_kernel_str(__uei_name.msg, \ + sizeof(__uei_name.msg), (__ei)->msg); \ + bpf_probe_read_kernel_str(__uei_name##_dump, \ + __uei_name##_dump_len, (__ei)->dump); \ + if (bpf_core_field_exists((__ei)->exit_code)) \ + __uei_name.exit_code = (__ei)->exit_code; \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ + (__ei)->kind); \ +}) + +#else /* !__bpf__ */ + +#include <stdio.h> +#include <stdbool.h> + +/* no need to call the following explicitly if SCX_OPS_LOAD() is used */ +#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ + u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ + (__skel)->rodata->__uei_name##_dump_len = __len; \ + RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \ +}) + +#define UEI_EXITED(__skel, __uei_name) ({ \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \ +}) + +#define UEI_REPORT(__skel, __uei_name) ({ \ + struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ + char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \ + if (__uei_dump[0] != '\0') { \ + fputs("\nDEBUG DUMP\n", stderr); \ + fputs("================================================================================\n\n", stderr); \ + fputs(__uei_dump, stderr); \ + fputs("\n================================================================================\n\n", stderr); \ + } \ + fprintf(stderr, "EXIT: %s", __uei->reason); \ + if (__uei->msg[0] != '\0') \ + fprintf(stderr, " (%s)", __uei->msg); \ + fputs("\n", stderr); \ + __uei->exit_code; \ +}) + +/* + * We can't import vmlinux.h while compiling user C code. Let's duplicate + * scx_exit_code definition. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +enum uei_ecode_mask { + UEI_ECODE_USER_MASK = ((1LLU << 32) - 1), + UEI_ECODE_SYS_RSN_MASK = ((1LLU << 16) - 1) << 32, + UEI_ECODE_SYS_ACT_MASK = ((1LLU << 16) - 1) << 48, +}; + +/* + * These macro interpret the ecode returned from UEI_REPORT(). + */ +#define UEI_ECODE_USER(__ecode) ((__ecode) & UEI_ECODE_USER_MASK) +#define UEI_ECODE_SYS_RSN(__ecode) ((__ecode) & UEI_ECODE_SYS_RSN_MASK) +#define UEI_ECODE_SYS_ACT(__ecode) ((__ecode) & UEI_ECODE_SYS_ACT_MASK) + +#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) + +#endif /* __bpf__ */ +#endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c new file mode 100644 index 000000000000..1d8fd570eaa7 --- /dev/null +++ b/tools/sched_ext/scx_central.bpf.c @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A central FIFO sched_ext scheduler which demonstrates the followings: + * + * a. Making all scheduling decisions from one CPU: + * + * The central CPU is the only one making scheduling decisions. All other + * CPUs kick the central CPU when they run out of tasks to run. + * + * There is one global BPF queue and the central CPU schedules all CPUs by + * dispatching from the global queue to each CPU's local dsq from dispatch(). + * This isn't the most straightforward. e.g. It'd be easier to bounce + * through per-CPU BPF queues. The current design is chosen to maximally + * utilize and verify various SCX mechanisms such as LOCAL_ON dispatching. + * + * b. Tickless operation + * + * All tasks are dispatched with the infinite slice which allows stopping the + * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full + * parameter. The tickless operation can be observed through + * /proc/interrupts. + * + * Periodic switching is enforced by a periodic timer checking all CPUs and + * preempting them as necessary. Unfortunately, BPF timer currently doesn't + * have a way to pin to a specific CPU, so the periodic timer isn't pinned to + * the central CPU. + * + * c. Preemption + * + * Kthreads are unconditionally queued to the head of a matching local dsq + * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always + * prioritized over user threads, which is required for ensuring forward + * progress as e.g. the periodic timer may run on a ksoftirqd and if the + * ksoftirqd gets starved by a user thread, there may not be anything else to + * vacate that user thread. + * + * SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the + * next tasks. + * + * This scheduler is designed to maximize usage of various SCX mechanisms. A + * more practical implementation would likely put the scheduling loop outside + * the central CPU's dispatch() path and add some form of priority mechanism. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +enum { + FALLBACK_DSQ_ID = 0, + MS_TO_NS = 1000LLU * 1000, + TIMER_INTERVAL_NS = 1 * MS_TO_NS, +}; + +const volatile s32 central_cpu; +const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */ +const volatile u64 slice_ns = SCX_SLICE_DFL; + +bool timer_pinned = true; +u64 nr_total, nr_locals, nr_queued, nr_lost_pids; +u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries; +u64 nr_overflows; + +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 4096); + __type(value, s32); +} central_q SEC(".maps"); + +/* can't use percpu map due to bad lookups */ +bool RESIZABLE_ARRAY(data, cpu_gimme_task); +u64 RESIZABLE_ARRAY(data, cpu_started_at); + +struct central_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct central_timer); +} central_timer SEC(".maps"); + +static bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* + * Steer wakeups to the central CPU as much as possible to avoid + * disturbing other CPUs. It's safe to blindly return the central cpu as + * select_cpu() is a hint and if @p can't be on it, the kernel will + * automatically pick a fallback CPU. + */ + return central_cpu; +} + +void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags) +{ + s32 pid = p->pid; + + __sync_fetch_and_add(&nr_total, 1); + + /* + * Push per-cpu kthreads at the head of local dsq's and preempt the + * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked + * behind other threads which is necessary for forward progress + * guarantee as we depend on the BPF timer which may run from ksoftirqd. + */ + if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) { + __sync_fetch_and_add(&nr_locals, 1); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, + enq_flags | SCX_ENQ_PREEMPT); + return; + } + + if (bpf_map_push_elem(¢ral_q, &pid, 0)) { + __sync_fetch_and_add(&nr_overflows, 1); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags); + return; + } + + __sync_fetch_and_add(&nr_queued, 1); + + if (!scx_bpf_task_running(p)) + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); +} + +static bool dispatch_to_cpu(s32 cpu) +{ + struct task_struct *p; + s32 pid; + + bpf_repeat(BPF_MAX_LOOPS) { + if (bpf_map_pop_elem(¢ral_q, &pid)) + break; + + __sync_fetch_and_sub(&nr_queued, 1); + + p = bpf_task_from_pid(pid); + if (!p) { + __sync_fetch_and_add(&nr_lost_pids, 1); + continue; + } + + /* + * If we can't run the task at the top, do the dumb thing and + * bounce it to the fallback dsq. + */ + if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { + __sync_fetch_and_add(&nr_mismatches, 1); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); + bpf_task_release(p); + /* + * We might run out of dispatch buffer slots if we continue dispatching + * to the fallback DSQ, without dispatching to the local DSQ of the + * target CPU. In such a case, break the loop now as will fail the + * next dispatch operation. + */ + if (!scx_bpf_dispatch_nr_slots()) + break; + continue; + } + + /* dispatch to local and mark that @cpu doesn't need more */ + scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0); + + if (cpu != central_cpu) + scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + + bpf_task_release(p); + return true; + } + + return false; +} + +void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev) +{ + if (cpu == central_cpu) { + /* dispatch for all other CPUs first */ + __sync_fetch_and_add(&nr_dispatches, 1); + + bpf_for(cpu, 0, nr_cpu_ids) { + bool *gimme; + + if (!scx_bpf_dispatch_nr_slots()) + break; + + /* central's gimme is never set */ + gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); + if (gimme && !*gimme) + continue; + + if (dispatch_to_cpu(cpu)) + *gimme = false; + } + + /* + * Retry if we ran out of dispatch buffer slots as we might have + * skipped some CPUs and also need to dispatch for self. The ext + * core automatically retries if the local dsq is empty but we + * can't rely on that as we're dispatching for other CPUs too. + * Kick self explicitly to retry. + */ + if (!scx_bpf_dispatch_nr_slots()) { + __sync_fetch_and_add(&nr_retries, 1); + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); + return; + } + + /* look for a task to run on the central CPU */ + if (scx_bpf_consume(FALLBACK_DSQ_ID)) + return; + dispatch_to_cpu(central_cpu); + } else { + bool *gimme; + + if (scx_bpf_consume(FALLBACK_DSQ_ID)) + return; + + gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); + if (gimme) + *gimme = true; + + /* + * Force dispatch on the scheduling CPU so that it finds a task + * to run for us. + */ + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); + } +} + +void BPF_STRUCT_OPS(central_running, struct task_struct *p) +{ + s32 cpu = scx_bpf_task_cpu(p); + u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at) + *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */ +} + +void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable) +{ + s32 cpu = scx_bpf_task_cpu(p); + u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at) + *started_at = 0; +} + +static int central_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + u64 now = bpf_ktime_get_ns(); + u64 nr_to_kick = nr_queued; + s32 i, curr_cpu; + + curr_cpu = bpf_get_smp_processor_id(); + if (timer_pinned && (curr_cpu != central_cpu)) { + scx_bpf_error("Central timer ran on CPU %d, not central CPU %d", + curr_cpu, central_cpu); + return 0; + } + + bpf_for(i, 0, nr_cpu_ids) { + s32 cpu = (nr_timers + i) % nr_cpu_ids; + u64 *started_at; + + if (cpu == central_cpu) + continue; + + /* kick iff the current one exhausted its slice */ + started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at && *started_at && + vtime_before(now, *started_at + slice_ns)) + continue; + + /* and there's something pending */ + if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) || + scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu)) + ; + else if (nr_to_kick) + nr_to_kick--; + else + continue; + + scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT); + } + + bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); + __sync_fetch_and_add(&nr_timers, 1); + return 0; +} + +int BPF_STRUCT_OPS_SLEEPABLE(central_init) +{ + u32 key = 0; + struct bpf_timer *timer; + int ret; + + ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); + if (ret) + return ret; + + timer = bpf_map_lookup_elem(¢ral_timer, &key); + if (!timer) + return -ESRCH; + + if (bpf_get_smp_processor_id() != central_cpu) { + scx_bpf_error("init from non-central CPU"); + return -EINVAL; + } + + bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, central_timerfn); + + ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); + /* + * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a + * kernel which doesn't have it, bpf_timer_start() will return -EINVAL. + * Retry without the PIN. This would be the perfect use case for + * bpf_core_enum_value_exists() but the enum type doesn't have a name + * and can't be used with bpf_core_enum_value_exists(). Oh well... + */ + if (ret == -EINVAL) { + timer_pinned = false; + ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0); + } + if (ret) + scx_bpf_error("bpf_timer_start failed (%d)", ret); + return ret; +} + +void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(central_ops, + /* + * We are offloading all scheduling decisions to the central CPU + * and thus being the last task on a given CPU doesn't mean + * anything special. Enqueue the last tasks like any other tasks. + */ + .flags = SCX_OPS_ENQ_LAST, + + .select_cpu = (void *)central_select_cpu, + .enqueue = (void *)central_enqueue, + .dispatch = (void *)central_dispatch, + .running = (void *)central_running, + .stopping = (void *)central_stopping, + .init = (void *)central_init, + .exit = (void *)central_exit, + .name = "central"); diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c new file mode 100644 index 000000000000..21deea320bd7 --- /dev/null +++ b/tools/sched_ext/scx_central.c @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#define _GNU_SOURCE +#include <sched.h> +#include <stdio.h> +#include <unistd.h> +#include <inttypes.h> +#include <signal.h> +#include <libgen.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include "scx_central.bpf.skel.h" + +const char help_fmt[] = +"A central FIFO sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-c CPU]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -c CPU Override the central CPU (default: 0)\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_central *skel; + struct bpf_link *link; + __u64 seq = 0, ecode; + __s32 opt; + cpu_set_t *cpuset; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(central_ops, scx_central); + + skel->rodata->central_cpu = 0; + skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); + + while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { + switch (opt) { + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; + break; + case 'c': + skel->rodata->central_cpu = strtoul(optarg, NULL, 0); + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + /* Resize arrays so their element count is equal to cpu count. */ + RESIZE_ARRAY(skel, data, cpu_gimme_task, skel->rodata->nr_cpu_ids); + RESIZE_ARRAY(skel, data, cpu_started_at, skel->rodata->nr_cpu_ids); + + SCX_OPS_LOAD(skel, central_ops, scx_central, uei); + + /* + * Affinitize the loading thread to the central CPU, as: + * - That's where the BPF timer is first invoked in the BPF program. + * - We probably don't want this user space component to take up a core + * from a task that would benefit from avoiding preemption on one of + * the tickless cores. + * + * Until BPF supports pinning the timer, it's not guaranteed that it + * will always be invoked on the central CPU. In practice, this + * suffices the majority of the time. + */ + cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); + SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); + CPU_ZERO(cpuset); + CPU_SET(skel->rodata->central_cpu, cpuset); + SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), + "Failed to affinitize to central CPU %d (max %d)", + skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); + CPU_FREE(cpuset); + + link = SCX_OPS_ATTACH(skel, central_ops, scx_central); + + if (!skel->data->timer_pinned) + printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n"); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + printf("[SEQ %llu]\n", seq++); + printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n", + skel->bss->nr_total, + skel->bss->nr_locals, + skel->bss->nr_queued, + skel->bss->nr_lost_pids); + printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", + skel->bss->nr_timers, + skel->bss->nr_dispatches, + skel->bss->nr_mismatches, + skel->bss->nr_retries); + printf("overflow:%10" PRIu64 "\n", + skel->bss->nr_overflows); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_central__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c new file mode 100644 index 000000000000..892278f12dce --- /dev/null +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -0,0 +1,706 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple five-level FIFO queue scheduler. + * + * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets + * assigned to one depending on its compound weight. Each CPU round robins + * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from + * queue0, 2 from queue1, 4 from queue2 and so on. + * + * This scheduler demonstrates: + * + * - BPF-side queueing using PIDs. + * - Sleepable per-task storage allocation using ops.prep_enable(). + * - Using ops.cpu_release() to handle a higher priority scheduling class taking + * the CPU away. + * - Core-sched support. + * + * This scheduler is primarily for demonstration and testing of sched_ext + * features and unlikely to be useful for actual workloads. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <scx/common.bpf.h> + +enum consts { + ONE_SEC_IN_NS = 1000000000, + SHARED_DSQ = 0, +}; + +char _license[] SEC("license") = "GPL"; + +const volatile u64 slice_ns = SCX_SLICE_DFL; +const volatile u32 stall_user_nth; +const volatile u32 stall_kernel_nth; +const volatile u32 dsp_inf_loop_after; +const volatile u32 dsp_batch; +const volatile bool print_shared_dsq; +const volatile s32 disallow_tgid; +const volatile bool suppress_dump; + +u32 test_error_cnt; + +UEI_DEFINE(uei); + +struct qmap { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 4096); + __type(value, u32); +} queue0 SEC(".maps"), + queue1 SEC(".maps"), + queue2 SEC(".maps"), + queue3 SEC(".maps"), + queue4 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 5); + __type(key, int); + __array(values, struct qmap); +} queue_arr SEC(".maps") = { + .values = { + [0] = &queue0, + [1] = &queue1, + [2] = &queue2, + [3] = &queue3, + [4] = &queue4, + }, +}; + +/* + * If enabled, CPU performance target is set according to the queue index + * according to the following table. + */ +static const u32 qidx_to_cpuperf_target[] = { + [0] = SCX_CPUPERF_ONE * 0 / 4, + [1] = SCX_CPUPERF_ONE * 1 / 4, + [2] = SCX_CPUPERF_ONE * 2 / 4, + [3] = SCX_CPUPERF_ONE * 3 / 4, + [4] = SCX_CPUPERF_ONE * 4 / 4, +}; + +/* + * Per-queue sequence numbers to implement core-sched ordering. + * + * Tail seq is assigned to each queued task and incremented. Head seq tracks the + * sequence number of the latest dispatched task. The distance between the a + * task's seq and the associated queue's head seq is called the queue distance + * and used when comparing two tasks for ordering. See qmap_core_sched_before(). + */ +static u64 core_sched_head_seqs[5]; +static u64 core_sched_tail_seqs[5]; + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local_dsq */ + u64 core_sched_seq; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +struct cpu_ctx { + u64 dsp_idx; /* dispatch index */ + u64 dsp_cnt; /* remaining count */ + u32 avg_weight; + u32 cpuperf_target; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct cpu_ctx); +} cpu_ctx_stor SEC(".maps"); + +/* Statistics */ +u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; +u64 nr_core_sched_execed; +u32 cpuperf_min, cpuperf_avg, cpuperf_max; +u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; + +static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) +{ + s32 cpu; + + if (p->nr_cpus_allowed == 1 || + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) + return prev_cpu; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + return cpu; + + return -1; +} + +s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return -ESRCH; + } + + cpu = pick_direct_dispatch_cpu(p, prev_cpu); + + if (cpu >= 0) { + tctx->force_local = true; + return cpu; + } else { + return prev_cpu; + } +} + +static int weight_to_idx(u32 weight) +{ + /* Coarsely map the compound weight to a FIFO. */ + if (weight <= 25) + return 0; + else if (weight <= 50) + return 1; + else if (weight < 200) + return 2; + else if (weight < 400) + return 3; + else + return 4; +} + +void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) +{ + static u32 user_cnt, kernel_cnt; + struct task_ctx *tctx; + u32 pid = p->pid; + int idx = weight_to_idx(p->scx.weight); + void *ring; + s32 cpu; + + if (p->flags & PF_KTHREAD) { + if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) + return; + } else { + if (stall_user_nth && !(++user_cnt % stall_user_nth)) + return; + } + + if (test_error_cnt && !--test_error_cnt) + scx_bpf_error("test triggering error"); + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + /* + * All enqueued tasks must have their core_sched_seq updated for correct + * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in + * qmap_ops.flags. + */ + tctx->core_sched_seq = core_sched_tail_seqs[idx]++; + + /* + * If qmap_select_cpu() is telling us to or this is the last runnable + * task on the CPU, enqueue locally. + */ + if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { + tctx->force_local = false; + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); + return; + } + + /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */ + if (!(enq_flags & SCX_ENQ_WAKEUP) && + (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { + __sync_fetch_and_add(&nr_ddsp_from_enq, 1); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); + return; + } + + /* + * If the task was re-enqueued due to the CPU being preempted by a + * higher priority scheduling class, just re-enqueue the task directly + * on the global DSQ. As we want another CPU to pick it up, find and + * kick an idle CPU. + */ + if (enq_flags & SCX_ENQ_REENQ) { + s32 cpu; + + scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags); + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + return; + } + + ring = bpf_map_lookup_elem(&queue_arr, &idx); + if (!ring) { + scx_bpf_error("failed to find ring %d", idx); + return; + } + + /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ + if (bpf_map_push_elem(ring, &pid, 0)) { + scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); + return; + } + + __sync_fetch_and_add(&nr_enqueued, 1); +} + +/* + * The BPF queue map doesn't support removal and sched_ext can handle spurious + * dispatches. qmap_dequeue() is only used to collect statistics. + */ +void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) +{ + __sync_fetch_and_add(&nr_dequeued, 1); + if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) + __sync_fetch_and_add(&nr_core_sched_execed, 1); +} + +static void update_core_sched_head_seq(struct task_struct *p) +{ + struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + int idx = weight_to_idx(p->scx.weight); + + if (tctx) + core_sched_head_seqs[idx] = tctx->core_sched_seq; + else + scx_bpf_error("task_ctx lookup failed"); +} + +void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) +{ + struct task_struct *p; + struct cpu_ctx *cpuc; + u32 zero = 0, batch = dsp_batch ?: 1; + void *fifo; + s32 i, pid; + + if (scx_bpf_consume(SHARED_DSQ)) + return; + + if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { + /* + * PID 2 should be kthreadd which should mostly be idle and off + * the scheduler. Let's keep dispatching it to force the kernel + * to call this function over and over again. + */ + p = bpf_task_from_pid(2); + if (p) { + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0); + bpf_task_release(p); + return; + } + } + + if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { + scx_bpf_error("failed to look up cpu_ctx"); + return; + } + + for (i = 0; i < 5; i++) { + /* Advance the dispatch cursor and pick the fifo. */ + if (!cpuc->dsp_cnt) { + cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; + cpuc->dsp_cnt = 1 << cpuc->dsp_idx; + } + + fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); + if (!fifo) { + scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); + return; + } + + /* Dispatch or advance. */ + bpf_repeat(BPF_MAX_LOOPS) { + if (bpf_map_pop_elem(fifo, &pid)) + break; + + p = bpf_task_from_pid(pid); + if (!p) + continue; + + update_core_sched_head_seq(p); + __sync_fetch_and_add(&nr_dispatched, 1); + scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); + bpf_task_release(p); + batch--; + cpuc->dsp_cnt--; + if (!batch || !scx_bpf_dispatch_nr_slots()) { + scx_bpf_consume(SHARED_DSQ); + return; + } + if (!cpuc->dsp_cnt) + break; + } + + cpuc->dsp_cnt = 0; + } +} + +void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) +{ + struct cpu_ctx *cpuc; + u32 zero = 0; + int idx; + + if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { + scx_bpf_error("failed to look up cpu_ctx"); + return; + } + + /* + * Use the running avg of weights to select the target cpuperf level. + * This is a demonstration of the cpuperf feature rather than a + * practical strategy to regulate CPU frequency. + */ + cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; + idx = weight_to_idx(cpuc->avg_weight); + cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; + + scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); +} + +/* + * The distance from the head of the queue scaled by the weight of the queue. + * The lower the number, the older the task and the higher the priority. + */ +static s64 task_qdist(struct task_struct *p) +{ + int idx = weight_to_idx(p->scx.weight); + struct task_ctx *tctx; + s64 qdist; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return 0; + } + + qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; + + /* + * As queue index increments, the priority doubles. The queue w/ index 3 + * is dispatched twice more frequently than 2. Reflect the difference by + * scaling qdists accordingly. Note that the shift amount needs to be + * flipped depending on the sign to avoid flipping priority direction. + */ + if (qdist >= 0) + return qdist << (4 - idx); + else + return qdist << idx; +} + +/* + * This is called to determine the task ordering when core-sched is picking + * tasks to execute on SMT siblings and should encode about the same ordering as + * the regular scheduling path. Use the priority-scaled distances from the head + * of the queues to compare the two tasks which should be consistent with the + * dispatch path behavior. + */ +bool BPF_STRUCT_OPS(qmap_core_sched_before, + struct task_struct *a, struct task_struct *b) +{ + return task_qdist(a) > task_qdist(b); +} + +void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + u32 cnt; + + /* + * Called when @cpu is taken by a higher priority scheduling class. This + * makes @cpu no longer available for executing sched_ext tasks. As we + * don't want the tasks in @cpu's local dsq to sit there until @cpu + * becomes available again, re-enqueue them into the global dsq. See + * %SCX_ENQ_REENQ handling in qmap_enqueue(). + */ + cnt = scx_bpf_reenqueue_local(); + if (cnt) + __sync_fetch_and_add(&nr_reenqueued, cnt); +} + +s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + if (p->tgid == disallow_tgid) + p->scx.disallow = true; + + /* + * @p is new. Let's ensure that its task_ctx is available. We can sleep + * in this function and the following will automatically use GFP_KERNEL. + */ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) +{ + s32 i, pid; + + if (suppress_dump) + return; + + bpf_for(i, 0, 5) { + void *fifo; + + if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) + return; + + scx_bpf_dump("QMAP FIFO[%d]:", i); + bpf_repeat(4096) { + if (bpf_map_pop_elem(fifo, &pid)) + break; + scx_bpf_dump(" %d", pid); + } + scx_bpf_dump("\n"); + } +} + +void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) +{ + u32 zero = 0; + struct cpu_ctx *cpuc; + + if (suppress_dump || idle) + return; + if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) + return; + + scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", + cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, + cpuc->cpuperf_target); +} + +void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) +{ + struct task_ctx *taskc; + + if (suppress_dump) + return; + if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) + return; + + scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", + taskc->force_local, taskc->core_sched_seq); +} + +/* + * Print out the online and possible CPU map using bpf_printk() as a + * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). + */ +static void print_cpus(void) +{ + const struct cpumask *possible, *online; + s32 cpu; + char buf[128] = "", *p; + int idx; + + possible = scx_bpf_get_possible_cpumask(); + online = scx_bpf_get_online_cpumask(); + + idx = 0; + bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { + if (!(p = MEMBER_VPTR(buf, [idx++]))) + break; + if (bpf_cpumask_test_cpu(cpu, online)) + *p++ = 'O'; + else if (bpf_cpumask_test_cpu(cpu, possible)) + *p++ = 'X'; + else + *p++ = ' '; + + if ((cpu & 7) == 7) { + if (!(p = MEMBER_VPTR(buf, [idx++]))) + break; + *p++ = '|'; + } + } + buf[sizeof(buf) - 1] = '\0'; + + scx_bpf_put_cpumask(online); + scx_bpf_put_cpumask(possible); + + bpf_printk("CPUS: |%s", buf); +} + +void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) +{ + bpf_printk("CPU %d coming online", cpu); + /* @cpu is already online at this point */ + print_cpus(); +} + +void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) +{ + bpf_printk("CPU %d going offline", cpu); + /* @cpu is still online at this point */ + print_cpus(); +} + +struct monitor_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct monitor_timer); +} monitor_timer SEC(".maps"); + +/* + * Print out the min, avg and max performance levels of CPUs every second to + * demonstrate the cpuperf interface. + */ +static void monitor_cpuperf(void) +{ + u32 zero = 0, nr_cpu_ids; + u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; + u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; + const struct cpumask *online; + int i, nr_online_cpus = 0; + + nr_cpu_ids = scx_bpf_nr_cpu_ids(); + online = scx_bpf_get_online_cpumask(); + + bpf_for(i, 0, nr_cpu_ids) { + struct cpu_ctx *cpuc; + u32 cap, cur; + + if (!bpf_cpumask_test_cpu(i, online)) + continue; + nr_online_cpus++; + + /* collect the capacity and current cpuperf */ + cap = scx_bpf_cpuperf_cap(i); + cur = scx_bpf_cpuperf_cur(i); + + cur_min = cur < cur_min ? cur : cur_min; + cur_max = cur > cur_max ? cur : cur_max; + + /* + * $cur is relative to $cap. Scale it down accordingly so that + * it's in the same scale as other CPUs and $cur_sum/$cap_sum + * makes sense. + */ + cur_sum += cur * cap / SCX_CPUPERF_ONE; + cap_sum += cap; + + if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { + scx_bpf_error("failed to look up cpu_ctx"); + goto out; + } + + /* collect target */ + cur = cpuc->cpuperf_target; + target_sum += cur; + target_min = cur < target_min ? cur : target_min; + target_max = cur > target_max ? cur : target_max; + } + + cpuperf_min = cur_min; + cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; + cpuperf_max = cur_max; + + cpuperf_target_min = target_min; + cpuperf_target_avg = target_sum / nr_online_cpus; + cpuperf_target_max = target_max; +out: + scx_bpf_put_cpumask(online); +} + +/* + * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of + * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to + * see meaningful dumps in the trace pipe. + */ +static void dump_shared_dsq(void) +{ + struct task_struct *p; + s32 nr; + + if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ))) + return; + + bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr); + + bpf_rcu_read_lock(); + bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV) + bpf_printk("%s[%d]", p->comm, p->pid); + bpf_rcu_read_unlock(); +} + +static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + monitor_cpuperf(); + + if (print_shared_dsq) + dump_shared_dsq(); + + bpf_timer_start(timer, ONE_SEC_IN_NS, 0); + return 0; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) +{ + u32 key = 0; + struct bpf_timer *timer; + s32 ret; + + print_cpus(); + + ret = scx_bpf_create_dsq(SHARED_DSQ, -1); + if (ret) + return ret; + + timer = bpf_map_lookup_elem(&monitor_timer, &key); + if (!timer) + return -ESRCH; + + bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, monitor_timerfn); + + return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); +} + +void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(qmap_ops, + .select_cpu = (void *)qmap_select_cpu, + .enqueue = (void *)qmap_enqueue, + .dequeue = (void *)qmap_dequeue, + .dispatch = (void *)qmap_dispatch, + .tick = (void *)qmap_tick, + .core_sched_before = (void *)qmap_core_sched_before, + .cpu_release = (void *)qmap_cpu_release, + .init_task = (void *)qmap_init_task, + .dump = (void *)qmap_dump, + .dump_cpu = (void *)qmap_dump_cpu, + .dump_task = (void *)qmap_dump_task, + .cpu_online = (void *)qmap_cpu_online, + .cpu_offline = (void *)qmap_cpu_offline, + .init = (void *)qmap_init, + .exit = (void *)qmap_exit, + .flags = SCX_OPS_ENQ_LAST, + .timeout_ms = 5000U, + .name = "qmap"); diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c new file mode 100644 index 000000000000..c9ca30d62b2b --- /dev/null +++ b/tools/sched_ext/scx_qmap.c @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <inttypes.h> +#include <signal.h> +#include <libgen.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include "scx_qmap.bpf.skel.h" + +const char help_fmt[] = +"A simple five-level FIFO queue sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" +" [-P] [-d PID] [-D LEN] [-p] [-v]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" +" -t COUNT Stall every COUNT'th user thread\n" +" -T COUNT Stall every COUNT'th kernel thread\n" +" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" +" -b COUNT Dispatch upto COUNT tasks together\n" +" -P Print out DSQ content to trace_pipe every second, use with -b\n" +" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" +" -D LEN Set scx_exit_info.dump buffer length\n" +" -S Suppress qmap-specific debug dump\n" +" -p Switch only tasks on SCHED_EXT policy instead of all\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_qmap *skel; + struct bpf_link *link; + int opt; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); + + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) { + switch (opt) { + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; + break; + case 'e': + skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); + break; + case 't': + skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0); + break; + case 'T': + skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0); + break; + case 'l': + skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0); + break; + case 'b': + skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); + break; + case 'P': + skel->rodata->print_shared_dsq = true; + break; + case 'd': + skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); + if (skel->rodata->disallow_tgid < 0) + skel->rodata->disallow_tgid = getpid(); + break; + case 'D': + skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0); + break; + case 'S': + skel->rodata->suppress_dump = true; + break; + case 'p': + skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL; + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei); + link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + long nr_enqueued = skel->bss->nr_enqueued; + long nr_dispatched = skel->bss->nr_dispatched; + + printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n", + nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, + skel->bss->nr_reenqueued, skel->bss->nr_dequeued, + skel->bss->nr_core_sched_execed, + skel->bss->nr_ddsp_from_enq); + if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) + printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", + skel->bss->cpuperf_min, + skel->bss->cpuperf_avg, + skel->bss->cpuperf_max, + skel->bss->cpuperf_target_min, + skel->bss->cpuperf_target_avg, + skel->bss->cpuperf_target_max); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + UEI_REPORT(skel, uei); + scx_qmap__destroy(skel); + /* + * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart + * on CPU hotplug events. + */ + return 0; +} diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py new file mode 100644 index 000000000000..d457d2a74e1e --- /dev/null +++ b/tools/sched_ext/scx_show_state.py @@ -0,0 +1,39 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2024 Tejun Heo <tj@kernel.org> +# Copyright (C) 2024 Meta Platforms, Inc. and affiliates. + +desc = """ +This is a drgn script to show the current sched_ext state. +For more info on drgn, visit https://github.com/osandov/drgn. +""" + +import drgn +import sys + +def err(s): + print(s, file=sys.stderr, flush=True) + sys.exit(1) + +def read_int(name): + return int(prog[name].value_()) + +def read_atomic(name): + return prog[name].counter.value_() + +def read_static_key(name): + return prog[name].key.enabled.counter.value_() + +def ops_state_str(state): + return prog['scx_ops_enable_state_str'][state].string_().decode() + +ops = prog['scx_ops'] +enable_state = read_atomic("scx_ops_enable_state_var") + +print(f'ops : {ops.name.string_().decode()}') +print(f'enabled : {read_static_key("__scx_ops_enabled")}') +print(f'switching_all : {read_int("scx_switching_all")}') +print(f'switched_all : {read_static_key("__scx_switched_all")}') +print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') +print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}') +print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c new file mode 100644 index 000000000000..ed7e8d535fc5 --- /dev/null +++ b/tools/sched_ext/scx_simple.bpf.c @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple scheduler. + * + * By default, it operates as a simple global weighted vtime scheduler and can + * be switched to FIFO scheduling. It also demonstrates the following niceties. + * + * - Statistics tracking how many tasks are queued to local and global dsq's. + * - Termination notification for userspace. + * + * While very simple, this scheduler should work reasonably well on CPUs with a + * uniform L3 cache topology. While preemption is not implemented, the fact that + * the scheduling queue is shared across all CPUs means that whatever is at the + * front of the queue is likely to be executed fairly quickly given enough + * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads + * but comes with the usual problems with FIFO scheduling where saturating + * threads can easily drown out interactive ones. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +const volatile bool fifo_sched; + +static u64 vtime_now; +UEI_DEFINE(uei); + +/* + * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues + * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We + * therefore create a separate DSQ with ID 0 that we dispatch to and consume + * from. If scx_simple only supported global FIFO scheduling, then we could + * just use SCX_DSQ_GLOBAL. + */ +#define SHARED_DSQ 0 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 2); /* [local, global] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { + stat_inc(0); /* count local queueing */ + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } + + return cpu; +} + +void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) +{ + stat_inc(1); /* count global queueing */ + + if (fifo_sched) { + scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); + } else { + u64 vtime = p->scx.dsq_vtime; + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + vtime = vtime_now - SCX_SLICE_DFL; + + scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, + enq_flags); + } +} + +void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_consume(SHARED_DSQ); +} + +void BPF_STRUCT_OPS(simple_running, struct task_struct *p) +{ + if (fifo_sched) + return; + + /* + * Global vtime always progresses forward as tasks start executing. The + * test and update can be performed concurrently from multiple CPUs and + * thus racy. Any error should be contained and temporary. Let's just + * live with it. + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) +{ + if (fifo_sched) + return; + + /* + * Scale the execution time by the inverse of the weight and charge. + * + * Note that the default yield implementation yields by setting + * @p->scx.slice to zero and the following would treat the yielding task + * as if it has consumed all its slice. If this penalizes yielding tasks + * too much, determine the execution time by taking explicit timestamps + * instead of depending on @p->scx.slice. + */ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) +{ + return scx_bpf_create_dsq(SHARED_DSQ, -1); +} + +void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(simple_ops, + .select_cpu = (void *)simple_select_cpu, + .enqueue = (void *)simple_enqueue, + .dispatch = (void *)simple_dispatch, + .running = (void *)simple_running, + .stopping = (void *)simple_stopping, + .enable = (void *)simple_enable, + .init = (void *)simple_init, + .exit = (void *)simple_exit, + .name = "simple"); diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c new file mode 100644 index 000000000000..76d83199545c --- /dev/null +++ b/tools/sched_ext/scx_simple.c @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <libgen.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include "scx_simple.bpf.skel.h" + +const char help_fmt[] = +"A simple sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-f] [-v]\n" +"\n" +" -f Use FIFO scheduling instead of weighted vtime scheduling\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int simple) +{ + exit_req = 1; +} + +static void read_stats(struct scx_simple *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + __u64 cnts[2][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 2); + + for (idx = 0; idx < 2; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_simple *skel; + struct bpf_link *link; + __u32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(simple_ops, scx_simple); + + while ((opt = getopt(argc, argv, "fvh")) != -1) { + switch (opt) { + case 'f': + skel->rodata->fifo_sched = true; + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei); + link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 stats[2]; + + read_stats(skel, stats); + printf("local=%llu global=%llu\n", stats[0], stats[1]); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_simple__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/testing/selftests/sched_ext/.gitignore b/tools/testing/selftests/sched_ext/.gitignore new file mode 100644 index 000000000000..ae5491a114c0 --- /dev/null +++ b/tools/testing/selftests/sched_ext/.gitignore @@ -0,0 +1,6 @@ +* +!*.c +!*.h +!Makefile +!.gitignore +!config diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile new file mode 100644 index 000000000000..0754a2c110a1 --- /dev/null +++ b/tools/testing/selftests/sched_ext/Makefile @@ -0,0 +1,218 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. +include ../../../build/Build.include +include ../../../scripts/Makefile.arch +include ../../../scripts/Makefile.include +include ../lib.mk + +ifneq ($(LLVM),) +ifneq ($(filter %/,$(LLVM)),) +LLVM_PREFIX := $(LLVM) +else ifneq ($(filter -%,$(LLVM)),) +LLVM_SUFFIX := $(LLVM) +endif + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +else +CC := gcc +endif # LLVM + +ifneq ($(CROSS_COMPILE),) +$(error CROSS_COMPILE not supported for scx selftests) +endif # CROSS_COMPILE + +CURDIR := $(abspath .) +REPOROOT := $(abspath ../../../..) +TOOLSDIR := $(REPOROOT)/tools +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi +GENDIR := $(REPOROOT)/include/generated +GENHDR := $(GENDIR)/autoconf.h +SCXTOOLSDIR := $(TOOLSDIR)/sched_ext +SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include + +OUTPUT_DIR := $(CURDIR)/build +OBJ_DIR := $(OUTPUT_DIR)/obj +INCLUDE_DIR := $(OUTPUT_DIR)/include +BPFOBJ_DIR := $(OBJ_DIR)/libbpf +SCXOBJ_DIR := $(OBJ_DIR)/sched_ext +BPFOBJ := $(BPFOBJ_DIR)/libbpf.a +LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a +DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool +HOST_BUILD_DIR := $(OBJ_DIR) +HOST_OUTPUT_DIR := $(OUTPUT_DIR) + +VMLINUX_BTF_PATHS ?= ../../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +ifneq ($(wildcard $(GENHDR)),) + GENFLAGS := -DHAVE_GENHDR +endif + +CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -I$(SCXTOOLSINCDIR) + +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + +LDFLAGS = -lelf -lz -lpthread -lzstd + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \ + grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__') + +# Get Clang's default includes on this system, as opposed to those seen by +# '-target bpf'. This fixes "missing" files on some architectures/distros, +# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +define get_sys_includes +$(shell $(1) -v -E - </dev/null 2>&1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') +endef + +BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \ + $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) \ + -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat \ + -I$(INCLUDE_DIR) -I$(APIDIR) -I$(SCXTOOLSINCDIR) \ + -I$(REPOROOT)/include \ + $(call get_sys_includes,$(CLANG)) \ + -Wall -Wno-compare-distinct-pointer-types \ + -Wno-incompatible-function-pointer-types \ + -O2 -mcpu=v3 + +# sort removes libbpf duplicates when not cross-building +MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(OBJ_DIR)/libbpf \ + $(OBJ_DIR)/bpftool $(OBJ_DIR)/resolve_btfids \ + $(INCLUDE_DIR) $(SCXOBJ_DIR)) + +$(MAKE_DIRS): + $(call msg,MKDIR,,$@) + $(Q)mkdir -p $@ + +$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ + $(APIDIR)/linux/bpf.h \ + | $(OBJ_DIR)/libbpf + $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/ \ + EXTRA_CFLAGS='-g -O0 -fPIC' \ + DESTDIR=$(OUTPUT_DIR) prefix= all install_headers + +$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ + $(LIBBPF_OUTPUT) | $(OBJ_DIR)/bpftool + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ + ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) \ + EXTRA_CFLAGS='-g -O0' \ + OUTPUT=$(OBJ_DIR)/bpftool/ \ + LIBBPF_OUTPUT=$(OBJ_DIR)/libbpf/ \ + LIBBPF_DESTDIR=$(OUTPUT_DIR)/ \ + prefix= DESTDIR=$(OUTPUT_DIR)/ install-bin + +$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) +ifeq ($(VMLINUX_H),) + $(call msg,GEN,,$@) + $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ +else + $(call msg,CP,,$@) + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h | $(BPFOBJ) $(SCXOBJ_DIR) + $(call msg,CLNG-BPF,,$(notdir $@)) + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ + +$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) | $(INCLUDE_DIR) + $(eval sched=$(notdir $@)) + $(call msg,GEN-SKEL,,$(sched)) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) + +################ +# C schedulers # +################ + +override define CLEAN + rm -rf $(OUTPUT_DIR) + rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h + rm -f $(TEST_GEN_PROGS) + rm -f runner +endef + +# Every testcase takes all of the BPF progs are dependencies by default. This +# allows testcases to load any BPF scheduler, which is useful for testcases +# that don't need their own prog to run their test. +all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubst %.c,%.skel.h,$(prog))) + +auto-test-targets := \ + create_dsq \ + enq_last_no_enq_fails \ + enq_select_cpu_fails \ + ddsp_bogus_dsq_fail \ + ddsp_vtimelocal_fail \ + dsp_local_on \ + exit \ + hotplug \ + init_enable_count \ + maximal \ + maybe_null \ + minimal \ + prog_run \ + reload_loop \ + select_cpu_dfl \ + select_cpu_dfl_nodispatch \ + select_cpu_dispatch \ + select_cpu_dispatch_bad_dsq \ + select_cpu_dispatch_dbl_dsp \ + select_cpu_vtime \ + test_example \ + +testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) + +$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) + $(CC) $(CFLAGS) -c $< -o $@ + +# Create all of the test targets object files, whose testcase objects will be +# registered into the runner in ELF constructors. +# +# Note that we must do double expansion here in order to support conditionally +# compiling BPF object files only if one is present, as the wildcard Make +# function doesn't support using implicit rules otherwise. +$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $(all_test_bpfprogs) | $(SCXOBJ_DIR) + $(eval test=$(patsubst %.o,%.c,$(notdir $@))) + $(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o + +$(SCXOBJ_DIR)/util.o: util.c | $(SCXOBJ_DIR) + $(CC) $(CFLAGS) -c $< -o $@ + +runner: $(SCXOBJ_DIR)/runner.o $(SCXOBJ_DIR)/util.o $(BPFOBJ) $(testcase-targets) + @echo "$(testcase-targets)" + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +TEST_GEN_PROGS := runner + +all: runner + +.PHONY: all clean help + +.DEFAULT_GOAL := all + +.DELETE_ON_ERROR: + +.SECONDARY: diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config new file mode 100644 index 000000000000..0de9b4ee249d --- /dev/null +++ b/tools/testing/selftests/sched_ext/config @@ -0,0 +1,9 @@ +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_EXT_GROUP_SCHED=y +CONFIG_BPF=y +CONFIG_BPF_SYSCALL=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_BTF=y diff --git a/tools/testing/selftests/sched_ext/create_dsq.bpf.c b/tools/testing/selftests/sched_ext/create_dsq.bpf.c new file mode 100644 index 000000000000..23f79ed343f0 --- /dev/null +++ b/tools/testing/selftests/sched_ext/create_dsq.bpf.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Create and destroy DSQs in a loop. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +void BPF_STRUCT_OPS(create_dsq_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) +{ + scx_bpf_destroy_dsq(p->pid); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + s32 err; + + err = scx_bpf_create_dsq(p->pid, -1); + if (err) + scx_bpf_error("Failed to create DSQ for %s[%d]", + p->comm, p->pid); + + return err; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init) +{ + u32 i; + s32 err; + + bpf_for(i, 0, 1024) { + err = scx_bpf_create_dsq(i, -1); + if (err) { + scx_bpf_error("Failed to create DSQ %d", i); + return 0; + } + } + + bpf_for(i, 0, 1024) { + scx_bpf_destroy_dsq(i); + } + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops create_dsq_ops = { + .init_task = create_dsq_init_task, + .exit_task = create_dsq_exit_task, + .init = create_dsq_init, + .name = "create_dsq", +}; diff --git a/tools/testing/selftests/sched_ext/create_dsq.c b/tools/testing/selftests/sched_ext/create_dsq.c new file mode 100644 index 000000000000..fa946d9146d4 --- /dev/null +++ b/tools/testing/selftests/sched_ext/create_dsq.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "create_dsq.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct create_dsq *skel; + + skel = create_dsq__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct create_dsq *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.create_dsq_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct create_dsq *skel = ctx; + + create_dsq__destroy(skel); +} + +struct scx_test create_dsq = { + .name = "create_dsq", + .description = "Create and destroy a dsq in a loop", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&create_dsq) diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c new file mode 100644 index 000000000000..e97ad41d354a --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> + */ +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + + if (cpu >= 0) { + /* + * If we dispatch to a bogus DSQ that will fall back to the + * builtin global DSQ, we fail gracefully. + */ + scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops ddsp_bogus_dsq_fail_ops = { + .select_cpu = ddsp_bogus_dsq_fail_select_cpu, + .exit = ddsp_bogus_dsq_fail_exit, + .name = "ddsp_bogus_dsq_fail", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c new file mode 100644 index 000000000000..e65d22f23f3b --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "ddsp_bogus_dsq_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct ddsp_bogus_dsq_fail *skel; + + skel = ddsp_bogus_dsq_fail__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct ddsp_bogus_dsq_fail *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.ddsp_bogus_dsq_fail_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct ddsp_bogus_dsq_fail *skel = ctx; + + ddsp_bogus_dsq_fail__destroy(skel); +} + +struct scx_test ddsp_bogus_dsq_fail = { + .name = "ddsp_bogus_dsq_fail", + .description = "Verify we gracefully fail, and fall back to using a " + "built-in DSQ, if we do a direct dispatch to an invalid" + " DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&ddsp_bogus_dsq_fail) diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c new file mode 100644 index 000000000000..dde7e7dafbfb --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> + */ +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + + if (cpu >= 0) { + /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ + scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops ddsp_vtimelocal_fail_ops = { + .select_cpu = ddsp_vtimelocal_fail_select_cpu, + .exit = ddsp_vtimelocal_fail_exit, + .name = "ddsp_vtimelocal_fail", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c new file mode 100644 index 000000000000..abafee587cd6 --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <unistd.h> +#include "ddsp_vtimelocal_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct ddsp_vtimelocal_fail *skel; + + skel = ddsp_vtimelocal_fail__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct ddsp_vtimelocal_fail *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.ddsp_vtimelocal_fail_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct ddsp_vtimelocal_fail *skel = ctx; + + ddsp_vtimelocal_fail__destroy(skel); +} + +struct scx_test ddsp_vtimelocal_fail = { + .name = "ddsp_vtimelocal_fail", + .description = "Verify we gracefully fail, and fall back to using a " + "built-in DSQ, if we do a direct vtime dispatch to a " + "built-in DSQ from DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&ddsp_vtimelocal_fail) diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c new file mode 100644 index 000000000000..efb4672decb4 --- /dev/null +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; +const volatile s32 nr_cpus; + +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 8192); + __type(value, s32); +} queue SEC(".maps"); + +s32 BPF_STRUCT_OPS(dsp_local_on_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + return prev_cpu; +} + +void BPF_STRUCT_OPS(dsp_local_on_enqueue, struct task_struct *p, + u64 enq_flags) +{ + s32 pid = p->pid; + + if (bpf_map_push_elem(&queue, &pid, 0)) + scx_bpf_error("Failed to enqueue %s[%d]", p->comm, p->pid); +} + +void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) +{ + s32 pid, target; + struct task_struct *p; + + if (bpf_map_pop_elem(&queue, &pid)) + return; + + p = bpf_task_from_pid(pid); + if (!p) + return; + + target = bpf_get_prandom_u32() % nr_cpus; + + scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); + bpf_task_release(p); +} + +void BPF_STRUCT_OPS(dsp_local_on_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops dsp_local_on_ops = { + .select_cpu = dsp_local_on_select_cpu, + .enqueue = dsp_local_on_enqueue, + .dispatch = dsp_local_on_dispatch, + .exit = dsp_local_on_exit, + .name = "dsp_local_on", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c new file mode 100644 index 000000000000..472851b56854 --- /dev/null +++ b/tools/testing/selftests/sched_ext/dsp_local_on.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <unistd.h> +#include "dsp_local_on.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct dsp_local_on *skel; + + skel = dsp_local_on__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + SCX_FAIL_IF(dsp_local_on__load(skel), "Failed to load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct dsp_local_on *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.dsp_local_on_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + /* Just sleeping is fine, plenty of scheduling events happening */ + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct dsp_local_on *skel = ctx; + + dsp_local_on__destroy(skel); +} + +struct scx_test dsp_local_on = { + .name = "dsp_local_on", + .description = "Verify we can directly dispatch tasks to a local DSQs " + "from osp.dispatch()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&dsp_local_on) diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c new file mode 100644 index 000000000000..b0b99531d5d5 --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +SEC(".struct_ops.link") +struct sched_ext_ops enq_last_no_enq_fails_ops = { + .name = "enq_last_no_enq_fails", + /* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */ + .flags = SCX_OPS_ENQ_LAST, + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c new file mode 100644 index 000000000000..2a3eda5e2c0b --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "enq_last_no_enq_fails.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct enq_last_no_enq_fails *skel; + + skel = enq_last_no_enq_fails__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct enq_last_no_enq_fails *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops); + if (link) { + SCX_ERR("Incorrectly succeeded in to attaching scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct enq_last_no_enq_fails *skel = ctx; + + enq_last_no_enq_fails__destroy(skel); +} + +struct scx_test enq_last_no_enq_fails = { + .name = "enq_last_no_enq_fails", + .description = "Verify we fail to load a scheduler if we specify " + "the SCX_OPS_ENQ_LAST flag without defining " + "ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&enq_last_no_enq_fails) diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c new file mode 100644 index 000000000000..b3dfc1033cd6 --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +/* Manually specify the signature until the kfunc is added to the scx repo. */ +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + bool *found) __ksym; + +s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + return prev_cpu; +} + +void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, + u64 enq_flags) +{ + /* + * Need to initialize the variable or the verifier will fail to load. + * Improving these semantics is actively being worked on. + */ + bool found = false; + + /* Can only call from ops.select_cpu() */ + scx_bpf_select_cpu_dfl(p, 0, 0, &found); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +SEC(".struct_ops.link") +struct sched_ext_ops enq_select_cpu_fails_ops = { + .select_cpu = enq_select_cpu_fails_select_cpu, + .enqueue = enq_select_cpu_fails_enqueue, + .name = "enq_select_cpu_fails", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c new file mode 100644 index 000000000000..dd1350e5f002 --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "enq_select_cpu_fails.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct enq_select_cpu_fails *skel; + + skel = enq_select_cpu_fails__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct enq_select_cpu_fails *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + sleep(1); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct enq_select_cpu_fails *skel = ctx; + + enq_select_cpu_fails__destroy(skel); +} + +struct scx_test enq_select_cpu_fails = { + .name = "enq_select_cpu_fails", + .description = "Verify we fail to call scx_bpf_select_cpu_dfl() " + "from ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&enq_select_cpu_fails) diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c new file mode 100644 index 000000000000..ae12ddaac921 --- /dev/null +++ b/tools/testing/selftests/sched_ext/exit.bpf.c @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +#include "exit_test.h" + +const volatile int exit_point; +UEI_DEFINE(uei); + +#define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point) + +s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + bool found; + + if (exit_point == EXIT_SELECT_CPU) + EXIT_CLEANLY(); + + return scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &found); +} + +void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) +{ + if (exit_point == EXIT_ENQUEUE) + EXIT_CLEANLY(); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) +{ + if (exit_point == EXIT_DISPATCH) + EXIT_CLEANLY(); + + scx_bpf_consume(SCX_DSQ_GLOBAL); +} + +void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) +{ + if (exit_point == EXIT_ENABLE) + EXIT_CLEANLY(); +} + +s32 BPF_STRUCT_OPS(exit_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + if (exit_point == EXIT_INIT_TASK) + EXIT_CLEANLY(); + + return 0; +} + +void BPF_STRUCT_OPS(exit_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init) +{ + if (exit_point == EXIT_INIT) + EXIT_CLEANLY(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops exit_ops = { + .select_cpu = exit_select_cpu, + .enqueue = exit_enqueue, + .dispatch = exit_dispatch, + .init_task = exit_init_task, + .enable = exit_enable, + .exit = exit_exit, + .init = exit_init, + .name = "exit", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c new file mode 100644 index 000000000000..31bcd06e21cd --- /dev/null +++ b/tools/testing/selftests/sched_ext/exit.c @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#include <bpf/bpf.h> +#include <sched.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "exit.bpf.skel.h" +#include "scx_test.h" + +#include "exit_test.h" + +static enum scx_test_status run(void *ctx) +{ + enum exit_test_case tc; + + for (tc = 0; tc < NUM_EXITS; tc++) { + struct exit *skel; + struct bpf_link *link; + char buf[16]; + + skel = exit__open(); + skel->rodata->exit_point = tc; + exit__load(skel); + link = bpf_map__attach_struct_ops(skel->maps.exit_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + exit__destroy(skel); + return SCX_TEST_FAIL; + } + + /* Assumes uei.kind is written last */ + while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) + sched_yield(); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); + SCX_EQ(skel->data->uei.exit_code, tc); + sprintf(buf, "%d", tc); + SCX_ASSERT(!strcmp(skel->data->uei.msg, buf)); + bpf_link__destroy(link); + exit__destroy(skel); + } + + return SCX_TEST_PASS; +} + +struct scx_test exit_test = { + .name = "exit", + .description = "Verify we can cleanly exit a scheduler in multiple places", + .run = run, +}; +REGISTER_SCX_TEST(&exit_test) diff --git a/tools/testing/selftests/sched_ext/exit_test.h b/tools/testing/selftests/sched_ext/exit_test.h new file mode 100644 index 000000000000..94f0268b9cb8 --- /dev/null +++ b/tools/testing/selftests/sched_ext/exit_test.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ + +#ifndef __EXIT_TEST_H__ +#define __EXIT_TEST_H__ + +enum exit_test_case { + EXIT_SELECT_CPU, + EXIT_ENQUEUE, + EXIT_DISPATCH, + EXIT_ENABLE, + EXIT_INIT_TASK, + EXIT_INIT, + NUM_EXITS, +}; + +#endif // # __EXIT_TEST_H__ diff --git a/tools/testing/selftests/sched_ext/hotplug.bpf.c b/tools/testing/selftests/sched_ext/hotplug.bpf.c new file mode 100644 index 000000000000..8f2601db39f3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/hotplug.bpf.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +#include "hotplug_test.h" + +UEI_DEFINE(uei); + +void BPF_STRUCT_OPS(hotplug_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +static void exit_from_hotplug(s32 cpu, bool onlining) +{ + /* + * Ignored, just used to verify that we can invoke blocking kfuncs + * from the hotplug path. + */ + scx_bpf_create_dsq(0, -1); + + s64 code = SCX_ECODE_ACT_RESTART | HOTPLUG_EXIT_RSN; + + if (onlining) + code |= HOTPLUG_ONLINING; + + scx_bpf_exit(code, "hotplug event detected (%d going %s)", cpu, + onlining ? "online" : "offline"); +} + +void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_online, s32 cpu) +{ + exit_from_hotplug(cpu, true); +} + +void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_offline, s32 cpu) +{ + exit_from_hotplug(cpu, false); +} + +SEC(".struct_ops.link") +struct sched_ext_ops hotplug_cb_ops = { + .cpu_online = hotplug_cpu_online, + .cpu_offline = hotplug_cpu_offline, + .exit = hotplug_exit, + .name = "hotplug_cbs", + .timeout_ms = 1000U, +}; + +SEC(".struct_ops.link") +struct sched_ext_ops hotplug_nocb_ops = { + .exit = hotplug_exit, + .name = "hotplug_nocbs", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c new file mode 100644 index 000000000000..87bf220b1bce --- /dev/null +++ b/tools/testing/selftests/sched_ext/hotplug.c @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#include <bpf/bpf.h> +#include <sched.h> +#include <scx/common.h> +#include <sched.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "hotplug_test.h" +#include "hotplug.bpf.skel.h" +#include "scx_test.h" +#include "util.h" + +const char *online_path = "/sys/devices/system/cpu/cpu1/online"; + +static bool is_cpu_online(void) +{ + return file_read_long(online_path) > 0; +} + +static void toggle_online_status(bool online) +{ + long val = online ? 1 : 0; + int ret; + + ret = file_write_long(online_path, val); + if (ret != 0) + fprintf(stderr, "Failed to bring CPU %s (%s)", + online ? "online" : "offline", strerror(errno)); +} + +static enum scx_test_status setup(void **ctx) +{ + if (!is_cpu_online()) + return SCX_TEST_SKIP; + + return SCX_TEST_PASS; +} + +static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) +{ + struct hotplug *skel; + struct bpf_link *link; + long kind, code; + + SCX_ASSERT(is_cpu_online()); + + skel = hotplug__open_and_load(); + SCX_ASSERT(skel); + + /* Testing the offline -> online path, so go offline before starting */ + if (onlining) + toggle_online_status(0); + + if (cbs_defined) { + kind = SCX_KIND_VAL(SCX_EXIT_UNREG_BPF); + code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | HOTPLUG_EXIT_RSN; + if (onlining) + code |= HOTPLUG_ONLINING; + } else { + kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); + code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | + SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); + } + + if (cbs_defined) + link = bpf_map__attach_struct_ops(skel->maps.hotplug_cb_ops); + else + link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); + + if (!link) { + SCX_ERR("Failed to attach scheduler"); + hotplug__destroy(skel); + return SCX_TEST_FAIL; + } + + toggle_online_status(onlining ? 1 : 0); + + while (!UEI_EXITED(skel, uei)) + sched_yield(); + + SCX_EQ(skel->data->uei.kind, kind); + SCX_EQ(UEI_REPORT(skel, uei), code); + + if (!onlining) + toggle_online_status(1); + + bpf_link__destroy(link); + hotplug__destroy(skel); + + return SCX_TEST_PASS; +} + +static enum scx_test_status test_hotplug_attach(void) +{ + struct hotplug *skel; + struct bpf_link *link; + enum scx_test_status status = SCX_TEST_PASS; + long kind, code; + + SCX_ASSERT(is_cpu_online()); + SCX_ASSERT(scx_hotplug_seq() > 0); + + skel = SCX_OPS_OPEN(hotplug_nocb_ops, hotplug); + SCX_ASSERT(skel); + + SCX_OPS_LOAD(skel, hotplug_nocb_ops, hotplug, uei); + + /* + * Take the CPU offline to increment the global hotplug seq, which + * should cause attach to fail due to us setting the hotplug seq above + */ + toggle_online_status(0); + link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); + + toggle_online_status(1); + + SCX_ASSERT(link); + while (!UEI_EXITED(skel, uei)) + sched_yield(); + + kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); + code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | + SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); + SCX_EQ(skel->data->uei.kind, kind); + SCX_EQ(UEI_REPORT(skel, uei), code); + + bpf_link__destroy(link); + hotplug__destroy(skel); + + return status; +} + +static enum scx_test_status run(void *ctx) +{ + +#define HP_TEST(__onlining, __cbs_defined) ({ \ + if (test_hotplug(__onlining, __cbs_defined) != SCX_TEST_PASS) \ + return SCX_TEST_FAIL; \ +}) + + HP_TEST(true, true); + HP_TEST(false, true); + HP_TEST(true, false); + HP_TEST(false, false); + +#undef HP_TEST + + return test_hotplug_attach(); +} + +static void cleanup(void *ctx) +{ + toggle_online_status(1); +} + +struct scx_test hotplug_test = { + .name = "hotplug", + .description = "Verify hotplug behavior", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&hotplug_test) diff --git a/tools/testing/selftests/sched_ext/hotplug_test.h b/tools/testing/selftests/sched_ext/hotplug_test.h new file mode 100644 index 000000000000..73d236f90787 --- /dev/null +++ b/tools/testing/selftests/sched_ext/hotplug_test.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ + +#ifndef __HOTPLUG_TEST_H__ +#define __HOTPLUG_TEST_H__ + +enum hotplug_test_flags { + HOTPLUG_EXIT_RSN = 1LLU << 0, + HOTPLUG_ONLINING = 1LLU << 1, +}; + +#endif // # __HOTPLUG_TEST_H__ diff --git a/tools/testing/selftests/sched_ext/init_enable_count.bpf.c b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c new file mode 100644 index 000000000000..47ea89a626c3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that verifies that we do proper counting of init, enable, etc + * callbacks. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +u64 init_task_cnt, exit_task_cnt, enable_cnt, disable_cnt; +u64 init_fork_cnt, init_transition_cnt; + +s32 BPF_STRUCT_OPS_SLEEPABLE(cnt_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + __sync_fetch_and_add(&init_task_cnt, 1); + + if (args->fork) + __sync_fetch_and_add(&init_fork_cnt, 1); + else + __sync_fetch_and_add(&init_transition_cnt, 1); + + return 0; +} + +void BPF_STRUCT_OPS(cnt_exit_task, struct task_struct *p) +{ + __sync_fetch_and_add(&exit_task_cnt, 1); +} + +void BPF_STRUCT_OPS(cnt_enable, struct task_struct *p) +{ + __sync_fetch_and_add(&enable_cnt, 1); +} + +void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p) +{ + __sync_fetch_and_add(&disable_cnt, 1); +} + +SEC(".struct_ops.link") +struct sched_ext_ops init_enable_count_ops = { + .init_task = cnt_init_task, + .exit_task = cnt_exit_task, + .enable = cnt_enable, + .disable = cnt_disable, + .name = "init_enable_count", +}; diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c new file mode 100644 index 000000000000..97d45f1e5597 --- /dev/null +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ +#include <stdio.h> +#include <unistd.h> +#include <sched.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include "scx_test.h" +#include "init_enable_count.bpf.skel.h" + +#define SCHED_EXT 7 + +static struct init_enable_count * +open_load_prog(bool global) +{ + struct init_enable_count *skel; + + skel = init_enable_count__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + if (!global) + skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; + + SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); + + return skel; +} + +static enum scx_test_status run_test(bool global) +{ + struct init_enable_count *skel; + struct bpf_link *link; + const u32 num_children = 5, num_pre_forks = 1024; + int ret, i, status; + struct sched_param param = {}; + pid_t pids[num_pre_forks]; + + skel = open_load_prog(global); + + /* + * Fork a bunch of children before we attach the scheduler so that we + * ensure (at least in practical terms) that there are more tasks that + * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that + * take the fork() path either below or in other processes. + */ + for (i = 0; i < num_pre_forks; i++) { + pids[i] = fork(); + SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + for (i = 0; i < num_pre_forks; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for pre-forked child\n"); + + SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i, + status); + } + + bpf_link__destroy(link); + SCX_GE(skel->bss->init_task_cnt, num_pre_forks); + SCX_GE(skel->bss->exit_task_cnt, num_pre_forks); + + link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + /* SCHED_EXT children */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); + + if (pids[i] == 0) { + ret = sched_setscheduler(0, SCHED_EXT, ¶m); + SCX_BUG_ON(ret, "Failed to set sched to sched_ext"); + + /* + * Reset to SCHED_OTHER for half of them. Counts for + * everything should still be the same regardless, as + * ops.disable() is invoked even if a task is still on + * SCHED_EXT before it exits. + */ + if (i % 2 == 0) { + ret = sched_setscheduler(0, SCHED_OTHER, ¶m); + SCX_BUG_ON(ret, "Failed to reset sched to normal"); + } + exit(0); + } + } + for (i = 0; i < num_children; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for SCX child\n"); + + SCX_FAIL_IF(status != 0, "SCX child %d exited with status %d\n", i, + status); + } + + /* SCHED_OTHER children */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + if (pids[i] == 0) + exit(0); + } + + for (i = 0; i < num_children; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for normal child\n"); + + SCX_FAIL_IF(status != 0, "Normal child %d exited with status %d\n", i, + status); + } + + bpf_link__destroy(link); + + SCX_GE(skel->bss->init_task_cnt, 2 * num_children); + SCX_GE(skel->bss->exit_task_cnt, 2 * num_children); + + if (global) { + SCX_GE(skel->bss->enable_cnt, 2 * num_children); + SCX_GE(skel->bss->disable_cnt, 2 * num_children); + } else { + SCX_EQ(skel->bss->enable_cnt, num_children); + SCX_EQ(skel->bss->disable_cnt, num_children); + } + /* + * We forked a ton of tasks before we attached the scheduler above, so + * this should be fine. Technically it could be flaky if a ton of forks + * are happening at the same time in other processes, but that should + * be exceedingly unlikely. + */ + SCX_GT(skel->bss->init_transition_cnt, skel->bss->init_fork_cnt); + SCX_GE(skel->bss->init_fork_cnt, 2 * num_children); + + init_enable_count__destroy(skel); + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + enum scx_test_status status; + + status = run_test(true); + if (status != SCX_TEST_PASS) + return status; + + return run_test(false); +} + +struct scx_test init_enable_count = { + .name = "init_enable_count", + .description = "Verify we do the correct amount of counting of init, " + "enable, etc callbacks.", + .run = run, +}; +REGISTER_SCX_TEST(&init_enable_count) diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c new file mode 100644 index 000000000000..44612fdaf399 --- /dev/null +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler with every callback defined. + * + * This scheduler defines every callback. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + u64 wake_flags) +{ + return prev_cpu; +} + +void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) +{ + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) +{} + +void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_consume(SCX_DSQ_GLOBAL); +} + +void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) +{} + +void BPF_STRUCT_OPS(maximal_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maximal_stopping, struct task_struct *p, bool runnable) +{} + +void BPF_STRUCT_OPS(maximal_quiescent, struct task_struct *p, u64 deq_flags) +{} + +bool BPF_STRUCT_OPS(maximal_yield, struct task_struct *from, + struct task_struct *to) +{ + return false; +} + +bool BPF_STRUCT_OPS(maximal_core_sched_before, struct task_struct *a, + struct task_struct *b) +{ + return false; +} + +void BPF_STRUCT_OPS(maximal_set_weight, struct task_struct *p, u32 weight) +{} + +void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p, + const struct cpumask *cpumask) +{} + +void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle) +{} + +void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu, + struct scx_cpu_acquire_args *args) +{} + +void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu, + struct scx_cpu_release_args *args) +{} + +void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu) +{} + +void BPF_STRUCT_OPS(maximal_cpu_offline, s32 cpu) +{} + +s32 BPF_STRUCT_OPS(maximal_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_enable, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) +{} + +void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p) +{} + +s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) +{} + +SEC(".struct_ops.link") +struct sched_ext_ops maximal_ops = { + .select_cpu = maximal_select_cpu, + .enqueue = maximal_enqueue, + .dequeue = maximal_dequeue, + .dispatch = maximal_dispatch, + .runnable = maximal_runnable, + .running = maximal_running, + .stopping = maximal_stopping, + .quiescent = maximal_quiescent, + .yield = maximal_yield, + .core_sched_before = maximal_core_sched_before, + .set_weight = maximal_set_weight, + .set_cpumask = maximal_set_cpumask, + .update_idle = maximal_update_idle, + .cpu_acquire = maximal_cpu_acquire, + .cpu_release = maximal_cpu_release, + .cpu_online = maximal_cpu_online, + .cpu_offline = maximal_cpu_offline, + .init_task = maximal_init_task, + .enable = maximal_enable, + .exit_task = maximal_exit_task, + .disable = maximal_disable, + .init = maximal_init, + .exit = maximal_exit, + .name = "maximal", +}; diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c new file mode 100644 index 000000000000..f38fc973c380 --- /dev/null +++ b/tools/testing/selftests/sched_ext/maximal.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "maximal.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct maximal *skel; + + skel = maximal__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct maximal *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct maximal *skel = ctx; + + maximal__destroy(skel); +} + +struct scx_test maximal = { + .name = "maximal", + .description = "Verify we can load a scheduler with every callback defined", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&maximal) diff --git a/tools/testing/selftests/sched_ext/maybe_null.bpf.c b/tools/testing/selftests/sched_ext/maybe_null.bpf.c new file mode 100644 index 000000000000..27d0f386acfb --- /dev/null +++ b/tools/testing/selftests/sched_ext/maybe_null.bpf.c @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maybe_null_success_dispatch, s32 cpu, struct task_struct *p) +{ + if (p != NULL) + vtime_test = p->scx.dsq_vtime; +} + +bool BPF_STRUCT_OPS(maybe_null_success_yield, struct task_struct *from, + struct task_struct *to) +{ + if (to) + bpf_printk("Yielding to %s[%d]", to->comm, to->pid); + + return false; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_success = { + .dispatch = maybe_null_success_dispatch, + .yield = maybe_null_success_yield, + .enable = maybe_null_running, + .name = "minimal", +}; diff --git a/tools/testing/selftests/sched_ext/maybe_null.c b/tools/testing/selftests/sched_ext/maybe_null.c new file mode 100644 index 000000000000..31cfafb0cf65 --- /dev/null +++ b/tools/testing/selftests/sched_ext/maybe_null.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "maybe_null.bpf.skel.h" +#include "maybe_null_fail_dsp.bpf.skel.h" +#include "maybe_null_fail_yld.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status run(void *ctx) +{ + struct maybe_null *skel; + struct maybe_null_fail_dsp *fail_dsp; + struct maybe_null_fail_yld *fail_yld; + + skel = maybe_null__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load maybe_null skel"); + return SCX_TEST_FAIL; + } + maybe_null__destroy(skel); + + fail_dsp = maybe_null_fail_dsp__open_and_load(); + if (fail_dsp) { + maybe_null_fail_dsp__destroy(fail_dsp); + SCX_ERR("Should failed to open and load maybe_null_fail_dsp skel"); + return SCX_TEST_FAIL; + } + + fail_yld = maybe_null_fail_yld__open_and_load(); + if (fail_yld) { + maybe_null_fail_yld__destroy(fail_yld); + SCX_ERR("Should failed to open and load maybe_null_fail_yld skel"); + return SCX_TEST_FAIL; + } + + return SCX_TEST_PASS; +} + +struct scx_test maybe_null = { + .name = "maybe_null", + .description = "Verify if PTR_MAYBE_NULL work for .dispatch", + .run = run, +}; +REGISTER_SCX_TEST(&maybe_null) diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c new file mode 100644 index 000000000000..c0641050271d --- /dev/null +++ b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p) +{ + vtime_test = p->scx.dsq_vtime; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_fail = { + .dispatch = maybe_null_fail_dispatch, + .enable = maybe_null_running, + .name = "maybe_null_fail_dispatch", +}; diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c new file mode 100644 index 000000000000..3c1740028e3b --- /dev/null +++ b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +bool BPF_STRUCT_OPS(maybe_null_fail_yield, struct task_struct *from, + struct task_struct *to) +{ + bpf_printk("Yielding to %s[%d]", to->comm, to->pid); + + return false; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_fail = { + .yield = maybe_null_fail_yield, + .enable = maybe_null_running, + .name = "maybe_null_fail_yield", +}; diff --git a/tools/testing/selftests/sched_ext/minimal.bpf.c b/tools/testing/selftests/sched_ext/minimal.bpf.c new file mode 100644 index 000000000000..6a7eccef0104 --- /dev/null +++ b/tools/testing/selftests/sched_ext/minimal.bpf.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A completely minimal scheduler. + * + * This scheduler defines the absolute minimal set of struct sched_ext_ops + * fields: its name. It should _not_ fail to be loaded, and can be used to + * exercise the default scheduling paths in ext.c. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +SEC(".struct_ops.link") +struct sched_ext_ops minimal_ops = { + .name = "minimal", +}; diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c new file mode 100644 index 000000000000..6c5db8ebbf8a --- /dev/null +++ b/tools/testing/selftests/sched_ext/minimal.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "minimal.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct minimal *skel; + + skel = minimal__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct minimal *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.minimal_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct minimal *skel = ctx; + + minimal__destroy(skel); +} + +struct scx_test minimal = { + .name = "minimal", + .description = "Verify we can load a fully minimal scheduler", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&minimal) diff --git a/tools/testing/selftests/sched_ext/prog_run.bpf.c b/tools/testing/selftests/sched_ext/prog_run.bpf.c new file mode 100644 index 000000000000..6a4d7c48e3f2 --- /dev/null +++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates that we can invoke sched_ext kfuncs in + * BPF_PROG_TYPE_SYSCALL programs. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ + +#include <scx/common.bpf.h> + +UEI_DEFINE(uei); + +char _license[] SEC("license") = "GPL"; + +SEC("syscall") +int BPF_PROG(prog_run_syscall) +{ + scx_bpf_create_dsq(0, -1); + scx_bpf_exit(0xdeadbeef, "Exited from PROG_RUN"); + return 0; +} + +void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops prog_run_ops = { + .exit = prog_run_exit, + .name = "prog_run", +}; diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c new file mode 100644 index 000000000000..3cd57ef8daaa --- /dev/null +++ b/tools/testing/selftests/sched_ext/prog_run.c @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#include <bpf/bpf.h> +#include <sched.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "prog_run.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct prog_run *skel; + + skel = prog_run__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct prog_run *skel = ctx; + struct bpf_link *link; + int prog_fd, err = 0; + + prog_fd = bpf_program__fd(skel->progs.prog_run_syscall); + if (prog_fd < 0) { + SCX_ERR("Failed to get BPF_PROG_RUN prog"); + return SCX_TEST_FAIL; + } + + LIBBPF_OPTS(bpf_test_run_opts, topts); + + link = bpf_map__attach_struct_ops(skel->maps.prog_run_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + close(prog_fd); + return SCX_TEST_FAIL; + } + + err = bpf_prog_test_run_opts(prog_fd, &topts); + SCX_EQ(err, 0); + + /* Assumes uei.kind is written last */ + while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) + sched_yield(); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); + SCX_EQ(skel->data->uei.exit_code, 0xdeadbeef); + close(prog_fd); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct prog_run *skel = ctx; + + prog_run__destroy(skel); +} + +struct scx_test prog_run = { + .name = "prog_run", + .description = "Verify we can call into a scheduler with BPF_PROG_RUN, and invoke kfuncs", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&prog_run) diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c new file mode 100644 index 000000000000..5cfba2d6e056 --- /dev/null +++ b/tools/testing/selftests/sched_ext/reload_loop.c @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#include <bpf/bpf.h> +#include <pthread.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "maximal.bpf.skel.h" +#include "scx_test.h" + +static struct maximal *skel; +static pthread_t threads[2]; + +bool force_exit = false; + +static enum scx_test_status setup(void **ctx) +{ + skel = maximal__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + + return SCX_TEST_PASS; +} + +static void *do_reload_loop(void *arg) +{ + u32 i; + + for (i = 0; i < 1024 && !force_exit; i++) { + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); + if (link) + bpf_link__destroy(link); + } + + return NULL; +} + +static enum scx_test_status run(void *ctx) +{ + int err; + void *ret; + + err = pthread_create(&threads[0], NULL, do_reload_loop, NULL); + SCX_FAIL_IF(err, "Failed to create thread 0"); + + err = pthread_create(&threads[1], NULL, do_reload_loop, NULL); + SCX_FAIL_IF(err, "Failed to create thread 1"); + + SCX_FAIL_IF(pthread_join(threads[0], &ret), "thread 0 failed"); + SCX_FAIL_IF(pthread_join(threads[1], &ret), "thread 1 failed"); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + force_exit = true; + maximal__destroy(skel); +} + +struct scx_test reload_loop = { + .name = "reload_loop", + .description = "Stress test loading and unloading schedulers repeatedly in a tight loop", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&reload_loop) diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c new file mode 100644 index 000000000000..eab48c7ff309 --- /dev/null +++ b/tools/testing/selftests/sched_ext/runner.c @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> + */ +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <libgen.h> +#include <bpf/bpf.h> +#include "scx_test.h" + +const char help_fmt[] = +"The runner for sched_ext tests.\n" +"\n" +"The runner is statically linked against all testcases, and runs them all serially.\n" +"It's required for the testcases to be serial, as only a single host-wide sched_ext\n" +"scheduler may be loaded at any given time." +"\n" +"Usage: %s [-t TEST] [-h]\n" +"\n" +" -t TEST Only run tests whose name includes this string\n" +" -s Include print output for skipped tests\n" +" -q Don't print the test descriptions during run\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; +static bool quiet, print_skipped; + +#define MAX_SCX_TESTS 2048 + +static struct scx_test __scx_tests[MAX_SCX_TESTS]; +static unsigned __scx_num_tests = 0; + +static void sigint_handler(int simple) +{ + exit_req = 1; +} + +static void print_test_preamble(const struct scx_test *test, bool quiet) +{ + printf("===== START =====\n"); + printf("TEST: %s\n", test->name); + if (!quiet) + printf("DESCRIPTION: %s\n", test->description); + printf("OUTPUT:\n"); +} + +static const char *status_to_result(enum scx_test_status status) +{ + switch (status) { + case SCX_TEST_PASS: + case SCX_TEST_SKIP: + return "ok"; + case SCX_TEST_FAIL: + return "not ok"; + default: + return "<UNKNOWN>"; + } +} + +static void print_test_result(const struct scx_test *test, + enum scx_test_status status, + unsigned int testnum) +{ + const char *result = status_to_result(status); + const char *directive = status == SCX_TEST_SKIP ? "SKIP " : ""; + + printf("%s %u %s # %s\n", result, testnum, test->name, directive); + printf("===== END =====\n"); +} + +static bool should_skip_test(const struct scx_test *test, const char * filter) +{ + return !strstr(test->name, filter); +} + +static enum scx_test_status run_test(const struct scx_test *test) +{ + enum scx_test_status status; + void *context = NULL; + + if (test->setup) { + status = test->setup(&context); + if (status != SCX_TEST_PASS) + return status; + } + + status = test->run(context); + + if (test->cleanup) + test->cleanup(context); + + return status; +} + +static bool test_valid(const struct scx_test *test) +{ + if (!test) { + fprintf(stderr, "NULL test detected\n"); + return false; + } + + if (!test->name) { + fprintf(stderr, + "Test with no name found. Must specify test name.\n"); + return false; + } + + if (!test->description) { + fprintf(stderr, "Test %s requires description.\n", test->name); + return false; + } + + if (!test->run) { + fprintf(stderr, "Test %s has no run() callback\n", test->name); + return false; + } + + return true; +} + +int main(int argc, char **argv) +{ + const char *filter = NULL; + unsigned testnum = 0, i; + unsigned passed = 0, skipped = 0, failed = 0; + int opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + while ((opt = getopt(argc, argv, "qst:h")) != -1) { + switch (opt) { + case 'q': + quiet = true; + break; + case 's': + print_skipped = true; + break; + case 't': + filter = optarg; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + for (i = 0; i < __scx_num_tests; i++) { + enum scx_test_status status; + struct scx_test *test = &__scx_tests[i]; + + if (filter && should_skip_test(test, filter)) { + /* + * Printing the skipped tests and their preambles can + * add a lot of noise to the runner output. Printing + * this is only really useful for CI, so let's skip it + * by default. + */ + if (print_skipped) { + print_test_preamble(test, quiet); + print_test_result(test, SCX_TEST_SKIP, ++testnum); + } + continue; + } + + print_test_preamble(test, quiet); + status = run_test(test); + print_test_result(test, status, ++testnum); + switch (status) { + case SCX_TEST_PASS: + passed++; + break; + case SCX_TEST_SKIP: + skipped++; + break; + case SCX_TEST_FAIL: + failed++; + break; + } + } + printf("\n\n=============================\n\n"); + printf("RESULTS:\n\n"); + printf("PASSED: %u\n", passed); + printf("SKIPPED: %u\n", skipped); + printf("FAILED: %u\n", failed); + + return 0; +} + +void scx_test_register(struct scx_test *test) +{ + SCX_BUG_ON(!test_valid(test), "Invalid test found"); + SCX_BUG_ON(__scx_num_tests >= MAX_SCX_TESTS, "Maximum tests exceeded"); + + __scx_tests[__scx_num_tests++] = *test; +} diff --git a/tools/testing/selftests/sched_ext/scx_test.h b/tools/testing/selftests/sched_ext/scx_test.h new file mode 100644 index 000000000000..90b8d6915bb7 --- /dev/null +++ b/tools/testing/selftests/sched_ext/scx_test.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + */ + +#ifndef __SCX_TEST_H__ +#define __SCX_TEST_H__ + +#include <errno.h> +#include <scx/common.h> +#include <scx/compat.h> + +enum scx_test_status { + SCX_TEST_PASS = 0, + SCX_TEST_SKIP, + SCX_TEST_FAIL, +}; + +#define EXIT_KIND(__ent) __COMPAT_ENUM_OR_ZERO("scx_exit_kind", #__ent) + +struct scx_test { + /** + * name - The name of the testcase. + */ + const char *name; + + /** + * description - A description of your testcase: what it tests and is + * meant to validate. + */ + const char *description; + + /* + * setup - Setup the test. + * @ctx: A pointer to a context object that will be passed to run and + * cleanup. + * + * An optional callback that allows a testcase to perform setup for its + * run. A test may return SCX_TEST_SKIP to skip the run. + */ + enum scx_test_status (*setup)(void **ctx); + + /* + * run - Run the test. + * @ctx: Context set in the setup() callback. If @ctx was not set in + * setup(), it is NULL. + * + * The main test. Callers should return one of: + * + * - SCX_TEST_PASS: Test passed + * - SCX_TEST_SKIP: Test should be skipped + * - SCX_TEST_FAIL: Test failed + * + * This callback must be defined. + */ + enum scx_test_status (*run)(void *ctx); + + /* + * cleanup - Perform cleanup following the test + * @ctx: Context set in the setup() callback. If @ctx was not set in + * setup(), it is NULL. + * + * An optional callback that allows a test to perform cleanup after + * being run. This callback is run even if the run() callback returns + * SCX_TEST_SKIP or SCX_TEST_FAIL. It is not run if setup() returns + * SCX_TEST_SKIP or SCX_TEST_FAIL. + */ + void (*cleanup)(void *ctx); +}; + +void scx_test_register(struct scx_test *test); + +#define REGISTER_SCX_TEST(__test) \ + __attribute__((constructor)) \ + static void ___scxregister##__LINE__(void) \ + { \ + scx_test_register(__test); \ + } + +#define SCX_ERR(__fmt, ...) \ + do { \ + fprintf(stderr, "ERR: %s:%d\n", __FILE__, __LINE__); \ + fprintf(stderr, __fmt"\n", ##__VA_ARGS__); \ + } while (0) + +#define SCX_FAIL(__fmt, ...) \ + do { \ + SCX_ERR(__fmt, ##__VA_ARGS__); \ + return SCX_TEST_FAIL; \ + } while (0) + +#define SCX_FAIL_IF(__cond, __fmt, ...) \ + do { \ + if (__cond) \ + SCX_FAIL(__fmt, ##__VA_ARGS__); \ + } while (0) + +#define SCX_GT(_x, _y) SCX_FAIL_IF((_x) <= (_y), "Expected %s > %s (%lu > %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_GE(_x, _y) SCX_FAIL_IF((_x) < (_y), "Expected %s >= %s (%lu >= %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_LT(_x, _y) SCX_FAIL_IF((_x) >= (_y), "Expected %s < %s (%lu < %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_LE(_x, _y) SCX_FAIL_IF((_x) > (_y), "Expected %s <= %s (%lu <= %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_EQ(_x, _y) SCX_FAIL_IF((_x) != (_y), "Expected %s == %s (%lu == %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_ASSERT(_x) SCX_FAIL_IF(!(_x), "Expected %s to be true (%lu)", \ + #_x, (u64)(_x)) + +#define SCX_ECODE_VAL(__ecode) ({ \ + u64 __val = 0; \ + bool __found = false; \ + \ + __found = __COMPAT_read_enum("scx_exit_code", #__ecode, &__val); \ + SCX_ASSERT(__found); \ + (s64)__val; \ +}) + +#define SCX_KIND_VAL(__kind) ({ \ + u64 __val = 0; \ + bool __found = false; \ + \ + __found = __COMPAT_read_enum("scx_exit_kind", #__kind, &__val); \ + SCX_ASSERT(__found); \ + __val; \ +}) + +#endif // # __SCX_TEST_H__ diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c new file mode 100644 index 000000000000..2ed2991afafe --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +bool saw_local = false; + +static bool task_is_test(const struct task_struct *p) +{ + return !bpf_strncmp(p->comm, 9, "select_cpu"); +} + +void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, + u64 enq_flags) +{ + const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask(); + + if (task_is_test(p) && + bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask)) { + saw_local = true; + } + scx_bpf_put_idle_cpumask(idle_mask); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dfl_ops = { + .enqueue = select_cpu_dfl_enqueue, + .name = "select_cpu_dfl", +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c new file mode 100644 index 000000000000..a53a40c2d2f0 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "select_cpu_dfl.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dfl *skel; + + skel = select_cpu_dfl__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dfl *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + SCX_ASSERT(!skel->bss->saw_local); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dfl *skel = ctx; + + select_cpu_dfl__destroy(skel); +} + +struct scx_test select_cpu_dfl = { + .name = "select_cpu_dfl", + .description = "Verify the default ops.select_cpu() dispatches tasks " + "when idles cores are found, and skips ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dfl) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c new file mode 100644 index 000000000000..4bb5abb2d369 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation, and with the SCX_OPS_ENQ_DFL_NO_DISPATCH ops flag + * specified. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +bool saw_local = false; + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* CPU changed by ops.select_cpu() */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* Manually specify the signature until the kfunc is added to the scx repo. */ +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + bool *found) __ksym; + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return -ESRCH; + } + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, + &tctx->force_local); + + return cpu; +} + +void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, + u64 enq_flags) +{ + u64 dsq_id = SCX_DSQ_GLOBAL; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + if (tctx->force_local) { + dsq_id = SCX_DSQ_LOCAL; + tctx->force_local = false; + saw_local = true; + } + + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); +} + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, + struct task_struct *p, struct scx_init_task_args *args) +{ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dfl_nodispatch_ops = { + .select_cpu = select_cpu_dfl_nodispatch_select_cpu, + .enqueue = select_cpu_dfl_nodispatch_enqueue, + .init_task = select_cpu_dfl_nodispatch_init_task, + .name = "select_cpu_dfl_nodispatch", +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c new file mode 100644 index 000000000000..1d85bf4bf3a3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "select_cpu_dfl_nodispatch.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dfl_nodispatch *skel; + + skel = select_cpu_dfl_nodispatch__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dfl_nodispatch *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + SCX_ASSERT(skel->bss->saw_local); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dfl_nodispatch *skel = ctx; + + select_cpu_dfl_nodispatch__destroy(skel); +} + +struct scx_test select_cpu_dfl_nodispatch = { + .name = "select_cpu_dfl_nodispatch", + .description = "Verify behavior of scx_bpf_select_cpu_dfl() in " + "ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dfl_nodispatch) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c new file mode 100644 index 000000000000..f0b96a4a04b2 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + u64 dsq_id = SCX_DSQ_LOCAL; + s32 cpu = prev_cpu; + + if (scx_bpf_test_and_clear_cpu_idle(cpu)) + goto dispatch; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto dispatch; + + dsq_id = SCX_DSQ_GLOBAL; + cpu = prev_cpu; + +dispatch: + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); + return cpu; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_ops = { + .select_cpu = select_cpu_dispatch_select_cpu, + .name = "select_cpu_dispatch", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c new file mode 100644 index 000000000000..0309ca8785b3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "select_cpu_dispatch.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch *skel; + + skel = select_cpu_dispatch__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch *skel = ctx; + + select_cpu_dispatch__destroy(skel); +} + +struct scx_test select_cpu_dispatch = { + .name = "select_cpu_dispatch", + .description = "Test direct dispatching to built-in DSQs from " + "ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c new file mode 100644 index 000000000000..7b42ddce0f56 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* Dispatching to a random DSQ should fail. */ + scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); + + return prev_cpu; +} + +void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = { + .select_cpu = select_cpu_dispatch_bad_dsq_select_cpu, + .exit = select_cpu_dispatch_bad_dsq_exit, + .name = "select_cpu_dispatch_bad_dsq", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c new file mode 100644 index 000000000000..47eb6ed7627d --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "select_cpu_dispatch_bad_dsq.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel; + + skel = select_cpu_dispatch_bad_dsq__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel = ctx; + + select_cpu_dispatch_bad_dsq__destroy(skel); +} + +struct scx_test select_cpu_dispatch_bad_dsq = { + .name = "select_cpu_dispatch_bad_dsq", + .description = "Verify graceful failure if we direct-dispatch to a " + "bogus DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch_bad_dsq) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c new file mode 100644 index 000000000000..653e3dc0b4dc --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* Dispatching twice in a row is disallowed. */ + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + + return prev_cpu; +} + +void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = { + .select_cpu = select_cpu_dispatch_dbl_dsp_select_cpu, + .exit = select_cpu_dispatch_dbl_dsp_exit, + .name = "select_cpu_dispatch_dbl_dsp", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c new file mode 100644 index 000000000000..48ff028a3c46 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "select_cpu_dispatch_dbl_dsp.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel; + + skel = select_cpu_dispatch_dbl_dsp__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel = ctx; + + select_cpu_dispatch_dbl_dsp__destroy(skel); +} + +struct scx_test select_cpu_dispatch_dbl_dsp = { + .name = "select_cpu_dispatch_dbl_dsp", + .description = "Verify graceful failure if we dispatch twice to a " + "DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch_dbl_dsp) diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c new file mode 100644 index 000000000000..7f3ebf4fc2ea --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates that enqueue flags are properly stored and + * applied at dispatch time when a task is directly dispatched from + * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and + * making the test a very basic vtime scheduler. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +volatile bool consumed; + +static u64 vtime_now; + +#define VTIME_DSQ 0 + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static inline u64 task_vtime(const struct task_struct *p) +{ + u64 vtime = p->scx.dsq_vtime; + + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + return vtime_now - SCX_SLICE_DFL; + else + return vtime; +} + +s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto ddsp; + + cpu = prev_cpu; + scx_bpf_test_and_clear_cpu_idle(cpu); +ddsp: + scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); + return cpu; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) +{ + if (scx_bpf_consume(VTIME_DSQ)) + consumed = true; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p) +{ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p, + bool runnable) +{ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init) +{ + return scx_bpf_create_dsq(VTIME_DSQ, -1); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_vtime_ops = { + .select_cpu = select_cpu_vtime_select_cpu, + .dispatch = select_cpu_vtime_dispatch, + .running = select_cpu_vtime_running, + .stopping = select_cpu_vtime_stopping, + .enable = select_cpu_vtime_enable, + .init = select_cpu_vtime_init, + .name = "select_cpu_vtime", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c new file mode 100644 index 000000000000..b4629c2364f5 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "select_cpu_vtime.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_vtime *skel; + + skel = select_cpu_vtime__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_vtime *skel = ctx; + struct bpf_link *link; + + SCX_ASSERT(!skel->bss->consumed); + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_ASSERT(skel->bss->consumed); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_vtime *skel = ctx; + + select_cpu_vtime__destroy(skel); +} + +struct scx_test select_cpu_vtime = { + .name = "select_cpu_vtime", + .description = "Test doing direct vtime-dispatching from " + "ops.select_cpu(), to a non-built-in DSQ", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_vtime) diff --git a/tools/testing/selftests/sched_ext/test_example.c b/tools/testing/selftests/sched_ext/test_example.c new file mode 100644 index 000000000000..ce36cdf03cdc --- /dev/null +++ b/tools/testing/selftests/sched_ext/test_example.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include "scx_test.h" + +static bool setup_called = false; +static bool run_called = false; +static bool cleanup_called = false; + +static int context = 10; + +static enum scx_test_status setup(void **ctx) +{ + setup_called = true; + *ctx = &context; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + int *arg = ctx; + + SCX_ASSERT(setup_called); + SCX_ASSERT(!run_called && !cleanup_called); + SCX_EQ(*arg, context); + + run_called = true; + return SCX_TEST_PASS; +} + +static void cleanup (void *ctx) +{ + SCX_BUG_ON(!run_called || cleanup_called, "Wrong callbacks invoked"); +} + +struct scx_test example = { + .name = "example", + .description = "Validate the basic function of the test suite itself", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&example) diff --git a/tools/testing/selftests/sched_ext/util.c b/tools/testing/selftests/sched_ext/util.c new file mode 100644 index 000000000000..e47769c91918 --- /dev/null +++ b/tools/testing/selftests/sched_ext/util.c @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <dvernet@meta.com> + */ +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +/* Returns read len on success, or -errno on failure. */ +static ssize_t read_text(const char *path, char *buf, size_t max_len) +{ + ssize_t len; + int fd; + + fd = open(path, O_RDONLY); + if (fd < 0) + return -errno; + + len = read(fd, buf, max_len - 1); + + if (len >= 0) + buf[len] = 0; + + close(fd); + return len < 0 ? -errno : len; +} + +/* Returns written len on success, or -errno on failure. */ +static ssize_t write_text(const char *path, char *buf, ssize_t len) +{ + int fd; + ssize_t written; + + fd = open(path, O_WRONLY | O_APPEND); + if (fd < 0) + return -errno; + + written = write(fd, buf, len); + close(fd); + return written < 0 ? -errno : written; +} + +long file_read_long(const char *path) +{ + char buf[128]; + + + if (read_text(path, buf, sizeof(buf)) <= 0) + return -1; + + return atol(buf); +} + +int file_write_long(const char *path, long val) +{ + char buf[64]; + int ret; + + ret = sprintf(buf, "%lu", val); + if (ret < 0) + return ret; + + if (write_text(path, buf, sizeof(buf)) <= 0) + return -1; + + return 0; +} diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h new file mode 100644 index 000000000000..bc13dfec1267 --- /dev/null +++ b/tools/testing/selftests/sched_ext/util.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet <void@manifault.com> + */ + +#ifndef __SCX_TEST_UTIL_H__ +#define __SCX_TEST_UTIL_H__ + +long file_read_long(const char *path); +int file_write_long(const char *path, long val); + +#endif // __SCX_TEST_H__ |