diff options
Diffstat (limited to 'samples')
222 files changed, 10124 insertions, 9653 deletions
diff --git a/samples/Kconfig b/samples/Kconfig index 9d236c346de5..0d81c00289ee 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -6,11 +6,21 @@ menuconfig SAMPLES if SAMPLES +config SAMPLE_AUXDISPLAY + bool "auxdisplay sample" + depends on CC_CAN_LINK + config SAMPLE_TRACE_EVENTS tristate "Build trace_events examples -- loadable modules only" depends on EVENT_TRACING && m help - This build trace event example modules. + This builds the trace event example module. + +config SAMPLE_TRACE_CUSTOM_EVENTS + tristate "Build custom trace event example -- loadable modules only" + depends on EVENT_TRACING && m + help + This builds the custom trace event example module. config SAMPLE_TRACE_PRINTK tristate "Build trace_printk module - tests various trace_printk formats" @@ -22,11 +32,20 @@ config SAMPLE_TRACE_PRINTK config SAMPLE_FTRACE_DIRECT tristate "Build register_ftrace_direct() example" depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m - depends on X86_64 # has x86_64 inlined asm + depends on HAVE_SAMPLE_FTRACE_DIRECT help This builds an ftrace direct function example that hooks to wake_up_process and prints the parameters. +config SAMPLE_FTRACE_DIRECT_MULTI + tristate "Build register_ftrace_direct_multi() example" + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m + depends on HAVE_SAMPLE_FTRACE_DIRECT_MULTI + help + This builds an ftrace direct function example + that hooks to wake_up_process and schedule, and prints + the function addresses. + config SAMPLE_TRACE_ARRAY tristate "Build sample module for kernel access to Ftrace instancess" depends on EVENT_TRACING && m @@ -60,6 +79,13 @@ config SAMPLE_HW_BREAKPOINT help This builds kernel hardware breakpoint example modules. +config SAMPLE_FPROBE + tristate "Build fprobe examples -- loadable modules only" + depends on FPROBE && m + help + This builds a fprobe example module. This module has an option 'symbol'. + You can specify a probed symbol or symbols separated with ','. + config SAMPLE_KFIFO tristate "Build kfifo examples -- loadable modules only" depends on m @@ -116,31 +142,57 @@ config SAMPLE_CONNECTOR with it. See also Documentation/driver-api/connector.rst +config SAMPLE_FANOTIFY_ERROR + bool "Build fanotify error monitoring sample" + depends on FANOTIFY && CC_CAN_LINK && HEADERS_INSTALL + help + When enabled, this builds an example code that uses the + FAN_FS_ERROR fanotify mechanism to monitor filesystem + errors. + See also Documentation/admin-guide/filesystem-monitoring.rst. + config SAMPLE_HIDRAW bool "hidraw sample" - depends on HEADERS_INSTALL + depends on CC_CAN_LINK && HEADERS_INSTALL + +config SAMPLE_LANDLOCK + bool "Landlock example" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build a simple Landlock sandbox manager able to start a process + restricted by a user-defined filesystem access control policy. config SAMPLE_PIDFD bool "pidfd sample" - depends on HEADERS_INSTALL + depends on CC_CAN_LINK && HEADERS_INSTALL config SAMPLE_SECCOMP bool "Build seccomp sample code" - depends on SECCOMP_FILTER && HEADERS_INSTALL + depends on SECCOMP_FILTER && CC_CAN_LINK && HEADERS_INSTALL help Build samples of seccomp filters using various methods of BPF filter construction. +config SAMPLE_TIMER + bool "Timer sample" + depends on CC_CAN_LINK && HEADERS_INSTALL + +config SAMPLE_UHID + bool "UHID sample" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build UHID sample program. + config SAMPLE_VFIO_MDEV_MTTY tristate "Build VFIO mtty example mediated device sample code -- loadable modules only" - depends on VFIO_MDEV_DEVICE && m + depends on VFIO_MDEV && m help Build a virtual tty sample driver for use as a VFIO mediated device config SAMPLE_VFIO_MDEV_MDPY tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only" - depends on VFIO_MDEV_DEVICE && m + depends on VFIO_MDEV && m help Build a virtual display sample driver for use as a VFIO mediated device. It is a simple framebuffer and supports @@ -157,7 +209,7 @@ config SAMPLE_VFIO_MDEV_MDPY_FB config SAMPLE_VFIO_MDEV_MBOCHS tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only" - depends on VFIO_MDEV_DEVICE && m + depends on VFIO_MDEV && m select DMA_SHARED_BUFFER help Build a virtual display sample driver for use as a VFIO @@ -171,14 +223,14 @@ config SAMPLE_VFIO_MDEV_MBOCHS config SAMPLE_ANDROID_BINDERFS bool "Build Android binderfs example" - depends on CONFIG_ANDROID_BINDERFS + depends on CC_CAN_LINK && HEADERS_INSTALL help Builds a sample program to illustrate the use of the Android binderfs filesystem. config SAMPLE_VFS bool "Build example programs that use new VFS system calls" - depends on HEADERS_INSTALL + depends on CC_CAN_LINK && HEADERS_INSTALL help Build example userspace programs that use new VFS system calls such as mount API and statx(). Note that this is restricted to the x86 @@ -187,8 +239,36 @@ config SAMPLE_VFS config SAMPLE_INTEL_MEI bool "Build example program working with intel mei driver" depends on INTEL_MEI + depends on CC_CAN_LINK && HEADERS_INSTALL help Build a sample program to work with mei device. +config SAMPLE_WATCHDOG + bool "watchdog sample" + depends on CC_CAN_LINK + +config SAMPLE_WATCH_QUEUE + bool "Build example watch_queue notification API consumer" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build example userspace program to use the new mount_notify(), + sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function. + +config SAMPLE_CORESIGHT_SYSCFG + tristate "Build example loadable module for CoreSight config" + depends on CORESIGHT && m + help + Build an example loadable module that adds new CoreSight features + and configuration using the CoreSight system configuration API. + This demonstrates how a user may create their own CoreSight + configurations and easily load them into the system at runtime. + +source "samples/rust/Kconfig" endif # SAMPLES + +config HAVE_SAMPLE_FTRACE_DIRECT + bool + +config HAVE_SAMPLE_FTRACE_DIRECT_MULTI + bool diff --git a/samples/Makefile b/samples/Makefile index f8f847b4f61f..9832ef3f8fcb 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,26 +1,38 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for Linux samples code -OBJECT_FILES_NON_STANDARD := y -obj-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs/ +subdir-$(CONFIG_SAMPLE_AUXDISPLAY) += auxdisplay +subdir-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/ obj-$(CONFIG_SAMPLE_CONNECTOR) += connector/ +obj-$(CONFIG_SAMPLE_FANOTIFY_ERROR) += fanotify/ subdir-$(CONFIG_SAMPLE_HIDRAW) += hidraw obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += hw_breakpoint/ obj-$(CONFIG_SAMPLE_KDB) += kdb/ obj-$(CONFIG_SAMPLE_KFIFO) += kfifo/ obj-$(CONFIG_SAMPLE_KOBJECT) += kobject/ obj-$(CONFIG_SAMPLE_KPROBES) += kprobes/ +subdir-$(CONFIG_SAMPLE_LANDLOCK) += landlock obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch/ subdir-$(CONFIG_SAMPLE_PIDFD) += pidfd obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi/ obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg/ subdir-$(CONFIG_SAMPLE_SECCOMP) += seccomp +subdir-$(CONFIG_SAMPLE_TIMER) += timers obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/ +obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_events/ obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace/ obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/ +subdir-$(CONFIG_SAMPLE_UHID) += uhid obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ obj-y += vfio-mdev/ subdir-$(CONFIG_SAMPLE_VFS) += vfs obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/ +subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog +subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue +obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak/ +obj-$(CONFIG_SAMPLE_CORESIGHT_SYSCFG) += coresight/ +obj-$(CONFIG_SAMPLE_FPROBE) += fprobe/ +obj-$(CONFIG_SAMPLES_RUST) += rust/ diff --git a/samples/acrn/Makefile b/samples/acrn/Makefile new file mode 100644 index 000000000000..c8e3ed9785e9 --- /dev/null +++ b/samples/acrn/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +.PHONY: vm-sample + +vm-sample: vm-sample.o payload.o + $(CC) $^ -o $@ + +payload.o: payload.ld guest16.o + $(LD) -T $< -o $@ + +clean: + rm *.o vm-sample diff --git a/samples/acrn/guest.ld b/samples/acrn/guest.ld new file mode 100644 index 000000000000..5127c682bd22 --- /dev/null +++ b/samples/acrn/guest.ld @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +OUTPUT_FORMAT(binary) +SECTIONS +{ + .start : { *(.start) } + .text : { *(.text*) } + .rodata : { *(.rodata) } + .data : { *(.data) } +} diff --git a/samples/acrn/payload.ld b/samples/acrn/payload.ld new file mode 100644 index 000000000000..e8d9a498ad62 --- /dev/null +++ b/samples/acrn/payload.ld @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +SECTIONS +{ + .payload16 0 : { + guest16 = .; + guest16.o(.text) + guest16_end = .; + } +} diff --git a/samples/acrn/vm-sample.c b/samples/acrn/vm-sample.c new file mode 100644 index 000000000000..b2dad47a77a0 --- /dev/null +++ b/samples/acrn/vm-sample.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A sample program to run a User VM on the ACRN hypervisor + * + * This sample runs in a Service VM, which is a privileged VM of ACRN. + * CONFIG_ACRN_HSM need to be enabled in the Service VM. + * + * Guest VM code in guest16.s will be executed after the VM launched. + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + */ +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <malloc.h> +#include <fcntl.h> +#include <unistd.h> +#include <signal.h> +#include <sys/ioctl.h> +#include <linux/acrn.h> + +#define GUEST_MEMORY_SIZE (1024*1024) +void *guest_memory; + +extern const unsigned char guest16[], guest16_end[]; +static char io_request_page[4096] __attribute__((aligned(4096))); +static struct acrn_io_request *io_req_buf = (struct acrn_io_request *)io_request_page; + +__u16 vcpu_num; +__u16 vmid; +/* POST_STANDARD_VM_UUID1, refer to https://github.com/projectacrn/acrn-hypervisor/blob/master/hypervisor/include/common/vm_uuids.h */ +guid_t vm_uuid = GUID_INIT(0x385479d2, 0xd625, 0xe811, 0x86, 0x4e, 0xcb, 0x7a, 0x18, 0xb3, 0x46, 0x43); + +int hsm_fd; +int is_running = 1; + +void vm_exit(int sig) +{ + sig = sig; + + is_running = 0; + ioctl(hsm_fd, ACRN_IOCTL_PAUSE_VM, vmid); + ioctl(hsm_fd, ACRN_IOCTL_DESTROY_IOREQ_CLIENT, 0); +} + +int main(int argc, char **argv) +{ + int vcpu_id, ret; + struct acrn_vm_creation create_vm = {0}; + struct acrn_vm_memmap ram_map = {0}; + struct acrn_vcpu_regs regs; + struct acrn_io_request *io_req; + struct acrn_ioreq_notify __attribute__((aligned(8))) notify; + + argc = argc; + argv = argv; + + guest_memory = memalign(4096, GUEST_MEMORY_SIZE); + if (!guest_memory) { + printf("No enough memory!\n"); + return -1; + } + hsm_fd = open("/dev/acrn_hsm", O_RDWR|O_CLOEXEC); + + memcpy(&create_vm.uuid, &vm_uuid, 16); + create_vm.ioreq_buf = (__u64)io_req_buf; + ret = ioctl(hsm_fd, ACRN_IOCTL_CREATE_VM, &create_vm); + printf("Created VM! [%d]\n", ret); + vcpu_num = create_vm.vcpu_num; + vmid = create_vm.vmid; + + /* setup guest memory */ + ram_map.type = ACRN_MEMMAP_RAM; + ram_map.vma_base = (__u64)guest_memory; + ram_map.len = GUEST_MEMORY_SIZE; + ram_map.user_vm_pa = 0; + ram_map.attr = ACRN_MEM_ACCESS_RWX; + ret = ioctl(hsm_fd, ACRN_IOCTL_SET_MEMSEG, &ram_map); + printf("Set up VM memory! [%d]\n", ret); + + memcpy(guest_memory, guest16, guest16_end-guest16); + + /* setup vcpu registers */ + memset(®s, 0, sizeof(regs)); + regs.vcpu_id = 0; + regs.vcpu_regs.rip = 0; + + /* CR0_ET | CR0_NE */ + regs.vcpu_regs.cr0 = 0x30U; + regs.vcpu_regs.cs_ar = 0x009FU; + regs.vcpu_regs.cs_sel = 0xF000U; + regs.vcpu_regs.cs_limit = 0xFFFFU; + regs.vcpu_regs.cs_base = 0 & 0xFFFF0000UL; + regs.vcpu_regs.rip = 0 & 0xFFFFUL; + + ret = ioctl(hsm_fd, ACRN_IOCTL_SET_VCPU_REGS, ®s); + printf("Set up VM BSP registers! [%d]\n", ret); + + /* create an ioreq client for this VM */ + ret = ioctl(hsm_fd, ACRN_IOCTL_CREATE_IOREQ_CLIENT, 0); + printf("Created IO request client! [%d]\n", ret); + + /* run vm */ + ret = ioctl(hsm_fd, ACRN_IOCTL_START_VM, vmid); + printf("Start VM! [%d]\n", ret); + + signal(SIGINT, vm_exit); + while (is_running) { + ret = ioctl(hsm_fd, ACRN_IOCTL_ATTACH_IOREQ_CLIENT, 0); + + for (vcpu_id = 0; vcpu_id < vcpu_num; vcpu_id++) { + io_req = &io_req_buf[vcpu_id]; + if ((__sync_add_and_fetch(&io_req->processed, 0) == ACRN_IOREQ_STATE_PROCESSING) + && (!io_req->kernel_handled)) + if (io_req->type == ACRN_IOREQ_TYPE_PORTIO) { + int bytes, port, in; + + port = io_req->reqs.pio_request.address; + bytes = io_req->reqs.pio_request.size; + in = (io_req->reqs.pio_request.direction == ACRN_IOREQ_DIR_READ); + printf("Guest VM %s PIO[%x] with size[%x]\n", in ? "read" : "write", port, bytes); + + notify.vmid = vmid; + notify.vcpu = vcpu_id; + ioctl(hsm_fd, ACRN_IOCTL_NOTIFY_REQUEST_FINISH, ¬ify); + } + } + } + + ret = ioctl(hsm_fd, ACRN_IOCTL_DESTROY_VM, NULL); + printf("Destroy VM! [%d]\n", ret); + close(hsm_fd); + free(guest_memory); + return 0; +} diff --git a/samples/auxdisplay/.gitignore b/samples/auxdisplay/.gitignore index 7af222860a96..d023816849bd 100644 --- a/samples/auxdisplay/.gitignore +++ b/samples/auxdisplay/.gitignore @@ -1 +1,2 @@ -cfag12864b-example +# SPDX-License-Identifier: GPL-2.0-only +/cfag12864b-example diff --git a/samples/auxdisplay/Makefile b/samples/auxdisplay/Makefile index 0273bab27233..19d5568938c3 100644 --- a/samples/auxdisplay/Makefile +++ b/samples/auxdisplay/Makefile @@ -1,10 +1,2 @@ # SPDX-License-Identifier: GPL-2.0 -CC := $(CROSS_COMPILE)gcc -CFLAGS := -I../../usr/include - -PROGS := cfag12864b-example - -all: $(PROGS) - -clean: - rm -fr $(PROGS) +userprogs-always-y += cfag12864b-example diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c index bfeab44f81d0..2e3bb7375c99 100644 --- a/samples/auxdisplay/cfag12864b-example.c +++ b/samples/auxdisplay/cfag12864b-example.c @@ -4,7 +4,7 @@ * Version: 0.1.0 * Description: cfag12864b LCD userspace example program * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org> * Date: 2006-10-31 */ diff --git a/samples/binderfs/.gitignore b/samples/binderfs/.gitignore new file mode 100644 index 000000000000..8fa415a3640b --- /dev/null +++ b/samples/binderfs/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +/binderfs_example diff --git a/samples/binderfs/Makefile b/samples/binderfs/Makefile index ea4c93d36256..629e43b9b129 100644 --- a/samples/binderfs/Makefile +++ b/samples/binderfs/Makefile @@ -1,2 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs_example.o +userprogs-always-y += binderfs_example + +userccflags += -I usr/include diff --git a/samples/binderfs/binderfs_example.c b/samples/binderfs/binderfs_example.c index 5bbd2ebc0aea..0fd92cdda460 100644 --- a/samples/binderfs/binderfs_example.c +++ b/samples/binderfs/binderfs_example.c @@ -18,7 +18,6 @@ int main(int argc, char *argv[]) { int fd, ret, saved_errno; - size_t len; struct binderfs_device device = { 0 }; ret = unshare(CLONE_NEWNS); diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 74d31fd3c99c..0e7bfdbff80a 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only cpustat fds_example hbm @@ -44,8 +45,19 @@ xdp_monitor xdp_redirect xdp_redirect_cpu xdp_redirect_map +xdp_redirect_map_multi xdp_router_ipv4 xdp_rxq_info xdp_sample_pkts xdp_tx_iptunnel xdpsock +xdpsock_ctrl_proc +xsk_fwd +testfile.img +hbm_out.log +iperf.* +*.out +*.skel.h +/vmlinux.h +/bpftool/ +/libbpf/ diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 79b0fee6943b..727da3c5879b 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -3,6 +3,8 @@ BPF_SAMPLES_PATH ?= $(abspath $(srctree)/$(src)) TOOLS_PATH := $(BPF_SAMPLES_PATH)/../../tools +pound := \# + # List of programs to build tprogs-y := test_lru_dist tprogs-y += sock_example @@ -39,76 +41,83 @@ tprogs-y += lwt_len_hist tprogs-y += xdp_tx_iptunnel tprogs-y += test_map_in_map tprogs-y += per_socket_stats_example -tprogs-y += xdp_redirect -tprogs-y += xdp_redirect_map -tprogs-y += xdp_redirect_cpu -tprogs-y += xdp_monitor tprogs-y += xdp_rxq_info tprogs-y += syscall_tp tprogs-y += cpustat tprogs-y += xdp_adjust_tail -tprogs-y += xdpsock tprogs-y += xdp_fwd tprogs-y += task_fd_query tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm +tprogs-y += xdp_redirect_cpu +tprogs-y += xdp_redirect_map_multi +tprogs-y += xdp_redirect_map +tprogs-y += xdp_redirect +tprogs-y += xdp_monitor + # Libbpf dependencies -LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a +LIBBPF_SRC = $(TOOLS_PATH)/lib/bpf +LIBBPF_OUTPUT = $(abspath $(BPF_SAMPLES_PATH))/libbpf +LIBBPF_DESTDIR = $(LIBBPF_OUTPUT) +LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include +LIBBPF = $(LIBBPF_OUTPUT)/libbpf.a CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o +XDP_SAMPLE := xdp_sample_user.o fds_example-objs := fds_example.o sockex1-objs := sockex1_user.o sockex2-objs := sockex2_user.o -sockex3-objs := bpf_load.o sockex3_user.o -tracex1-objs := bpf_load.o tracex1_user.o -tracex2-objs := bpf_load.o tracex2_user.o -tracex3-objs := bpf_load.o tracex3_user.o -tracex4-objs := bpf_load.o tracex4_user.o -tracex5-objs := bpf_load.o tracex5_user.o -tracex6-objs := bpf_load.o tracex6_user.o -tracex7-objs := bpf_load.o tracex7_user.o -test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o -trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS) -lathist-objs := bpf_load.o lathist_user.o -offwaketime-objs := bpf_load.o offwaketime_user.o $(TRACE_HELPERS) -spintest-objs := bpf_load.o spintest_user.o $(TRACE_HELPERS) -map_perf_test-objs := bpf_load.o map_perf_test_user.o -test_overhead-objs := bpf_load.o test_overhead_user.o +sockex3-objs := sockex3_user.o +tracex1-objs := tracex1_user.o $(TRACE_HELPERS) +tracex2-objs := tracex2_user.o +tracex3-objs := tracex3_user.o +tracex4-objs := tracex4_user.o +tracex5-objs := tracex5_user.o $(TRACE_HELPERS) +tracex6-objs := tracex6_user.o +tracex7-objs := tracex7_user.o +test_probe_write_user-objs := test_probe_write_user_user.o +trace_output-objs := trace_output_user.o +lathist-objs := lathist_user.o +offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS) +spintest-objs := spintest_user.o $(TRACE_HELPERS) +map_perf_test-objs := map_perf_test_user.o +test_overhead-objs := test_overhead_user.o test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o test_cgrp2_attach-objs := test_cgrp2_attach.o test_cgrp2_sock-objs := test_cgrp2_sock.o -test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o +test_cgrp2_sock2-objs := test_cgrp2_sock2.o xdp1-objs := xdp1_user.o # reuse xdp1 source intentionally xdp2-objs := xdp1_user.o -xdp_router_ipv4-objs := xdp_router_ipv4_user.o -test_current_task_under_cgroup-objs := bpf_load.o $(CGROUP_HELPERS) \ +test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \ test_current_task_under_cgroup_user.o -trace_event-objs := bpf_load.o trace_event_user.o $(TRACE_HELPERS) -sampleip-objs := bpf_load.o sampleip_user.o $(TRACE_HELPERS) -tc_l2_redirect-objs := bpf_load.o tc_l2_redirect_user.o -lwt_len_hist-objs := bpf_load.o lwt_len_hist_user.o +trace_event-objs := trace_event_user.o $(TRACE_HELPERS) +sampleip-objs := sampleip_user.o $(TRACE_HELPERS) +tc_l2_redirect-objs := tc_l2_redirect_user.o +lwt_len_hist-objs := lwt_len_hist_user.o xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o -test_map_in_map-objs := bpf_load.o test_map_in_map_user.o +test_map_in_map-objs := test_map_in_map_user.o per_socket_stats_example-objs := cookie_uid_helper_example.o -xdp_redirect-objs := xdp_redirect_user.o -xdp_redirect_map-objs := xdp_redirect_map_user.o -xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o -xdp_monitor-objs := bpf_load.o xdp_monitor_user.o xdp_rxq_info-objs := xdp_rxq_info_user.o -syscall_tp-objs := bpf_load.o syscall_tp_user.o -cpustat-objs := bpf_load.o cpustat_user.o +syscall_tp-objs := syscall_tp_user.o +cpustat-objs := cpustat_user.o xdp_adjust_tail-objs := xdp_adjust_tail_user.o -xdpsock-objs := xdpsock_user.o xdp_fwd-objs := xdp_fwd_user.o -task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) -xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) -ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) -hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) +task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS) +xdp_sample_pkts-objs := xdp_sample_pkts_user.o +ibumad-objs := ibumad_user.o +hbm-objs := hbm.o $(CGROUP_HELPERS) + +xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o $(XDP_SAMPLE) +xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o $(XDP_SAMPLE) +xdp_redirect_map-objs := xdp_redirect_map_user.o $(XDP_SAMPLE) +xdp_redirect-objs := xdp_redirect_user.o $(XDP_SAMPLE) +xdp_monitor-objs := xdp_monitor_user.o $(XDP_SAMPLE) +xdp_router_ipv4-objs := xdp_router_ipv4_user.o $(XDP_SAMPLE) # Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -138,7 +147,6 @@ always-y += parse_varlen.o parse_simple.o parse_ldabs.o always-y += test_cgrp2_tc_kern.o always-y += xdp1_kern.o always-y += xdp2_kern.o -always-y += xdp_router_ipv4_kern.o always-y += test_current_task_under_cgroup_kern.o always-y += trace_event_kern.o always-y += sampleip_kern.o @@ -154,10 +162,6 @@ always-y += tcp_clamp_kern.o always-y += tcp_basertt_kern.o always-y += tcp_tos_reflect_kern.o always-y += tcp_dumpstats_kern.o -always-y += xdp_redirect_kern.o -always-y += xdp_redirect_map_kern.o -always-y += xdp_redirect_cpu_kern.o -always-y += xdp_monitor_kern.o always-y += xdp_rxq_info_kern.o always-y += xdp2skb_meta_kern.o always-y += syscall_tp_kern.o @@ -169,7 +173,6 @@ always-y += xdp_sample_pkts_kern.o always-y += ibumad_kern.o always-y += hbm_out_kern.o always-y += hbm_edt_kern.o -always-y += xdpsock_kern.o ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux @@ -179,13 +182,21 @@ BPF_EXTRA_CFLAGS := $(ARM_ARCH_SELECTOR) TPROGS_CFLAGS += $(ARM_ARCH_SELECTOR) endif +ifeq ($(ARCH), mips) +TPROGS_CFLAGS += -D__SANE_USERSPACE_TYPES__ +ifdef CONFIG_MACH_LOONGSON64 +BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-loongson64 +BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-generic +endif +endif + TPROGS_CFLAGS += -Wall -O2 TPROGS_CFLAGS += -Wmissing-prototypes TPROGS_CFLAGS += -Wstrict-prototypes TPROGS_CFLAGS += -I$(objtree)/usr/include TPROGS_CFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ -TPROGS_CFLAGS += -I$(srctree)/tools/lib/ +TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE) TPROGS_CFLAGS += -I$(srctree)/tools/include TPROGS_CFLAGS += -I$(srctree)/tools/perf TPROGS_CFLAGS += -DHAVE_ATTR_TEST=0 @@ -195,20 +206,26 @@ TPROGS_CFLAGS += --sysroot=$(SYSROOT) TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib endif -TPROGCFLAGS_bpf_load.o += -Wno-unused-variable - TPROGS_LDLIBS += $(LIBBPF) -lelf -lz +TPROGLDLIBS_xdp_monitor += -lm +TPROGLDLIBS_xdp_redirect += -lm +TPROGLDLIBS_xdp_redirect_cpu += -lm +TPROGLDLIBS_xdp_redirect_map += -lm +TPROGLDLIBS_xdp_redirect_map_multi += -lm +TPROGLDLIBS_xdp_router_ipv4 += -lm -pthread TPROGLDLIBS_tracex4 += -lrt TPROGLDLIBS_trace_output += -lrt TPROGLDLIBS_map_perf_test += -lrt TPROGLDLIBS_test_overhead += -lrt -TPROGLDLIBS_xdpsock += -pthread # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: -# make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang +# make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang LLC ?= llc CLANG ?= clang +OPT ?= opt +LLVM_DIS ?= llvm-dis LLVM_OBJCOPY ?= llvm-objcopy +LLVM_READELF ?= llvm-readelf BTF_PAHOLE ?= pahole # Detect that we're cross compiling and use the cross compiler @@ -218,7 +235,7 @@ endif # Don't evaluate probes and warnings if we need to run make recursively ifneq ($(src),) -HDR_PROBE := $(shell printf "\#include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \ +HDR_PROBE := $(shell printf "$(pound)include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \ $(CC) $(TPROGS_CFLAGS) $(TPROGS_LDFLAGS) -x c - \ -o /dev/null 2>/dev/null && echo okay) @@ -232,7 +249,7 @@ BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF) BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm') BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \ $(CLANG) -target bpf -O2 -g -c -x c - -o ./llvm_btf_verify.o; \ - readelf -S ./llvm_btf_verify.o | grep BTF; \ + $(LLVM_READELF) -S ./llvm_btf_verify.o | grep BTF; \ /bin/rm -f ./llvm_btf_verify.o) BPF_EXTRA_CFLAGS += -fno-stack-protector @@ -254,11 +271,25 @@ all: clean: $(MAKE) -C ../../ M=$(CURDIR) clean @find $(CURDIR) -type f -name '*~' -delete + @$(RM) -r $(CURDIR)/libbpf $(CURDIR)/bpftool -$(LIBBPF): FORCE +$(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT) # Fix up variables inherited from Kbuild that tools/ build system won't like - $(MAKE) -C $(dir $@) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ - LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(BPF_SAMPLES_PATH)/../../ O= + $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ + LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(BPF_SAMPLES_PATH)/../../ \ + O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \ + $@ install_headers + +BPFTOOLDIR := $(TOOLS_PATH)/bpf/bpftool +BPFTOOL_OUTPUT := $(abspath $(BPF_SAMPLES_PATH))/bpftool +BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool +$(BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) | $(BPFTOOL_OUTPUT) + $(MAKE) -C $(BPFTOOLDIR) srctree=$(BPF_SAMPLES_PATH)/../../ \ + OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap + +$(LIBBPF_OUTPUT) $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ $(obj)/syscall_nrs.h: $(obj)/syscall_nrs.s FORCE $(call filechk,offsets,__SYSCALL_NRS_H__) @@ -290,28 +321,130 @@ verify_target_bpf: verify_cmds $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) +libbpf_hdrs: $(LIBBPF) +$(obj)/$(TRACE_HELPERS) $(obj)/$(CGROUP_HELPERS) $(obj)/$(XDP_SAMPLE): | libbpf_hdrs + +.PHONY: libbpf_hdrs + +$(obj)/xdp_redirect_cpu_user.o: $(obj)/xdp_redirect_cpu.skel.h +$(obj)/xdp_redirect_map_multi_user.o: $(obj)/xdp_redirect_map_multi.skel.h +$(obj)/xdp_redirect_map_user.o: $(obj)/xdp_redirect_map.skel.h +$(obj)/xdp_redirect_user.o: $(obj)/xdp_redirect.skel.h +$(obj)/xdp_monitor_user.o: $(obj)/xdp_monitor.skel.h +$(obj)/xdp_router_ipv4_user.o: $(obj)/xdp_router_ipv4.skel.h + $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h $(obj)/hbm.o: $(src)/hbm.h $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h +# Override includes for xdp_sample_user.o because $(srctree)/usr/include in +# TPROGS_CFLAGS causes conflicts +XDP_SAMPLE_CFLAGS += -Wall -O2 \ + -I$(src)/../../tools/include \ + -I$(src)/../../tools/include/uapi \ + -I$(LIBBPF_INCLUDE) \ + -I$(src)/../../tools/testing/selftests/bpf + +$(obj)/$(XDP_SAMPLE): TPROGS_CFLAGS = $(XDP_SAMPLE_CFLAGS) +$(obj)/$(XDP_SAMPLE): $(src)/xdp_sample_user.h $(src)/xdp_sample_shared.h + -include $(BPF_SAMPLES_PATH)/Makefile.target +VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ + $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ + $(abspath ./vmlinux) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + +$(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) +ifeq ($(VMLINUX_H),) +ifeq ($(VMLINUX_BTF),) + $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)",\ + build the kernel or set VMLINUX_BTF or VMLINUX_H variable) +endif + $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ +else + $(Q)cp "$(VMLINUX_H)" $@ +endif + +clean-files += vmlinux.h + +# Get Clang's default includes on this system, as opposed to those seen by +# '-target bpf'. This fixes "missing" files on some architectures/distros, +# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +define get_sys_includes +$(shell $(1) -v -E - </dev/null 2>&1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - </dev/null | grep '#define __riscv_xlen ' | sed 's/#define /-D/' | sed 's/ /=/') +endef + +CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG)) + +$(obj)/xdp_redirect_cpu.bpf.o: $(obj)/xdp_sample.bpf.o +$(obj)/xdp_redirect_map_multi.bpf.o: $(obj)/xdp_sample.bpf.o +$(obj)/xdp_redirect_map.bpf.o: $(obj)/xdp_sample.bpf.o +$(obj)/xdp_redirect.bpf.o: $(obj)/xdp_sample.bpf.o +$(obj)/xdp_monitor.bpf.o: $(obj)/xdp_sample.bpf.o +$(obj)/xdp_router_ipv4.bpf.o: $(obj)/xdp_sample.bpf.o + +$(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/xdp_sample_shared.h + @echo " CLANG-BPF " $@ + $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(SRCARCH) \ + -Wno-compare-distinct-pointer-types -I$(srctree)/include \ + -I$(srctree)/samples/bpf -I$(srctree)/tools/include \ + -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \ + -c $(filter %.bpf.c,$^) -o $@ + +LINKED_SKELS := xdp_redirect_cpu.skel.h xdp_redirect_map_multi.skel.h \ + xdp_redirect_map.skel.h xdp_redirect.skel.h xdp_monitor.skel.h \ + xdp_router_ipv4.skel.h +clean-files += $(LINKED_SKELS) + +xdp_redirect_cpu.skel.h-deps := xdp_redirect_cpu.bpf.o xdp_sample.bpf.o +xdp_redirect_map_multi.skel.h-deps := xdp_redirect_map_multi.bpf.o xdp_sample.bpf.o +xdp_redirect_map.skel.h-deps := xdp_redirect_map.bpf.o xdp_sample.bpf.o +xdp_redirect.skel.h-deps := xdp_redirect.bpf.o xdp_sample.bpf.o +xdp_monitor.skel.h-deps := xdp_monitor.bpf.o xdp_sample.bpf.o +xdp_router_ipv4.skel.h-deps := xdp_router_ipv4.bpf.o xdp_sample.bpf.o + +LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) + +BPF_SRCS_LINKED := $(notdir $(wildcard $(src)/*.bpf.c)) +BPF_OBJS_LINKED := $(patsubst %.bpf.c,$(obj)/%.bpf.o, $(BPF_SRCS_LINKED)) +BPF_SKELS_LINKED := $(addprefix $(obj)/,$(LINKED_SKELS)) + +$(BPF_SKELS_LINKED): $(BPF_OBJS_LINKED) $(BPFTOOL) + @echo " BPF GEN-OBJ " $(@:.skel.h=) + $(Q)$(BPFTOOL) gen object $(@:.skel.h=.lbpf.o) $(addprefix $(obj)/,$($(@F)-deps)) + @echo " BPF GEN-SKEL" $(@:.skel.h=) + $(Q)$(BPFTOOL) gen skeleton $(@:.skel.h=.lbpf.o) name $(notdir $(@:.skel.h=)) > $@ + # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is # useless for BPF samples. +# below we use long chain of commands, clang | opt | llvm-dis | llc, +# to generate final object file. 'clang' compiles the source into IR +# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin +# processing (llvm12) and IR optimizations. 'llvm-dis' converts +# 'opt' output to IR, and finally 'llc' generates bpf byte code. $(obj)/%.o: $(src)/%.c @echo " CLANG-bpf " $@ $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \ -I$(obj) -I$(srctree)/tools/testing/selftests/bpf/ \ - -I$(srctree)/tools/lib/ \ + -I$(LIBBPF_INCLUDE) \ -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \ -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \ -Wno-gnu-variable-sized-type-not-at-end \ -Wno-address-of-packed-member -Wno-tautological-compare \ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ + -fno-asynchronous-unwind-tables \ -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \ - -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ + -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \ + $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \ + $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ ifeq ($(DWARF2BTF),y) $(BTF_PAHOLE) -J $@ endif diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst index dd34b2d26f1c..60c6494adb1b 100644 --- a/samples/bpf/README.rst +++ b/samples/bpf/README.rst @@ -62,20 +62,26 @@ To generate a smaller llc binary one can use:: -DLLVM_TARGETS_TO_BUILD="BPF" +We recommend that developers who want the fastest incremental builds +use the Ninja build system, you can find it in your system's package +manager, usually the package is ninja or ninja-build. + Quick sniplet for manually compiling LLVM and clang -(build dependencies are cmake and gcc-c++):: +(build dependencies are ninja, cmake and gcc-c++):: - $ git clone http://llvm.org/git/llvm.git - $ cd llvm/tools - $ git clone --depth 1 http://llvm.org/git/clang.git - $ cd ..; mkdir build; cd build - $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" - $ make -j $(getconf _NPROCESSORS_ONLN) + $ git clone https://github.com/llvm/llvm-project.git + $ mkdir -p llvm-project/llvm/build + $ cd llvm-project/llvm/build + $ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \ + -DLLVM_ENABLE_PROJECTS="clang" \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_BUILD_RUNTIME=OFF + $ ninja It is also possible to point make to the newly compiled 'llc' or 'clang' command via redefining LLC or CLANG on the make command line:: - make M=samples/bpf LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang + make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang Cross compiling samples ----------------------- diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h index 544237980582..29c3bb6ad1cd 100644 --- a/samples/bpf/bpf_insn.h +++ b/samples/bpf/bpf_insn.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* eBPF instruction mini library */ #ifndef __BPF_INSN_H #define __BPF_INSN_H @@ -134,15 +134,31 @@ struct bpf_insn; .off = OFF, \ .imm = 0 }) -/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ - -#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ - ((struct bpf_insn) { \ - .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ +/* + * Atomic operations: + * + * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) + */ + +#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = 0 }) + .imm = OP }) + +/* Legacy alias */ +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c deleted file mode 100644 index 4574b1939e49..000000000000 --- a/samples/bpf/bpf_load.c +++ /dev/null @@ -1,687 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <stdio.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <libelf.h> -#include <gelf.h> -#include <errno.h> -#include <unistd.h> -#include <string.h> -#include <stdbool.h> -#include <stdlib.h> -#include <linux/bpf.h> -#include <linux/filter.h> -#include <linux/perf_event.h> -#include <linux/netlink.h> -#include <linux/rtnetlink.h> -#include <linux/types.h> -#include <sys/socket.h> -#include <sys/syscall.h> -#include <sys/ioctl.h> -#include <sys/mman.h> -#include <poll.h> -#include <ctype.h> -#include <assert.h> -#include <bpf/bpf.h> -#include "bpf_load.h" -#include "perf-sys.h" - -#define DEBUGFS "/sys/kernel/debug/tracing/" - -static char license[128]; -static int kern_version; -static bool processed_sec[128]; -char bpf_log_buf[BPF_LOG_BUF_SIZE]; -int map_fd[MAX_MAPS]; -int prog_fd[MAX_PROGS]; -int event_fd[MAX_PROGS]; -int prog_cnt; -int prog_array_fd = -1; - -struct bpf_map_data map_data[MAX_MAPS]; -int map_data_count; - -static int populate_prog_array(const char *event, int prog_fd) -{ - int ind = atoi(event), err; - - err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); - if (err < 0) { - printf("failed to store prog_fd in prog_array\n"); - return -1; - } - return 0; -} - -static int write_kprobe_events(const char *val) -{ - int fd, ret, flags; - - if (val == NULL) - return -1; - else if (val[0] == '\0') - flags = O_WRONLY | O_TRUNC; - else - flags = O_WRONLY | O_APPEND; - - fd = open(DEBUGFS "kprobe_events", flags); - - ret = write(fd, val, strlen(val)); - close(fd); - - return ret; -} - -static int load_and_attach(const char *event, struct bpf_insn *prog, int size) -{ - bool is_socket = strncmp(event, "socket", 6) == 0; - bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; - bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; - bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; - bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0; - bool is_xdp = strncmp(event, "xdp", 3) == 0; - bool is_perf_event = strncmp(event, "perf_event", 10) == 0; - bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; - bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; - bool is_sockops = strncmp(event, "sockops", 7) == 0; - bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; - bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0; - size_t insns_cnt = size / sizeof(struct bpf_insn); - enum bpf_prog_type prog_type; - char buf[256]; - int fd, efd, err, id; - struct perf_event_attr attr = {}; - - attr.type = PERF_TYPE_TRACEPOINT; - attr.sample_type = PERF_SAMPLE_RAW; - attr.sample_period = 1; - attr.wakeup_events = 1; - - if (is_socket) { - prog_type = BPF_PROG_TYPE_SOCKET_FILTER; - } else if (is_kprobe || is_kretprobe) { - prog_type = BPF_PROG_TYPE_KPROBE; - } else if (is_tracepoint) { - prog_type = BPF_PROG_TYPE_TRACEPOINT; - } else if (is_raw_tracepoint) { - prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT; - } else if (is_xdp) { - prog_type = BPF_PROG_TYPE_XDP; - } else if (is_perf_event) { - prog_type = BPF_PROG_TYPE_PERF_EVENT; - } else if (is_cgroup_skb) { - prog_type = BPF_PROG_TYPE_CGROUP_SKB; - } else if (is_cgroup_sk) { - prog_type = BPF_PROG_TYPE_CGROUP_SOCK; - } else if (is_sockops) { - prog_type = BPF_PROG_TYPE_SOCK_OPS; - } else if (is_sk_skb) { - prog_type = BPF_PROG_TYPE_SK_SKB; - } else if (is_sk_msg) { - prog_type = BPF_PROG_TYPE_SK_MSG; - } else { - printf("Unknown event '%s'\n", event); - return -1; - } - - if (prog_cnt == MAX_PROGS) - return -1; - - fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, - bpf_log_buf, BPF_LOG_BUF_SIZE); - if (fd < 0) { - printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf); - return -1; - } - - prog_fd[prog_cnt++] = fd; - - if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) - return 0; - - if (is_socket || is_sockops || is_sk_skb || is_sk_msg) { - if (is_socket) - event += 6; - else - event += 7; - if (*event != '/') - return 0; - event++; - if (!isdigit(*event)) { - printf("invalid prog number\n"); - return -1; - } - return populate_prog_array(event, fd); - } - - if (is_raw_tracepoint) { - efd = bpf_raw_tracepoint_open(event + 15, fd); - if (efd < 0) { - printf("tracepoint %s %s\n", event + 15, strerror(errno)); - return -1; - } - event_fd[prog_cnt - 1] = efd; - return 0; - } - - if (is_kprobe || is_kretprobe) { - bool need_normal_check = true; - const char *event_prefix = ""; - - if (is_kprobe) - event += 7; - else - event += 10; - - if (*event == 0) { - printf("event name cannot be empty\n"); - return -1; - } - - if (isdigit(*event)) - return populate_prog_array(event, fd); - -#ifdef __x86_64__ - if (strncmp(event, "sys_", 4) == 0) { - snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s", - is_kprobe ? 'p' : 'r', event, event); - err = write_kprobe_events(buf); - if (err >= 0) { - need_normal_check = false; - event_prefix = "__x64_"; - } - } -#endif - if (need_normal_check) { - snprintf(buf, sizeof(buf), "%c:%s %s", - is_kprobe ? 'p' : 'r', event, event); - err = write_kprobe_events(buf); - if (err < 0) { - printf("failed to create kprobe '%s' error '%s'\n", - event, strerror(errno)); - return -1; - } - } - - strcpy(buf, DEBUGFS); - strcat(buf, "events/kprobes/"); - strcat(buf, event_prefix); - strcat(buf, event); - strcat(buf, "/id"); - } else if (is_tracepoint) { - event += 11; - - if (*event == 0) { - printf("event name cannot be empty\n"); - return -1; - } - strcpy(buf, DEBUGFS); - strcat(buf, "events/"); - strcat(buf, event); - strcat(buf, "/id"); - } - - efd = open(buf, O_RDONLY, 0); - if (efd < 0) { - printf("failed to open event %s\n", event); - return -1; - } - - err = read(efd, buf, sizeof(buf)); - if (err < 0 || err >= sizeof(buf)) { - printf("read from '%s' failed '%s'\n", event, strerror(errno)); - return -1; - } - - close(efd); - - buf[err] = 0; - id = atoi(buf); - attr.config = id; - - efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); - if (efd < 0) { - printf("event %d fd %d err %s\n", id, efd, strerror(errno)); - return -1; - } - event_fd[prog_cnt - 1] = efd; - err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); - if (err < 0) { - printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n", - strerror(errno)); - return -1; - } - err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); - if (err < 0) { - printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n", - strerror(errno)); - return -1; - } - - return 0; -} - -static int load_maps(struct bpf_map_data *maps, int nr_maps, - fixup_map_cb fixup_map) -{ - int i, numa_node; - - for (i = 0; i < nr_maps; i++) { - if (fixup_map) { - fixup_map(&maps[i], i); - /* Allow userspace to assign map FD prior to creation */ - if (maps[i].fd != -1) { - map_fd[i] = maps[i].fd; - continue; - } - } - - numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ? - maps[i].def.numa_node : -1; - - if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || - maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { - int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; - - map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, - maps[i].name, - maps[i].def.key_size, - inner_map_fd, - maps[i].def.max_entries, - maps[i].def.map_flags, - numa_node); - } else { - map_fd[i] = bpf_create_map_node(maps[i].def.type, - maps[i].name, - maps[i].def.key_size, - maps[i].def.value_size, - maps[i].def.max_entries, - maps[i].def.map_flags, - numa_node); - } - if (map_fd[i] < 0) { - printf("failed to create map %d (%s): %d %s\n", - i, maps[i].name, errno, strerror(errno)); - return 1; - } - maps[i].fd = map_fd[i]; - - if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) - prog_array_fd = map_fd[i]; - } - return 0; -} - -static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, - GElf_Shdr *shdr, Elf_Data **data) -{ - Elf_Scn *scn; - - scn = elf_getscn(elf, i); - if (!scn) - return 1; - - if (gelf_getshdr(scn, shdr) != shdr) - return 2; - - *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); - if (!*shname || !shdr->sh_size) - return 3; - - *data = elf_getdata(scn, 0); - if (!*data || elf_getdata(scn, *data) != NULL) - return 4; - - return 0; -} - -static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, - GElf_Shdr *shdr, struct bpf_insn *insn, - struct bpf_map_data *maps, int nr_maps) -{ - int i, nrels; - - nrels = shdr->sh_size / shdr->sh_entsize; - - for (i = 0; i < nrels; i++) { - GElf_Sym sym; - GElf_Rel rel; - unsigned int insn_idx; - bool match = false; - int j, map_idx; - - gelf_getrel(data, i, &rel); - - insn_idx = rel.r_offset / sizeof(struct bpf_insn); - - gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); - - if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { - printf("invalid relo for insn[%d].code 0x%x\n", - insn_idx, insn[insn_idx].code); - return 1; - } - insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; - - /* Match FD relocation against recorded map_data[] offset */ - for (map_idx = 0; map_idx < nr_maps; map_idx++) { - if (maps[map_idx].elf_offset == sym.st_value) { - match = true; - break; - } - } - if (match) { - insn[insn_idx].imm = maps[map_idx].fd; - } else { - printf("invalid relo for insn[%d] no map_data match\n", - insn_idx); - return 1; - } - } - - return 0; -} - -static int cmp_symbols(const void *l, const void *r) -{ - const GElf_Sym *lsym = (const GElf_Sym *)l; - const GElf_Sym *rsym = (const GElf_Sym *)r; - - if (lsym->st_value < rsym->st_value) - return -1; - else if (lsym->st_value > rsym->st_value) - return 1; - else - return 0; -} - -static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, - Elf *elf, Elf_Data *symbols, int strtabidx) -{ - int map_sz_elf, map_sz_copy; - bool validate_zero = false; - Elf_Data *data_maps; - int i, nr_maps; - GElf_Sym *sym; - Elf_Scn *scn; - int copy_sz; - - if (maps_shndx < 0) - return -EINVAL; - if (!symbols) - return -EINVAL; - - /* Get data for maps section via elf index */ - scn = elf_getscn(elf, maps_shndx); - if (scn) - data_maps = elf_getdata(scn, NULL); - if (!scn || !data_maps) { - printf("Failed to get Elf_Data from maps section %d\n", - maps_shndx); - return -EINVAL; - } - - /* For each map get corrosponding symbol table entry */ - sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); - for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { - assert(nr_maps < MAX_MAPS+1); - if (!gelf_getsym(symbols, i, &sym[nr_maps])) - continue; - if (sym[nr_maps].st_shndx != maps_shndx) - continue; - /* Only increment iif maps section */ - nr_maps++; - } - - /* Align to map_fd[] order, via sort on offset in sym.st_value */ - qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); - - /* Keeping compatible with ELF maps section changes - * ------------------------------------------------ - * The program size of struct bpf_load_map_def is known by loader - * code, but struct stored in ELF file can be different. - * - * Unfortunately sym[i].st_size is zero. To calculate the - * struct size stored in the ELF file, assume all struct have - * the same size, and simply divide with number of map - * symbols. - */ - map_sz_elf = data_maps->d_size / nr_maps; - map_sz_copy = sizeof(struct bpf_load_map_def); - if (map_sz_elf < map_sz_copy) { - /* - * Backward compat, loading older ELF file with - * smaller struct, keeping remaining bytes zero. - */ - map_sz_copy = map_sz_elf; - } else if (map_sz_elf > map_sz_copy) { - /* - * Forward compat, loading newer ELF file with larger - * struct with unknown features. Assume zero means - * feature not used. Thus, validate rest of struct - * data is zero. - */ - validate_zero = true; - } - - /* Memcpy relevant part of ELF maps data to loader maps */ - for (i = 0; i < nr_maps; i++) { - struct bpf_load_map_def *def; - unsigned char *addr, *end; - const char *map_name; - size_t offset; - - map_name = elf_strptr(elf, strtabidx, sym[i].st_name); - maps[i].name = strdup(map_name); - if (!maps[i].name) { - printf("strdup(%s): %s(%d)\n", map_name, - strerror(errno), errno); - free(sym); - return -errno; - } - - /* Symbol value is offset into ELF maps section data area */ - offset = sym[i].st_value; - def = (struct bpf_load_map_def *)(data_maps->d_buf + offset); - maps[i].elf_offset = offset; - memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def)); - memcpy(&maps[i].def, def, map_sz_copy); - - /* Verify no newer features were requested */ - if (validate_zero) { - addr = (unsigned char *) def + map_sz_copy; - end = (unsigned char *) def + map_sz_elf; - for (; addr < end; addr++) { - if (*addr != 0) { - free(sym); - return -EFBIG; - } - } - } - } - - free(sym); - return nr_maps; -} - -static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) -{ - int fd, i, ret, maps_shndx = -1, strtabidx = -1; - Elf *elf; - GElf_Ehdr ehdr; - GElf_Shdr shdr, shdr_prog; - Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; - char *shname, *shname_prog; - int nr_maps = 0; - - /* reset global variables */ - kern_version = 0; - memset(license, 0, sizeof(license)); - memset(processed_sec, 0, sizeof(processed_sec)); - - if (elf_version(EV_CURRENT) == EV_NONE) - return 1; - - fd = open(path, O_RDONLY, 0); - if (fd < 0) - return 1; - - elf = elf_begin(fd, ELF_C_READ, NULL); - - if (!elf) - return 1; - - if (gelf_getehdr(elf, &ehdr) != &ehdr) - return 1; - - /* clear all kprobes */ - i = write_kprobe_events(""); - - /* scan over all elf sections to get license and map info */ - for (i = 1; i < ehdr.e_shnum; i++) { - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - - if (0) /* helpful for llvm debugging */ - printf("section %d:%s data %p size %zd link %d flags %d\n", - i, shname, data->d_buf, data->d_size, - shdr.sh_link, (int) shdr.sh_flags); - - if (strcmp(shname, "license") == 0) { - processed_sec[i] = true; - memcpy(license, data->d_buf, data->d_size); - } else if (strcmp(shname, "version") == 0) { - processed_sec[i] = true; - if (data->d_size != sizeof(int)) { - printf("invalid size of version section %zd\n", - data->d_size); - return 1; - } - memcpy(&kern_version, data->d_buf, sizeof(int)); - } else if (strcmp(shname, "maps") == 0) { - int j; - - maps_shndx = i; - data_maps = data; - for (j = 0; j < MAX_MAPS; j++) - map_data[j].fd = -1; - } else if (shdr.sh_type == SHT_SYMTAB) { - strtabidx = shdr.sh_link; - symbols = data; - } - } - - ret = 1; - - if (!symbols) { - printf("missing SHT_SYMTAB section\n"); - goto done; - } - - if (data_maps) { - nr_maps = load_elf_maps_section(map_data, maps_shndx, - elf, symbols, strtabidx); - if (nr_maps < 0) { - printf("Error: Failed loading ELF maps (errno:%d):%s\n", - nr_maps, strerror(-nr_maps)); - goto done; - } - if (load_maps(map_data, nr_maps, fixup_map)) - goto done; - map_data_count = nr_maps; - - processed_sec[maps_shndx] = true; - } - - /* process all relo sections, and rewrite bpf insns for maps */ - for (i = 1; i < ehdr.e_shnum; i++) { - if (processed_sec[i]) - continue; - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - - if (shdr.sh_type == SHT_REL) { - struct bpf_insn *insns; - - /* locate prog sec that need map fixup (relocations) */ - if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, - &shdr_prog, &data_prog)) - continue; - - if (shdr_prog.sh_type != SHT_PROGBITS || - !(shdr_prog.sh_flags & SHF_EXECINSTR)) - continue; - - insns = (struct bpf_insn *) data_prog->d_buf; - processed_sec[i] = true; /* relo section */ - - if (parse_relo_and_apply(data, symbols, &shdr, insns, - map_data, nr_maps)) - continue; - } - } - - /* load programs */ - for (i = 1; i < ehdr.e_shnum; i++) { - - if (processed_sec[i]) - continue; - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - - if (memcmp(shname, "kprobe/", 7) == 0 || - memcmp(shname, "kretprobe/", 10) == 0 || - memcmp(shname, "tracepoint/", 11) == 0 || - memcmp(shname, "raw_tracepoint/", 15) == 0 || - memcmp(shname, "xdp", 3) == 0 || - memcmp(shname, "perf_event", 10) == 0 || - memcmp(shname, "socket", 6) == 0 || - memcmp(shname, "cgroup/", 7) == 0 || - memcmp(shname, "sockops", 7) == 0 || - memcmp(shname, "sk_skb", 6) == 0 || - memcmp(shname, "sk_msg", 6) == 0) { - ret = load_and_attach(shname, data->d_buf, - data->d_size); - if (ret != 0) - goto done; - } - } - -done: - close(fd); - return ret; -} - -int load_bpf_file(char *path) -{ - return do_load_bpf_file(path, NULL); -} - -int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map) -{ - return do_load_bpf_file(path, fixup_map); -} - -void read_trace_pipe(void) -{ - int trace_fd; - - trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); - if (trace_fd < 0) - return; - - while (1) { - static char buf[4096]; - ssize_t sz; - - sz = read(trace_fd, buf, sizeof(buf) - 1); - if (sz > 0) { - buf[sz] = 0; - puts(buf); - } - } -} diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h deleted file mode 100644 index 814894a12974..000000000000 --- a/samples/bpf/bpf_load.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __BPF_LOAD_H -#define __BPF_LOAD_H - -#include <bpf/bpf.h> - -#define MAX_MAPS 32 -#define MAX_PROGS 32 - -struct bpf_load_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; - unsigned int inner_map_idx; - unsigned int numa_node; -}; - -struct bpf_map_data { - int fd; - char *name; - size_t elf_offset; - struct bpf_load_map_def def; -}; - -typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx); - -extern int prog_fd[MAX_PROGS]; -extern int event_fd[MAX_PROGS]; -extern char bpf_log_buf[BPF_LOG_BUF_SIZE]; -extern int prog_cnt; - -/* There is a one-to-one mapping between map_fd[] and map_data[]. - * The map_data[] just contains more rich info on the given map. - */ -extern int map_fd[MAX_MAPS]; -extern struct bpf_map_data map_data[MAX_MAPS]; -extern int map_data_count; - -/* parses elf file compiled by llvm .c->.o - * . parses 'maps' section and creates maps via BPF syscall - * . parses 'license' section and passes it to syscall - * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by - * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD - * . loads eBPF programs via BPF syscall - * - * One ELF file can contain multiple BPF programs which will be loaded - * and their FDs stored stored in prog_fd array - * - * returns zero on success - */ -int load_bpf_file(char *path); -int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map); - -void read_trace_pipe(void); -int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); -#endif diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c index deb0e3e0324d..f0df3dda4b1f 100644 --- a/samples/bpf/cookie_uid_helper_example.c +++ b/samples/bpf/cookie_uid_helper_example.c @@ -67,8 +67,8 @@ static bool test_finish; static void maps_create(void) { - map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(uint32_t), - sizeof(struct stats), 100, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t), + sizeof(struct stats), 100, NULL); if (map_fd < 0) error(1, errno, "map create failed!\n"); } @@ -147,19 +147,23 @@ static void prog_load(void) */ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, - offsetof(struct stats, packets)), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, packets)), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), - BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, - offsetof(struct stats, bytes)), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, bytes)), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, offsetof(struct __sk_buff, len)), BPF_EXIT_INSN(), }; - prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, - ARRAY_SIZE(prog), "GPL", 0, - log_buf, sizeof(log_buf)); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = log_buf, + .log_size = sizeof(log_buf), + ); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", + prog, ARRAY_SIZE(prog), &opts); if (prog_fd < 0) error(1, errno, "failed to load prog\n%s\n", log_buf); } @@ -167,7 +171,7 @@ static void prog_load(void) static void prog_attach_iptables(char *file) { int ret; - char rules[100]; + char rules[256]; if (bpf_obj_pin(prog_fd, file)) error(1, errno, "bpf_obj_pin"); @@ -175,8 +179,13 @@ static void prog_attach_iptables(char *file) printf("file path too long: %s\n", file); exit(1); } - sprintf(rules, "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT", - file); + ret = snprintf(rules, sizeof(rules), + "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT", + file); + if (ret < 0 || ret >= sizeof(rules)) { + printf("error constructing iptables command\n"); + exit(1); + } ret = system(rules); if (ret < 0) { printf("iptables rule update failed: %d/n", WEXITSTATUS(ret)); @@ -313,7 +322,7 @@ int main(int argc, char *argv[]) print_table(); printf("\n"); sleep(1); - }; + } } else if (cfg_test_cookie) { udp_client(); } diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c index a86a19d5f033..5aefd19cdfa1 100644 --- a/samples/bpf/cpustat_kern.c +++ b/samples/bpf/cpustat_kern.c @@ -51,28 +51,28 @@ static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 }; #define MAP_OFF_PSTATE_IDX 3 #define MAP_OFF_NUM 4 -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = MAX_CPU * MAP_OFF_NUM, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAP_OFF_NUM); +} my_map SEC(".maps"); /* cstate_duration records duration time for every idle state per CPU */ -struct bpf_map_def SEC("maps") cstate_duration = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = MAX_CPU * MAX_CSTATE_ENTRIES, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES); +} cstate_duration SEC(".maps"); /* pstate_duration records duration time for every operating point per CPU */ -struct bpf_map_def SEC("maps") pstate_duration = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = MAX_CPU * MAX_PSTATE_ENTRIES, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES); +} pstate_duration SEC(".maps"); /* * The trace events for cpu_idle and cpu_frequency are taken from: diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c index 869a99406dbf..ab90bb08a2b4 100644 --- a/samples/bpf/cpustat_user.c +++ b/samples/bpf/cpustat_user.c @@ -9,16 +9,16 @@ #include <string.h> #include <unistd.h> #include <fcntl.h> -#include <linux/bpf.h> #include <locale.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/time.h> -#include <sys/resource.h> #include <sys/wait.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> + +static int cstate_map_fd, pstate_map_fd; #define MAX_CPU 8 #define MAX_PSTATE_ENTRIES 5 @@ -181,21 +181,50 @@ static void int_exit(int sig) { cpu_stat_inject_cpu_idle_event(); cpu_stat_inject_cpu_frequency_event(); - cpu_stat_update(map_fd[1], map_fd[2]); + cpu_stat_update(cstate_map_fd, pstate_map_fd); cpu_stat_print(); exit(0); } int main(int argc, char **argv) { + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; int ret; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + cstate_map_fd = bpf_object__find_map_fd_by_name(obj, "cstate_duration"); + pstate_map_fd = bpf_object__find_map_fd_by_name(obj, "pstate_duration"); + if (cstate_map_fd < 0 || pstate_map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } ret = cpu_stat_inject_cpu_idle_event(); @@ -210,10 +239,13 @@ int main(int argc, char **argv) signal(SIGTERM, int_exit); while (1) { - cpu_stat_update(map_fd[1], map_fd[2]); + cpu_stat_update(cstate_map_fd, pstate_map_fd); cpu_stat_print(); sleep(5); } +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh index ffe4c0607341..38e4599350db 100755 --- a/samples/bpf/do_hbm_test.sh +++ b/samples/bpf/do_hbm_test.sh @@ -10,7 +10,7 @@ Usage() { echo "Script for testing HBM (Host Bandwidth Manager) framework." echo "It creates a cgroup to use for testing and load a BPF program to limit" - echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" + echo "egress or ingress bandwidth. It then uses iperf3 or netperf to create" echo "loads. The output is the goodput in Mbps (unless -D was used)." echo "" echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]" @@ -91,6 +91,16 @@ qdisc="" flags="" do_stats=0 +BPFFS=/sys/fs/bpf +function config_bpffs () { + if mount | grep $BPFFS > /dev/null; then + echo "bpffs already mounted" + else + echo "bpffs not mounted. Mounting..." + mount -t bpf none $BPFFS + fi +} + function start_hbm () { rm -f hbm.out echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out @@ -192,6 +202,7 @@ processArgs () { } processArgs +config_bpffs if [ $debug_flag -eq 1 ] ; then rm -f hbm_out.log @@ -201,7 +212,7 @@ hbm_pid=$(start_hbm) usleep 100000 host=`hostname` -cg_base_dir=/sys/fs/cgroup +cg_base_dir=/sys/fs/cgroup/unified cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id" echo $$ >> $cg_dir/cgroup.procs @@ -411,23 +422,8 @@ fi sleep 1 -# Detach any BPF programs that may have lingered -ttx=`bpftool cgroup tree | grep hbm` -v=2 -for x in $ttx ; do - if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then - cg=$x ; v=0 - else - if [ $v -eq 0 ] ; then - id=$x ; v=1 - else - if [ $v -eq 1 ] ; then - type=$x ; bpftool cgroup detach $cg $type id $id - v=0 - fi - fi - fi -done +# Detach any pinned BPF programs that may have lingered +rm -rf $BPFFS/hbm* if [ $use_netperf -ne 0 ] ; then if [ "$server" == "" ] ; then diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c index d5992f787232..88a26f3ce201 100644 --- a/samples/bpf/fds_example.c +++ b/samples/bpf/fds_example.c @@ -17,6 +17,7 @@ #include <bpf/libbpf.h> #include "bpf_insn.h" #include "sock_example.h" +#include "bpf_util.h" #define BPF_F_PIN (1 << 0) #define BPF_F_GET (1 << 1) @@ -30,6 +31,8 @@ #define BPF_M_MAP 1 #define BPF_M_PROG 2 +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + static void usage(void) { printf("Usage: fds_example [...]\n"); @@ -44,31 +47,30 @@ static void usage(void) printf(" -h Display this help.\n"); } -static int bpf_map_create(void) -{ - return bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t), - sizeof(uint32_t), 1024, 0); -} - static int bpf_prog_create(const char *object) { static struct bpf_insn insns[] = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }; - size_t insns_cnt = sizeof(insns) / sizeof(struct bpf_insn); - char bpf_log_buf[BPF_LOG_BUF_SIZE]; + size_t insns_cnt = ARRAY_SIZE(insns); struct bpf_object *obj; - int prog_fd; + int err; if (object) { - assert(!bpf_prog_load(object, BPF_PROG_TYPE_UNSPEC, - &obj, &prog_fd)); - return prog_fd; + obj = bpf_object__open_file(object, NULL); + assert(!libbpf_get_error(obj)); + err = bpf_object__load(obj); + assert(!err); + return bpf_program__fd(bpf_object__next_program(obj, NULL)); } else { - return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, - insns, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = bpf_log_buf, + .log_size = BPF_LOG_BUF_SIZE, + ); + + return bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", + insns, insns_cnt, &opts); } } @@ -78,7 +80,8 @@ static int bpf_do_map(const char *file, uint32_t flags, uint32_t key, int fd, ret; if (flags & BPF_F_PIN) { - fd = bpf_map_create(); + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(uint32_t), + sizeof(uint32_t), 1024, NULL); printf("bpf: map fd:%d (%s)\n", fd, strerror(errno)); assert(fd > 0); diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c index 7d7153777678..516fbac28b71 100644 --- a/samples/bpf/hbm.c +++ b/samples/bpf/hbm.c @@ -34,23 +34,20 @@ #include <stdio.h> #include <stdlib.h> #include <assert.h> -#include <sys/resource.h> #include <sys/time.h> #include <unistd.h> #include <errno.h> #include <fcntl.h> #include <linux/unistd.h> +#include <linux/compiler.h> #include <linux/bpf.h> #include <bpf/bpf.h> #include <getopt.h> -#include "bpf_load.h" -#include "bpf_rlimit.h" #include "cgroup_helpers.h" #include "hbm.h" #include "bpf_util.h" -#include <bpf/bpf.h> #include <bpf/libbpf.h> bool outFlag = true; @@ -70,9 +67,9 @@ static void do_error(char *msg, bool errno_flag); #define DEBUGFS "/sys/kernel/debug/tracing/" -struct bpf_object *obj; -int bpfprog_fd; -int cgroup_storage_fd; +static struct bpf_program *bpf_prog; +static struct bpf_object *obj; +static int queue_stats_fd; static void read_trace_pipe2(void) { @@ -121,56 +118,59 @@ static void do_error(char *msg, bool errno_flag) static int prog_load(char *prog) { - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_CGROUP_SKB, - .file = prog, - .expected_attach_type = BPF_CGROUP_INET_EGRESS, - }; - int map_fd; - struct bpf_map *map; + struct bpf_program *pos; + const char *sec_name; - int ret = 0; - - if (access(prog, O_RDONLY) < 0) { - printf("Error accessing file %s: %s\n", prog, strerror(errno)); + obj = bpf_object__open_file(prog, NULL); + if (libbpf_get_error(obj)) { + printf("ERROR: opening BPF object file failed\n"); return 1; } - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) - ret = 1; - if (!ret) { - map = bpf_object__find_map_by_name(obj, "queue_stats"); - map_fd = bpf_map__fd(map); - if (map_fd < 0) { - printf("Map not found: %s\n", strerror(map_fd)); - ret = 1; + + /* load BPF program */ + if (bpf_object__load(obj)) { + printf("ERROR: loading BPF object file failed\n"); + goto err; + } + + bpf_object__for_each_program(pos, obj) { + sec_name = bpf_program__section_name(pos); + if (sec_name && !strcmp(sec_name, "cgroup_skb/egress")) { + bpf_prog = pos; + break; } } + if (!bpf_prog) { + printf("ERROR: finding a prog in obj file failed\n"); + goto err; + } - if (ret) { - printf("ERROR: bpf_prog_load_xattr failed for: %s\n", prog); - printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); - ret = -1; - } else { - ret = map_fd; + queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats"); + if (queue_stats_fd < 0) { + printf("ERROR: finding a map in obj file failed\n"); + goto err; } - return ret; + return 0; + +err: + bpf_object__close(obj); + return 1; } static int run_bpf_prog(char *prog, int cg_id) { - int map_fd; - int rc = 0; + struct hbm_queue_stats qstats = {0}; + char cg_dir[100], cg_pin_path[100]; + struct bpf_link *link = NULL; int key = 0; int cg1 = 0; - int type = BPF_CGROUP_INET_EGRESS; - char cg_dir[100]; - struct hbm_queue_stats qstats = {0}; + int rc = 0; sprintf(cg_dir, "/hbm%d", cg_id); - map_fd = prog_load(prog); - if (map_fd == -1) - return 1; + rc = prog_load(prog); + if (rc != 0) + return rc; if (setup_cgroup_environment()) { printf("ERROR: setting cgroup environment\n"); @@ -190,16 +190,24 @@ static int run_bpf_prog(char *prog, int cg_id) qstats.stats = stats_flag ? 1 : 0; qstats.loopback = loopback_flag ? 1 : 0; qstats.no_cn = no_cn_flag ? 1 : 0; - if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { + if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) { printf("ERROR: Could not update map element\n"); goto err; } if (!outFlag) - type = BPF_CGROUP_INET_INGRESS; - if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { - printf("ERROR: bpf_prog_attach fails!\n"); - log_err("Attaching prog"); + bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS); + + link = bpf_program__attach_cgroup(bpf_prog, cg1); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n"); + goto err; + } + + sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id); + rc = bpf_link__pin(link, cg_pin_path); + if (rc < 0) { + printf("ERROR: bpf_link__pin failed: %d\n", rc); goto err; } @@ -213,7 +221,7 @@ static int run_bpf_prog(char *prog, int cg_id) #define DELTA_RATE_CHECK 10000 /* in us */ #define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ - bpf_map_lookup_elem(map_fd, &key, &qstats); + bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); if (gettimeofday(&t0, NULL) < 0) do_error("gettimeofday failed", true); t_last = t0; @@ -242,7 +250,7 @@ static int run_bpf_prog(char *prog, int cg_id) fclose(fin); printf(" new_eth_tx_bytes:%llu\n", new_eth_tx_bytes); - bpf_map_lookup_elem(map_fd, &key, &qstats); + bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); new_cg_tx_bytes = qstats.bytes_total; delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; last_eth_tx_bytes = new_eth_tx_bytes; @@ -289,14 +297,14 @@ static int run_bpf_prog(char *prog, int cg_id) rate = minRate; qstats.rate = rate; } - if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) + if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) do_error("update map element fails", false); } } else { sleep(dur); } // Get stats! - if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { + if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) { char fname[100]; FILE *fout; @@ -394,14 +402,20 @@ static int run_bpf_prog(char *prog, int cg_id) if (debugFlag) read_trace_pipe2(); - return rc; + goto cleanup; + err: rc = 1; - if (cg1) +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + + if (cg1 != -1) close(cg1); - cleanup_cgroup_environment(); + if (rc != 0) + cleanup_cgroup_environment(); return rc; } @@ -483,7 +497,7 @@ int main(int argc, char **argv) "Option -%c requires an argument.\n\n", optopt); case 'h': - // fallthrough + __fallthrough; default: Usage(); return 0; @@ -494,5 +508,8 @@ int main(int argc, char **argv) prog = argv[optind]; printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); + /* Use libbpf 1.0 API mode */ + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + return run_bpf_prog(prog, cg_id); } diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h index e00f26f6afba..1752a46a2b05 100644 --- a/samples/bpf/hbm_kern.h +++ b/samples/bpf/hbm_kern.h @@ -9,8 +9,6 @@ * Include file for sample Host Bandwidth Manager (HBM) BPF programs */ #define KBUILD_MODNAME "foo" -#include <stddef.h> -#include <stdbool.h> #include <uapi/linux/bpf.h> #include <uapi/linux/if_ether.h> #include <uapi/linux/if_packet.h> @@ -69,7 +67,7 @@ struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 1); __type(key, u32); - __type(value, struct hvm_queue_stats); + __type(value, struct hbm_queue_stats); } queue_stats SEC(".maps"); struct hbm_pkt_info { diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c index 3a91b4c1989a..9b193231024a 100644 --- a/samples/bpf/ibumad_kern.c +++ b/samples/bpf/ibumad_kern.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB -/** +/* * ibumad BPF sample kernel side * * This program is free software; you can redistribute it and/or @@ -16,19 +16,19 @@ #include <bpf/bpf_helpers.h> -struct bpf_map_def SEC("maps") read_count = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), /* class; u32 required */ - .value_size = sizeof(u64), /* count of mads read */ - .max_entries = 256, /* Room for all Classes */ -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); /* class; u32 required */ + __type(value, u64); /* count of mads read */ + __uint(max_entries, 256); /* Room for all Classes */ +} read_count SEC(".maps"); -struct bpf_map_def SEC("maps") write_count = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), /* class; u32 required */ - .value_size = sizeof(u64), /* count of mads written */ - .max_entries = 256, /* Room for all Classes */ -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); /* class; u32 required */ + __type(value, u64); /* count of mads written */ + __uint(max_entries, 256); /* Room for all Classes */ +} write_count SEC(".maps"); #undef DEBUG #ifndef DEBUG diff --git a/samples/bpf/ibumad_user.c b/samples/bpf/ibumad_user.c index fa06eef31a84..d074c978aac7 100644 --- a/samples/bpf/ibumad_user.c +++ b/samples/bpf/ibumad_user.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB -/** +/* * ibumad BPF sample user side * * This program is free software; you can redistribute it and/or @@ -19,14 +19,18 @@ #include <sys/types.h> #include <limits.h> -#include <sys/resource.h> #include <getopt.h> #include <net/if.h> -#include "bpf_load.h" +#include <bpf/bpf.h> #include "bpf_util.h" #include <bpf/libbpf.h> +static struct bpf_link *tp_links[3]; +static struct bpf_object *obj; +static int map_fd[2]; +static int tp_cnt; + static void dump_counts(int fd) { __u32 key; @@ -53,6 +57,11 @@ static void dump_all_counts(void) static void dump_exit(int sig) { dump_all_counts(); + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + bpf_object__close(obj); exit(0); } @@ -73,19 +82,11 @@ static void usage(char *cmd) int main(int argc, char **argv) { + struct bpf_program *prog; unsigned long delay = 5; + char filename[256]; int longindex = 0; - int opt; - char bpf_file[256]; - - /* Create the eBPF kernel code path name. - * This follows the pattern of all of the other bpf samples - */ - snprintf(bpf_file, sizeof(bpf_file), "%s_kern.o", argv[0]); - - /* Do one final dump when exiting */ - signal(SIGINT, dump_exit); - signal(SIGTERM, dump_exit); + int opt, err = -1; while ((opt = getopt_long(argc, argv, "hd:rSw", long_options, &longindex)) != -1) { @@ -107,16 +108,51 @@ int main(int argc, char **argv) } } - if (load_bpf_file(bpf_file)) { - fprintf(stderr, "ERROR: failed to load eBPF from file : %s\n", - bpf_file); - return 1; + /* Do one final dump when exiting */ + signal(SIGINT, dump_exit); + signal(SIGTERM, dump_exit); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return err; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "read_count"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "write_count"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + tp_links[tp_cnt] = bpf_program__attach(prog); + if (libbpf_get_error(tp_links[tp_cnt])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + tp_links[tp_cnt] = NULL; + goto cleanup; + } + tp_cnt++; } while (1) { sleep(delay); dump_all_counts(); } + err = 0; + +cleanup: + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); - return 0; + bpf_object__close(obj); + return err; } diff --git a/samples/bpf/lathist_kern.c b/samples/bpf/lathist_kern.c index ca9c2e4e69aa..4adfcbbe6ef4 100644 --- a/samples/bpf/lathist_kern.c +++ b/samples/bpf/lathist_kern.c @@ -18,12 +18,12 @@ * trace_preempt_[on|off] tracepoints hooks is not supported. */ -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u64), - .max_entries = MAX_CPU, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, u64); + __uint(max_entries, MAX_CPU); +} my_map SEC(".maps"); SEC("kprobe/trace_preempt_off") int bpf_prog1(struct pt_regs *ctx) @@ -61,12 +61,12 @@ static unsigned int log2l(unsigned long v) return log2(v); } -struct bpf_map_def SEC("maps") my_lat = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(long), - .max_entries = MAX_CPU * MAX_ENTRIES, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, long); + __uint(max_entries, MAX_CPU * MAX_ENTRIES); +} my_lat SEC(".maps"); SEC("kprobe/trace_preempt_on") int bpf_prog2(struct pt_regs *ctx) diff --git a/samples/bpf/lathist_user.c b/samples/bpf/lathist_user.c index 2ff2839a52d5..7d8ff2418303 100644 --- a/samples/bpf/lathist_user.c +++ b/samples/bpf/lathist_user.c @@ -6,9 +6,8 @@ #include <unistd.h> #include <stdlib.h> #include <signal.h> -#include <linux/bpf.h> +#include <bpf/libbpf.h> #include <bpf/bpf.h> -#include "bpf_load.h" #define MAX_ENTRIES 20 #define MAX_CPU 4 @@ -81,20 +80,51 @@ static void get_data(int fd) int main(int argc, char **argv) { + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + int map_fd, i = 0; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + map_fd = bpf_object__find_map_fd_by_name(obj, "my_lat"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } while (1) { - get_data(map_fd[1]); + get_data(map_fd); print_hist(); sleep(5); } +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh index 090b96eaf7f7..0eda9754f50b 100644..100755 --- a/samples/bpf/lwt_len_hist.sh +++ b/samples/bpf/lwt_len_hist.sh @@ -8,6 +8,8 @@ VETH1=tst_lwt1b TRACE_ROOT=/sys/kernel/debug/tracing function cleanup { + # To reset saved histogram, remove pinned map + rm /sys/fs/bpf/tc/globals/lwt_len_hist_map ip route del 192.168.253.2/32 dev $VETH0 2> /dev/null ip link del $VETH0 2> /dev/null ip link del $VETH1 2> /dev/null diff --git a/samples/bpf/lwt_len_hist_kern.c b/samples/bpf/lwt_len_hist_kern.c index 9ed63e10e170..1fa14c54963a 100644 --- a/samples/bpf/lwt_len_hist_kern.c +++ b/samples/bpf/lwt_len_hist_kern.c @@ -16,13 +16,6 @@ #include <uapi/linux/in.h> #include <bpf/bpf_helpers.h> -# define printk(fmt, ...) \ - ({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ - }) - struct bpf_elf_map { __u32 type; __u32 size_key; diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c index 587b68b1f8dd..430a4b7e353e 100644 --- a/samples/bpf/lwt_len_hist_user.c +++ b/samples/bpf/lwt_len_hist_user.c @@ -15,8 +15,6 @@ #define MAX_INDEX 64 #define MAX_STARS 38 -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - static void stars(char *str, long val, long max, int width) { int i; diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 12e91ae64d4d..7342c5b2f278 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -9,154 +9,172 @@ #include <linux/version.h> #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> -#include "bpf_legacy.h" #include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include "trace_common.h" #define MAX_ENTRIES 1000 #define MAX_NR_CPUS 1024 -struct bpf_map_def_legacy SEC("maps") hash_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, 10000); +} lru_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, 10000); + __uint(map_flags, BPF_F_NO_COMMON_LRU); +} nocommon_lru_hash_map SEC(".maps"); + +struct inner_lru { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NUMA_NODE); + __uint(numa_node, 0); +} inner_lru_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, MAX_NR_CPUS); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_lru); /* use inner_lru as inner map */ +} array_of_lru_hashs SEC(".maps") = { + /* statically initialize the first element */ + .values = { &inner_lru_hash_map }, }; -struct bpf_map_def_legacy SEC("maps") lru_hash_map = { - .type = BPF_MAP_TYPE_LRU_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = 10000, -}; - -struct bpf_map_def_legacy SEC("maps") nocommon_lru_hash_map = { - .type = BPF_MAP_TYPE_LRU_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = 10000, - .map_flags = BPF_F_NO_COMMON_LRU, -}; - -struct bpf_map_def_legacy SEC("maps") inner_lru_hash_map = { - .type = BPF_MAP_TYPE_LRU_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, - .map_flags = BPF_F_NUMA_NODE, - .numa_node = 0, -}; - -struct bpf_map_def_legacy SEC("maps") array_of_lru_hashs = { - .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, - .key_size = sizeof(u32), - .max_entries = MAX_NR_CPUS, -}; - -struct bpf_map_def_legacy SEC("maps") percpu_hash_map = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, -}; - -struct bpf_map_def_legacy SEC("maps") hash_map_alloc = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, - .map_flags = BPF_F_NO_PREALLOC, -}; - -struct bpf_map_def_legacy SEC("maps") percpu_hash_map_alloc = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, - .map_flags = BPF_F_NO_PREALLOC, -}; - -struct bpf_map_def_legacy SEC("maps") lpm_trie_map_alloc = { - .type = BPF_MAP_TYPE_LPM_TRIE, - .key_size = 8, - .value_size = sizeof(long), - .max_entries = 10000, - .map_flags = BPF_F_NO_PREALLOC, -}; - -struct bpf_map_def_legacy SEC("maps") array_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, -}; - -struct bpf_map_def_legacy SEC("maps") lru_hash_lookup_map = { - .type = BPF_MAP_TYPE_LRU_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, -}; - -SEC("kprobe/sys_getuid") +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(long)); + __uint(max_entries, MAX_ENTRIES); +} percpu_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NO_PREALLOC); +} hash_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(long)); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NO_PREALLOC); +} percpu_hash_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(key_size, 8); + __uint(value_size, sizeof(long)); + __uint(max_entries, 10000); + __uint(map_flags, BPF_F_NO_PREALLOC); +} lpm_trie_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} array_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} lru_hash_lookup_map SEC(".maps"); + +SEC("kprobe/" SYSCALL(sys_getuid)) int stress_hmap(struct pt_regs *ctx) { u32 key = bpf_get_current_pid_tgid(); long init_val = 1; long *value; + int i; - bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&hash_map, &key); - if (value) - bpf_map_delete_elem(&hash_map, &key); + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&hash_map, &key); + if (value) + bpf_map_delete_elem(&hash_map, &key); + } return 0; } -SEC("kprobe/sys_geteuid") +SEC("kprobe/" SYSCALL(sys_geteuid)) int stress_percpu_hmap(struct pt_regs *ctx) { u32 key = bpf_get_current_pid_tgid(); long init_val = 1; long *value; + int i; - bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&percpu_hash_map, &key); - if (value) - bpf_map_delete_elem(&percpu_hash_map, &key); + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&percpu_hash_map, &key); + if (value) + bpf_map_delete_elem(&percpu_hash_map, &key); + } return 0; } -SEC("kprobe/sys_getgid") +SEC("kprobe/" SYSCALL(sys_getgid)) int stress_hmap_alloc(struct pt_regs *ctx) { u32 key = bpf_get_current_pid_tgid(); long init_val = 1; long *value; + int i; - bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&hash_map_alloc, &key); - if (value) - bpf_map_delete_elem(&hash_map_alloc, &key); + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&hash_map_alloc, &key); + if (value) + bpf_map_delete_elem(&hash_map_alloc, &key); + } return 0; } -SEC("kprobe/sys_getegid") +SEC("kprobe/" SYSCALL(sys_getegid)) int stress_percpu_hmap_alloc(struct pt_regs *ctx) { u32 key = bpf_get_current_pid_tgid(); long init_val = 1; long *value; + int i; - bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key); - if (value) - bpf_map_delete_elem(&percpu_hash_map_alloc, &key); + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key); + if (value) + bpf_map_delete_elem(&percpu_hash_map_alloc, &key); + } return 0; } -SEC("kprobe/sys_connect") +SEC("kprobe/" SYSCALL(sys_connect)) int stress_lru_hmap_alloc(struct pt_regs *ctx) { + struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn"; union { u16 dst6[8]; @@ -175,8 +193,8 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx) long val = 1; u32 key = 0; - in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx); - addrlen = (int)PT_REGS_PARM3(ctx); + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs); + addrlen = (int)PT_REGS_PARM3_CORE(real_regs); if (addrlen != sizeof(*in6)) return 0; @@ -233,7 +251,7 @@ done: return 0; } -SEC("kprobe/sys_gettid") +SEC("kprobe/" SYSCALL(sys_gettid)) int stress_lpm_trie_map_alloc(struct pt_regs *ctx) { union { @@ -255,7 +273,7 @@ int stress_lpm_trie_map_alloc(struct pt_regs *ctx) return 0; } -SEC("kprobe/sys_getpgid") +SEC("kprobe/" SYSCALL(sys_getpgid)) int stress_hash_map_lookup(struct pt_regs *ctx) { u32 key = 1, i; @@ -268,7 +286,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx) return 0; } -SEC("kprobe/sys_getppid") +SEC("kprobe/" SYSCALL(sys_getppid)) int stress_array_map_lookup(struct pt_regs *ctx) { u32 key = 1, i; diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index fe5564bff39b..1bb53f4b29e1 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -11,15 +11,13 @@ #include <sys/wait.h> #include <stdlib.h> #include <signal.h> -#include <linux/bpf.h> #include <string.h> #include <time.h> -#include <sys/resource.h> #include <arpa/inet.h> #include <errno.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #define TEST_BIT(t) (1U << (t)) #define MAX_NR_CPUS 1024 @@ -61,14 +59,20 @@ const char *test_map_names[NR_TESTS] = { [LRU_HASH_LOOKUP] = "lru_hash_lookup_map", }; +enum map_idx { + array_of_lru_hashs_idx, + hash_map_alloc_idx, + lru_hash_lookup_idx, + NR_IDXES, +}; + +static int map_fd[NR_IDXES]; + static int test_flags = ~0; static uint32_t num_map_entries; static uint32_t inner_lru_hash_size; -static int inner_lru_hash_idx = -1; -static int array_of_lru_hashs_idx = -1; -static int lru_hash_lookup_idx = -1; static int lru_hash_lookup_test_entries = 32; -static uint32_t max_cnt = 1000000; +static uint32_t max_cnt = 10000; static int check_test_flags(enum test_type t) { @@ -122,30 +126,33 @@ static void do_test_lru(enum test_type test, int cpu) __u64 start_time; int i, ret; - if (test == INNER_LRU_HASH_PREALLOC) { + if (test == INNER_LRU_HASH_PREALLOC && cpu) { + /* If CPU is not 0, create inner_lru hash map and insert the fd + * value into the array_of_lru_hash map. In case of CPU 0, + * 'inner_lru_hash_map' was statically inserted on the map init + */ int outer_fd = map_fd[array_of_lru_hashs_idx]; unsigned int mycpu, mynode; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = BPF_F_NUMA_NODE, + ); assert(cpu < MAX_NR_CPUS); - if (cpu) { - ret = syscall(__NR_getcpu, &mycpu, &mynode, NULL); - assert(!ret); - - inner_lru_map_fds[cpu] = - bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH, - test_map_names[INNER_LRU_HASH_PREALLOC], - sizeof(uint32_t), - sizeof(long), - inner_lru_hash_size, 0, - mynode); - if (inner_lru_map_fds[cpu] == -1) { - printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n", - strerror(errno), errno); - exit(1); - } - } else { - inner_lru_map_fds[cpu] = map_fd[inner_lru_hash_idx]; + ret = syscall(__NR_getcpu, &mycpu, &mynode, NULL); + assert(!ret); + + opts.numa_node = mynode; + inner_lru_map_fds[cpu] = + bpf_map_create(BPF_MAP_TYPE_LRU_HASH, + test_map_names[INNER_LRU_HASH_PREALLOC], + sizeof(uint32_t), + sizeof(long), + inner_lru_hash_size, &opts); + if (inner_lru_map_fds[cpu] == -1) { + printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n", + strerror(errno), errno); + exit(1); } ret = bpf_map_update_elem(outer_fd, &cpu, @@ -377,7 +384,8 @@ static void fill_lpm_trie(void) key->data[1] = rand() & 0xff; key->data[2] = rand() & 0xff; key->data[3] = rand() & 0xff; - r = bpf_map_update_elem(map_fd[6], key, &value, 0); + r = bpf_map_update_elem(map_fd[hash_map_alloc_idx], + key, &value, 0); assert(!r); } @@ -388,59 +396,46 @@ static void fill_lpm_trie(void) key->data[3] = 1; value = 128; - r = bpf_map_update_elem(map_fd[6], key, &value, 0); + r = bpf_map_update_elem(map_fd[hash_map_alloc_idx], key, &value, 0); assert(!r); } -static void fixup_map(struct bpf_map_data *map, int idx) +static void fixup_map(struct bpf_object *obj) { + struct bpf_map *map; int i; - if (!strcmp("inner_lru_hash_map", map->name)) { - inner_lru_hash_idx = idx; - inner_lru_hash_size = map->def.max_entries; - } + bpf_object__for_each_map(map, obj) { + const char *name = bpf_map__name(map); - if (!strcmp("array_of_lru_hashs", map->name)) { - if (inner_lru_hash_idx == -1) { - printf("inner_lru_hash_map must be defined before array_of_lru_hashs\n"); - exit(1); + /* Only change the max_entries for the enabled test(s) */ + for (i = 0; i < NR_TESTS; i++) { + if (!strcmp(test_map_names[i], name) && + (check_test_flags(i))) { + bpf_map__set_max_entries(map, num_map_entries); + continue; + } } - map->def.inner_map_idx = inner_lru_hash_idx; - array_of_lru_hashs_idx = idx; } - if (!strcmp("lru_hash_lookup_map", map->name)) - lru_hash_lookup_idx = idx; - - if (num_map_entries <= 0) - return; - inner_lru_hash_size = num_map_entries; - - /* Only change the max_entries for the enabled test(s) */ - for (i = 0; i < NR_TESTS; i++) { - if (!strcmp(test_map_names[i], map->name) && - (check_test_flags(i))) { - map->def.max_entries = num_map_entries; - } - } } int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + struct bpf_link *links[8]; + struct bpf_program *prog; + struct bpf_object *obj; + struct bpf_map *map; char filename[256]; - int num_cpu = 8; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); + int i = 0; if (argc > 1) test_flags = atoi(argv[1]) ? : test_flags; if (argc > 2) - num_cpu = atoi(argv[2]) ? : num_cpu; + nr_cpus = atoi(argv[2]) ? : nr_cpus; if (argc > 3) num_map_entries = atoi(argv[3]); @@ -448,14 +443,61 @@ int main(int argc, char **argv) if (argc > 4) max_cnt = atoi(argv[4]); - if (load_bpf_file_fixup_map(filename, fixup_map)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + map = bpf_object__find_map_by_name(obj, "inner_lru_hash_map"); + if (libbpf_get_error(map)) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + inner_lru_hash_size = bpf_map__max_entries(map); + if (!inner_lru_hash_size) { + fprintf(stderr, "ERROR: failed to get map attribute\n"); + goto cleanup; + } + + /* resize BPF map prior to loading */ + if (num_map_entries > 0) + fixup_map(obj); + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "array_of_lru_hashs"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "hash_map_alloc"); + map_fd[2] = bpf_object__find_map_fd_by_name(obj, "lru_hash_lookup_map"); + if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } fill_lpm_trie(); - run_perf_test(num_cpu); + run_perf_test(nr_cpus); + +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c index c4ec10dbfc3b..eb4d94742e6b 100644 --- a/samples/bpf/offwaketime_kern.c +++ b/samples/bpf/offwaketime_kern.c @@ -5,16 +5,22 @@ * License as published by the Free Software Foundation. */ #include <uapi/linux/bpf.h> -#include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> #include <uapi/linux/ptrace.h> #include <uapi/linux/perf_event.h> #include <linux/version.h> #include <linux/sched.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> -#define _(P) ({typeof(P) val; bpf_probe_read(&val, sizeof(val), &P); val;}) +#define _(P) \ + ({ \ + typeof(P) val; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) #define MINBLOCK_US 1 +#define MAX_ENTRIES 10000 struct key_t { char waker[TASK_COMM_LEN]; @@ -23,38 +29,38 @@ struct key_t { u32 tret; }; -struct bpf_map_def SEC("maps") counts = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct key_t), - .value_size = sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, MAX_ENTRIES); +} counts SEC(".maps"); -struct bpf_map_def SEC("maps") start = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_ENTRIES); +} start SEC(".maps"); struct wokeby_t { char name[TASK_COMM_LEN]; u32 ret; }; -struct bpf_map_def SEC("maps") wokeby = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(struct wokeby_t), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, struct wokeby_t); + __uint(max_entries, MAX_ENTRIES); +} wokeby SEC(".maps"); -struct bpf_map_def SEC("maps") stackmap = { - .type = BPF_MAP_TYPE_STACK_TRACE, - .key_size = sizeof(u32), - .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, MAX_ENTRIES); +} stackmap SEC(".maps"); #define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) @@ -107,11 +113,11 @@ static inline int update_counts(void *ctx, u32 pid, u64 delta) /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ struct sched_switch_args { unsigned long long pad; - char prev_comm[16]; + char prev_comm[TASK_COMM_LEN]; int prev_pid; int prev_prio; long long prev_state; - char next_comm[16]; + char next_comm[TASK_COMM_LEN]; int next_pid; int next_prio; }; diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c index 51c7da5341cc..b6eedcb98fb9 100644 --- a/samples/bpf/offwaketime_user.c +++ b/samples/bpf/offwaketime_user.c @@ -5,19 +5,18 @@ #include <unistd.h> #include <stdlib.h> #include <signal.h> -#include <linux/bpf.h> -#include <string.h> #include <linux/perf_event.h> #include <errno.h> -#include <assert.h> #include <stdbool.h> -#include <sys/resource.h> #include <bpf/libbpf.h> -#include "bpf_load.h" +#include <bpf/bpf.h> #include "trace_helpers.h" #define PRINT_RAW_ADDR 0 +/* counts, stackmap */ +static int map_fd[2]; + static void print_ksym(__u64 addr) { struct ksym *sym; @@ -52,14 +51,14 @@ static void print_stack(struct key_t *key, __u64 count) int i; printf("%s;", key->target); - if (bpf_map_lookup_elem(map_fd[3], &key->tret, ip) != 0) { + if (bpf_map_lookup_elem(map_fd[1], &key->tret, ip) != 0) { printf("---;"); } else { for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) print_ksym(ip[i]); } printf("-;"); - if (bpf_map_lookup_elem(map_fd[3], &key->wret, ip) != 0) { + if (bpf_map_lookup_elem(map_fd[1], &key->wret, ip) != 0) { printf("---;"); } else { for (i = 0; i < PERF_MAX_STACK_DEPTH; i++) @@ -95,24 +94,49 @@ static void int_exit(int sig) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_object *obj = NULL; + struct bpf_link *links[2]; + struct bpf_program *prog; + int delay = 1, i = 0; char filename[256]; - int delay = 1; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); - - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); return 2; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } if (argc > 1) @@ -120,5 +144,10 @@ int main(int argc, char **argv) sleep(delay); print_stacks(map_fd[0]); +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c index e504dc308371..a3f8a3998e0a 100644 --- a/samples/bpf/sampleip_kern.c +++ b/samples/bpf/sampleip_kern.c @@ -4,7 +4,6 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include <linux/version.h> #include <linux/ptrace.h> #include <uapi/linux/bpf.h> #include <uapi/linux/bpf_perf_event.h> @@ -13,12 +12,12 @@ #define MAX_IPS 8192 -struct bpf_map_def SEC("maps") ip_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u64), - .value_size = sizeof(u32), - .max_entries = MAX_IPS, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); + __type(value, u32); + __uint(max_entries, MAX_IPS); +} ip_map SEC(".maps"); SEC("perf_event") int do_sample(struct bpf_perf_event_data *ctx) diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c index b0f115f938bc..921c505bb567 100644 --- a/samples/bpf/sampleip_user.c +++ b/samples/bpf/sampleip_user.c @@ -10,13 +10,11 @@ #include <errno.h> #include <signal.h> #include <string.h> -#include <assert.h> #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/bpf.h> -#include <sys/ioctl.h> +#include <bpf/bpf.h> #include <bpf/libbpf.h> -#include "bpf_load.h" #include "perf-sys.h" #include "trace_helpers.h" @@ -25,6 +23,7 @@ #define MAX_IPS 8192 #define PAGE_OFFSET 0xffff880000000000 +static int map_fd; static int nr_cpus; static void usage(void) @@ -34,9 +33,10 @@ static void usage(void) printf(" duration # sampling duration (seconds), default 5\n"); } -static int sampling_start(int *pmu_fd, int freq) +static int sampling_start(int freq, struct bpf_program *prog, + struct bpf_link *links[]) { - int i; + int i, pmu_fd; struct perf_event_attr pe_sample_attr = { .type = PERF_TYPE_SOFTWARE, @@ -47,26 +47,30 @@ static int sampling_start(int *pmu_fd, int freq) }; for (i = 0; i < nr_cpus; i++) { - pmu_fd[i] = sys_perf_event_open(&pe_sample_attr, -1 /* pid */, i, + pmu_fd = sys_perf_event_open(&pe_sample_attr, -1 /* pid */, i, -1 /* group_fd */, 0 /* flags */); - if (pmu_fd[i] < 0) { + if (pmu_fd < 0) { fprintf(stderr, "ERROR: Initializing perf sampling\n"); return 1; } - assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, - prog_fd[0]) == 0); - assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0); + links[i] = bpf_program__attach_perf_event(prog, pmu_fd); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: Attach perf event\n"); + links[i] = NULL; + close(pmu_fd); + return 1; + } } return 0; } -static void sampling_end(int *pmu_fd) +static void sampling_end(struct bpf_link *links[]) { int i; for (i = 0; i < nr_cpus; i++) - close(pmu_fd[i]); + bpf_link__destroy(links[i]); } struct ipcount { @@ -128,14 +132,17 @@ static void print_ip_map(int fd) static void int_exit(int sig) { printf("\n"); - print_ip_map(map_fd[0]); + print_ip_map(map_fd); exit(0); } int main(int argc, char **argv) { + int opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS, error = 1; + struct bpf_object *obj = NULL; + struct bpf_program *prog; + struct bpf_link **links; char filename[256]; - int *pmu_fd, opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS; /* process arguments */ while ((opt = getopt(argc, argv, "F:h")) != -1) { @@ -163,38 +170,58 @@ int main(int argc, char **argv) } /* create perf FDs for each CPU */ - nr_cpus = sysconf(_SC_NPROCESSORS_CONF); - pmu_fd = malloc(nr_cpus * sizeof(int)); - if (pmu_fd == NULL) { - fprintf(stderr, "ERROR: malloc of pmu_fd\n"); - return 1; + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + links = calloc(nr_cpus, sizeof(struct bpf_link *)); + if (!links) { + fprintf(stderr, "ERROR: malloc of links\n"); + goto cleanup; } - /* load BPF program */ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (load_bpf_file(filename)) { - fprintf(stderr, "ERROR: loading BPF program (errno %d):\n", - errno); - if (strcmp(bpf_log_buf, "") == 0) - fprintf(stderr, "Try: ulimit -l unlimited\n"); - else - fprintf(stderr, "%s", bpf_log_buf); - return 1; + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + prog = bpf_object__find_program_by_name(obj, "do_sample"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "ip_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + signal(SIGINT, int_exit); signal(SIGTERM, int_exit); /* do sampling */ printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n", freq, secs); - if (sampling_start(pmu_fd, freq) != 0) - return 1; + if (sampling_start(freq, prog, links) != 0) + goto cleanup; + sleep(secs); - sampling_end(pmu_fd); - free(pmu_fd); + error = 0; +cleanup: + sampling_end(links); /* output sample counts */ - print_ip_map(map_fd[0]); + if (!error) + print_ip_map(map_fd); - return 0; + free(links); + bpf_object__close(obj); + return error; } diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c index 00aae1d33fca..5b66f2401b96 100644 --- a/samples/bpf/sock_example.c +++ b/samples/bpf/sock_example.c @@ -29,6 +29,7 @@ #include <bpf/bpf.h> #include "bpf_insn.h" #include "sock_example.h" +#include "bpf_util.h" char bpf_log_buf[BPF_LOG_BUF_SIZE]; @@ -37,8 +38,8 @@ static int test_sock(void) int sock = -1, map_fd, prog_fd, i, key; long long value = 0, tcp_cnt, udp_cnt, icmp_cnt; - map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), - 256, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(key), sizeof(value), + 256, NULL); if (map_fd < 0) { printf("failed to create map '%s'\n", strerror(errno)); goto cleanup; @@ -54,14 +55,18 @@ static int test_sock(void) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ BPF_EXIT_INSN(), }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + size_t insns_cnt = ARRAY_SIZE(prog); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = bpf_log_buf, + .log_size = BPF_LOG_BUF_SIZE, + ); - prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, insns_cnt, - "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", + prog, insns_cnt, &opts); if (prog_fd < 0) { printf("failed to load prog '%s'\n", strerror(errno)); goto cleanup; diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c index 3c83722877dc..9e8d39e245c1 100644 --- a/samples/bpf/sockex1_user.c +++ b/samples/bpf/sockex1_user.c @@ -11,17 +11,26 @@ int main(int ac, char **argv) { struct bpf_object *obj; + struct bpf_program *prog; int map_fd, prog_fd; char filename[256]; - int i, sock; + int i, sock, err; FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, - &obj, &prog_fd)) + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) return 1; + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_SOCKET_FILTER); + + err = bpf_object__load(obj); + if (err) + return 1; + + prog_fd = bpf_program__fd(prog); map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); sock = open_raw_sock("lo"); diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c index a41dd520bc53..b7997541f7ee 100644 --- a/samples/bpf/sockex2_kern.c +++ b/samples/bpf/sockex2_kern.c @@ -1,12 +1,12 @@ #include <uapi/linux/bpf.h> -#include <bpf/bpf_helpers.h> -#include "bpf_legacy.h" #include <uapi/linux/in.h> #include <uapi/linux/if.h> #include <uapi/linux/if_ether.h> #include <uapi/linux/ip.h> #include <uapi/linux/ipv6.h> #include <uapi/linux/if_tunnel.h> +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" #define IP_MF 0x2000 #define IP_OFFSET 0x1FFF diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c index af925a5afd1d..2c18471336f0 100644 --- a/samples/bpf/sockex2_user.c +++ b/samples/bpf/sockex2_user.c @@ -7,7 +7,6 @@ #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> -#include <sys/resource.h> struct pair { __u64 packets; @@ -16,20 +15,26 @@ struct pair { int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_program *prog; struct bpf_object *obj; int map_fd, prog_fd; char filename[256]; - int i, sock; + int i, sock, err; FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) + return 1; + + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_SOCKET_FILTER); - if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, - &obj, &prog_fd)) + err = bpf_object__load(obj); + if (err) return 1; + prog_fd = bpf_program__fd(prog); map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); sock = open_raw_sock("lo"); diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c index 36d4dac23549..b363503357e5 100644 --- a/samples/bpf/sockex3_kern.c +++ b/samples/bpf/sockex3_kern.c @@ -5,8 +5,6 @@ * License as published by the Free Software Foundation. */ #include <uapi/linux/bpf.h> -#include <bpf/bpf_helpers.h> -#include "bpf_legacy.h" #include <uapi/linux/in.h> #include <uapi/linux/if.h> #include <uapi/linux/if_ether.h> @@ -14,28 +12,32 @@ #include <uapi/linux/ipv6.h> #include <uapi/linux/if_tunnel.h> #include <uapi/linux/mpls.h> +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" #define IP_MF 0x2000 #define IP_OFFSET 0x1FFF #define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F -struct bpf_map_def SEC("maps") jmp_table = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), - .max_entries = 8, -}; +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 8); +} jmp_table SEC(".maps"); #define PARSE_VLAN 1 #define PARSE_MPLS 2 #define PARSE_IP 3 #define PARSE_IPV6 4 -/* protocol dispatch routine. - * It tail-calls next BPF program depending on eth proto - * Note, we could have used: - * bpf_tail_call(skb, &jmp_table, proto); - * but it would need large prog_array +/* Protocol dispatch routine. It tail-calls next BPF program depending + * on eth proto. Note, we could have used ... + * + * bpf_tail_call(skb, &jmp_table, proto); + * + * ... but it would need large prog_array and cannot be optimised given + * the map key is not static. */ static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto) { @@ -92,12 +94,12 @@ struct globals { struct flow_key_record flow; }; -struct bpf_map_def SEC("maps") percpu_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(__u32), - .value_size = sizeof(struct globals), - .max_entries = 32, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, struct globals); + __uint(max_entries, 32); +} percpu_map SEC(".maps"); /* user poor man's per_cpu until native support is ready */ static struct globals *this_cpu_globals(void) @@ -113,12 +115,12 @@ struct pair { __u64 bytes; }; -struct bpf_map_def SEC("maps") hash_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct flow_key_record), - .value_size = sizeof(struct pair), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct flow_key_record); + __type(value, struct pair); + __uint(max_entries, 1024); +} hash_map SEC(".maps"); static void update_stats(struct __sk_buff *skb, struct globals *g) { diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index bbb1cd0666a9..cd6fa79df900 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -1,17 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <assert.h> -#include <linux/bpf.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> -#include <sys/resource.h> - -#define PARSE_IP 3 -#define PARSE_IP_PROG_FD (prog_fd[0]) -#define PROG_ARRAY_FD (map_fd[0]) struct flow_key_record { __be32 src; @@ -30,31 +24,53 @@ struct pair { int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd; + struct bpf_program *prog; + struct bpf_object *obj; + const char *section; char filename[256]; FILE *f; - int i, sock, err, id, key = PARSE_IP; - struct bpf_prog_info info = {}; - uint32_t info_len = sizeof(info); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + jmp_table_fd = bpf_object__find_map_fd_by_name(obj, "jmp_table"); + hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); + if (jmp_table_fd < 0 || hash_map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; } - /* Test fd array lookup which returns the id of the bpf_prog */ - err = bpf_obj_get_info_by_fd(PARSE_IP_PROG_FD, &info, &info_len); - assert(!err); - err = bpf_map_lookup_elem(PROG_ARRAY_FD, &key, &id); - assert(!err); - assert(id == info.id); + bpf_object__for_each_program(prog, obj) { + fd = bpf_program__fd(prog); + + section = bpf_program__section_name(prog); + if (sscanf(section, "socket/%d", &key) != 1) { + fprintf(stderr, "ERROR: finding prog failed\n"); + goto cleanup; + } + + if (key == 0) + main_prog_fd = fd; + else + bpf_map_update_elem(jmp_table_fd, &key, &fd, BPF_ANY); + } sock = open_raw_sock("lo"); - assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4], + /* attach BPF program to socket */ + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd, sizeof(__u32)) == 0); if (argc > 1) @@ -69,8 +85,8 @@ int main(int argc, char **argv) sleep(1); printf("IP src.port -> dst.port bytes packets\n"); - while (bpf_map_get_next_key(map_fd[2], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[2], &next_key, &value); + while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(hash_map_fd, &next_key, &value); printf("%s.%05d -> %s.%05d %12lld %12lld\n", inet_ntoa((struct in_addr){htonl(next_key.src)}), next_key.port16[0], @@ -80,5 +96,8 @@ int main(int argc, char **argv) key = next_key; } } + +cleanup: + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest_kern.c index f508af357251..455da77319d9 100644 --- a/samples/bpf/spintest_kern.c +++ b/samples/bpf/spintest_kern.c @@ -12,25 +12,25 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(long), - .max_entries = 1024, -}; -struct bpf_map_def SEC("maps") my_map2 = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .key_size = sizeof(long), - .value_size = sizeof(long), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, long); + __uint(max_entries, 1024); +} my_map SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(long)); + __uint(value_size, sizeof(long)); + __uint(max_entries, 1024); +} my_map2 SEC(".maps"); -struct bpf_map_def SEC("maps") stackmap = { - .type = BPF_MAP_TYPE_STACK_TRACE, - .key_size = sizeof(u32), - .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, 10000); +} stackmap SEC(".maps"); #define PROG(foo) \ int foo(struct pt_regs *ctx) \ diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c index fb430ea2ef51..aadac14f748a 100644 --- a/samples/bpf/spintest_user.c +++ b/samples/bpf/spintest_user.c @@ -1,40 +1,70 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <unistd.h> -#include <linux/bpf.h> #include <string.h> #include <assert.h> -#include <sys/resource.h> #include <bpf/libbpf.h> -#include "bpf_load.h" +#include <bpf/bpf.h> #include "trace_helpers.h" int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256], symbol[256]; + struct bpf_object *obj = NULL; + struct bpf_link *links[20]; long key, next_key, value; - char filename[256]; + struct bpf_program *prog; + int map_fd, i, j = 0; + const char *section; struct ksym *sym; - int i; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); return 2; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + section = bpf_program__section_name(prog); + if (sscanf(section, "kprobe/%s", symbol) != 1) + continue; + + /* Attach prog only when symbol exists */ + if (ksym_get_addr(symbol)) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } } for (i = 0; i < 5; i++) { key = 0; printf("kprobing funcs:"); - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[0], &next_key, &value); + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(map_fd, &next_key, &value); assert(next_key == value); sym = ksym_search(value); key = next_key; @@ -48,10 +78,15 @@ int main(int ac, char **argv) if (key) printf("\n"); key = 0; - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) - bpf_map_delete_elem(map_fd[0], &next_key); + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) + bpf_map_delete_elem(map_fd, &next_key); sleep(1); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c index 5a62b03b1f88..50231c2eff9c 100644 --- a/samples/bpf/syscall_tp_kern.c +++ b/samples/bpf/syscall_tp_kern.c @@ -18,19 +18,19 @@ struct syscalls_exit_open_args { long ret; }; -struct bpf_map_def SEC("maps") enter_open_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} enter_open_map SEC(".maps"); -struct bpf_map_def SEC("maps") exit_open_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} exit_open_map SEC(".maps"); static __always_inline void count(void *map) { diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c index 57014bab7cbe..7a788bb837fc 100644 --- a/samples/bpf/syscall_tp_user.c +++ b/samples/bpf/syscall_tp_user.c @@ -5,16 +5,11 @@ #include <unistd.h> #include <fcntl.h> #include <stdlib.h> -#include <signal.h> -#include <linux/bpf.h> #include <string.h> #include <linux/perf_event.h> #include <errno.h> -#include <assert.h> -#include <stdbool.h> -#include <sys/resource.h> +#include <bpf/libbpf.h> #include <bpf/bpf.h> -#include "bpf_load.h" /* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*. * This requires kernel CONFIG_FTRACE_SYSCALLS to be set. @@ -40,6 +35,9 @@ static void verify_map(int map_id) fprintf(stderr, "failed: map #%d returns value 0\n", map_id); return; } + + printf("verify map:%d val: %d\n", map_id, val); + val = 0; if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) { fprintf(stderr, "map_update failed: %s\n", strerror(errno)); @@ -49,16 +47,44 @@ static void verify_map(int map_id) static int test(char *filename, int num_progs) { - int i, fd, map0_fds[num_progs], map1_fds[num_progs]; + int map0_fds[num_progs], map1_fds[num_progs], fd, i, j = 0; + struct bpf_link *links[num_progs * 4]; + struct bpf_object *objs[num_progs]; + struct bpf_program *prog; for (i = 0; i < num_progs; i++) { - if (load_bpf_file(filename)) { - fprintf(stderr, "%s", bpf_log_buf); - return 1; + objs[i] = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(objs[i])) { + fprintf(stderr, "opening BPF object file failed\n"); + objs[i] = NULL; + goto cleanup; } - printf("prog #%d: map ids %d %d\n", i, map_fd[0], map_fd[1]); - map0_fds[i] = map_fd[0]; - map1_fds[i] = map_fd[1]; + + /* load BPF program */ + if (bpf_object__load(objs[i])) { + fprintf(stderr, "loading BPF object file failed\n"); + goto cleanup; + } + + map0_fds[i] = bpf_object__find_map_fd_by_name(objs[i], + "enter_open_map"); + map1_fds[i] = bpf_object__find_map_fd_by_name(objs[i], + "exit_open_map"); + if (map0_fds[i] < 0 || map1_fds[i] < 0) { + fprintf(stderr, "finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, objs[i]) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + printf("prog #%d: map ids %d %d\n", i, map0_fds[i], map1_fds[i]); } /* current load_bpf_file has perf_event_open default pid = -1 @@ -80,12 +106,17 @@ static int test(char *filename, int num_progs) verify_map(map1_fds[i]); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + for (i--; i >= 0; i--) + bpf_object__close(objs[i]); return 0; } int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int opt, num_progs = 1; char filename[256]; @@ -101,7 +132,6 @@ int main(int argc, char **argv) } } - setrlimit(RLIMIT_MEMLOCK, &r); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); return test(filename, num_progs); diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c index 278ade5427c8..186ac0a79c0a 100644 --- a/samples/bpf/task_fd_query_kern.c +++ b/samples/bpf/task_fd_query_kern.c @@ -10,7 +10,7 @@ int bpf_prog1(struct pt_regs *ctx) return 0; } -SEC("kretprobe/blk_account_io_completion") +SEC("kretprobe/__blk_account_io_done") int bpf_prog2(struct pt_regs *ctx) { return 0; diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c index ff2e9c1c7266..a33d74bd3a4b 100644 --- a/samples/bpf/task_fd_query_user.c +++ b/samples/bpf/task_fd_query_user.c @@ -10,17 +10,19 @@ #include <fcntl.h> #include <linux/bpf.h> #include <sys/ioctl.h> -#include <sys/resource.h> #include <sys/types.h> #include <sys/stat.h> #include <linux/perf_event.h> +#include <bpf/bpf.h> #include <bpf/libbpf.h> -#include "bpf_load.h" #include "bpf_util.h" #include "perf-sys.h" #include "trace_helpers.h" +static struct bpf_program *progs[2]; +static struct bpf_link *links[2]; + #define CHECK_PERROR_RET(condition) ({ \ int __ret = !!(condition); \ if (__ret) { \ @@ -86,21 +88,22 @@ static int bpf_get_retprobe_bit(const char *event_type) return ret; } -static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name, +static int test_debug_fs_kprobe(int link_idx, const char *fn_name, __u32 expected_fd_type) { __u64 probe_offset, probe_addr; __u32 len, prog_id, fd_type; + int err, event_fd; char buf[256]; - int err; len = sizeof(buf); - err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len, + event_fd = bpf_link__fd(links[link_idx]); + err = bpf_task_fd_query(getpid(), event_fd, 0, buf, &len, &prog_id, &fd_type, &probe_offset, &probe_addr); if (err < 0) { printf("FAIL: %s, for event_fd idx %d, fn_name %s\n", - __func__, prog_fd_idx, fn_name); + __func__, link_idx, fn_name); perror(" :"); return -1; } @@ -108,7 +111,7 @@ static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name, fd_type != expected_fd_type || probe_offset != 0x0 || probe_addr != 0x0) { printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n", - prog_fd_idx); + link_idx); printf("buf: %s, fd_type: %u, probe_offset: 0x%llx," " probe_addr: 0x%llx\n", buf, fd_type, probe_offset, probe_addr); @@ -125,12 +128,13 @@ static int test_nondebug_fs_kuprobe_common(const char *event_type, int is_return_bit = bpf_get_retprobe_bit(event_type); int type = bpf_find_probe_type(event_type); struct perf_event_attr attr = {}; - int fd; + struct bpf_link *link; + int fd, err = -1; if (type < 0 || is_return_bit < 0) { printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n", __func__, type, is_return_bit); - return -1; + return err; } attr.sample_period = 1; @@ -149,14 +153,21 @@ static int test_nondebug_fs_kuprobe_common(const char *event_type, attr.type = type; fd = sys_perf_event_open(&attr, -1, 0, -1, 0); - CHECK_PERROR_RET(fd < 0); + link = bpf_program__attach_perf_event(progs[0], fd); + if (libbpf_get_error(link)) { + printf("ERROR: bpf_program__attach_perf_event failed\n"); + link = NULL; + close(fd); + goto cleanup; + } - CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0); - CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len, prog_id, fd_type, probe_offset, probe_addr) < 0); + err = 0; - return 0; +cleanup: + bpf_link__destroy(link); + return err; } static int test_nondebug_fs_probe(const char *event_type, const char *name, @@ -215,17 +226,18 @@ static int test_nondebug_fs_probe(const char *event_type, const char *name, static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) { + char buf[256], event_alias[sizeof("test_1234567890")]; const char *event_type = "uprobe"; struct perf_event_attr attr = {}; - char buf[256], event_alias[sizeof("test_1234567890")]; __u64 probe_offset, probe_addr; __u32 len, prog_id, fd_type; - int err, res, kfd, efd; + int err = -1, res, kfd, efd; + struct bpf_link *link; ssize_t bytes; snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type); - kfd = open(buf, O_WRONLY | O_APPEND, 0); + kfd = open(buf, O_WRONLY | O_TRUNC, 0); CHECK_PERROR_RET(kfd < 0); res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid()); @@ -254,10 +266,15 @@ static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) attr.type = PERF_TYPE_TRACEPOINT; attr.sample_period = 1; attr.wakeup_events = 1; + kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); - CHECK_PERROR_RET(kfd < 0); - CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); - CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0); + link = bpf_program__attach_perf_event(progs[0], kfd); + if (libbpf_get_error(link)) { + printf("ERROR: bpf_program__attach_perf_event failed\n"); + link = NULL; + close(kfd); + goto cleanup; + } len = sizeof(buf); err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len, @@ -283,38 +300,55 @@ static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) probe_offset); return -1; } + err = 0; - close(kfd); - return 0; +cleanup: + bpf_link__destroy(link); + return err; } int main(int argc, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; extern char __executable_start; char filename[256], buf[256]; __u64 uprobe_file_offset; + struct bpf_program *prog; + struct bpf_object *obj; + int i = 0, err = -1; + + if (load_kallsyms()) { + printf("failed to process /proc/kallsyms\n"); + return err; + } snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return err; } - if (load_kallsyms()) { - printf("failed to process /proc/kallsyms\n"); - return 1; + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + bpf_object__for_each_program(prog, obj) { + progs[i] = prog; + links[i] = bpf_program__attach(progs[i]); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } /* test two functions in the corresponding *_kern.c file */ CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request", BPF_FD_TYPE_KPROBE)); - CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion", + CHECK_AND_RET(test_debug_fs_kprobe(1, "__blk_account_io_done", BPF_FD_TYPE_KRETPROBE)); /* test nondebug fs kprobe */ @@ -361,7 +395,7 @@ int main(int argc, char **argv) * on different systems with different compilers. The right way is * to parse the ELF file. We took a shortcut here. */ - uprobe_file_offset = (__u64)main - (__u64)&__executable_start; + uprobe_file_offset = (unsigned long)main - (unsigned long)&__executable_start; CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0], uprobe_file_offset, 0x0, false, BPF_FD_TYPE_UPROBE, @@ -378,6 +412,12 @@ int main(int argc, char **argv) false)); CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, true)); + err = 0; - return 0; +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); + return err; } diff --git a/samples/bpf/test_cgrp2_array_pin.c b/samples/bpf/test_cgrp2_array_pin.c index 6d564aa75447..05e88aa63009 100644 --- a/samples/bpf/test_cgrp2_array_pin.c +++ b/samples/bpf/test_cgrp2_array_pin.c @@ -64,9 +64,9 @@ int main(int argc, char **argv) } if (create_array) { - array_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY, + array_fd = bpf_map_create(BPF_MAP_TYPE_CGROUP_ARRAY, NULL, sizeof(uint32_t), sizeof(uint32_t), - 1, 0); + 1, NULL); if (array_fd < 0) { fprintf(stderr, "bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY,...): %s(%d)\n", diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c index 20fbd1241db3..68ce69457afe 100644 --- a/samples/bpf/test_cgrp2_attach.c +++ b/samples/bpf/test_cgrp2_attach.c @@ -31,6 +31,7 @@ #include <bpf/bpf.h> #include "bpf_insn.h" +#include "bpf_util.h" enum { MAP_KEY_PACKETS, @@ -53,7 +54,7 @@ static int prog_load(int map_fd, int verdict) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), /* Count bytes */ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ @@ -64,16 +65,20 @@ static int prog_load(int map_fd, int verdict) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ BPF_EXIT_INSN(), }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); - - return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, - prog, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); + size_t insns_cnt = ARRAY_SIZE(prog); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = bpf_log_buf, + .log_size = BPF_LOG_BUF_SIZE, + ); + + return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SKB, NULL, "GPL", + prog, insns_cnt, &opts); } static int usage(const char *argv0) @@ -89,9 +94,9 @@ static int attach_filter(int cg_fd, int type, int verdict) int prog_fd, map_fd, ret, key; long long pkt_cnt, byte_cnt; - map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(key), sizeof(byte_cnt), - 256, 0); + 256, NULL); if (map_fd < 0) { printf("Failed to create map: '%s'\n", strerror(errno)); return EXIT_FAILURE; diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c index b0811da5a00f..a0811df888f4 100644 --- a/samples/bpf/test_cgrp2_sock.c +++ b/samples/bpf/test_cgrp2_sock.c @@ -70,6 +70,10 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio) BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)), BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, priority)), }; + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = bpf_log_buf, + .log_size = BPF_LOG_BUF_SIZE, + ); struct bpf_insn *prog; size_t insns_cnt; @@ -115,8 +119,8 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio) insns_cnt /= sizeof(struct bpf_insn); - ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt, - "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); + ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", + prog, insns_cnt, &opts); free(prog); diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c index a9277b118c33..e7060aaa2f5a 100644 --- a/samples/bpf/test_cgrp2_sock2.c +++ b/samples/bpf/test_cgrp2_sock2.c @@ -20,9 +20,9 @@ #include <net/if.h> #include <linux/bpf.h> #include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "bpf_insn.h" -#include "bpf_load.h" static int usage(const char *argv0) { @@ -32,37 +32,64 @@ static int usage(const char *argv0) int main(int argc, char **argv) { - int cg_fd, ret, filter_id = 0; + int cg_fd, err, ret = EXIT_FAILURE, filter_id = 0, prog_cnt = 0; + const char *link_pin_path = "/sys/fs/bpf/test_cgrp2_sock2"; + struct bpf_link *link = NULL; + struct bpf_program *progs[2]; + struct bpf_program *prog; + struct bpf_object *obj; if (argc < 3) return usage(argv[0]); + if (argc > 3) + filter_id = atoi(argv[3]); + cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); if (cg_fd < 0) { printf("Failed to open cgroup path: '%s'\n", strerror(errno)); - return EXIT_FAILURE; + return ret; } - if (load_bpf_file(argv[2])) - return EXIT_FAILURE; - - printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); + obj = bpf_object__open_file(argv[2], NULL); + if (libbpf_get_error(obj)) { + printf("ERROR: opening BPF object file failed\n"); + return ret; + } - if (argc > 3) - filter_id = atoi(argv[3]); + bpf_object__for_each_program(prog, obj) { + progs[prog_cnt] = prog; + prog_cnt++; + } if (filter_id >= prog_cnt) { printf("Invalid program id; program not found in file\n"); - return EXIT_FAILURE; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + printf("ERROR: loading BPF object file failed\n"); + goto cleanup; } - ret = bpf_prog_attach(prog_fd[filter_id], cg_fd, - BPF_CGROUP_INET_SOCK_CREATE, 0); - if (ret < 0) { - printf("Failed to attach prog to cgroup: '%s'\n", - strerror(errno)); - return EXIT_FAILURE; + link = bpf_program__attach_cgroup(progs[filter_id], cg_fd); + if (libbpf_get_error(link)) { + printf("ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } - return EXIT_SUCCESS; + err = bpf_link__pin(link, link_pin_path); + if (err < 0) { + printf("ERROR: bpf_link__pin failed: %d\n", err); + goto cleanup; + } + + ret = EXIT_SUCCESS; + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return ret; } diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh index 0f396a86e0cb..6a3dbe642b2b 100755 --- a/samples/bpf/test_cgrp2_sock2.sh +++ b/samples/bpf/test_cgrp2_sock2.sh @@ -1,6 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +BPFFS=/sys/fs/bpf +LINK_PIN=$BPFFS/test_cgrp2_sock2 + function config_device { ip netns add at_ns0 ip link add veth0 type veth peer name veth0b @@ -21,16 +24,22 @@ function config_cgroup { echo $$ >> /tmp/cgroupv2/foo/cgroup.procs } +function config_bpffs { + if mount | grep $BPFFS > /dev/null; then + echo "bpffs already mounted" + else + echo "bpffs not mounted. Mounting..." + mount -t bpf none $BPFFS + fi +} function attach_bpf { - test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1 + ./test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1 [ $? -ne 0 ] && exit 1 } function cleanup { - if [ -d /tmp/cgroupv2/foo ]; then - test_cgrp2_sock -d /tmp/cgroupv2/foo - fi + rm -rf $LINK_PIN ip link del veth0b ip netns delete at_ns0 umount /tmp/cgroupv2 @@ -42,6 +51,7 @@ cleanup 2>/dev/null set -e config_device config_cgroup +config_bpffs set +e # @@ -62,6 +72,9 @@ if [ $? -eq 0 ]; then exit 1 fi +rm -rf $LINK_PIN +sleep 1 # Wait for link detach + # # Test 2 - fail ping # diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c index 6dc4f41bb6cb..fbd43e2bb4d3 100644 --- a/samples/bpf/test_current_task_under_cgroup_kern.c +++ b/samples/bpf/test_current_task_under_cgroup_kern.c @@ -10,23 +10,24 @@ #include <linux/version.h> #include <bpf/bpf_helpers.h> #include <uapi/linux/utsname.h> +#include "trace_common.h" -struct bpf_map_def SEC("maps") cgroup_map = { - .type = BPF_MAP_TYPE_CGROUP_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 1); +} cgroup_map SEC(".maps"); -struct bpf_map_def SEC("maps") perf_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 1); +} perf_map SEC(".maps"); /* Writes the last PID that called sync to a map at index 0 */ -SEC("kprobe/sys_sync") +SEC("kprobe/" SYSCALL(sys_sync)) int bpf_prog1(struct pt_regs *ctx) { u64 pid = bpf_get_current_pid_tgid(); diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c index 06e9f8ce42e2..ac251a417f45 100644 --- a/samples/bpf/test_current_task_under_cgroup_user.c +++ b/samples/bpf/test_current_task_under_cgroup_user.c @@ -4,10 +4,9 @@ #define _GNU_SOURCE #include <stdio.h> -#include <linux/bpf.h> #include <unistd.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "cgroup_helpers.h" #define CGROUP_PATH "/my-cgroup" @@ -15,13 +14,44 @@ int main(int argc, char **argv) { pid_t remote_pid, local_pid = getpid(); - int cg2, idx = 0, rc = 0; + struct bpf_link *link = NULL; + struct bpf_program *prog; + int cg2, idx = 0, rc = 1; + struct bpf_object *obj; char filename[256]; + int map_fd[2]; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "cgroup_map"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "perf_map"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } if (setup_cgroup_environment()) @@ -70,12 +100,14 @@ int main(int argc, char **argv) goto err; } - goto out; -err: - rc = 1; + rc = 0; -out: +err: close(cg2); cleanup_cgroup_environment(); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return rc; } diff --git a/samples/bpf/test_ipip.sh b/samples/bpf/test_ipip.sh deleted file mode 100755 index 9e507c305c6e..000000000000 --- a/samples/bpf/test_ipip.sh +++ /dev/null @@ -1,179 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -function config_device { - ip netns add at_ns0 - ip netns add at_ns1 - ip netns add at_ns2 - ip link add veth0 type veth peer name veth0b - ip link add veth1 type veth peer name veth1b - ip link add veth2 type veth peer name veth2b - ip link set veth0b up - ip link set veth1b up - ip link set veth2b up - ip link set dev veth0b mtu 1500 - ip link set dev veth1b mtu 1500 - ip link set dev veth2b mtu 1500 - ip link set veth0 netns at_ns0 - ip link set veth1 netns at_ns1 - ip link set veth2 netns at_ns2 - ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 - ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad - ip netns exec at_ns0 ip link set dev veth0 up - ip netns exec at_ns1 ip addr add 172.16.1.101/24 dev veth1 - ip netns exec at_ns1 ip addr add 2401:db00::2/64 dev veth1 nodad - ip netns exec at_ns1 ip link set dev veth1 up - ip netns exec at_ns2 ip addr add 172.16.1.200/24 dev veth2 - ip netns exec at_ns2 ip addr add 2401:db00::3/64 dev veth2 nodad - ip netns exec at_ns2 ip link set dev veth2 up - ip link add br0 type bridge - ip link set br0 up - ip link set dev br0 mtu 1500 - ip link set veth0b master br0 - ip link set veth1b master br0 - ip link set veth2b master br0 -} - -function add_ipip_tunnel { - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type ipip local 172.16.1.100 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns1 \ - ip link add dev $DEV_NS type ipip local 172.16.1.101 remote 172.16.1.200 - ip netns exec at_ns1 ip link set dev $DEV_NS up - # same inner IP address in at_ns0 and at_ns1 - ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 - - ip netns exec at_ns2 ip link add dev $DEV type ipip external - ip netns exec at_ns2 ip link set dev $DEV up - ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 -} - -function add_ipip6_tunnel { - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::1/64 remote 2401:db00::3/64 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns1 \ - ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::2/64 remote 2401:db00::3/64 - ip netns exec at_ns1 ip link set dev $DEV_NS up - # same inner IP address in at_ns0 and at_ns1 - ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 - - ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ipip6 external - ip netns exec at_ns2 ip link set dev $DEV up - ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 -} - -function add_ip6ip6_tunnel { - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::1/64 remote 2401:db00::3/64 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 2601:646::1/64 - ip netns exec at_ns1 \ - ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::2/64 remote 2401:db00::3/64 - ip netns exec at_ns1 ip link set dev $DEV_NS up - # same inner IP address in at_ns0 and at_ns1 - ip netns exec at_ns1 ip addr add dev $DEV_NS 2601:646::1/64 - - ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ip6ip6 external - ip netns exec at_ns2 ip link set dev $DEV up - ip netns exec at_ns2 ip addr add dev $DEV 2601:646::2/64 -} - -function attach_bpf { - DEV=$1 - SET_TUNNEL=$2 - GET_TUNNEL=$3 - ip netns exec at_ns2 tc qdisc add dev $DEV clsact - ip netns exec at_ns2 tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL - ip netns exec at_ns2 tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL -} - -function test_ipip { - DEV_NS=ipip_std - DEV=ipip_bpf - config_device -# tcpdump -nei br0 & - cat /sys/kernel/debug/tracing/trace_pipe & - - add_ipip_tunnel - attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel - - ip netns exec at_ns0 ping -c 1 10.1.1.200 - ip netns exec at_ns2 ping -c 1 10.1.1.100 - ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null - ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null - sleep 0.2 - # tcp check _same_ IP over different tunnels - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 - cleanup -} - -# IPv4 over IPv6 tunnel -function test_ipip6 { - DEV_NS=ipip_std - DEV=ipip_bpf - config_device -# tcpdump -nei br0 & - cat /sys/kernel/debug/tracing/trace_pipe & - - add_ipip6_tunnel - attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel - - ip netns exec at_ns0 ping -c 1 10.1.1.200 - ip netns exec at_ns2 ping -c 1 10.1.1.100 - ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null - ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null - sleep 0.2 - # tcp check _same_ IP over different tunnels - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 - cleanup -} - -# IPv6 over IPv6 tunnel -function test_ip6ip6 { - DEV_NS=ipip_std - DEV=ipip_bpf - config_device -# tcpdump -nei br0 & - cat /sys/kernel/debug/tracing/trace_pipe & - - add_ip6ip6_tunnel - attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel - - ip netns exec at_ns0 ping -6 -c 1 2601:646::2 - ip netns exec at_ns2 ping -6 -c 1 2601:646::1 - ip netns exec at_ns0 iperf -6sD -p 5200 > /dev/null - ip netns exec at_ns1 iperf -6sD -p 5201 > /dev/null - sleep 0.2 - # tcp check _same_ IP over different tunnels - ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5200 - ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5201 - cleanup -} - -function cleanup { - set +ex - pkill iperf - ip netns delete at_ns0 - ip netns delete at_ns1 - ip netns delete at_ns2 - ip link del veth0 - ip link del veth1 - ip link del veth2 - ip link del br0 - pkill tcpdump - pkill cat - set -ex -} - -cleanup -echo "Testing IP tunnels..." -test_ipip -test_ipip6 -test_ip6ip6 -echo "*** PASS ***" diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c index b313dba4111b..5efb91763d65 100644 --- a/samples/bpf/test_lru_dist.c +++ b/samples/bpf/test_lru_dist.c @@ -13,7 +13,6 @@ #include <sched.h> #include <sys/wait.h> #include <sys/stat.h> -#include <sys/resource.h> #include <fcntl.h> #include <stdlib.h> #include <time.h> @@ -105,10 +104,10 @@ struct pfect_lru { static void pfect_lru_init(struct pfect_lru *lru, unsigned int lru_size, unsigned int nr_possible_elems) { - lru->map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, + lru->map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(unsigned long long), sizeof(struct pfect_lru_node *), - nr_possible_elems, 0); + nr_possible_elems, NULL); assert(lru->map_fd != -1); lru->free_nodes = malloc(lru_size * sizeof(struct pfect_lru_node)); @@ -207,10 +206,13 @@ static unsigned int read_keys(const char *dist_file, static int create_map(int map_type, int map_flags, unsigned int size) { + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = map_flags, + ); int map_fd; - map_fd = bpf_create_map(map_type, sizeof(unsigned long long), - sizeof(unsigned long long), size, map_flags); + map_fd = bpf_map_create(map_type, NULL, sizeof(unsigned long long), + sizeof(unsigned long long), size, &opts); if (map_fd == -1) perror("bpf_create_map"); @@ -489,7 +491,6 @@ static void test_parallel_lru_loss(int map_type, int map_flags, int nr_tasks) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int map_flags[] = {0, BPF_F_NO_COMMON_LRU}; const char *dist_file; int nr_tasks = 1; @@ -508,8 +509,6 @@ int main(int argc, char **argv) setbuf(stdout, NULL); - assert(!setrlimit(RLIMIT_MEMLOCK, &r)); - srand(time(NULL)); nr_cpus = bpf_num_possible_cpus(); @@ -524,7 +523,7 @@ int main(int argc, char **argv) return -1; } - for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) { + for (f = 0; f < ARRAY_SIZE(map_flags); f++) { test_lru_loss0(BPF_MAP_TYPE_LRU_HASH, map_flags[f]); test_lru_loss1(BPF_MAP_TYPE_LRU_HASH, map_flags[f]); test_parallel_lru_loss(BPF_MAP_TYPE_LRU_HASH, map_flags[f], diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh index 65a976058dd3..65a976058dd3 100644..100755 --- a/samples/bpf/test_lwt_bpf.sh +++ b/samples/bpf/test_lwt_bpf.sh diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c index 6cee61e8ce9b..b0200c8eac09 100644 --- a/samples/bpf/test_map_in_map_kern.c +++ b/samples/bpf/test_map_in_map_kern.c @@ -11,66 +11,67 @@ #include <uapi/linux/bpf.h> #include <uapi/linux/in6.h> #include <bpf/bpf_helpers.h> -#include "bpf_legacy.h" #include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include "trace_common.h" #define MAX_NR_PORTS 65536 /* map #0 */ -struct bpf_map_def_legacy SEC("maps") port_a = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(int), - .max_entries = MAX_NR_PORTS, -}; +struct inner_a { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, int); + __uint(max_entries, MAX_NR_PORTS); +} port_a SEC(".maps"); /* map #1 */ -struct bpf_map_def_legacy SEC("maps") port_h = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(int), - .max_entries = 1, -}; +struct inner_h { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} port_h SEC(".maps"); /* map #2 */ -struct bpf_map_def_legacy SEC("maps") reg_result_h = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(int), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} reg_result_h SEC(".maps"); /* map #3 */ -struct bpf_map_def_legacy SEC("maps") inline_result_h = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(int), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} inline_result_h SEC(".maps"); /* map #4 */ /* Test case #0 */ -struct bpf_map_def_legacy SEC("maps") a_of_port_a = { - .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, - .key_size = sizeof(u32), - .inner_map_idx = 0, /* map_fd[0] is port_a */ - .max_entries = MAX_NR_PORTS, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, MAX_NR_PORTS); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_a); /* use inner_a as inner map */ +} a_of_port_a SEC(".maps"); /* map #5 */ /* Test case #1 */ -struct bpf_map_def_legacy SEC("maps") h_of_port_a = { - .type = BPF_MAP_TYPE_HASH_OF_MAPS, - .key_size = sizeof(u32), - .inner_map_idx = 0, /* map_fd[0] is port_a */ - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_a); /* use inner_a as inner map */ +} h_of_port_a SEC(".maps"); /* map #6 */ /* Test case #2 */ -struct bpf_map_def_legacy SEC("maps") h_of_port_h = { - .type = BPF_MAP_TYPE_HASH_OF_MAPS, - .key_size = sizeof(u32), - .inner_map_idx = 1, /* map_fd[1] is port_h */ - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_h); /* use inner_h as inner map */ +} h_of_port_h SEC(".maps"); static __always_inline int do_reg_lookup(void *inner_map, u32 port) { @@ -102,7 +103,7 @@ static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port) return result ? *result : -ENOENT; } -SEC("kprobe/sys_connect") +SEC("kprobe/__sys_connect") int trace_sys_connect(struct pt_regs *ctx) { struct sockaddr_in6 *in6; @@ -112,8 +113,8 @@ int trace_sys_connect(struct pt_regs *ctx) void *outer_map, *inner_map; bool inline_hash = false; - in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx); - addrlen = (int)PT_REGS_PARM3(ctx); + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(ctx); + addrlen = (int)PT_REGS_PARM3_CORE(ctx); if (addrlen != sizeof(*in6)) return 0; diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c index eb29bcb76f3f..652ec720533d 100644 --- a/samples/bpf/test_map_in_map_user.c +++ b/samples/bpf/test_map_in_map_user.c @@ -2,7 +2,6 @@ /* * Copyright (c) 2017 Facebook */ -#include <sys/resource.h> #include <sys/socket.h> #include <arpa/inet.h> #include <stdint.h> @@ -11,7 +10,11 @@ #include <stdlib.h> #include <stdio.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> + +#include "bpf_util.h" + +static int map_fd[7]; #define PORT_A (map_fd[0]) #define PORT_H (map_fd[1]) @@ -27,7 +30,7 @@ static const char * const test_names[] = { "Hash of Hash", }; -#define NR_TESTS (sizeof(test_names) / sizeof(*test_names)) +#define NR_TESTS ARRAY_SIZE(test_names) static void check_map_id(int inner_map_fd, int map_in_map_fd, uint32_t key) { @@ -112,19 +115,54 @@ static void test_map_in_map(void) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; - assert(!setrlimit(RLIMIT_MEMLOCK, &r)); - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "trace_sys_connect"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "port_a"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "port_h"); + map_fd[2] = bpf_object__find_map_fd_by_name(obj, "reg_result_h"); + map_fd[3] = bpf_object__find_map_fd_by_name(obj, "inline_result_h"); + map_fd[4] = bpf_object__find_map_fd_by_name(obj, "a_of_port_a"); + map_fd[5] = bpf_object__find_map_fd_by_name(obj, "h_of_port_a"); + map_fd[6] = bpf_object__find_map_fd_by_name(obj, "h_of_port_h"); + if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0 || + map_fd[3] < 0 || map_fd[4] < 0 || map_fd[5] < 0 || map_fd[6] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } test_map_in_map(); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c index 8b811c29dc79..8fdd2c9c56b2 100644 --- a/samples/bpf/test_overhead_kprobe_kern.c +++ b/samples/bpf/test_overhead_kprobe_kern.c @@ -6,27 +6,34 @@ */ #include <linux/version.h> #include <linux/ptrace.h> +#include <linux/sched.h> #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) +#define _(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) SEC("kprobe/__set_task_comm") int prog(struct pt_regs *ctx) { struct signal_struct *signal; struct task_struct *tsk; - char oldcomm[16] = {}; - char newcomm[16] = {}; + char oldcomm[TASK_COMM_LEN] = {}; + char newcomm[TASK_COMM_LEN] = {}; u16 oom_score_adj; u32 pid; tsk = (void *)PT_REGS_PARM1(ctx); pid = _(tsk->pid); - bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm); - bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx)); + bpf_probe_read_kernel_str(oldcomm, sizeof(oldcomm), &tsk->comm); + bpf_probe_read_kernel_str(newcomm, sizeof(newcomm), + (void *)PT_REGS_PARM2(ctx)); signal = _(tsk->signal); oom_score_adj = _(signal->oom_score_adj); return 0; diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c index eaa32693f8fc..80edadacb692 100644 --- a/samples/bpf/test_overhead_tp_kern.c +++ b/samples/bpf/test_overhead_tp_kern.c @@ -4,6 +4,7 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ +#include <linux/sched.h> #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> @@ -11,8 +12,8 @@ struct task_rename { __u64 pad; __u32 pid; - char oldcomm[16]; - char newcomm[16]; + char oldcomm[TASK_COMM_LEN]; + char newcomm[TASK_COMM_LEN]; __u16 oom_score_adj; }; SEC("tracepoint/task/task_rename") diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c index 94f74112a20e..88717f8ec6ac 100644 --- a/samples/bpf/test_overhead_user.c +++ b/samples/bpf/test_overhead_user.c @@ -16,12 +16,15 @@ #include <linux/bpf.h> #include <string.h> #include <time.h> -#include <sys/resource.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #define MAX_CNT 1000000 +static struct bpf_link *links[2]; +static struct bpf_object *obj; +static int cnt; + static __u64 time_get_ns(void) { struct timespec ts; @@ -115,22 +118,54 @@ static void run_perf_test(int tasks, int flags) } } +static int load_progs(char *filename) +{ + struct bpf_program *prog; + int err = 0; + + obj = bpf_object__open_file(filename, NULL); + err = libbpf_get_error(obj); + if (err < 0) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return err; + } + + /* load BPF program */ + err = bpf_object__load(obj); + if (err < 0) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + return err; + } + + bpf_object__for_each_program(prog, obj) { + links[cnt] = bpf_program__attach(prog); + err = libbpf_get_error(links[cnt]); + if (err < 0) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[cnt] = NULL; + return err; + } + cnt++; + } + + return err; +} + static void unload_progs(void) { - close(prog_fd[0]); - close(prog_fd[1]); - close(event_fd[0]); - close(event_fd[1]); + while (cnt) + bpf_link__destroy(links[--cnt]); + + bpf_object__close(obj); } int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - char filename[256]; - int num_cpu = 8; + int num_cpu = sysconf(_SC_NPROCESSORS_ONLN); int test_flags = ~0; + char filename[256]; + int err = 0; - setrlimit(RLIMIT_MEMLOCK, &r); if (argc > 1) test_flags = atoi(argv[1]) ? : test_flags; @@ -145,38 +180,36 @@ int main(int argc, char **argv) if (test_flags & 0xC) { snprintf(filename, sizeof(filename), "%s_kprobe_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } + printf("w/KPROBE\n"); - run_perf_test(num_cpu, test_flags >> 2); + err = load_progs(filename); + if (!err) + run_perf_test(num_cpu, test_flags >> 2); + unload_progs(); } if (test_flags & 0x30) { snprintf(filename, sizeof(filename), "%s_tp_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } printf("w/TRACEPOINT\n"); - run_perf_test(num_cpu, test_flags >> 4); + err = load_progs(filename); + if (!err) + run_perf_test(num_cpu, test_flags >> 4); + unload_progs(); } if (test_flags & 0xC0) { snprintf(filename, sizeof(filename), "%s_raw_tp_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } printf("w/RAW_TRACEPOINT\n"); - run_perf_test(num_cpu, test_flags >> 6); + err = load_progs(filename); + if (!err) + run_perf_test(num_cpu, test_flags >> 6); + unload_progs(); } - return 0; + return err; } diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh index e68b9ee6814b..35db26f736b9 100755 --- a/samples/bpf/test_override_return.sh +++ b/samples/bpf/test_override_return.sh @@ -1,5 +1,6 @@ #!/bin/bash +rm -r tmpmnt rm -f testfile.img dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 DEVICE=$(losetup --show -f testfile.img) diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c index f033f36a13a3..220a96438d75 100644 --- a/samples/bpf/test_probe_write_user_kern.c +++ b/samples/bpf/test_probe_write_user_kern.c @@ -10,13 +10,15 @@ #include <linux/version.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include "trace_common.h" -struct bpf_map_def SEC("maps") dnat_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct sockaddr_in), - .value_size = sizeof(struct sockaddr_in), - .max_entries = 256, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct sockaddr_in); + __type(value, struct sockaddr_in); + __uint(max_entries, 256); +} dnat_map SEC(".maps"); /* kprobe is NOT a stable ABI * kernel functions can be removed, renamed or completely change semantics. @@ -26,13 +28,14 @@ struct bpf_map_def SEC("maps") dnat_map = { * This example sits on a syscall, and the syscall ABI is relatively stable * of course, across platforms, and over time, the ABI may change. */ -SEC("kprobe/sys_connect") +SEC("kprobe/" SYSCALL(sys_connect)) int bpf_prog1(struct pt_regs *ctx) { + struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); + void *sockaddr_arg = (void *)PT_REGS_PARM2_CORE(real_regs); + int sockaddr_len = (int)PT_REGS_PARM3_CORE(real_regs); struct sockaddr_in new_addr, orig_addr = {}; struct sockaddr_in *mapped_addr; - void *sockaddr_arg = (void *)PT_REGS_PARM2(ctx); - int sockaddr_len = (int)PT_REGS_PARM3(ctx); if (sockaddr_len > sizeof(orig_addr)) return 0; diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c index 045eb5e30f54..00ccfb834e45 100644 --- a/samples/bpf/test_probe_write_user_user.c +++ b/samples/bpf/test_probe_write_user_user.c @@ -1,21 +1,22 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <assert.h> -#include <linux/bpf.h> #include <unistd.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include <sys/socket.h> -#include <string.h> #include <netinet/in.h> #include <arpa/inet.h> int main(int ac, char **argv) { - int serverfd, serverconnfd, clientfd; - socklen_t sockaddr_len; - struct sockaddr serv_addr, mapped_addr, tmp_addr; struct sockaddr_in *serv_addr_in, *mapped_addr_in, *tmp_addr_in; + struct sockaddr serv_addr, mapped_addr, tmp_addr; + int serverfd, serverconnfd, clientfd, map_fd; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + socklen_t sockaddr_len; char filename[256]; char *ip; @@ -24,10 +25,35 @@ int main(int ac, char **argv) tmp_addr_in = (struct sockaddr_in *)&tmp_addr; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "dnat_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } assert((serverfd = socket(AF_INET, SOCK_STREAM, 0)) > 0); @@ -51,7 +77,7 @@ int main(int ac, char **argv) mapped_addr_in->sin_port = htons(5555); mapped_addr_in->sin_addr.s_addr = inet_addr("255.255.255.255"); - assert(!bpf_map_update_elem(map_fd[0], &mapped_addr, &serv_addr, BPF_ANY)); + assert(!bpf_map_update_elem(map_fd, &mapped_addr, &serv_addr, BPF_ANY)); assert(listen(serverfd, 5) == 0); @@ -75,5 +101,8 @@ int main(int ac, char **argv) /* Is the server's getsockname = the socket getpeername */ assert(memcmp(&serv_addr, &tmp_addr, sizeof(struct sockaddr_in)) == 0); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/trace_common.h b/samples/bpf/trace_common.h new file mode 100644 index 000000000000..8cb5400aed1f --- /dev/null +++ b/samples/bpf/trace_common.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACE_COMMON_H +#define __TRACE_COMMON_H + +#ifdef __x86_64__ +#define SYSCALL(SYS) "__x64_" __stringify(SYS) +#elif defined(__s390x__) +#define SYSCALL(SYS) "__s390x_" __stringify(SYS) +#else +#define SYSCALL(SYS) __stringify(SYS) +#endif + +#endif diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c index da1d69e20645..0bba5fcd7d24 100644 --- a/samples/bpf/trace_event_kern.c +++ b/samples/bpf/trace_event_kern.c @@ -5,7 +5,6 @@ * License as published by the Free Software Foundation. */ #include <linux/ptrace.h> -#include <linux/version.h> #include <uapi/linux/bpf.h> #include <uapi/linux/bpf_perf_event.h> #include <uapi/linux/perf_event.h> @@ -18,19 +17,19 @@ struct key_t { u32 userstack; }; -struct bpf_map_def SEC("maps") counts = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct key_t), - .value_size = sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, 10000); +} counts SEC(".maps"); -struct bpf_map_def SEC("maps") stackmap = { - .type = BPF_MAP_TYPE_STACK_TRACE, - .key_size = sizeof(u32), - .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, 10000); +} stackmap SEC(".maps"); #define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) #define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK) diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index 356171bc392b..9664749bf618 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -6,22 +6,22 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> -#include <fcntl.h> -#include <poll.h> -#include <sys/ioctl.h> #include <linux/perf_event.h> #include <linux/bpf.h> #include <signal.h> -#include <assert.h> #include <errno.h> #include <sys/resource.h> +#include <bpf/bpf.h> #include <bpf/libbpf.h> -#include "bpf_load.h" #include "perf-sys.h" #include "trace_helpers.h" #define SAMPLE_FREQ 50 +static int pid; +/* counts, stackmap */ +static int map_fd[2]; +struct bpf_program *prog; static bool sys_read_seen, sys_write_seen; static void print_ksym(__u64 addr) @@ -91,10 +91,10 @@ static void print_stack(struct key_t *key, __u64 count) } } -static void int_exit(int sig) +static void err_exit(int err) { - kill(0, SIGKILL); - exit(0); + kill(pid, SIGKILL); + exit(err); } static void print_stacks(void) @@ -102,7 +102,7 @@ static void print_stacks(void) struct key_t key = {}, next_key; __u64 value; __u32 stackid = 0, next_id; - int fd = map_fd[0], stack_map = map_fd[1]; + int error = 1, fd = map_fd[0], stack_map = map_fd[1]; sys_read_seen = sys_write_seen = false; while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { @@ -114,7 +114,7 @@ static void print_stacks(void) printf("\n"); if (!sys_read_seen || !sys_write_seen) { printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n"); - int_exit(0); + err_exit(error); } /* clear stack map */ @@ -136,43 +136,52 @@ static inline int generate_load(void) static void test_perf_event_all_cpu(struct perf_event_attr *attr) { - int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); - int *pmu_fd = malloc(nr_cpus * sizeof(int)); - int i, error = 0; + int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + struct bpf_link **links = calloc(nr_cpus, sizeof(struct bpf_link *)); + int i, pmu_fd, error = 1; + + if (!links) { + printf("malloc of links failed\n"); + goto err; + } /* system wide perf event, no need to inherit */ attr->inherit = 0; /* open perf_event on all cpus */ for (i = 0; i < nr_cpus; i++) { - pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0); - if (pmu_fd[i] < 0) { + pmu_fd = sys_perf_event_open(attr, -1, i, -1, 0); + if (pmu_fd < 0) { printf("sys_perf_event_open failed\n"); - error = 1; goto all_cpu_err; } - assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0); - assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE) == 0); + links[i] = bpf_program__attach_perf_event(prog, pmu_fd); + if (libbpf_get_error(links[i])) { + printf("bpf_program__attach_perf_event failed\n"); + links[i] = NULL; + close(pmu_fd); + goto all_cpu_err; + } } - if (generate_load() < 0) { - error = 1; + if (generate_load() < 0) goto all_cpu_err; - } + print_stacks(); + error = 0; all_cpu_err: - for (i--; i >= 0; i--) { - ioctl(pmu_fd[i], PERF_EVENT_IOC_DISABLE); - close(pmu_fd[i]); - } - free(pmu_fd); + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); +err: + free(links); if (error) - int_exit(0); + err_exit(error); } static void test_perf_event_task(struct perf_event_attr *attr) { - int pmu_fd, error = 0; + struct bpf_link *link = NULL; + int pmu_fd, error = 1; /* per task perf event, enable inherit so the "dd ..." command can be traced properly. * Enabling inherit will cause bpf_perf_prog_read_time helper failure. @@ -183,21 +192,25 @@ static void test_perf_event_task(struct perf_event_attr *attr) pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0); if (pmu_fd < 0) { printf("sys_perf_event_open failed\n"); - int_exit(0); + goto err; } - assert(ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0); - assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE) == 0); - - if (generate_load() < 0) { - error = 1; + link = bpf_program__attach_perf_event(prog, pmu_fd); + if (libbpf_get_error(link)) { + printf("bpf_program__attach_perf_event failed\n"); + link = NULL; + close(pmu_fd); goto err; } + + if (generate_load() < 0) + goto err; + print_stacks(); + error = 0; err: - ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE); - close(pmu_fd); + bpf_link__destroy(link); if (error) - int_exit(0); + err_exit(error); } static void test_bpf_perf_event(void) @@ -281,30 +294,59 @@ static void test_bpf_perf_event(void) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_object *obj = NULL; char filename[256]; + int error = 1; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); + signal(SIGINT, err_exit); + signal(SIGTERM, err_exit); if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); - return 1; + goto cleanup; + } + + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + printf("opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 2; + /* load BPF program */ + if (bpf_object__load(obj)) { + printf("loading BPF object file failed\n"); + goto cleanup; } - if (fork() == 0) { + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + printf("finding a counts/stackmap map in obj file failed\n"); + goto cleanup; + } + + pid = fork(); + if (pid == 0) { read_trace_pipe(); return 0; + } else if (pid == -1) { + printf("couldn't spawn process\n"); + goto cleanup; } + test_bpf_perf_event(); - int_exit(0); - return 0; + error = 0; + +cleanup: + bpf_object__close(obj); + err_exit(error); } diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c index 1d7d422cae6f..b64815af0943 100644 --- a/samples/bpf/trace_output_kern.c +++ b/samples/bpf/trace_output_kern.c @@ -2,15 +2,16 @@ #include <linux/version.h> #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> +#include "trace_common.h" -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = 2, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 2); +} my_map SEC(".maps"); -SEC("kprobe/sys_write") +SEC("kprobe/" SYSCALL(sys_write)) int bpf_prog1(struct pt_regs *ctx) { struct S { diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c index 60a17dd05345..371732f9cf8e 100644 --- a/samples/bpf/trace_output_user.c +++ b/samples/bpf/trace_output_user.c @@ -1,23 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only #include <stdio.h> -#include <unistd.h> -#include <stdlib.h> -#include <stdbool.h> -#include <string.h> #include <fcntl.h> #include <poll.h> -#include <linux/perf_event.h> -#include <linux/bpf.h> -#include <errno.h> -#include <assert.h> -#include <sys/syscall.h> -#include <sys/ioctl.h> -#include <sys/mman.h> #include <time.h> #include <signal.h> #include <bpf/libbpf.h> -#include "bpf_load.h" -#include "perf-sys.h" static __u64 time_get_ns(void) { @@ -56,21 +43,47 @@ static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size) int main(int argc, char **argv) { - struct perf_buffer_opts pb_opts = {}; + struct bpf_link *link = NULL; + struct bpf_program *prog; struct perf_buffer *pb; + struct bpf_object *obj; + int map_fd, ret = 0; char filename[256]; FILE *f; - int ret; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; } - pb_opts.sample_cb = print_bpf_output; - pb = perf_buffer__new(map_fd[0], 8, &pb_opts); + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + pb = perf_buffer__new(map_fd, 8, print_bpf_output, NULL, NULL, NULL); ret = libbpf_get_error(pb); if (ret) { printf("failed to setup perf_buffer: %d\n", ret); @@ -84,5 +97,9 @@ int main(int argc, char **argv) while ((ret = perf_buffer__poll(pb, 1000)) >= 0 && cnt < MAX_CNT) { } kill(0, SIGINT); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return ret; } diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c index 8e2610e14475..ef30d2b353b0 100644 --- a/samples/bpf/tracex1_kern.c +++ b/samples/bpf/tracex1_kern.c @@ -11,7 +11,12 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) +#define _(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) /* kprobe is NOT a stable ABI * kernel functions can be removed, renamed or completely change semantics. @@ -21,7 +26,7 @@ SEC("kprobe/__netif_receive_skb_core") int bpf_prog1(struct pt_regs *ctx) { - /* attaches to kprobe netif_receive_skb, + /* attaches to kprobe __netif_receive_skb_core, * looks for packets on loobpack device and prints them */ char devname[IFNAMSIZ]; @@ -30,11 +35,11 @@ int bpf_prog1(struct pt_regs *ctx) int len; /* non-portable! works for the given kernel only */ - skb = (struct sk_buff *) PT_REGS_PARM1(ctx); + bpf_probe_read_kernel(&skb, sizeof(skb), (void *)PT_REGS_PARM1(ctx)); dev = _(skb->dev); len = _(skb->len); - bpf_probe_read(devname, sizeof(devname), dev->name); + bpf_probe_read_kernel(devname, sizeof(devname), dev->name); if (devname[0] == 'l' && devname[1] == 'o') { char fmt[] = "skb %p len %d\n"; diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c index af8c20608ab5..9d4adb7fd834 100644 --- a/samples/bpf/tracex1_user.c +++ b/samples/bpf/tracex1_user.c @@ -1,20 +1,41 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> -#include <linux/bpf.h> #include <unistd.h> -#include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> +#include "trace_helpers.h" int main(int ac, char **argv) { - FILE *f; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } f = popen("taskset 1 ping -c5 localhost", "r"); @@ -22,5 +43,8 @@ int main(int ac, char **argv) read_trace_pipe(); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c index d865bb309bcb..5bc696bac27d 100644 --- a/samples/bpf/tracex2_kern.c +++ b/samples/bpf/tracex2_kern.c @@ -10,13 +10,14 @@ #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> +#include "trace_common.h" -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(long), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, long); + __uint(max_entries, 1024); +} my_map SEC(".maps"); /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful @@ -70,14 +71,14 @@ struct hist_key { u64 index; }; -struct bpf_map_def SEC("maps") my_hist_map = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .key_size = sizeof(struct hist_key), - .value_size = sizeof(long), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(struct hist_key)); + __uint(value_size, sizeof(long)); + __uint(max_entries, 1024); +} my_hist_map SEC(".maps"); -SEC("kprobe/sys_write") +SEC("kprobe/" SYSCALL(sys_write)) int bpf_prog3(struct pt_regs *ctx) { long write_size = PT_REGS_PARM3(ctx); diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index c9544a4ce61a..dd6205c6b6a7 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -3,17 +3,18 @@ #include <unistd.h> #include <stdlib.h> #include <signal.h> -#include <linux/bpf.h> #include <string.h> -#include <sys/resource.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "bpf_util.h" #define MAX_INDEX 64 #define MAX_STARS 38 +/* my_map, my_hist_map */ +static int map_fd[2]; + static void stars(char *str, long val, long max, int width) { int i; @@ -114,17 +115,32 @@ static void int_exit(int sig) int main(int ac, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; - char filename[256]; long key, next_key, value; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + int i, j = 0; FILE *f; - int i; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "my_map"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "my_hist_map"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; } signal(SIGINT, int_exit); @@ -138,9 +154,14 @@ int main(int ac, char **argv) f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r"); (void) f; - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; } for (i = 0; i < 5; i++) { @@ -156,5 +177,10 @@ int main(int ac, char **argv) } print_hist(map_fd[1]); +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c index fe21c14feb8d..bde6591cb20c 100644 --- a/samples/bpf/tracex3_kern.c +++ b/samples/bpf/tracex3_kern.c @@ -11,12 +11,12 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(u64), - .max_entries = 4096, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, u64); + __uint(max_entries, 4096); +} my_map SEC(".maps"); /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful @@ -42,14 +42,14 @@ static unsigned int log2l(unsigned long long n) #define SLOTS 100 -struct bpf_map_def SEC("maps") lat_map = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = SLOTS, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, SLOTS); +} lat_map SEC(".maps"); -SEC("kprobe/blk_account_io_completion") +SEC("kprobe/__blk_account_io_done") int bpf_prog2(struct pt_regs *ctx) { long rq = PT_REGS_PARM1(ctx); diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index cf8fedc773f2..d5eebace31e6 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -7,11 +7,9 @@ #include <unistd.h> #include <stdbool.h> #include <string.h> -#include <linux/bpf.h> -#include <sys/resource.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "bpf_util.h" #define SLOTS 100 @@ -108,21 +106,11 @@ static void print_hist(int fd) int main(int ac, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; - int i; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } + int map_fd, i, j = 0; for (i = 1; i < ac; i++) { if (strcmp(argv[i], "-a") == 0) { @@ -137,6 +125,35 @@ int main(int ac, char **argv) } } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + printf(" heatmap of IO latency\n"); if (text_only) printf(" %s", sym[num_colors - 1]); @@ -153,9 +170,14 @@ int main(int ac, char **argv) for (i = 0; ; i++) { if (i % 20 == 0) print_banner(); - print_hist(map_fd[1]); + print_hist(map_fd); sleep(2); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c index b1bb9df88f8e..eb0f8fdd14bf 100644 --- a/samples/bpf/tracex4_kern.c +++ b/samples/bpf/tracex4_kern.c @@ -15,12 +15,12 @@ struct pair { u64 ip; }; -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(struct pair), - .max_entries = 1000000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, struct pair); + __uint(max_entries, 1000000); +} my_map SEC(".maps"); /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c index ec52203fce39..227b05a0bc88 100644 --- a/samples/bpf/tracex4_user.c +++ b/samples/bpf/tracex4_user.c @@ -8,11 +8,9 @@ #include <stdbool.h> #include <string.h> #include <time.h> -#include <linux/bpf.h> -#include <sys/resource.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> struct pair { long long val; @@ -33,11 +31,11 @@ static void print_old_objects(int fd) __u64 key, next_key; struct pair v; - key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */ + key = write(1, "\e[1;1H\e[2J", 11); /* clear screen */ key = -1; - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[0], &next_key, &v); + while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(fd, &next_key, &v); key = next_key; if (val - v.val < 1000000000ll) /* object was allocated more then 1 sec ago */ @@ -49,26 +47,50 @@ static void print_old_objects(int fd) int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; - int i; + int map_fd, i, j = 0; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); - return 1; + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; } for (i = 0; ; i++) { - print_old_objects(map_fd[1]); + print_old_objects(map_fd); sleep(1); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c index 481790fde864..64a1f7550d7e 100644 --- a/samples/bpf/tracex5_kern.c +++ b/samples/bpf/tracex5_kern.c @@ -15,16 +15,16 @@ #define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F -struct bpf_map_def SEC("maps") progs = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); #ifdef __mips__ - .max_entries = 6000, /* MIPS n64 syscalls start at 5000 */ + __uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */ #else - .max_entries = 1024, + __uint(max_entries, 1024); #endif -}; +} progs SEC(".maps"); SEC("kprobe/__seccomp_filter") int bpf_prog1(struct pt_regs *ctx) @@ -47,7 +47,7 @@ PROG(SYS__NR_write)(struct pt_regs *ctx) { struct seccomp_data sd; - bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); + bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); if (sd.args[2] == 512) { char fmt[] = "write(fd=%d, buf=%p, size=%d)\n"; bpf_trace_printk(fmt, sizeof(fmt), @@ -60,7 +60,7 @@ PROG(SYS__NR_read)(struct pt_regs *ctx) { struct seccomp_data sd; - bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); + bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); if (sd.args[2] > 128 && sd.args[2] <= 1024) { char fmt[] = "read(fd=%d, buf=%p, size=%d)\n"; bpf_trace_printk(fmt, sizeof(fmt), diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c index c4ab91c89494..9d7d79f0d47d 100644 --- a/samples/bpf/tracex5_user.c +++ b/samples/bpf/tracex5_user.c @@ -1,13 +1,20 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> -#include <linux/bpf.h> +#include <stdlib.h> #include <unistd.h> #include <linux/filter.h> #include <linux/seccomp.h> #include <sys/prctl.h> #include <bpf/bpf.h> -#include "bpf_load.h" -#include <sys/resource.h> +#include <bpf/libbpf.h> +#include "trace_helpers.h" +#include "bpf_util.h" + +#ifdef __mips__ +#define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */ +#else +#define MAX_ENTRIES 1024 +#endif /* install fake seccomp program to enable seccomp code path inside the kernel, * so that our kprobe attached to seccomp_phase1() can be triggered @@ -18,7 +25,7 @@ static void install_accept_all_seccomp(void) BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog prog = { - .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .len = (unsigned short)ARRAY_SIZE(filter), .filter = filter, }; if (prctl(PR_SET_SECCOMP, 2, &prog)) @@ -27,16 +34,54 @@ static void install_accept_all_seccomp(void) int main(int ac, char **argv) { - FILE *f; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + int key, fd, progs_fd; + const char *section; char filename[256]; - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + progs_fd = bpf_object__find_map_fd_by_name(obj, "progs"); + if (progs_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + section = bpf_program__section_name(prog); + /* register only syscalls to PROG_ARRAY */ + if (sscanf(section, "kprobe/%d", &key) != 1) + continue; - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + fd = bpf_program__fd(prog); + bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY); } install_accept_all_seccomp(); @@ -46,5 +91,8 @@ int main(int ac, char **argv) read_trace_pipe(); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c index 96c234efa852..acad5712d8b4 100644 --- a/samples/bpf/tracex6_kern.c +++ b/samples/bpf/tracex6_kern.c @@ -3,24 +3,26 @@ #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> -struct bpf_map_def SEC("maps") counters = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = 64, -}; -struct bpf_map_def SEC("maps") values = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(int), - .value_size = sizeof(u64), - .max_entries = 64, -}; -struct bpf_map_def SEC("maps") values2 = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(int), - .value_size = sizeof(struct bpf_perf_event_value), - .max_entries = 64, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 64); +} counters SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, u64); + __uint(max_entries, 64); +} values SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, struct bpf_perf_event_value); + __uint(max_entries, 64); +} values2 SEC(".maps"); SEC("kprobe/htab_map_get_next_key") int bpf_prog1(struct pt_regs *ctx) diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c index 4bb3c830adb2..8e83bf2a84a4 100644 --- a/samples/bpf/tracex6_user.c +++ b/samples/bpf/tracex6_user.c @@ -4,23 +4,24 @@ #include <assert.h> #include <fcntl.h> #include <linux/perf_event.h> -#include <linux/bpf.h> #include <sched.h> #include <stdio.h> #include <stdlib.h> #include <sys/ioctl.h> -#include <sys/resource.h> #include <sys/time.h> #include <sys/types.h> #include <sys/wait.h> #include <unistd.h> -#include "bpf_load.h" #include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "perf-sys.h" #define SAMPLE_PERIOD 0x7fffffffffffffffULL +/* counters, values, values2 */ +static int map_fd[3]; + static void check_on_cpu(int cpu, struct perf_event_attr *attr) { struct bpf_perf_event_value value2; @@ -173,17 +174,49 @@ static void test_bpf_perf_event(void) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + int i = 0; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } - setrlimit(RLIMIT_MEMLOCK, &r); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values"); + map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2"); + if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } test_bpf_perf_event(); + +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c index ea6dae78f0df..8be7ce18d3ba 100644 --- a/samples/bpf/tracex7_user.c +++ b/samples/bpf/tracex7_user.c @@ -1,28 +1,56 @@ #define _GNU_SOURCE #include <stdio.h> -#include <linux/bpf.h> #include <unistd.h> -#include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> int main(int argc, char **argv) { - FILE *f; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; char command[256]; - int ret; + int ret = 0; + FILE *f; + + if (!argv[1]) { + fprintf(stderr, "ERROR: Run with the btrfs device argument!\n"); + return 0; + } snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } snprintf(command, 256, "mount %s tmpmnt/", argv[1]); f = popen(command, "r"); ret = pclose(f); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return ret ? 0 : 1; } diff --git a/samples/bpf/xdp1_kern.c b/samples/bpf/xdp1_kern.c index 34b64394ed9c..0a5c704badd0 100644 --- a/samples/bpf/xdp1_kern.c +++ b/samples/bpf/xdp1_kern.c @@ -39,11 +39,13 @@ static int parse_ipv6(void *data, u64 nh_off, void *data_end) return ip6h->nexthdr; } -SEC("xdp1") +#define XDPBUFSIZE 64 +SEC("xdp.frags") int xdp_prog1(struct xdp_md *ctx) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; + __u8 pkt[XDPBUFSIZE] = {}; + void *data_end = &pkt[XDPBUFSIZE-1]; + void *data = pkt; struct ethhdr *eth = data; int rc = XDP_DROP; long *value; @@ -51,12 +53,16 @@ int xdp_prog1(struct xdp_md *ctx) u64 nh_off; u32 ipproto; + if (bpf_xdp_load_bytes(ctx, 0, pkt, sizeof(pkt))) + return rc; + nh_off = sizeof(*eth); if (data + nh_off > data_end) return rc; h_proto = eth->h_proto; + /* Handle VLAN tagged packet */ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { struct vlan_hdr *vhdr; @@ -66,6 +72,7 @@ int xdp_prog1(struct xdp_md *ctx) return rc; h_proto = vhdr->h_vlan_encapsulated_proto; } + /* Handle double VLAN tagged packet */ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { struct vlan_hdr *vhdr; diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index c447ad9e3a1d..ac370e638fa3 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -11,7 +11,6 @@ #include <string.h> #include <unistd.h> #include <libgen.h> -#include <sys/resource.h> #include <net/if.h> #include "bpf_util.h" @@ -26,12 +25,12 @@ static void int_exit(int sig) { __u32 curr_prog_id = 0; - if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); + if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) { + printf("bpf_xdp_query_id failed\n"); exit(1); } if (prog_id == curr_prog_id) - bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_xdp_detach(ifindex, xdp_flags, NULL); else if (!curr_prog_id) printf("couldn't find a prog id on a given interface\n"); else @@ -79,14 +78,11 @@ static void usage(const char *prog) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); const char *optstr = "FSN"; int prog_fd, map_fd, opt; + struct bpf_program *prog; struct bpf_object *obj; struct bpf_map *map; char filename[256]; @@ -117,11 +113,6 @@ int main(int argc, char **argv) return 1; } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - ifindex = if_nametoindex(argv[optind]); if (!ifindex) { perror("if_nametoindex"); @@ -129,12 +120,20 @@ int main(int argc, char **argv) } snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) + return 1; + + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + err = bpf_object__load(obj); + if (err) return 1; - map = bpf_map__next(NULL, obj); + prog_fd = bpf_program__fd(prog); + + map = bpf_object__next_map(obj, NULL); if (!map) { printf("finding a map in obj file failed\n"); return 1; @@ -149,7 +148,7 @@ int main(int argc, char **argv) signal(SIGINT, int_exit); signal(SIGTERM, int_exit); - if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { + if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) { printf("link set xdp fd failed\n"); return 1; } @@ -161,7 +160,7 @@ int main(int argc, char **argv) } prog_id = info.id; - poll_stats(map_fd, 2); + poll_stats(map_fd, 1); return 0; } diff --git a/samples/bpf/xdp2_kern.c b/samples/bpf/xdp2_kern.c index c787f4b49646..3332ba6bb95f 100644 --- a/samples/bpf/xdp2_kern.c +++ b/samples/bpf/xdp2_kern.c @@ -55,11 +55,13 @@ static int parse_ipv6(void *data, u64 nh_off, void *data_end) return ip6h->nexthdr; } -SEC("xdp1") +#define XDPBUFSIZE 64 +SEC("xdp.frags") int xdp_prog1(struct xdp_md *ctx) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; + __u8 pkt[XDPBUFSIZE] = {}; + void *data_end = &pkt[XDPBUFSIZE-1]; + void *data = pkt; struct ethhdr *eth = data; int rc = XDP_DROP; long *value; @@ -67,12 +69,16 @@ int xdp_prog1(struct xdp_md *ctx) u64 nh_off; u32 ipproto; + if (bpf_xdp_load_bytes(ctx, 0, pkt, sizeof(pkt))) + return rc; + nh_off = sizeof(*eth); if (data + nh_off > data_end) return rc; h_proto = eth->h_proto; + /* Handle VLAN tagged packet */ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { struct vlan_hdr *vhdr; @@ -82,6 +88,7 @@ int xdp_prog1(struct xdp_md *ctx) return rc; h_proto = vhdr->h_vlan_encapsulated_proto; } + /* Handle double VLAN tagged packet */ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { struct vlan_hdr *vhdr; diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c index 9b783316e860..d5631014a176 100644 --- a/samples/bpf/xdp2skb_meta_kern.c +++ b/samples/bpf/xdp2skb_meta_kern.c @@ -6,7 +6,7 @@ * This uses the XDP data_meta infrastructure, and is a cooperation * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook. * - * Notice: This example does not use the BPF C-loader (bpf_load.c), + * Notice: This example does not use the BPF C-loader, * but instead rely on the iproute2 TC tool for loading BPF-objects. */ #include <uapi/linux/bpf.h> diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c index ba482dc3da33..167646077c8f 100644 --- a/samples/bpf/xdp_adjust_tail_user.c +++ b/samples/bpf/xdp_adjust_tail_user.c @@ -14,7 +14,6 @@ #include <stdlib.h> #include <string.h> #include <net/if.h> -#include <sys/resource.h> #include <arpa/inet.h> #include <netinet/ether.h> #include <unistd.h> @@ -34,12 +33,12 @@ static void int_exit(int sig) __u32 curr_prog_id = 0; if (ifindex > -1) { - if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); + if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) { + printf("bpf_xdp_query_id failed\n"); exit(1); } if (prog_id == curr_prog_id) - bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_xdp_detach(ifindex, xdp_flags, NULL); else if (!curr_prog_id) printf("couldn't find a prog id on a given iface\n"); else @@ -82,16 +81,13 @@ static void usage(const char *cmd) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; unsigned char opt_flags[256] = {}; const char *optstr = "i:T:P:SNFh"; struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); unsigned int kill_after_s = 0; int i, prog_fd, map_fd, opt; + struct bpf_program *prog; struct bpf_object *obj; __u32 max_pckt_size = 0; __u32 key = 0; @@ -143,22 +139,26 @@ int main(int argc, char **argv) } } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); - return 1; - } - if (!ifindex) { fprintf(stderr, "Invalid ifname\n"); return 1; } snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) return 1; + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); + + err = bpf_object__load(obj); + if (err) + return 1; + + prog_fd = bpf_program__fd(prog); + /* static global var 'max_pcktsz' is accessible from .data section */ if (max_pckt_size) { map_fd = bpf_object__find_map_fd_by_name(obj, "xdp_adju.data"); @@ -179,7 +179,7 @@ int main(int argc, char **argv) signal(SIGINT, int_exit); signal(SIGTERM, int_exit); - if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { + if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) { printf("link set xdp fd failed\n"); return 1; } diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c index 74a4583d0d86..84f57f1209ce 100644 --- a/samples/bpf/xdp_fwd_user.c +++ b/samples/bpf/xdp_fwd_user.c @@ -33,7 +33,7 @@ static int do_attach(int idx, int prog_fd, int map_fd, const char *name) { int err; - err = bpf_set_link_xdp_fd(idx, prog_fd, xdp_flags); + err = bpf_xdp_attach(idx, prog_fd, xdp_flags, NULL); if (err < 0) { printf("ERROR: failed to attach program to %s\n", name); return err; @@ -47,17 +47,60 @@ static int do_attach(int idx, int prog_fd, int map_fd, const char *name) return err; } -static int do_detach(int idx, const char *name) +static int do_detach(int ifindex, const char *ifname, const char *app_name) { - int err; + LIBBPF_OPTS(bpf_xdp_attach_opts, opts); + struct bpf_prog_info prog_info = {}; + char prog_name[BPF_OBJ_NAME_LEN]; + __u32 info_len, curr_prog_id; + int prog_fd; + int err = 1; + + if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) { + printf("ERROR: bpf_xdp_query_id failed (%s)\n", + strerror(errno)); + return err; + } - err = bpf_set_link_xdp_fd(idx, -1, xdp_flags); - if (err < 0) - printf("ERROR: failed to detach program from %s\n", name); + if (!curr_prog_id) { + printf("ERROR: flags(0x%x) xdp prog is not attached to %s\n", + xdp_flags, ifname); + return err; + } + + info_len = sizeof(prog_info); + prog_fd = bpf_prog_get_fd_by_id(curr_prog_id); + if (prog_fd < 0) { + printf("ERROR: bpf_prog_get_fd_by_id failed (%s)\n", + strerror(errno)); + return prog_fd; + } + err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len); + if (err) { + printf("ERROR: bpf_obj_get_info_by_fd failed (%s)\n", + strerror(errno)); + goto close_out; + } + snprintf(prog_name, sizeof(prog_name), "%s_prog", app_name); + prog_name[BPF_OBJ_NAME_LEN - 1] = '\0'; + + if (strcmp(prog_info.name, prog_name)) { + printf("ERROR: %s isn't attached to %s\n", app_name, ifname); + err = 1; + goto close_out; + } + + opts.old_prog_fd = prog_fd; + err = bpf_xdp_detach(ifindex, xdp_flags, &opts); + if (err < 0) + printf("ERROR: failed to detach program from %s (%s)\n", + ifname, strerror(errno)); /* TODO: Remember to cleanup map, when adding use of shared map * bpf_map_delete_elem((map_fd, &idx); */ +close_out: + close(prog_fd); return err; } @@ -67,18 +110,19 @@ static void usage(const char *prog) "usage: %s [OPTS] interface-list\n" "\nOPTS:\n" " -d detach program\n" + " -S use skb-mode\n" + " -F force loading prog\n" " -D direct table lookups (skip fib rules)\n", prog); } int main(int argc, char **argv) { - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; const char *prog_name = "xdp_fwd"; - struct bpf_program *prog; - int prog_fd, map_fd = -1; + struct bpf_program *prog = NULL; + struct bpf_program *pos; + const char *sec_name; + int prog_fd = -1, map_fd = -1; char filename[PATH_MAX]; struct bpf_object *obj; int opt, i, idx, err; @@ -115,7 +159,6 @@ int main(int argc, char **argv) if (attach) { snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; if (access(filename, O_RDONLY) < 0) { printf("error accessing file %s: %s\n", @@ -123,7 +166,14 @@ int main(int argc, char **argv) return 1; } - err = bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) + return 1; + + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); + + err = bpf_object__load(obj); if (err) { printf("Does kernel support devmap lookup?\n"); /* If not, the error message will be: @@ -132,7 +182,13 @@ int main(int argc, char **argv) return 1; } - prog = bpf_object__find_program_by_title(obj, prog_name); + bpf_object__for_each_program(pos, obj) { + sec_name = bpf_program__section_name(pos); + if (sec_name && !strcmp(sec_name, prog_name)) { + prog = pos; + break; + } + } prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { printf("program not found: %s\n", strerror(prog_fd)); @@ -156,7 +212,7 @@ int main(int argc, char **argv) return 1; } if (!attach) { - err = do_detach(idx, argv[i]); + err = do_detach(idx, argv[i], prog_name); if (err) ret = err; } else { diff --git a/samples/bpf/xdp_monitor.bpf.c b/samples/bpf/xdp_monitor.bpf.c new file mode 100644 index 000000000000..cfb41e2205f4 --- /dev/null +++ b/samples/bpf/xdp_monitor.bpf.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc. + * + * XDP monitor tool, based on tracepoints + */ +#include "xdp_sample.bpf.h" + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c deleted file mode 100644 index 3d33cca2d48a..000000000000 --- a/samples/bpf/xdp_monitor_kern.c +++ /dev/null @@ -1,257 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc. - * - * XDP monitor tool, based on tracepoints - */ -#include <uapi/linux/bpf.h> -#include <bpf/bpf_helpers.h> - -struct bpf_map_def SEC("maps") redirect_err_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = 2, - /* TODO: have entries for all possible errno's */ -}; - -#define XDP_UNKNOWN XDP_REDIRECT + 1 -struct bpf_map_def SEC("maps") exception_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = XDP_UNKNOWN + 1, -}; - -/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct xdp_redirect_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12 size:4; signed:0; - int ifindex; // offset:16 size:4; signed:1; - int err; // offset:20 size:4; signed:1; - int to_ifindex; // offset:24 size:4; signed:1; - u32 map_id; // offset:28 size:4; signed:0; - int map_index; // offset:32 size:4; signed:1; -}; // offset:36 - -enum { - XDP_REDIRECT_SUCCESS = 0, - XDP_REDIRECT_ERROR = 1 -}; - -static __always_inline -int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) -{ - u32 key = XDP_REDIRECT_ERROR; - int err = ctx->err; - u64 *cnt; - - if (!err) - key = XDP_REDIRECT_SUCCESS; - - cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key); - if (!cnt) - return 1; - *cnt += 1; - - return 0; /* Indicate event was filtered (no further processing)*/ - /* - * Returning 1 here would allow e.g. a perf-record tracepoint - * to see and record these events, but it doesn't work well - * in-practice as stopping perf-record also unload this - * bpf_prog. Plus, there is additional overhead of doing so. - */ -} - -SEC("tracepoint/xdp/xdp_redirect_err") -int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) -{ - return xdp_redirect_collect_stat(ctx); -} - - -SEC("tracepoint/xdp/xdp_redirect_map_err") -int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) -{ - return xdp_redirect_collect_stat(ctx); -} - -/* Likely unloaded when prog starts */ -SEC("tracepoint/xdp/xdp_redirect") -int trace_xdp_redirect(struct xdp_redirect_ctx *ctx) -{ - return xdp_redirect_collect_stat(ctx); -} - -/* Likely unloaded when prog starts */ -SEC("tracepoint/xdp/xdp_redirect_map") -int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx) -{ - return xdp_redirect_collect_stat(ctx); -} - -/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct xdp_exception_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int ifindex; // offset:16; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_exception") -int trace_xdp_exception(struct xdp_exception_ctx *ctx) -{ - u64 *cnt; - u32 key; - - key = ctx->act; - if (key > XDP_REDIRECT) - key = XDP_UNKNOWN; - - cnt = bpf_map_lookup_elem(&exception_cnt, &key); - if (!cnt) - return 1; - *cnt += 1; - - return 0; -} - -/* Common stats data record shared with _user.c */ -struct datarec { - u64 processed; - u64 dropped; - u64 info; - u64 err; -}; -#define MAX_CPUS 64 - -struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(struct datarec), - .max_entries = MAX_CPUS, -}; - -struct bpf_map_def SEC("maps") cpumap_kthread_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(struct datarec), - .max_entries = 1, -}; - -/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct cpumap_enqueue_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int map_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int cpu; // offset:16; size:4; signed:1; - unsigned int drops; // offset:20; size:4; signed:0; - unsigned int processed; // offset:24; size:4; signed:0; - int to_cpu; // offset:28; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_cpumap_enqueue") -int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) -{ - u32 to_cpu = ctx->to_cpu; - struct datarec *rec; - - if (to_cpu >= MAX_CPUS) - return 1; - - rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); - if (!rec) - return 0; - rec->processed += ctx->processed; - rec->dropped += ctx->drops; - - /* Record bulk events, then userspace can calc average bulk size */ - if (ctx->processed > 0) - rec->info += 1; - - return 0; -} - -/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct cpumap_kthread_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int map_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int cpu; // offset:16; size:4; signed:1; - unsigned int drops; // offset:20; size:4; signed:0; - unsigned int processed; // offset:24; size:4; signed:0; - int sched; // offset:28; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_cpumap_kthread") -int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) -{ - struct datarec *rec; - u32 key = 0; - - rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); - if (!rec) - return 0; - rec->processed += ctx->processed; - rec->dropped += ctx->drops; - - /* Count times kthread yielded CPU via schedule call */ - if (ctx->sched) - rec->info++; - - return 0; -} - -struct bpf_map_def SEC("maps") devmap_xmit_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(struct datarec), - .max_entries = 1, -}; - -/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct devmap_xmit_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int from_ifindex; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int to_ifindex; // offset:16; size:4; signed:1; - int drops; // offset:20; size:4; signed:1; - int sent; // offset:24; size:4; signed:1; - int err; // offset:28; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_devmap_xmit") -int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx) -{ - struct datarec *rec; - u32 key = 0; - - rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key); - if (!rec) - return 0; - rec->processed += ctx->sent; - rec->dropped += ctx->drops; - - /* Record bulk events, then userspace can calc average bulk size */ - rec->info += 1; - - /* Record error cases, where no frame were sent */ - if (ctx->err) - rec->err++; - - /* Catch API error of drv ndo_xdp_xmit sent more than count */ - if (ctx->drops < 0) - rec->err++; - - return 1; -} diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index dd558cbb2309..58015eb2ffae 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -1,15 +1,12 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. - */ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ static const char *__doc__= - "XDP monitor tool, based on tracepoints\n" -; +"XDP monitor tool, based on tracepoints\n"; static const char *__doc_err_only__= - " NOTICE: Only tracking XDP redirect errors\n" - " Enable TX success stats via '--stats'\n" - " (which comes with a per packet processing overhead)\n" -; +" NOTICE: Only tracking XDP redirect errors\n" +" Enable redirect success stats via '-s/--stats'\n" +" (which comes with a per packet processing overhead)\n"; #include <errno.h> #include <stdio.h> @@ -20,696 +17,102 @@ static const char *__doc_err_only__= #include <ctype.h> #include <unistd.h> #include <locale.h> - -#include <sys/resource.h> #include <getopt.h> #include <net/if.h> #include <time.h> - +#include <signal.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "bpf_util.h" +#include "xdp_sample_user.h" +#include "xdp_monitor.skel.h" -static int verbose = 1; -static bool debug = false; - -static const struct option long_options[] = { - {"help", no_argument, NULL, 'h' }, - {"debug", no_argument, NULL, 'D' }, - {"stats", no_argument, NULL, 'S' }, - {"sec", required_argument, NULL, 's' }, - {0, 0, NULL, 0 } -}; - -/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */ -#define EXIT_FAIL_MEM 5 - -static void usage(char *argv[]) -{ - int i; - printf("\nDOCUMENTATION:\n%s\n", __doc__); - printf("\n"); - printf(" Usage: %s (options-see-below)\n", - argv[0]); - printf(" Listing options:\n"); - for (i = 0; long_options[i].name != 0; i++) { - printf(" --%-15s", long_options[i].name); - if (long_options[i].flag != NULL) - printf(" flag (internal value:%d)", - *long_options[i].flag); - else - printf("short-option: -%c", - long_options[i].val); - printf("\n"); - } - printf("\n"); -} - -#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ -static __u64 gettime(void) -{ - struct timespec t; - int res; - - res = clock_gettime(CLOCK_MONOTONIC, &t); - if (res < 0) { - fprintf(stderr, "Error with gettimeofday! (%i)\n", res); - exit(EXIT_FAILURE); - } - return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; -} - -enum { - REDIR_SUCCESS = 0, - REDIR_ERROR = 1, -}; -#define REDIR_RES_MAX 2 -static const char *redir_names[REDIR_RES_MAX] = { - [REDIR_SUCCESS] = "Success", - [REDIR_ERROR] = "Error", -}; -static const char *err2str(int err) -{ - if (err < REDIR_RES_MAX) - return redir_names[err]; - return NULL; -} -/* enum xdp_action */ -#define XDP_UNKNOWN XDP_REDIRECT + 1 -#define XDP_ACTION_MAX (XDP_UNKNOWN + 1) -static const char *xdp_action_names[XDP_ACTION_MAX] = { - [XDP_ABORTED] = "XDP_ABORTED", - [XDP_DROP] = "XDP_DROP", - [XDP_PASS] = "XDP_PASS", - [XDP_TX] = "XDP_TX", - [XDP_REDIRECT] = "XDP_REDIRECT", - [XDP_UNKNOWN] = "XDP_UNKNOWN", -}; -static const char *action2str(int action) -{ - if (action < XDP_ACTION_MAX) - return xdp_action_names[action]; - return NULL; -} - -/* Common stats data record shared with _kern.c */ -struct datarec { - __u64 processed; - __u64 dropped; - __u64 info; - __u64 err; -}; -#define MAX_CPUS 64 +static int mask = SAMPLE_REDIRECT_ERR_CNT | SAMPLE_CPUMAP_ENQUEUE_CNT | + SAMPLE_CPUMAP_KTHREAD_CNT | SAMPLE_EXCEPTION_CNT | + SAMPLE_DEVMAP_XMIT_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI; -/* Userspace structs for collection of stats from maps */ -struct record { - __u64 timestamp; - struct datarec total; - struct datarec *cpu; -}; -struct u64rec { - __u64 processed; -}; -struct record_u64 { - /* record for _kern side __u64 values */ - __u64 timestamp; - struct u64rec total; - struct u64rec *cpu; -}; +DEFINE_SAMPLE_INIT(xdp_monitor); -struct stats_record { - struct record_u64 xdp_redirect[REDIR_RES_MAX]; - struct record_u64 xdp_exception[XDP_ACTION_MAX]; - struct record xdp_cpumap_kthread; - struct record xdp_cpumap_enqueue[MAX_CPUS]; - struct record xdp_devmap_xmit; +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "stats", no_argument, NULL, 's' }, + { "interval", required_argument, NULL, 'i' }, + { "verbose", no_argument, NULL, 'v' }, + {} }; -static bool map_collect_record(int fd, __u32 key, struct record *rec) -{ - /* For percpu maps, userspace gets a value per possible CPU */ - unsigned int nr_cpus = bpf_num_possible_cpus(); - struct datarec values[nr_cpus]; - __u64 sum_processed = 0; - __u64 sum_dropped = 0; - __u64 sum_info = 0; - __u64 sum_err = 0; - int i; - - if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { - fprintf(stderr, - "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); - return false; - } - /* Get time as close as possible to reading map contents */ - rec->timestamp = gettime(); - - /* Record and sum values from each CPU */ - for (i = 0; i < nr_cpus; i++) { - rec->cpu[i].processed = values[i].processed; - sum_processed += values[i].processed; - rec->cpu[i].dropped = values[i].dropped; - sum_dropped += values[i].dropped; - rec->cpu[i].info = values[i].info; - sum_info += values[i].info; - rec->cpu[i].err = values[i].err; - sum_err += values[i].err; - } - rec->total.processed = sum_processed; - rec->total.dropped = sum_dropped; - rec->total.info = sum_info; - rec->total.err = sum_err; - return true; -} - -static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec) -{ - /* For percpu maps, userspace gets a value per possible CPU */ - unsigned int nr_cpus = bpf_num_possible_cpus(); - struct u64rec values[nr_cpus]; - __u64 sum_total = 0; - int i; - - if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { - fprintf(stderr, - "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); - return false; - } - /* Get time as close as possible to reading map contents */ - rec->timestamp = gettime(); - - /* Record and sum values from each CPU */ - for (i = 0; i < nr_cpus; i++) { - rec->cpu[i].processed = values[i].processed; - sum_total += values[i].processed; - } - rec->total.processed = sum_total; - return true; -} - -static double calc_period(struct record *r, struct record *p) -{ - double period_ = 0; - __u64 period = 0; - - period = r->timestamp - p->timestamp; - if (period > 0) - period_ = ((double) period / NANOSEC_PER_SEC); - - return period_; -} - -static double calc_period_u64(struct record_u64 *r, struct record_u64 *p) -{ - double period_ = 0; - __u64 period = 0; - - period = r->timestamp - p->timestamp; - if (period > 0) - period_ = ((double) period / NANOSEC_PER_SEC); - - return period_; -} - -static double calc_pps(struct datarec *r, struct datarec *p, double period) -{ - __u64 packets = 0; - double pps = 0; - - if (period > 0) { - packets = r->processed - p->processed; - pps = packets / period; - } - return pps; -} - -static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period) -{ - __u64 packets = 0; - double pps = 0; - - if (period > 0) { - packets = r->processed - p->processed; - pps = packets / period; - } - return pps; -} - -static double calc_drop(struct datarec *r, struct datarec *p, double period) -{ - __u64 packets = 0; - double pps = 0; - - if (period > 0) { - packets = r->dropped - p->dropped; - pps = packets / period; - } - return pps; -} - -static double calc_info(struct datarec *r, struct datarec *p, double period) -{ - __u64 packets = 0; - double pps = 0; - - if (period > 0) { - packets = r->info - p->info; - pps = packets / period; - } - return pps; -} - -static double calc_err(struct datarec *r, struct datarec *p, double period) -{ - __u64 packets = 0; - double pps = 0; - - if (period > 0) { - packets = r->err - p->err; - pps = packets / period; - } - return pps; -} - -static void stats_print(struct stats_record *stats_rec, - struct stats_record *stats_prev, - bool err_only) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - int rec_i = 0, i, to_cpu; - double t = 0, pps = 0; - - /* Header */ - printf("%-15s %-7s %-12s %-12s %-9s\n", - "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info"); - - /* tracepoint: xdp:xdp_redirect_* */ - if (err_only) - rec_i = REDIR_ERROR; - - for (; rec_i < REDIR_RES_MAX; rec_i++) { - struct record_u64 *rec, *prev; - char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; - char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; - - rec = &stats_rec->xdp_redirect[rec_i]; - prev = &stats_prev->xdp_redirect[rec_i]; - t = calc_period_u64(rec, prev); - - for (i = 0; i < nr_cpus; i++) { - struct u64rec *r = &rec->cpu[i]; - struct u64rec *p = &prev->cpu[i]; - - pps = calc_pps_u64(r, p, t); - if (pps > 0) - printf(fmt1, "XDP_REDIRECT", i, - rec_i ? 0.0: pps, rec_i ? pps : 0.0, - err2str(rec_i)); - } - pps = calc_pps_u64(&rec->total, &prev->total, t); - printf(fmt2, "XDP_REDIRECT", "total", - rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i)); - } - - /* tracepoint: xdp:xdp_exception */ - for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { - struct record_u64 *rec, *prev; - char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; - char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; - - rec = &stats_rec->xdp_exception[rec_i]; - prev = &stats_prev->xdp_exception[rec_i]; - t = calc_period_u64(rec, prev); - - for (i = 0; i < nr_cpus; i++) { - struct u64rec *r = &rec->cpu[i]; - struct u64rec *p = &prev->cpu[i]; - - pps = calc_pps_u64(r, p, t); - if (pps > 0) - printf(fmt1, "Exception", i, - 0.0, pps, action2str(rec_i)); - } - pps = calc_pps_u64(&rec->total, &prev->total, t); - if (pps > 0) - printf(fmt2, "Exception", "total", - 0.0, pps, action2str(rec_i)); - } - - /* cpumap enqueue stats */ - for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { - char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; - char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; - struct record *rec, *prev; - char *info_str = ""; - double drop, info; - - rec = &stats_rec->xdp_cpumap_enqueue[to_cpu]; - prev = &stats_prev->xdp_cpumap_enqueue[to_cpu]; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop(r, p, t); - info = calc_info(r, p, t); - if (info > 0) { - info_str = "bulk-average"; - info = pps / info; /* calc average bulk size */ - } - if (pps > 0) - printf(fmt1, "cpumap-enqueue", - i, to_cpu, pps, drop, info, info_str); - } - pps = calc_pps(&rec->total, &prev->total, t); - if (pps > 0) { - drop = calc_drop(&rec->total, &prev->total, t); - info = calc_info(&rec->total, &prev->total, t); - if (info > 0) { - info_str = "bulk-average"; - info = pps / info; /* calc average bulk size */ - } - printf(fmt2, "cpumap-enqueue", - "sum", to_cpu, pps, drop, info, info_str); - } - } - - /* cpumap kthread stats */ - { - char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n"; - char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n"; - struct record *rec, *prev; - double drop, info; - char *i_str = ""; - - rec = &stats_rec->xdp_cpumap_kthread; - prev = &stats_prev->xdp_cpumap_kthread; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop(r, p, t); - info = calc_info(r, p, t); - if (info > 0) - i_str = "sched"; - if (pps > 0 || drop > 0) - printf(fmt1, "cpumap-kthread", - i, pps, drop, info, i_str); - } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop(&rec->total, &prev->total, t); - info = calc_info(&rec->total, &prev->total, t); - if (info > 0) - i_str = "sched-sum"; - printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str); - } - - /* devmap ndo_xdp_xmit stats */ - { - char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n"; - char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n"; - struct record *rec, *prev; - double drop, info, err; - char *i_str = ""; - char *err_str = ""; - - rec = &stats_rec->xdp_devmap_xmit; - prev = &stats_prev->xdp_devmap_xmit; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop(r, p, t); - info = calc_info(r, p, t); - err = calc_err(r, p, t); - if (info > 0) { - i_str = "bulk-average"; - info = (pps+drop) / info; /* calc avg bulk */ - } - if (err > 0) - err_str = "drv-err"; - if (pps > 0 || drop > 0) - printf(fmt1, "devmap-xmit", - i, pps, drop, info, i_str, err_str); - } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop(&rec->total, &prev->total, t); - info = calc_info(&rec->total, &prev->total, t); - err = calc_err(&rec->total, &prev->total, t); - if (info > 0) { - i_str = "bulk-average"; - info = (pps+drop) / info; /* calc avg bulk */ - } - if (err > 0) - err_str = "drv-err"; - printf(fmt2, "devmap-xmit", "total", pps, drop, - info, i_str, err_str); - } - - printf("\n"); -} - -static bool stats_collect(struct stats_record *rec) -{ - int fd; - int i; - - /* TODO: Detect if someone unloaded the perf event_fd's, as - * this can happen by someone running perf-record -e - */ - - fd = map_data[0].fd; /* map0: redirect_err_cnt */ - for (i = 0; i < REDIR_RES_MAX; i++) - map_collect_record_u64(fd, i, &rec->xdp_redirect[i]); - - fd = map_data[1].fd; /* map1: exception_cnt */ - for (i = 0; i < XDP_ACTION_MAX; i++) { - map_collect_record_u64(fd, i, &rec->xdp_exception[i]); - } - - fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */ - for (i = 0; i < MAX_CPUS; i++) - map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]); - - fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */ - map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); - - fd = map_data[4].fd; /* map4: devmap_xmit_cnt */ - map_collect_record(fd, 0, &rec->xdp_devmap_xmit); - - return true; -} - -static void *alloc_rec_per_cpu(int record_size) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - void *array; - size_t size; - - size = record_size * nr_cpus; - array = malloc(size); - memset(array, 0, size); - if (!array) { - fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); - exit(EXIT_FAIL_MEM); - } - return array; -} - -static struct stats_record *alloc_stats_record(void) -{ - struct stats_record *rec; - int rec_sz; - int i; - - /* Alloc main stats_record structure */ - rec = malloc(sizeof(*rec)); - memset(rec, 0, sizeof(*rec)); - if (!rec) { - fprintf(stderr, "Mem alloc error\n"); - exit(EXIT_FAIL_MEM); - } - - /* Alloc stats stored per CPU for each record */ - rec_sz = sizeof(struct u64rec); - for (i = 0; i < REDIR_RES_MAX; i++) - rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz); - - for (i = 0; i < XDP_ACTION_MAX; i++) - rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz); - - rec_sz = sizeof(struct datarec); - rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz); - rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz); - - for (i = 0; i < MAX_CPUS; i++) - rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz); - - return rec; -} - -static void free_stats_record(struct stats_record *r) -{ - int i; - - for (i = 0; i < REDIR_RES_MAX; i++) - free(r->xdp_redirect[i].cpu); - - for (i = 0; i < XDP_ACTION_MAX; i++) - free(r->xdp_exception[i].cpu); - - free(r->xdp_cpumap_kthread.cpu); - free(r->xdp_devmap_xmit.cpu); - - for (i = 0; i < MAX_CPUS; i++) - free(r->xdp_cpumap_enqueue[i].cpu); - - free(r); -} - -/* Pointer swap trick */ -static inline void swap(struct stats_record **a, struct stats_record **b) -{ - struct stats_record *tmp; - - tmp = *a; - *a = *b; - *b = tmp; -} - -static void stats_poll(int interval, bool err_only) -{ - struct stats_record *rec, *prev; - - rec = alloc_stats_record(); - prev = alloc_stats_record(); - stats_collect(rec); - - if (err_only) - printf("\n%s\n", __doc_err_only__); - - /* Trick to pretty printf with thousands separators use %' */ - setlocale(LC_NUMERIC, "en_US"); - - /* Header */ - if (verbose) - printf("\n%s", __doc__); - - /* TODO Need more advanced stats on error types */ - if (verbose) { - printf(" - Stats map0: %s\n", map_data[0].name); - printf(" - Stats map1: %s\n", map_data[1].name); - printf("\n"); - } - fflush(stdout); - - while (1) { - swap(&prev, &rec); - stats_collect(rec); - stats_print(rec, prev, err_only); - fflush(stdout); - sleep(interval); - } - - free_stats_record(rec); - free_stats_record(prev); -} - -static void print_bpf_prog_info(void) -{ - int i; - - /* Prog info */ - printf("Loaded BPF prog have %d bpf program(s)\n", prog_cnt); - for (i = 0; i < prog_cnt; i++) { - printf(" - prog_fd[%d] = fd(%d)\n", i, prog_fd[i]); - } - - /* Maps info */ - printf("Loaded BPF prog have %d map(s)\n", map_data_count); - for (i = 0; i < map_data_count; i++) { - char *name = map_data[i].name; - int fd = map_data[i].fd; - - printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name); - } - - /* Event info */ - printf("Searching for (max:%d) event file descriptor(s)\n", prog_cnt); - for (i = 0; i < prog_cnt; i++) { - if (event_fd[i] != -1) - printf(" - event_fd[%d] = fd(%d)\n", i, event_fd[i]); - } -} - int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - int longindex = 0, opt; - int ret = EXIT_SUCCESS; - char bpf_obj_file[256]; - - /* Default settings: */ + unsigned long interval = 2; + int ret = EXIT_FAIL_OPTION; + struct xdp_monitor *skel; bool errors_only = true; - int interval = 2; - - snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]); + int longindex = 0, opt; + bool error = true; /* Parse commands line args */ - while ((opt = getopt_long(argc, argv, "hDSs:", + while ((opt = getopt_long(argc, argv, "si:vh", long_options, &longindex)) != -1) { switch (opt) { - case 'D': - debug = true; - break; - case 'S': + case 's': errors_only = false; + mask |= SAMPLE_REDIRECT_CNT; break; - case 's': - interval = atoi(optarg); + case 'i': + interval = strtoul(optarg, NULL, 0); + break; + case 'v': + sample_switch_mode(); break; case 'h': + error = false; default: - usage(argv); - return EXIT_FAILURE; + sample_usage(argv, long_options, __doc__, mask, error); + return ret; } } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return EXIT_FAILURE; + skel = xdp_monitor__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_monitor__open: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end; } - if (load_bpf_file(bpf_obj_file)) { - printf("ERROR - bpf_log_buf: %s", bpf_log_buf); - return EXIT_FAILURE; - } - if (!prog_fd[0]) { - printf("ERROR - load_bpf_file: %s\n", strerror(errno)); - return EXIT_FAILURE; + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - if (debug) { - print_bpf_prog_info(); + ret = xdp_monitor__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_monitor__load: %s\n", strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - /* Unload/stop tracepoint event by closing fd's */ - if (errors_only) { - /* The prog_fd[i] and event_fd[i] depend on the - * order the functions was defined in _kern.c - */ - close(event_fd[2]); /* tracepoint/xdp/xdp_redirect */ - close(prog_fd[2]); /* func: trace_xdp_redirect */ - close(event_fd[3]); /* tracepoint/xdp/xdp_redirect_map */ - close(prog_fd[3]); /* func: trace_xdp_redirect_map */ + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - stats_poll(interval, errors_only); + if (errors_only) + printf("%s", __doc_err_only__); - return ret; + ret = sample_run(interval, NULL, NULL); + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + ret = EXIT_OK; +end_destroy: + xdp_monitor__destroy(skel); +end: + sample_exit(ret); } diff --git a/samples/bpf/xdp_redirect.bpf.c b/samples/bpf/xdp_redirect.bpf.c new file mode 100644 index 000000000000..7c02bacfe96b --- /dev/null +++ b/samples/bpf/xdp_redirect.bpf.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include "vmlinux.h" +#include "xdp_sample.bpf.h" +#include "xdp_sample_shared.h" + +const volatile int ifindex_out; + +SEC("xdp") +int xdp_redirect_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); + struct ethhdr *eth = data; + struct datarec *rec; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + + swap_src_dst_mac(data); + return bpf_redirect(ifindex_out, 0); +} + +/* Redirect require an XDP bpf_prog loaded on the TX device */ +SEC("xdp") +int xdp_redirect_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu.bpf.c index 313a8fe6d125..87c54bfdbb70 100644 --- a/samples/bpf/xdp_redirect_cpu_kern.c +++ b/samples/bpf/xdp_redirect_cpu.bpf.c @@ -2,71 +2,18 @@ * * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ -#include <uapi/linux/if_ether.h> -#include <uapi/linux/if_packet.h> -#include <uapi/linux/if_vlan.h> -#include <uapi/linux/ip.h> -#include <uapi/linux/ipv6.h> -#include <uapi/linux/in.h> -#include <uapi/linux/tcp.h> -#include <uapi/linux/udp.h> - -#include <uapi/linux/bpf.h> -#include <bpf/bpf_helpers.h> +#include "vmlinux.h" +#include "xdp_sample.bpf.h" +#include "xdp_sample_shared.h" #include "hash_func01.h" -#define MAX_CPUS 64 /* WARNING - sync with _user.c */ - /* Special map type that can XDP_REDIRECT frames to another CPU */ struct { __uint(type, BPF_MAP_TYPE_CPUMAP); __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(u32)); - __uint(max_entries, MAX_CPUS); + __uint(value_size, sizeof(struct bpf_cpumap_val)); } cpu_map SEC(".maps"); -/* Common stats data record to keep userspace more simple */ -struct datarec { - __u64 processed; - __u64 dropped; - __u64 issue; -}; - -/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success - * feedback. Redirect TX errors can be caught via a tracepoint. - */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, 1); -} rx_cnt SEC(".maps"); - -/* Used by trace point */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, 2); - /* TODO: have entries for all possible errno's */ -} redirect_err_cnt SEC(".maps"); - -/* Used by trace point */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, MAX_CPUS); -} cpumap_enqueue_cnt SEC(".maps"); - -/* Used by trace point */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, 1); -} cpumap_kthread_cnt SEC(".maps"); - /* Set of maps controlling available CPU, and for iterating through * selectable redirect CPUs. */ @@ -74,14 +21,15 @@ struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, u32); __type(value, u32); - __uint(max_entries, MAX_CPUS); } cpus_available SEC(".maps"); + struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, u32); __type(value, u32); __uint(max_entries, 1); } cpus_count SEC(".maps"); + struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __type(key, u32); @@ -89,24 +37,16 @@ struct { __uint(max_entries, 1); } cpus_iterator SEC(".maps"); -/* Used by trace point */ struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(struct bpf_devmap_val)); __uint(max_entries, 1); -} exception_cnt SEC(".maps"); +} tx_port SEC(".maps"); -/* Helper parse functions */ +char tx_mac_addr[ETH_ALEN]; -/* Parse Ethernet layer 2, extract network layer 3 offset and protocol - * - * Returns false on error and non-supported ether-type - */ -struct vlan_hdr { - __be16 h_vlan_TCI; - __be16 h_vlan_encapsulated_proto; -}; +/* Helper parse functions */ static __always_inline bool parse_eth(struct ethhdr *eth, void *data_end, @@ -122,11 +62,12 @@ bool parse_eth(struct ethhdr *eth, void *data_end, eth_type = eth->h_proto; /* Skip non 802.3 Ethertypes */ - if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN)) + if (__builtin_expect(bpf_ntohs(eth_type) < ETH_P_802_3_MIN, 0)) return false; /* Handle VLAN tagged packet */ - if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) { + if (eth_type == bpf_htons(ETH_P_8021Q) || + eth_type == bpf_htons(ETH_P_8021AD)) { struct vlan_hdr *vlan_hdr; vlan_hdr = (void *)eth + offset; @@ -136,7 +77,8 @@ bool parse_eth(struct ethhdr *eth, void *data_end, eth_type = vlan_hdr->h_vlan_encapsulated_proto; } /* Handle double VLAN tagged packet */ - if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) { + if (eth_type == bpf_htons(ETH_P_8021Q) || + eth_type == bpf_htons(ETH_P_8021AD)) { struct vlan_hdr *vlan_hdr; vlan_hdr = (void *)eth + offset; @@ -146,7 +88,7 @@ bool parse_eth(struct ethhdr *eth, void *data_end, eth_type = vlan_hdr->h_vlan_encapsulated_proto; } - *eth_proto = ntohs(eth_type); + *eth_proto = bpf_ntohs(eth_type); *l3_offset = offset; return true; } @@ -158,7 +100,6 @@ u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off) void *data = (void *)(long)ctx->data; struct iphdr *iph = data + nh_off; struct udphdr *udph; - u16 dport; if (iph + 1 > data_end) return 0; @@ -169,8 +110,7 @@ u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off) if (udph + 1 > data_end) return 0; - dport = ntohs(udph->dest); - return dport; + return bpf_ntohs(udph->dest); } static __always_inline @@ -197,50 +137,48 @@ int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off) return ip6h->nexthdr; } -SEC("xdp_cpu_map0") +SEC("xdp") int xdp_prognum0_no_touch(struct xdp_md *ctx) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); struct datarec *rec; u32 *cpu_selected; - u32 cpu_dest; - u32 key = 0; + u32 cpu_dest = 0; + u32 key0 = 0; /* Only use first entry in cpus_available */ - cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key0); if (!cpu_selected) return XDP_ABORTED; cpu_dest = *cpu_selected; - /* Count RX packet in map */ rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) - return XDP_ABORTED; - rec->processed++; + return XDP_PASS; + NO_TEAR_INC(rec->processed); - if (cpu_dest >= MAX_CPUS) { - rec->issue++; + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); return XDP_ABORTED; } - return bpf_redirect_map(&cpu_map, cpu_dest, 0); } -SEC("xdp_cpu_map1_touch_data") +SEC("xdp") int xdp_prognum1_touch_data(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); struct ethhdr *eth = data; struct datarec *rec; u32 *cpu_selected; - u32 cpu_dest; + u32 cpu_dest = 0; + u32 key0 = 0; u16 eth_type; - u32 key = 0; /* Only use first entry in cpus_available */ - cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key0); if (!cpu_selected) return XDP_ABORTED; cpu_dest = *cpu_selected; @@ -249,36 +187,33 @@ int xdp_prognum1_touch_data(struct xdp_md *ctx) if (eth + 1 > data_end) return XDP_ABORTED; - /* Count RX packet in map */ rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) - return XDP_ABORTED; - rec->processed++; + return XDP_PASS; + NO_TEAR_INC(rec->processed); /* Read packet data, and use it (drop non 802.3 Ethertypes) */ eth_type = eth->h_proto; - if (ntohs(eth_type) < ETH_P_802_3_MIN) { - rec->dropped++; + if (bpf_ntohs(eth_type) < ETH_P_802_3_MIN) { + NO_TEAR_INC(rec->dropped); return XDP_DROP; } - if (cpu_dest >= MAX_CPUS) { - rec->issue++; + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); return XDP_ABORTED; } - return bpf_redirect_map(&cpu_map, cpu_dest, 0); } -SEC("xdp_cpu_map2_round_robin") +SEC("xdp") int xdp_prognum2_round_robin(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; + u32 key = bpf_get_smp_processor_id(); struct datarec *rec; - u32 cpu_dest; - u32 *cpu_lookup; + u32 cpu_dest = 0; u32 key0 = 0; u32 *cpu_selected; @@ -304,40 +239,37 @@ int xdp_prognum2_round_robin(struct xdp_md *ctx) return XDP_ABORTED; cpu_dest = *cpu_selected; - /* Count RX packet in map */ - rec = bpf_map_lookup_elem(&rx_cnt, &key0); + rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) - return XDP_ABORTED; - rec->processed++; + return XDP_PASS; + NO_TEAR_INC(rec->processed); - if (cpu_dest >= MAX_CPUS) { - rec->issue++; + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); return XDP_ABORTED; } - return bpf_redirect_map(&cpu_map, cpu_dest, 0); } -SEC("xdp_cpu_map3_proto_separate") +SEC("xdp") int xdp_prognum3_proto_separate(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); struct ethhdr *eth = data; u8 ip_proto = IPPROTO_UDP; struct datarec *rec; u16 eth_proto = 0; u64 l3_offset = 0; u32 cpu_dest = 0; - u32 cpu_idx = 0; u32 *cpu_lookup; - u32 key = 0; + u32 cpu_idx = 0; - /* Count RX packet in map */ rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) - return XDP_ABORTED; - rec->processed++; + return XDP_PASS; + NO_TEAR_INC(rec->processed); if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) return XDP_PASS; /* Just skip */ @@ -378,35 +310,33 @@ int xdp_prognum3_proto_separate(struct xdp_md *ctx) return XDP_ABORTED; cpu_dest = *cpu_lookup; - if (cpu_dest >= MAX_CPUS) { - rec->issue++; + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); return XDP_ABORTED; } - return bpf_redirect_map(&cpu_map, cpu_dest, 0); } -SEC("xdp_cpu_map4_ddos_filter_pktgen") +SEC("xdp") int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); struct ethhdr *eth = data; u8 ip_proto = IPPROTO_UDP; struct datarec *rec; u16 eth_proto = 0; u64 l3_offset = 0; u32 cpu_dest = 0; + u32 *cpu_lookup; u32 cpu_idx = 0; u16 dest_port; - u32 *cpu_lookup; - u32 key = 0; - /* Count RX packet in map */ rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) - return XDP_ABORTED; - rec->processed++; + return XDP_PASS; + NO_TEAR_INC(rec->processed); if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) return XDP_PASS; /* Just skip */ @@ -440,8 +370,7 @@ int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) /* DDoS filter UDP port 9 (pktgen) */ dest_port = get_dest_port_ipv4_udp(ctx, l3_offset); if (dest_port == 9) { - if (rec) - rec->dropped++; + NO_TEAR_INC(rec->dropped); return XDP_DROP; } break; @@ -454,11 +383,10 @@ int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) return XDP_ABORTED; cpu_dest = *cpu_lookup; - if (cpu_dest >= MAX_CPUS) { - rec->issue++; + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); return XDP_ABORTED; } - return bpf_redirect_map(&cpu_map, cpu_dest, 0); } @@ -493,10 +421,10 @@ u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) if (ip6h + 1 > data_end) return 0; - cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0]; - cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1]; - cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2]; - cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3]; + cpu_hash = ip6h->saddr.in6_u.u6_addr32[0] + ip6h->daddr.in6_u.u6_addr32[0]; + cpu_hash += ip6h->saddr.in6_u.u6_addr32[1] + ip6h->daddr.in6_u.u6_addr32[1]; + cpu_hash += ip6h->saddr.in6_u.u6_addr32[2] + ip6h->daddr.in6_u.u6_addr32[2]; + cpu_hash += ip6h->saddr.in6_u.u6_addr32[3] + ip6h->daddr.in6_u.u6_addr32[3]; cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr); return cpu_hash; @@ -506,30 +434,29 @@ u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) * hashing scheme is symmetric, meaning swapping IP src/dest still hit * same CPU. */ -SEC("xdp_cpu_map5_lb_hash_ip_pairs") +SEC("xdp") int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); struct ethhdr *eth = data; - u8 ip_proto = IPPROTO_UDP; struct datarec *rec; u16 eth_proto = 0; u64 l3_offset = 0; u32 cpu_dest = 0; u32 cpu_idx = 0; u32 *cpu_lookup; + u32 key0 = 0; u32 *cpu_max; u32 cpu_hash; - u32 key = 0; - /* Count RX packet in map */ rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) - return XDP_ABORTED; - rec->processed++; + return XDP_PASS; + NO_TEAR_INC(rec->processed); - cpu_max = bpf_map_lookup_elem(&cpus_count, &key); + cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); if (!cpu_max) return XDP_ABORTED; @@ -557,165 +484,56 @@ int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx) return XDP_ABORTED; cpu_dest = *cpu_lookup; - if (cpu_dest >= MAX_CPUS) { - rec->issue++; + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); return XDP_ABORTED; } - return bpf_redirect_map(&cpu_map, cpu_dest, 0); } -char _license[] SEC("license") = "GPL"; - -/*** Trace point code ***/ - -/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct xdp_redirect_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12 size:4; signed:0; - int ifindex; // offset:16 size:4; signed:1; - int err; // offset:20 size:4; signed:1; - int to_ifindex; // offset:24 size:4; signed:1; - u32 map_id; // offset:28 size:4; signed:0; - int map_index; // offset:32 size:4; signed:1; -}; // offset:36 - -enum { - XDP_REDIRECT_SUCCESS = 0, - XDP_REDIRECT_ERROR = 1 -}; - -static __always_inline -int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) +SEC("xdp/cpumap") +int xdp_redirect_cpu_devmap(struct xdp_md *ctx) { - u32 key = XDP_REDIRECT_ERROR; - struct datarec *rec; - int err = ctx->err; + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u64 nh_off; - if (!err) - key = XDP_REDIRECT_SUCCESS; + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; - rec = bpf_map_lookup_elem(&redirect_err_cnt, &key); - if (!rec) - return 0; - rec->dropped += 1; - - return 0; /* Indicate event was filtered (no further processing)*/ - /* - * Returning 1 here would allow e.g. a perf-record tracepoint - * to see and record these events, but it doesn't work well - * in-practice as stopping perf-record also unload this - * bpf_prog. Plus, there is additional overhead of doing so. - */ + swap_src_dst_mac(data); + return bpf_redirect_map(&tx_port, 0, 0); } -SEC("tracepoint/xdp/xdp_redirect_err") -int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) +SEC("xdp/cpumap") +int xdp_redirect_cpu_pass(struct xdp_md *ctx) { - return xdp_redirect_collect_stat(ctx); + return XDP_PASS; } -SEC("tracepoint/xdp/xdp_redirect_map_err") -int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) +SEC("xdp/cpumap") +int xdp_redirect_cpu_drop(struct xdp_md *ctx) { - return xdp_redirect_collect_stat(ctx); + return XDP_DROP; } -/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct xdp_exception_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int ifindex; // offset:16; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_exception") -int trace_xdp_exception(struct xdp_exception_ctx *ctx) +SEC("xdp/devmap") +int xdp_redirect_egress_prog(struct xdp_md *ctx) { - struct datarec *rec; - u32 key = 0; - - rec = bpf_map_lookup_elem(&exception_cnt, &key); - if (!rec) - return 1; - rec->dropped += 1; - - return 0; -} + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u64 nh_off; -/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct cpumap_enqueue_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int map_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int cpu; // offset:16; size:4; signed:1; - unsigned int drops; // offset:20; size:4; signed:0; - unsigned int processed; // offset:24; size:4; signed:0; - int to_cpu; // offset:28; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_cpumap_enqueue") -int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) -{ - u32 to_cpu = ctx->to_cpu; - struct datarec *rec; + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; - if (to_cpu >= MAX_CPUS) - return 1; + __builtin_memcpy(eth->h_source, (const char *)tx_mac_addr, ETH_ALEN); - rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); - if (!rec) - return 0; - rec->processed += ctx->processed; - rec->dropped += ctx->drops; - - /* Record bulk events, then userspace can calc average bulk size */ - if (ctx->processed > 0) - rec->issue += 1; - - /* Inception: It's possible to detect overload situations, via - * this tracepoint. This can be used for creating a feedback - * loop to XDP, which can take appropriate actions to mitigate - * this overload situation. - */ - return 0; + return XDP_PASS; } -/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct cpumap_kthread_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int map_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int cpu; // offset:16; size:4; signed:1; - unsigned int drops; // offset:20; size:4; signed:0; - unsigned int processed; // offset:24; size:4; signed:0; - int sched; // offset:28; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_cpumap_kthread") -int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) -{ - struct datarec *rec; - u32 key = 0; - - rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); - if (!rec) - return 0; - rec->processed += ctx->processed; - rec->dropped += ctx->drops; - - /* Count times kthread yielded CPU via schedule call */ - if (ctx->sched) - rec->issue++; - - return 0; -} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 15bdf047a222..a12381c37d2b 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -2,7 +2,16 @@ /* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ static const char *__doc__ = - " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\""; +"XDP CPU redirect tool, using BPF_MAP_TYPE_CPUMAP\n" +"Usage: xdp_redirect_cpu -d <IFINDEX|IFNAME> -c 0 ... -c N\n" +"Valid specification for CPUMAP BPF program:\n" +" --mprog-name/-e pass (use built-in XDP_PASS program)\n" +" --mprog-name/-e drop (use built-in XDP_DROP program)\n" +" --redirect-device/-r <ifindex|ifname> (use built-in DEVMAP redirect program)\n" +" Custom CPUMAP BPF program:\n" +" --mprog-filename/-f <filename> --mprog-name/-e <program>\n" +" Optionally, also pass --redirect-map/-m and --redirect-device/-r together\n" +" to configure DEVMAP in BPF object <filename>\n"; #include <errno.h> #include <signal.h> @@ -12,491 +21,70 @@ static const char *__doc__ = #include <string.h> #include <unistd.h> #include <locale.h> -#include <sys/resource.h> +#include <sys/sysinfo.h> #include <getopt.h> #include <net/if.h> #include <time.h> #include <linux/limits.h> - -#define __must_check -#include <linux/err.h> - #include <arpa/inet.h> #include <linux/if_link.h> - -#define MAX_CPUS 64 /* WARNING - sync with _kern.c */ - -/* How many xdp_progs are defined in _kern.c */ -#define MAX_PROG 6 - #include <bpf/bpf.h> #include <bpf/libbpf.h> - #include "bpf_util.h" +#include "xdp_sample_user.h" +#include "xdp_redirect_cpu.skel.h" -static int ifindex = -1; -static char ifname_buf[IF_NAMESIZE]; -static char *ifname; -static __u32 prog_id; - -static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static int cpu_map_fd; -static int rx_cnt_map_fd; -static int redirect_err_cnt_map_fd; -static int cpumap_enqueue_cnt_map_fd; -static int cpumap_kthread_cnt_map_fd; -static int cpus_available_map_fd; -static int cpus_count_map_fd; -static int cpus_iterator_map_fd; -static int exception_cnt_map_fd; - -#define NUM_TP 5 -struct bpf_link *tp_links[NUM_TP] = { 0 }; -static int tp_cnt = 0; - -/* Exit return codes */ -#define EXIT_OK 0 -#define EXIT_FAIL 1 -#define EXIT_FAIL_OPTION 2 -#define EXIT_FAIL_XDP 3 -#define EXIT_FAIL_BPF 4 -#define EXIT_FAIL_MEM 5 +static int map_fd; +static int avail_fd; +static int count_fd; -static const struct option long_options[] = { - {"help", no_argument, NULL, 'h' }, - {"dev", required_argument, NULL, 'd' }, - {"skb-mode", no_argument, NULL, 'S' }, - {"sec", required_argument, NULL, 's' }, - {"progname", required_argument, NULL, 'p' }, - {"qsize", required_argument, NULL, 'q' }, - {"cpu", required_argument, NULL, 'c' }, - {"stress-mode", no_argument, NULL, 'x' }, - {"no-separators", no_argument, NULL, 'z' }, - {"force", no_argument, NULL, 'F' }, - {0, 0, NULL, 0 } -}; - -static void int_exit(int sig) -{ - __u32 curr_prog_id = 0; +static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT | + SAMPLE_CPUMAP_ENQUEUE_CNT | SAMPLE_CPUMAP_KTHREAD_CNT | + SAMPLE_EXCEPTION_CNT; - if (ifindex > -1) { - if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(EXIT_FAIL); - } - if (prog_id == curr_prog_id) { - fprintf(stderr, - "Interrupted: Removing XDP program on ifindex:%d device:%s\n", - ifindex, ifname); - bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); - } else if (!curr_prog_id) { - printf("couldn't find a prog id on a given iface\n"); - } else { - printf("program on interface changed, not removing\n"); - } - } - /* Detach tracepoints */ - while (tp_cnt) - bpf_link__destroy(tp_links[--tp_cnt]); +DEFINE_SAMPLE_INIT(xdp_redirect_cpu); - exit(EXIT_OK); -} +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "dev", required_argument, NULL, 'd' }, + { "skb-mode", no_argument, NULL, 'S' }, + { "progname", required_argument, NULL, 'p' }, + { "qsize", required_argument, NULL, 'q' }, + { "cpu", required_argument, NULL, 'c' }, + { "stress-mode", no_argument, NULL, 'x' }, + { "force", no_argument, NULL, 'F' }, + { "interval", required_argument, NULL, 'i' }, + { "verbose", no_argument, NULL, 'v' }, + { "stats", no_argument, NULL, 's' }, + { "mprog-name", required_argument, NULL, 'e' }, + { "mprog-filename", required_argument, NULL, 'f' }, + { "redirect-device", required_argument, NULL, 'r' }, + { "redirect-map", required_argument, NULL, 'm' }, + {} +}; static void print_avail_progs(struct bpf_object *obj) { struct bpf_program *pos; + printf(" Programs to be used for -p/--progname:\n"); bpf_object__for_each_program(pos, obj) { - if (bpf_program__is_xdp(pos)) - printf(" %s\n", bpf_program__title(pos, false)); - } -} - -static void usage(char *argv[], struct bpf_object *obj) -{ - int i; - - printf("\nDOCUMENTATION:\n%s\n", __doc__); - printf("\n"); - printf(" Usage: %s (options-see-below)\n", argv[0]); - printf(" Listing options:\n"); - for (i = 0; long_options[i].name != 0; i++) { - printf(" --%-12s", long_options[i].name); - if (long_options[i].flag != NULL) - printf(" flag (internal value:%d)", - *long_options[i].flag); - else - printf(" short-option: -%c", - long_options[i].val); - printf("\n"); - } - printf("\n Programs to be used for --progname:\n"); - print_avail_progs(obj); - printf("\n"); -} - -/* gettime returns the current time of day in nanoseconds. - * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC) - * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE) - */ -#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ -static __u64 gettime(void) -{ - struct timespec t; - int res; - - res = clock_gettime(CLOCK_MONOTONIC, &t); - if (res < 0) { - fprintf(stderr, "Error with gettimeofday! (%i)\n", res); - exit(EXIT_FAIL); - } - return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; -} - -/* Common stats data record shared with _kern.c */ -struct datarec { - __u64 processed; - __u64 dropped; - __u64 issue; -}; -struct record { - __u64 timestamp; - struct datarec total; - struct datarec *cpu; -}; -struct stats_record { - struct record rx_cnt; - struct record redir_err; - struct record kthread; - struct record exception; - struct record enq[MAX_CPUS]; -}; - -static bool map_collect_percpu(int fd, __u32 key, struct record *rec) -{ - /* For percpu maps, userspace gets a value per possible CPU */ - unsigned int nr_cpus = bpf_num_possible_cpus(); - struct datarec values[nr_cpus]; - __u64 sum_processed = 0; - __u64 sum_dropped = 0; - __u64 sum_issue = 0; - int i; - - if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { - fprintf(stderr, - "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); - return false; - } - /* Get time as close as possible to reading map contents */ - rec->timestamp = gettime(); - - /* Record and sum values from each CPU */ - for (i = 0; i < nr_cpus; i++) { - rec->cpu[i].processed = values[i].processed; - sum_processed += values[i].processed; - rec->cpu[i].dropped = values[i].dropped; - sum_dropped += values[i].dropped; - rec->cpu[i].issue = values[i].issue; - sum_issue += values[i].issue; - } - rec->total.processed = sum_processed; - rec->total.dropped = sum_dropped; - rec->total.issue = sum_issue; - return true; -} - -static struct datarec *alloc_record_per_cpu(void) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - struct datarec *array; - size_t size; - - size = sizeof(struct datarec) * nr_cpus; - array = malloc(size); - memset(array, 0, size); - if (!array) { - fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); - exit(EXIT_FAIL_MEM); - } - return array; -} - -static struct stats_record *alloc_stats_record(void) -{ - struct stats_record *rec; - int i; - - rec = malloc(sizeof(*rec)); - memset(rec, 0, sizeof(*rec)); - if (!rec) { - fprintf(stderr, "Mem alloc error\n"); - exit(EXIT_FAIL_MEM); - } - rec->rx_cnt.cpu = alloc_record_per_cpu(); - rec->redir_err.cpu = alloc_record_per_cpu(); - rec->kthread.cpu = alloc_record_per_cpu(); - rec->exception.cpu = alloc_record_per_cpu(); - for (i = 0; i < MAX_CPUS; i++) - rec->enq[i].cpu = alloc_record_per_cpu(); - - return rec; -} - -static void free_stats_record(struct stats_record *r) -{ - int i; - - for (i = 0; i < MAX_CPUS; i++) - free(r->enq[i].cpu); - free(r->exception.cpu); - free(r->kthread.cpu); - free(r->redir_err.cpu); - free(r->rx_cnt.cpu); - free(r); -} - -static double calc_period(struct record *r, struct record *p) -{ - double period_ = 0; - __u64 period = 0; - - period = r->timestamp - p->timestamp; - if (period > 0) - period_ = ((double) period / NANOSEC_PER_SEC); - - return period_; -} - -static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) -{ - __u64 packets = 0; - __u64 pps = 0; - - if (period_ > 0) { - packets = r->processed - p->processed; - pps = packets / period_; - } - return pps; -} - -static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_) -{ - __u64 packets = 0; - __u64 pps = 0; - - if (period_ > 0) { - packets = r->dropped - p->dropped; - pps = packets / period_; - } - return pps; -} - -static __u64 calc_errs_pps(struct datarec *r, - struct datarec *p, double period_) -{ - __u64 packets = 0; - __u64 pps = 0; - - if (period_ > 0) { - packets = r->issue - p->issue; - pps = packets / period_; - } - return pps; -} - -static void stats_print(struct stats_record *stats_rec, - struct stats_record *stats_prev, - char *prog_name) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - double pps = 0, drop = 0, err = 0; - struct record *rec, *prev; - int to_cpu; - double t; - int i; - - /* Header */ - printf("Running XDP/eBPF prog_name:%s\n", prog_name); - printf("%-15s %-7s %-14s %-11s %-9s\n", - "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info"); - - /* XDP rx_cnt */ - { - char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; - char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n"; - char *errstr = ""; - - rec = &stats_rec->rx_cnt; - prev = &stats_prev->rx_cnt; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop_pps(r, p, t); - err = calc_errs_pps(r, p, t); - if (err > 0) - errstr = "cpu-dest/err"; - if (pps > 0) - printf(fmt_rx, "XDP-RX", - i, pps, drop, err, errstr); - } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop_pps(&rec->total, &prev->total, t); - err = calc_errs_pps(&rec->total, &prev->total, t); - printf(fm2_rx, "XDP-RX", "total", pps, drop); - } - - /* cpumap enqueue stats */ - for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { - char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; - char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; - char *errstr = ""; - - rec = &stats_rec->enq[to_cpu]; - prev = &stats_prev->enq[to_cpu]; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop_pps(r, p, t); - err = calc_errs_pps(r, p, t); - if (err > 0) { - errstr = "bulk-average"; - err = pps / err; /* calc average bulk size */ - } - if (pps > 0) - printf(fmt, "cpumap-enqueue", - i, to_cpu, pps, drop, err, errstr); - } - pps = calc_pps(&rec->total, &prev->total, t); - if (pps > 0) { - drop = calc_drop_pps(&rec->total, &prev->total, t); - err = calc_errs_pps(&rec->total, &prev->total, t); - if (err > 0) { - errstr = "bulk-average"; - err = pps / err; /* calc average bulk size */ - } - printf(fm2, "cpumap-enqueue", - "sum", to_cpu, pps, drop, err, errstr); - } - } - - /* cpumap kthread stats */ - { - char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; - char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n"; - char *e_str = ""; - - rec = &stats_rec->kthread; - prev = &stats_prev->kthread; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop_pps(r, p, t); - err = calc_errs_pps(r, p, t); - if (err > 0) - e_str = "sched"; - if (pps > 0) - printf(fmt_k, "cpumap_kthread", - i, pps, drop, err, e_str); - } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop_pps(&rec->total, &prev->total, t); - err = calc_errs_pps(&rec->total, &prev->total, t); - if (err > 0) - e_str = "sched-sum"; - printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str); - } - - /* XDP redirect err tracepoints (very unlikely) */ - { - char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; - char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; - - rec = &stats_rec->redir_err; - prev = &stats_prev->redir_err; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop_pps(r, p, t); - if (pps > 0) - printf(fmt_err, "redirect_err", i, pps, drop); + if (bpf_program__type(pos) == BPF_PROG_TYPE_XDP) { + if (!strncmp(bpf_program__name(pos), "xdp_prognum", + sizeof("xdp_prognum") - 1)) + printf(" %s\n", bpf_program__name(pos)); } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop_pps(&rec->total, &prev->total, t); - printf(fm2_err, "redirect_err", "total", pps, drop); } - - /* XDP general exception tracepoints */ - { - char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; - char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; - - rec = &stats_rec->exception; - prev = &stats_prev->exception; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop_pps(r, p, t); - if (pps > 0) - printf(fmt_err, "xdp_exception", i, pps, drop); - } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop_pps(&rec->total, &prev->total, t); - printf(fm2_err, "xdp_exception", "total", pps, drop); - } - - printf("\n"); - fflush(stdout); -} - -static void stats_collect(struct stats_record *rec) -{ - int fd, i; - - fd = rx_cnt_map_fd; - map_collect_percpu(fd, 0, &rec->rx_cnt); - - fd = redirect_err_cnt_map_fd; - map_collect_percpu(fd, 1, &rec->redir_err); - - fd = cpumap_enqueue_cnt_map_fd; - for (i = 0; i < MAX_CPUS; i++) - map_collect_percpu(fd, i, &rec->enq[i]); - - fd = cpumap_kthread_cnt_map_fd; - map_collect_percpu(fd, 0, &rec->kthread); - - fd = exception_cnt_map_fd; - map_collect_percpu(fd, 0, &rec->exception); } - -/* Pointer swap trick */ -static inline void swap(struct stats_record **a, struct stats_record **b) +static void usage(char *argv[], const struct option *long_options, + const char *doc, int mask, bool error, struct bpf_object *obj) { - struct stats_record *tmp; - - tmp = *a; - *a = *b; - *b = tmp; + sample_usage(argv, long_options, doc, mask, error); + print_avail_progs(obj); } -static int create_cpu_entry(__u32 cpu, __u32 queue_size, +static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, __u32 avail_idx, bool new) { __u32 curr_cpus_count = 0; @@ -506,40 +94,42 @@ static int create_cpu_entry(__u32 cpu, __u32 queue_size, /* Add a CPU entry to cpumap, as this allocate a cpu entry in * the kernel for the cpu. */ - ret = bpf_map_update_elem(cpu_map_fd, &cpu, &queue_size, 0); - if (ret) { - fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret); - exit(EXIT_FAIL_BPF); + ret = bpf_map_update_elem(map_fd, &cpu, value, 0); + if (ret < 0) { + fprintf(stderr, "Create CPU entry failed: %s\n", strerror(errno)); + return ret; } /* Inform bpf_prog's that a new CPU is available to select * from via some control maps. */ - ret = bpf_map_update_elem(cpus_available_map_fd, &avail_idx, &cpu, 0); - if (ret) { - fprintf(stderr, "Add to avail CPUs failed\n"); - exit(EXIT_FAIL_BPF); + ret = bpf_map_update_elem(avail_fd, &avail_idx, &cpu, 0); + if (ret < 0) { + fprintf(stderr, "Add to avail CPUs failed: %s\n", strerror(errno)); + return ret; } /* When not replacing/updating existing entry, bump the count */ - ret = bpf_map_lookup_elem(cpus_count_map_fd, &key, &curr_cpus_count); - if (ret) { - fprintf(stderr, "Failed reading curr cpus_count\n"); - exit(EXIT_FAIL_BPF); + ret = bpf_map_lookup_elem(count_fd, &key, &curr_cpus_count); + if (ret < 0) { + fprintf(stderr, "Failed reading curr cpus_count: %s\n", + strerror(errno)); + return ret; } if (new) { curr_cpus_count++; - ret = bpf_map_update_elem(cpus_count_map_fd, &key, + ret = bpf_map_update_elem(count_fd, &key, &curr_cpus_count, 0); - if (ret) { - fprintf(stderr, "Failed write curr cpus_count\n"); - exit(EXIT_FAIL_BPF); + if (ret < 0) { + fprintf(stderr, "Failed write curr cpus_count: %s\n", + strerror(errno)); + return ret; } } - /* map_fd[7] = cpus_iterator */ - printf("%s CPU:%u as idx:%u queue_size:%d (total cpus_count:%u)\n", - new ? "Add-new":"Replace", cpu, avail_idx, - queue_size, curr_cpus_count); + + printf("%s CPU: %u as idx: %u qsize: %d cpumap_prog_fd: %d (cpus_count: %u)\n", + new ? "Add new" : "Replace", cpu, avail_idx, + value->qsize, value->bpf_prog.fd, curr_cpus_count); return 0; } @@ -547,284 +137,423 @@ static int create_cpu_entry(__u32 cpu, __u32 queue_size, /* CPUs are zero-indexed. Thus, add a special sentinel default value * in map cpus_available to mark CPU index'es not configured */ -static void mark_cpus_unavailable(void) +static int mark_cpus_unavailable(void) { - __u32 invalid_cpu = MAX_CPUS; - int ret, i; + int ret, i, n_cpus = libbpf_num_possible_cpus(); + __u32 invalid_cpu = n_cpus; - for (i = 0; i < MAX_CPUS; i++) { - ret = bpf_map_update_elem(cpus_available_map_fd, &i, + for (i = 0; i < n_cpus; i++) { + ret = bpf_map_update_elem(avail_fd, &i, &invalid_cpu, 0); - if (ret) { - fprintf(stderr, "Failed marking CPU unavailable\n"); - exit(EXIT_FAIL_BPF); + if (ret < 0) { + fprintf(stderr, "Failed marking CPU unavailable: %s\n", + strerror(errno)); + return ret; } } + + return 0; } /* Stress cpumap management code by concurrently changing underlying cpumap */ -static void stress_cpumap(void) +static void stress_cpumap(void *ctx) { + struct bpf_cpumap_val *value = ctx; + /* Changing qsize will cause kernel to free and alloc a new * bpf_cpu_map_entry, with an associated/complicated tear-down * procedure. */ - create_cpu_entry(1, 1024, 0, false); - create_cpu_entry(1, 8, 0, false); - create_cpu_entry(1, 16000, 0, false); + value->qsize = 1024; + create_cpu_entry(1, value, 0, false); + value->qsize = 8; + create_cpu_entry(1, value, 0, false); + value->qsize = 16000; + create_cpu_entry(1, value, 0, false); } -static void stats_poll(int interval, bool use_separators, char *prog_name, - bool stress_mode) +static int set_cpumap_prog(struct xdp_redirect_cpu *skel, + const char *redir_interface, const char *redir_map, + const char *mprog_filename, const char *mprog_name) { - struct stats_record *record, *prev; - - record = alloc_stats_record(); - prev = alloc_stats_record(); - stats_collect(record); - - /* Trick to pretty printf with thousands separators use %' */ - if (use_separators) - setlocale(LC_NUMERIC, "en_US"); - - while (1) { - swap(&prev, &record); - stats_collect(record); - stats_print(record, prev, prog_name); - sleep(interval); - if (stress_mode) - stress_cpumap(); - } + if (mprog_filename) { + struct bpf_program *prog; + struct bpf_object *obj; + int ret; + + if (!mprog_name) { + fprintf(stderr, "BPF program not specified for file %s\n", + mprog_filename); + goto end; + } + if ((redir_interface && !redir_map) || (!redir_interface && redir_map)) { + fprintf(stderr, "--redirect-%s specified but --redirect-%s not specified\n", + redir_interface ? "device" : "map", redir_interface ? "map" : "device"); + goto end; + } - free_stats_record(record); - free_stats_record(prev); -} + /* Custom BPF program */ + obj = bpf_object__open_file(mprog_filename, NULL); + if (!obj) { + ret = -errno; + fprintf(stderr, "Failed to bpf_prog_load_xattr: %s\n", + strerror(errno)); + return ret; + } -static struct bpf_link * attach_tp(struct bpf_object *obj, - const char *tp_category, - const char* tp_name) -{ - struct bpf_program *prog; - struct bpf_link *link; - char sec_name[PATH_MAX]; - int len; - - len = snprintf(sec_name, PATH_MAX, "tracepoint/%s/%s", - tp_category, tp_name); - if (len < 0) - exit(EXIT_FAIL); - - prog = bpf_object__find_program_by_title(obj, sec_name); - if (!prog) { - fprintf(stderr, "ERR: finding progsec: %s\n", sec_name); - exit(EXIT_FAIL_BPF); - } + ret = bpf_object__load(obj); + if (ret < 0) { + ret = -errno; + fprintf(stderr, "Failed to bpf_object__load: %s\n", + strerror(errno)); + return ret; + } - link = bpf_program__attach_tracepoint(prog, tp_category, tp_name); - if (IS_ERR(link)) - exit(EXIT_FAIL_BPF); + if (redir_map) { + int err, redir_map_fd, ifindex_out, key = 0; - return link; -} + redir_map_fd = bpf_object__find_map_fd_by_name(obj, redir_map); + if (redir_map_fd < 0) { + fprintf(stderr, "Failed to bpf_object__find_map_fd_by_name: %s\n", + strerror(errno)); + return redir_map_fd; + } -static void init_tracepoints(struct bpf_object *obj) { - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_err"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_map_err"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_exception"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_enqueue"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_kthread"); -} + ifindex_out = if_nametoindex(redir_interface); + if (!ifindex_out) + ifindex_out = strtoul(redir_interface, NULL, 0); + if (!ifindex_out) { + fprintf(stderr, "Bad interface name or index\n"); + return -EINVAL; + } -static int init_map_fds(struct bpf_object *obj) -{ - /* Maps updated by tracepoints */ - redirect_err_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "redirect_err_cnt"); - exception_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "exception_cnt"); - cpumap_enqueue_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpumap_enqueue_cnt"); - cpumap_kthread_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpumap_kthread_cnt"); - - /* Maps used by XDP */ - rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt"); - cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map"); - cpus_available_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpus_available"); - cpus_count_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_count"); - cpus_iterator_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpus_iterator"); - - if (cpu_map_fd < 0 || rx_cnt_map_fd < 0 || - redirect_err_cnt_map_fd < 0 || cpumap_enqueue_cnt_map_fd < 0 || - cpumap_kthread_cnt_map_fd < 0 || cpus_available_map_fd < 0 || - cpus_count_map_fd < 0 || cpus_iterator_map_fd < 0 || - exception_cnt_map_fd < 0) - return -ENOENT; + err = bpf_map_update_elem(redir_map_fd, &key, &ifindex_out, 0); + if (err < 0) + return err; + } + prog = bpf_object__find_program_by_name(obj, mprog_name); + if (!prog) { + ret = -errno; + fprintf(stderr, "Failed to bpf_object__find_program_by_name: %s\n", + strerror(errno)); + return ret; + } + + return bpf_program__fd(prog); + } else { + if (mprog_name) { + if (redir_interface || redir_map) { + fprintf(stderr, "Need to specify --mprog-filename/-f\n"); + goto end; + } + if (!strcmp(mprog_name, "pass") || !strcmp(mprog_name, "drop")) { + /* Use built-in pass/drop programs */ + return *mprog_name == 'p' ? bpf_program__fd(skel->progs.xdp_redirect_cpu_pass) + : bpf_program__fd(skel->progs.xdp_redirect_cpu_drop); + } else { + fprintf(stderr, "Unknown name \"%s\" for built-in BPF program\n", + mprog_name); + goto end; + } + } else { + if (redir_map) { + fprintf(stderr, "Need to specify --mprog-filename, --mprog-name and" + " --redirect-device with --redirect-map\n"); + goto end; + } + if (redir_interface) { + /* Use built-in devmap redirect */ + struct bpf_devmap_val val = {}; + int ifindex_out, err; + __u32 key = 0; + + if (!redir_interface) + return 0; + + ifindex_out = if_nametoindex(redir_interface); + if (!ifindex_out) + ifindex_out = strtoul(redir_interface, NULL, 0); + if (!ifindex_out) { + fprintf(stderr, "Bad interface name or index\n"); + return -EINVAL; + } + + if (get_mac_addr(ifindex_out, skel->bss->tx_mac_addr) < 0) { + printf("Get interface %d mac failed\n", ifindex_out); + return -EINVAL; + } + + val.ifindex = ifindex_out; + val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_redirect_egress_prog); + err = bpf_map_update_elem(bpf_map__fd(skel->maps.tx_port), &key, &val, 0); + if (err < 0) + return -errno; + + return bpf_program__fd(skel->progs.xdp_redirect_cpu_devmap); + } + } + } + + /* Disabled */ return 0; +end: + fprintf(stderr, "Invalid options for CPUMAP BPF program\n"); + return -EINVAL; } int main(int argc, char **argv) { - struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; - char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs"; - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_UNSPEC, - }; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - bool use_separators = true; + const char *redir_interface = NULL, *redir_map = NULL; + const char *mprog_filename = NULL, *mprog_name = NULL; + struct xdp_redirect_cpu *skel; + struct bpf_map_info info = {}; + struct bpf_cpumap_val value; + __u32 infosz = sizeof(info); + int ret = EXIT_FAIL_OPTION; + unsigned long interval = 2; bool stress_mode = false; struct bpf_program *prog; - struct bpf_object *obj; - char filename[256]; + const char *prog_name; + bool generic = false; + bool force = false; int added_cpus = 0; + bool error = true; int longindex = 0; - int interval = 2; int add_cpu = -1; - int opt, err; - int prog_fd; + int ifindex = -1; + int *cpu, i, opt; __u32 qsize; - - /* Notice: choosing he queue size is very important with the - * ixgbe driver, because it's driver page recycling trick is - * dependend on pages being returned quickly. The number of - * out-standing packets in the system must be less-than 2x - * RX-ring size. + int n_cpus; + + n_cpus = libbpf_num_possible_cpus(); + + /* Notice: Choosing the queue size is very important when CPU is + * configured with power-saving states. + * + * If deepest state take 133 usec to wakeup from (133/10^6). When link + * speed is 10Gbit/s ((10*10^9/8) in bytes/sec). How many bytes can + * arrive with in 133 usec at this speed: (10*10^9/8)*(133/10^6) = + * 166250 bytes. With MTU size packets this is 110 packets, and with + * minimum Ethernet (MAC-preamble + intergap) 84 bytes is 1979 packets. + * + * Setting default cpumap queue to 2048 as worst-case (small packet) + * should be +64 packet due kthread wakeup call (due to xdp_do_flush) + * worst-case is 2043 packets. + * + * Sysadm can configured system to avoid deep-sleep via: + * tuned-adm profile network-latency */ - qsize = 128+64; + qsize = 2048; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; + skel = xdp_redirect_cpu__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_redirect_cpu__open: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end; + } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) - return EXIT_FAIL; + if (bpf_map__set_max_entries(skel->maps.cpu_map, n_cpus) < 0) { + fprintf(stderr, "Failed to set max entries for cpu_map map: %s", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; + } - if (prog_fd < 0) { - fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", + if (bpf_map__set_max_entries(skel->maps.cpus_available, n_cpus) < 0) { + fprintf(stderr, "Failed to set max entries for cpus_available map: %s", strerror(errno)); - return EXIT_FAIL; + ret = EXIT_FAIL_BPF; + goto end_destroy; } - init_tracepoints(obj); - if (init_map_fds(obj) < 0) { - fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n"); - return EXIT_FAIL; + + cpu = calloc(n_cpus, sizeof(int)); + if (!cpu) { + fprintf(stderr, "Failed to allocate cpu array\n"); + goto end_destroy; } - mark_cpus_unavailable(); - /* Parse commands line args */ - while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzF", + prog = skel->progs.xdp_prognum5_lb_hash_ip_pairs; + while ((opt = getopt_long(argc, argv, "d:si:Sxp:f:e:r:m:c:q:Fvh", long_options, &longindex)) != -1) { switch (opt) { case 'd': if (strlen(optarg) >= IF_NAMESIZE) { - fprintf(stderr, "ERR: --dev name too long\n"); - goto error; + fprintf(stderr, "-d/--dev name too long\n"); + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_cpu; } - ifname = (char *)&ifname_buf; - strncpy(ifname, optarg, IF_NAMESIZE); - ifindex = if_nametoindex(ifname); - if (ifindex == 0) { - fprintf(stderr, - "ERR: --dev name unknown err(%d):%s\n", + ifindex = if_nametoindex(optarg); + if (!ifindex) + ifindex = strtoul(optarg, NULL, 0); + if (!ifindex) { + fprintf(stderr, "Bad interface index or name (%d): %s\n", errno, strerror(errno)); - goto error; + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_cpu; } break; case 's': - interval = atoi(optarg); + mask |= SAMPLE_REDIRECT_MAP_CNT; + break; + case 'i': + interval = strtoul(optarg, NULL, 0); break; case 'S': - xdp_flags |= XDP_FLAGS_SKB_MODE; + generic = true; break; case 'x': stress_mode = true; break; - case 'z': - use_separators = false; - break; case 'p': /* Selecting eBPF prog to load */ prog_name = optarg; + prog = bpf_object__find_program_by_name(skel->obj, + prog_name); + if (!prog) { + fprintf(stderr, + "Failed to find program %s specified by" + " option -p/--progname\n", + prog_name); + print_avail_progs(skel->obj); + goto end_cpu; + } + break; + case 'f': + mprog_filename = optarg; + break; + case 'e': + mprog_name = optarg; + break; + case 'r': + redir_interface = optarg; + mask |= SAMPLE_DEVMAP_XMIT_CNT_MULTI; + break; + case 'm': + redir_map = optarg; break; case 'c': /* Add multiple CPUs */ add_cpu = strtoul(optarg, NULL, 0); - if (add_cpu >= MAX_CPUS) { + if (add_cpu >= n_cpus) { fprintf(stderr, - "--cpu nr too large for cpumap err(%d):%s\n", + "--cpu nr too large for cpumap err (%d):%s\n", errno, strerror(errno)); - goto error; + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_cpu; } - create_cpu_entry(add_cpu, qsize, added_cpus, true); - added_cpus++; + cpu[added_cpus++] = add_cpu; break; case 'q': - qsize = atoi(optarg); + qsize = strtoul(optarg, NULL, 0); break; case 'F': - xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + force = true; + break; + case 'v': + sample_switch_mode(); break; case 'h': - error: + error = false; default: - usage(argv, obj); - return EXIT_FAIL_OPTION; + usage(argv, long_options, __doc__, mask, error, skel->obj); + goto end_cpu; } } - if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) - xdp_flags |= XDP_FLAGS_DRV_MODE; - - /* Required option */ + ret = EXIT_FAIL_OPTION; if (ifindex == -1) { - fprintf(stderr, "ERR: required option --dev missing\n"); - usage(argv, obj); - return EXIT_FAIL_OPTION; + fprintf(stderr, "Required option --dev missing\n"); + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_cpu; } - /* Required option */ + if (add_cpu == -1) { - fprintf(stderr, "ERR: required option --cpu missing\n"); - fprintf(stderr, " Specify multiple --cpu option to add more\n"); - usage(argv, obj); - return EXIT_FAIL_OPTION; + fprintf(stderr, "Required option --cpu missing\n" + "Specify multiple --cpu option to add more\n"); + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_cpu; } - /* Remove XDP program when program is interrupted or killed */ - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); + skel->rodata->from_match[0] = ifindex; + if (redir_interface) + skel->rodata->to_match[0] = if_nametoindex(redir_interface); - prog = bpf_object__find_program_by_title(obj, prog_name); - if (!prog) { - fprintf(stderr, "bpf_object__find_program_by_title failed\n"); - return EXIT_FAIL; + ret = xdp_redirect_cpu__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_redirect_cpu__load: %s\n", + strerror(errno)); + goto end_cpu; } - prog_fd = bpf_program__fd(prog); - if (prog_fd < 0) { - fprintf(stderr, "bpf_program__fd failed\n"); - return EXIT_FAIL; + ret = bpf_obj_get_info_by_fd(bpf_map__fd(skel->maps.cpu_map), &info, &infosz); + if (ret < 0) { + fprintf(stderr, "Failed bpf_obj_get_info_by_fd for cpumap: %s\n", + strerror(errno)); + goto end_cpu; } - if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { - fprintf(stderr, "link set xdp fd failed\n"); - return EXIT_FAIL_XDP; + skel->bss->cpumap_map_id = info.id; + + map_fd = bpf_map__fd(skel->maps.cpu_map); + avail_fd = bpf_map__fd(skel->maps.cpus_available); + count_fd = bpf_map__fd(skel->maps.cpus_count); + + ret = mark_cpus_unavailable(); + if (ret < 0) { + fprintf(stderr, "Unable to mark CPUs as unavailable\n"); + goto end_cpu; } - err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (err) { - printf("can't get prog info - %s\n", strerror(errno)); - return err; + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_cpu; } - prog_id = info.id; - stats_poll(interval, use_separators, prog_name, stress_mode); - return EXIT_OK; + value.bpf_prog.fd = set_cpumap_prog(skel, redir_interface, redir_map, + mprog_filename, mprog_name); + if (value.bpf_prog.fd < 0) { + fprintf(stderr, "Failed to set CPUMAP BPF program: %s\n", + strerror(-value.bpf_prog.fd)); + usage(argv, long_options, __doc__, mask, true, skel->obj); + ret = EXIT_FAIL_BPF; + goto end_cpu; + } + value.qsize = qsize; + + for (i = 0; i < added_cpus; i++) { + if (create_cpu_entry(cpu[i], &value, i, true) < 0) { + fprintf(stderr, "Cannot proceed, exiting\n"); + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_cpu; + } + } + + ret = EXIT_FAIL_XDP; + if (sample_install_xdp(prog, ifindex, generic, force) < 0) + goto end_cpu; + + ret = sample_run(interval, stress_mode ? stress_cpumap : NULL, &value); + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_cpu; + } + ret = EXIT_OK; +end_cpu: + free(cpu); +end_destroy: + xdp_redirect_cpu__destroy(skel); +end: + sample_exit(ret); } diff --git a/samples/bpf/xdp_redirect_kern.c b/samples/bpf/xdp_redirect_kern.c deleted file mode 100644 index d26ec3aa215e..000000000000 --- a/samples/bpf/xdp_redirect_kern.c +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#define KBUILD_MODNAME "foo" -#include <uapi/linux/bpf.h> -#include <linux/in.h> -#include <linux/if_ether.h> -#include <linux/if_packet.h> -#include <linux/if_vlan.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <bpf/bpf_helpers.h> - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, int); - __type(value, int); - __uint(max_entries, 1); -} tx_port SEC(".maps"); - -/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success - * feedback. Redirect TX errors can be caught via a tracepoint. - */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, long); - __uint(max_entries, 1); -} rxcnt SEC(".maps"); - -static void swap_src_dst_mac(void *data) -{ - unsigned short *p = data; - unsigned short dst[3]; - - dst[0] = p[0]; - dst[1] = p[1]; - dst[2] = p[2]; - p[0] = p[3]; - p[1] = p[4]; - p[2] = p[5]; - p[3] = dst[0]; - p[4] = dst[1]; - p[5] = dst[2]; -} - -SEC("xdp_redirect") -int xdp_redirect_prog(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - int rc = XDP_DROP; - int *ifindex, port = 0; - long *value; - u32 key = 0; - u64 nh_off; - - nh_off = sizeof(*eth); - if (data + nh_off > data_end) - return rc; - - ifindex = bpf_map_lookup_elem(&tx_port, &port); - if (!ifindex) - return rc; - - value = bpf_map_lookup_elem(&rxcnt, &key); - if (value) - *value += 1; - - swap_src_dst_mac(data); - return bpf_redirect(*ifindex, 0); -} - -/* Redirect require an XDP bpf_prog loaded on the TX device */ -SEC("xdp_redirect_dummy") -int xdp_redirect_dummy_prog(struct xdp_md *ctx) -{ - return XDP_PASS; -} - -char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_map.bpf.c b/samples/bpf/xdp_redirect_map.bpf.c new file mode 100644 index 000000000000..8557c278df77 --- /dev/null +++ b/samples/bpf/xdp_redirect_map.bpf.c @@ -0,0 +1,97 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define KBUILD_MODNAME "foo" + +#include "vmlinux.h" +#include "xdp_sample.bpf.h" +#include "xdp_sample_shared.h" + +/* The 2nd xdp prog on egress does not support skb mode, so we define two + * maps, tx_port_general and tx_port_native. + */ +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 1); +} tx_port_general SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 1); +} tx_port_native SEC(".maps"); + +/* store egress interface mac address */ +const volatile __u8 tx_mac_addr[ETH_ALEN]; + +static __always_inline int xdp_redirect_map(struct xdp_md *ctx, void *redirect_map) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); + struct ethhdr *eth = data; + struct datarec *rec; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + swap_src_dst_mac(data); + return bpf_redirect_map(redirect_map, 0, 0); +} + +SEC("xdp") +int xdp_redirect_map_general(struct xdp_md *ctx) +{ + return xdp_redirect_map(ctx, &tx_port_general); +} + +SEC("xdp") +int xdp_redirect_map_native(struct xdp_md *ctx) +{ + return xdp_redirect_map(ctx, &tx_port_native); +} + +SEC("xdp/devmap") +int xdp_redirect_map_egress(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u8 *mac_addr = (u8 *) tx_mac_addr; + struct ethhdr *eth = data; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + barrier_var(mac_addr); /* prevent optimizing out memcpy */ + __builtin_memcpy(eth->h_source, mac_addr, ETH_ALEN); + + return XDP_PASS; +} + +/* Redirect require an XDP bpf_prog loaded on the TX device */ +SEC("xdp") +int xdp_redirect_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_map_kern.c b/samples/bpf/xdp_redirect_map_kern.c deleted file mode 100644 index 6489352ab7a4..000000000000 --- a/samples/bpf/xdp_redirect_map_kern.c +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#define KBUILD_MODNAME "foo" -#include <uapi/linux/bpf.h> -#include <linux/in.h> -#include <linux/if_ether.h> -#include <linux/if_packet.h> -#include <linux/if_vlan.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <bpf/bpf_helpers.h> - -struct { - __uint(type, BPF_MAP_TYPE_DEVMAP); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); - __uint(max_entries, 100); -} tx_port SEC(".maps"); - -/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success - * feedback. Redirect TX errors can be caught via a tracepoint. - */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, long); - __uint(max_entries, 1); -} rxcnt SEC(".maps"); - -static void swap_src_dst_mac(void *data) -{ - unsigned short *p = data; - unsigned short dst[3]; - - dst[0] = p[0]; - dst[1] = p[1]; - dst[2] = p[2]; - p[0] = p[3]; - p[1] = p[4]; - p[2] = p[5]; - p[3] = dst[0]; - p[4] = dst[1]; - p[5] = dst[2]; -} - -SEC("xdp_redirect_map") -int xdp_redirect_map_prog(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - int rc = XDP_DROP; - int vport, port = 0, m = 0; - long *value; - u32 key = 0; - u64 nh_off; - - nh_off = sizeof(*eth); - if (data + nh_off > data_end) - return rc; - - /* constant virtual port */ - vport = 0; - - /* count packet in global counter */ - value = bpf_map_lookup_elem(&rxcnt, &key); - if (value) - *value += 1; - - swap_src_dst_mac(data); - - /* send packet out physical port */ - return bpf_redirect_map(&tx_port, vport, 0); -} - -/* Redirect require an XDP bpf_prog loaded on the TX device */ -SEC("xdp_redirect_dummy") -int xdp_redirect_dummy_prog(struct xdp_md *ctx) -{ - return XDP_PASS; -} - -char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_map_multi.bpf.c b/samples/bpf/xdp_redirect_map_multi.bpf.c new file mode 100644 index 000000000000..8b2fd4ec2c76 --- /dev/null +++ b/samples/bpf/xdp_redirect_map_multi.bpf.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +#define KBUILD_MODNAME "foo" + +#include "vmlinux.h" +#include "xdp_sample.bpf.h" +#include "xdp_sample_shared.h" + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 32); +} forward_map_general SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 32); +} forward_map_native SEC(".maps"); + +/* map to store egress interfaces mac addresses */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, __be64); + __uint(max_entries, 32); +} mac_map SEC(".maps"); + +static int xdp_redirect_map(struct xdp_md *ctx, void *forward_map) +{ + u32 key = bpf_get_smp_processor_id(); + struct datarec *rec; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + + return bpf_redirect_map(forward_map, 0, + BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS); +} + +SEC("xdp") +int xdp_redirect_map_general(struct xdp_md *ctx) +{ + return xdp_redirect_map(ctx, &forward_map_general); +} + +SEC("xdp") +int xdp_redirect_map_native(struct xdp_md *ctx) +{ + return xdp_redirect_map(ctx, &forward_map_native); +} + +SEC("xdp/devmap") +int xdp_devmap_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u32 key = ctx->egress_ifindex; + struct ethhdr *eth = data; + __be64 *mac; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + mac = bpf_map_lookup_elem(&mac_map, &key); + if (mac) + __builtin_memcpy(eth->h_source, mac, ETH_ALEN); + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_map_multi_user.c b/samples/bpf/xdp_redirect_map_multi_user.c new file mode 100644 index 000000000000..9e24f2705b67 --- /dev/null +++ b/samples/bpf/xdp_redirect_map_multi_user.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0 +static const char *__doc__ = +"XDP multi redirect tool, using BPF_MAP_TYPE_DEVMAP and BPF_F_BROADCAST flag for bpf_redirect_map\n" +"Usage: xdp_redirect_map_multi <IFINDEX|IFNAME> <IFINDEX|IFNAME> ... <IFINDEX|IFNAME>\n"; + +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <assert.h> +#include <getopt.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <net/if.h> +#include <unistd.h> +#include <libgen.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <linux/if_ether.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "bpf_util.h" +#include "xdp_sample_user.h" +#include "xdp_redirect_map_multi.skel.h" + +#define MAX_IFACE_NUM 32 +static int ifaces[MAX_IFACE_NUM] = {}; + +static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT | + SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT | + SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_SKIP_HEADING; + +DEFINE_SAMPLE_INIT(xdp_redirect_map_multi); + +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "skb-mode", no_argument, NULL, 'S' }, + { "force", no_argument, NULL, 'F' }, + { "load-egress", no_argument, NULL, 'X' }, + { "stats", no_argument, NULL, 's' }, + { "interval", required_argument, NULL, 'i' }, + { "verbose", no_argument, NULL, 'v' }, + {} +}; + +static int update_mac_map(struct bpf_map *map) +{ + int mac_map_fd = bpf_map__fd(map); + unsigned char mac_addr[6]; + unsigned int ifindex; + int i, ret = -1; + + for (i = 0; ifaces[i] > 0; i++) { + ifindex = ifaces[i]; + + ret = get_mac_addr(ifindex, mac_addr); + if (ret < 0) { + fprintf(stderr, "get interface %d mac failed\n", + ifindex); + return ret; + } + + ret = bpf_map_update_elem(mac_map_fd, &ifindex, mac_addr, 0); + if (ret < 0) { + fprintf(stderr, "Failed to update mac address for ifindex %d\n", + ifindex); + return ret; + } + } + + return 0; +} + +int main(int argc, char **argv) +{ + struct bpf_devmap_val devmap_val = {}; + struct xdp_redirect_map_multi *skel; + struct bpf_program *ingress_prog; + bool xdp_devmap_attached = false; + struct bpf_map *forward_map; + int ret = EXIT_FAIL_OPTION; + unsigned long interval = 2; + char ifname[IF_NAMESIZE]; + unsigned int ifindex; + bool generic = false; + bool force = false; + bool tried = false; + bool error = true; + int i, opt; + + while ((opt = getopt_long(argc, argv, "hSFXi:vs", + long_options, NULL)) != -1) { + switch (opt) { + case 'S': + generic = true; + /* devmap_xmit tracepoint not available */ + mask &= ~(SAMPLE_DEVMAP_XMIT_CNT | + SAMPLE_DEVMAP_XMIT_CNT_MULTI); + break; + case 'F': + force = true; + break; + case 'X': + xdp_devmap_attached = true; + break; + case 'i': + interval = strtoul(optarg, NULL, 0); + break; + case 'v': + sample_switch_mode(); + break; + case 's': + mask |= SAMPLE_REDIRECT_MAP_CNT; + break; + case 'h': + error = false; + default: + sample_usage(argv, long_options, __doc__, mask, error); + return ret; + } + } + + if (argc <= optind + 1) { + sample_usage(argv, long_options, __doc__, mask, error); + return ret; + } + + skel = xdp_redirect_map_multi__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_redirect_map_multi__open: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end; + } + + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; + } + + ret = EXIT_FAIL_OPTION; + for (i = 0; i < MAX_IFACE_NUM && argv[optind + i]; i++) { + ifaces[i] = if_nametoindex(argv[optind + i]); + if (!ifaces[i]) + ifaces[i] = strtoul(argv[optind + i], NULL, 0); + if (!if_indextoname(ifaces[i], ifname)) { + fprintf(stderr, "Bad interface index or name\n"); + sample_usage(argv, long_options, __doc__, mask, true); + goto end_destroy; + } + + skel->rodata->from_match[i] = ifaces[i]; + skel->rodata->to_match[i] = ifaces[i]; + } + + ret = xdp_redirect_map_multi__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_redirect_map_multi__load: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; + } + + if (xdp_devmap_attached) { + /* Update mac_map with all egress interfaces' mac addr */ + if (update_mac_map(skel->maps.mac_map) < 0) { + fprintf(stderr, "Updating mac address failed\n"); + ret = EXIT_FAIL; + goto end_destroy; + } + } + + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + + ingress_prog = skel->progs.xdp_redirect_map_native; + forward_map = skel->maps.forward_map_native; + + for (i = 0; ifaces[i] > 0; i++) { + ifindex = ifaces[i]; + + ret = EXIT_FAIL_XDP; +restart: + /* bind prog_fd to each interface */ + if (sample_install_xdp(ingress_prog, ifindex, generic, force) < 0) { + if (generic && !tried) { + fprintf(stderr, + "Trying fallback to sizeof(int) as value_size for devmap in generic mode\n"); + ingress_prog = skel->progs.xdp_redirect_map_general; + forward_map = skel->maps.forward_map_general; + tried = true; + goto restart; + } + goto end_destroy; + } + + /* Add all the interfaces to forward group and attach + * egress devmap program if exist + */ + devmap_val.ifindex = ifindex; + if (xdp_devmap_attached) + devmap_val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_devmap_prog); + ret = bpf_map_update_elem(bpf_map__fd(forward_map), &ifindex, &devmap_val, 0); + if (ret < 0) { + fprintf(stderr, "Failed to update devmap value: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; + } + } + + ret = sample_run(interval, NULL, NULL); + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + ret = EXIT_OK; +end_destroy: + xdp_redirect_map_multi__destroy(skel); +end: + sample_exit(ret); +} diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index 35e16dee613e..c889a1394dc1 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -1,6 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io */ +static const char *__doc__ = +"XDP redirect tool, using BPF_MAP_TYPE_DEVMAP\n" +"Usage: xdp_redirect_map <IFINDEX|IFNAME>_IN <IFINDEX|IFNAME>_OUT\n"; + #include <linux/bpf.h> #include <linux/if_link.h> #include <assert.h> @@ -13,131 +17,86 @@ #include <net/if.h> #include <unistd.h> #include <libgen.h> -#include <sys/resource.h> - -#include "bpf_util.h" +#include <getopt.h> #include <bpf/bpf.h> #include <bpf/libbpf.h> +#include "bpf_util.h" +#include "xdp_sample_user.h" +#include "xdp_redirect_map.skel.h" -static int ifindex_in; -static int ifindex_out; -static bool ifindex_out_xdp_dummy_attached = true; -static __u32 prog_id; -static __u32 dummy_prog_id; +static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT | + SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI; -static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static int rxcnt_map_fd; +DEFINE_SAMPLE_INIT(xdp_redirect_map); -static void int_exit(int sig) -{ - __u32 curr_prog_id = 0; +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "skb-mode", no_argument, NULL, 'S' }, + { "force", no_argument, NULL, 'F' }, + { "load-egress", no_argument, NULL, 'X' }, + { "stats", no_argument, NULL, 's' }, + { "interval", required_argument, NULL, 'i' }, + { "verbose", no_argument, NULL, 'v' }, + {} +}; - if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(1); - } - if (prog_id == curr_prog_id) - bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); - else if (!curr_prog_id) - printf("couldn't find a prog id on iface IN\n"); - else - printf("program on iface IN changed, not removing\n"); - - if (ifindex_out_xdp_dummy_attached) { - curr_prog_id = 0; - if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id, - xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(1); - } - if (dummy_prog_id == curr_prog_id) - bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); - else if (!curr_prog_id) - printf("couldn't find a prog id on iface OUT\n"); - else - printf("program on iface OUT changed, not removing\n"); - } - exit(0); -} - -static void poll_stats(int interval, int ifindex) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - __u64 values[nr_cpus], prev[nr_cpus]; - - memset(prev, 0, sizeof(prev)); - - while (1) { - __u64 sum = 0; - __u32 key = 0; - int i; - - sleep(interval); - assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0); - for (i = 0; i < nr_cpus; i++) - sum += (values[i] - prev[i]); - if (sum) - printf("ifindex %i: %10llu pkt/s\n", - ifindex, sum / interval); - memcpy(prev, values, sizeof(values)); - } -} - -static void usage(const char *prog) -{ - fprintf(stderr, - "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n" - "OPTS:\n" - " -S use skb-mode\n" - " -N enforce native mode\n" - " -F force loading prog\n", - prog); -} +static int verbose = 0; int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; - struct bpf_program *prog, *dummy_prog; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - int prog_fd, dummy_prog_fd; - const char *optstr = "FSN"; - struct bpf_object *obj; - int ret, opt, key = 0; - char filename[256]; - int tx_port_map_fd; - - while ((opt = getopt(argc, argv, optstr)) != -1) { + struct bpf_devmap_val devmap_val = {}; + bool xdp_devmap_attached = false; + struct xdp_redirect_map *skel; + char str[2 * IF_NAMESIZE + 1]; + char ifname_out[IF_NAMESIZE]; + struct bpf_map *tx_port_map; + char ifname_in[IF_NAMESIZE]; + int ifindex_in, ifindex_out; + unsigned long interval = 2; + int ret = EXIT_FAIL_OPTION; + struct bpf_program *prog; + bool generic = false; + bool force = false; + bool tried = false; + bool error = true; + int opt, key = 0; + + while ((opt = getopt_long(argc, argv, "hSFXi:vs", + long_options, NULL)) != -1) { switch (opt) { case 'S': - xdp_flags |= XDP_FLAGS_SKB_MODE; - break; - case 'N': - /* default, set below */ + generic = true; + /* devmap_xmit tracepoint not available */ + mask &= ~(SAMPLE_DEVMAP_XMIT_CNT | + SAMPLE_DEVMAP_XMIT_CNT_MULTI); break; case 'F': - xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + force = true; break; + case 'X': + xdp_devmap_attached = true; + break; + case 'i': + interval = strtoul(optarg, NULL, 0); + break; + case 'v': + sample_switch_mode(); + verbose = 1; + break; + case 's': + mask |= SAMPLE_REDIRECT_MAP_CNT; + break; + case 'h': + error = false; default: - usage(basename(argv[0])); - return 1; + sample_usage(argv, long_options, __doc__, mask, error); + return ret; } } - if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) - xdp_flags |= XDP_FLAGS_DRV_MODE; - - if (optind == argc) { - printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]); - return 1; - } - - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; + if (argc <= optind + 1) { + sample_usage(argv, long_options, __doc__, mask, true); + goto end; } ifindex_in = if_nametoindex(argv[optind]); @@ -148,75 +107,122 @@ int main(int argc, char **argv) if (!ifindex_out) ifindex_out = strtoul(argv[optind + 1], NULL, 0); - printf("input: %d output: %d\n", ifindex_in, ifindex_out); - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; + if (!ifindex_in || !ifindex_out) { + fprintf(stderr, "Bad interface index or name\n"); + sample_usage(argv, long_options, __doc__, mask, true); + goto end; + } - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) - return 1; + skel = xdp_redirect_map__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_redirect_map__open: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end; + } - prog = bpf_program__next(NULL, obj); - dummy_prog = bpf_program__next(prog, obj); - if (!prog || !dummy_prog) { - printf("finding a prog in obj file failed\n"); - return 1; + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - /* bpf_prog_load_xattr gives us the pointer to first prog's fd, - * so we're missing only the fd for dummy prog - */ - dummy_prog_fd = bpf_program__fd(dummy_prog); - if (prog_fd < 0 || dummy_prog_fd < 0) { - printf("bpf_prog_load_xattr: %s\n", strerror(errno)); - return 1; + + /* Load 2nd xdp prog on egress. */ + if (xdp_devmap_attached) { + ret = get_mac_addr(ifindex_out, skel->rodata->tx_mac_addr); + if (ret < 0) { + fprintf(stderr, "Failed to get interface %d mac address: %s\n", + ifindex_out, strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + if (verbose) + printf("Egress ifindex:%d using src MAC %02x:%02x:%02x:%02x:%02x:%02x\n", + ifindex_out, + skel->rodata->tx_mac_addr[0], skel->rodata->tx_mac_addr[1], + skel->rodata->tx_mac_addr[2], skel->rodata->tx_mac_addr[3], + skel->rodata->tx_mac_addr[4], skel->rodata->tx_mac_addr[5]); } - tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); - rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); - if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) { - printf("bpf_object__find_map_fd_by_name failed\n"); - return 1; + skel->rodata->from_match[0] = ifindex_in; + skel->rodata->to_match[0] = ifindex_out; + + ret = xdp_redirect_map__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_redirect_map__load: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) { - printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); - return 1; + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; } - ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (ret) { - printf("can't get prog info - %s\n", strerror(errno)); - return ret; + prog = skel->progs.xdp_redirect_map_native; + tx_port_map = skel->maps.tx_port_native; +restart: + if (sample_install_xdp(prog, ifindex_in, generic, force) < 0) { + /* First try with struct bpf_devmap_val as value for generic + * mode, then fallback to sizeof(int) for older kernels. + */ + fprintf(stderr, + "Trying fallback to sizeof(int) as value_size for devmap in generic mode\n"); + if (generic && !tried) { + prog = skel->progs.xdp_redirect_map_general; + tx_port_map = skel->maps.tx_port_general; + tried = true; + goto restart; + } + ret = EXIT_FAIL_XDP; + goto end_destroy; } - prog_id = info.id; /* Loading dummy XDP prog on out-device */ - if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd, - (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { - printf("WARN: link set xdp fd failed on %d\n", ifindex_out); - ifindex_out_xdp_dummy_attached = false; + sample_install_xdp(skel->progs.xdp_redirect_dummy_prog, ifindex_out, generic, force); + + devmap_val.ifindex = ifindex_out; + if (xdp_devmap_attached) + devmap_val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_redirect_map_egress); + ret = bpf_map_update_elem(bpf_map__fd(tx_port_map), &key, &devmap_val, 0); + if (ret < 0) { + fprintf(stderr, "Failed to update devmap value: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - memset(&info, 0, sizeof(info)); - ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len); - if (ret) { - printf("can't get prog info - %s\n", strerror(errno)); - return ret; + ret = EXIT_FAIL; + if (!if_indextoname(ifindex_in, ifname_in)) { + fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_in, + strerror(errno)); + goto end_destroy; } - dummy_prog_id = info.id; - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); - - /* populate virtual to physical port map */ - ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0); - if (ret) { - perror("bpf_update_elem"); - goto out; + if (!if_indextoname(ifindex_out, ifname_out)) { + fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_out, + strerror(errno)); + goto end_destroy; } - poll_stats(2, ifindex_out); + safe_strncpy(str, get_driver_name(ifindex_in), sizeof(str)); + printf("Redirecting from %s (ifindex %d; driver %s) to %s (ifindex %d; driver %s)\n", + ifname_in, ifindex_in, str, ifname_out, ifindex_out, get_driver_name(ifindex_out)); + snprintf(str, sizeof(str), "%s->%s", ifname_in, ifname_out); -out: - return 0; + ret = sample_run(interval, NULL, NULL); + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + ret = EXIT_OK; +end_destroy: + xdp_redirect_map__destroy(skel); +end: + sample_exit(ret); } diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index 9ca2bf457cda..8663dd631b6e 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -1,6 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com> */ +static const char *__doc__ = +"XDP redirect tool, using bpf_redirect helper\n" +"Usage: xdp_redirect <IFINDEX|IFNAME>_IN <IFINDEX|IFNAME>_OUT\n"; + #include <linux/bpf.h> #include <linux/if_link.h> #include <assert.h> @@ -13,132 +17,72 @@ #include <net/if.h> #include <unistd.h> #include <libgen.h> -#include <sys/resource.h> - -#include "bpf_util.h" +#include <getopt.h> #include <bpf/bpf.h> #include <bpf/libbpf.h> +#include "bpf_util.h" +#include "xdp_sample_user.h" +#include "xdp_redirect.skel.h" -static int ifindex_in; -static int ifindex_out; -static bool ifindex_out_xdp_dummy_attached = true; -static __u32 prog_id; -static __u32 dummy_prog_id; - -static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static int rxcnt_map_fd; - -static void int_exit(int sig) -{ - __u32 curr_prog_id = 0; - - if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(1); - } - if (prog_id == curr_prog_id) - bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); - else if (!curr_prog_id) - printf("couldn't find a prog id on iface IN\n"); - else - printf("program on iface IN changed, not removing\n"); - - if (ifindex_out_xdp_dummy_attached) { - curr_prog_id = 0; - if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id, - xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(1); - } - if (dummy_prog_id == curr_prog_id) - bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); - else if (!curr_prog_id) - printf("couldn't find a prog id on iface OUT\n"); - else - printf("program on iface OUT changed, not removing\n"); - } - exit(0); -} - -static void poll_stats(int interval, int ifindex) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - __u64 values[nr_cpus], prev[nr_cpus]; - - memset(prev, 0, sizeof(prev)); - - while (1) { - __u64 sum = 0; - __u32 key = 0; - int i; - - sleep(interval); - assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0); - for (i = 0; i < nr_cpus; i++) - sum += (values[i] - prev[i]); - if (sum) - printf("ifindex %i: %10llu pkt/s\n", - ifindex, sum / interval); - memcpy(prev, values, sizeof(values)); - } -} +static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_CNT | + SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI; -static void usage(const char *prog) -{ - fprintf(stderr, - "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n" - "OPTS:\n" - " -S use skb-mode\n" - " -N enforce native mode\n" - " -F force loading prog\n", - prog); -} +DEFINE_SAMPLE_INIT(xdp_redirect); +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"skb-mode", no_argument, NULL, 'S' }, + {"force", no_argument, NULL, 'F' }, + {"stats", no_argument, NULL, 's' }, + {"interval", required_argument, NULL, 'i' }, + {"verbose", no_argument, NULL, 'v' }, + {} +}; int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; - struct bpf_program *prog, *dummy_prog; - int prog_fd, tx_port_map_fd, opt; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - const char *optstr = "FSN"; - struct bpf_object *obj; - char filename[256]; - int dummy_prog_fd; - int ret, key = 0; - - while ((opt = getopt(argc, argv, optstr)) != -1) { + int ifindex_in, ifindex_out, opt; + char str[2 * IF_NAMESIZE + 1]; + char ifname_out[IF_NAMESIZE]; + char ifname_in[IF_NAMESIZE]; + int ret = EXIT_FAIL_OPTION; + unsigned long interval = 2; + struct xdp_redirect *skel; + bool generic = false; + bool force = false; + bool error = true; + + while ((opt = getopt_long(argc, argv, "hSFi:vs", + long_options, NULL)) != -1) { switch (opt) { case 'S': - xdp_flags |= XDP_FLAGS_SKB_MODE; - break; - case 'N': - /* default, set below */ + generic = true; + mask &= ~(SAMPLE_DEVMAP_XMIT_CNT | + SAMPLE_DEVMAP_XMIT_CNT_MULTI); break; case 'F': - xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + force = true; + break; + case 'i': + interval = strtoul(optarg, NULL, 0); + break; + case 'v': + sample_switch_mode(); break; + case 's': + mask |= SAMPLE_REDIRECT_CNT; + break; + case 'h': + error = false; default: - usage(basename(argv[0])); - return 1; + sample_usage(argv, long_options, __doc__, mask, error); + return ret; } } - if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) - xdp_flags |= XDP_FLAGS_DRV_MODE; - - if (optind == argc) { - printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]); - return 1; - } - - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; + if (argc <= optind + 1) { + sample_usage(argv, long_options, __doc__, mask, true); + return ret; } ifindex_in = if_nametoindex(argv[optind]); @@ -149,75 +93,80 @@ int main(int argc, char **argv) if (!ifindex_out) ifindex_out = strtoul(argv[optind + 1], NULL, 0); - printf("input: %d output: %d\n", ifindex_in, ifindex_out); - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; - - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) - return 1; - - prog = bpf_program__next(NULL, obj); - dummy_prog = bpf_program__next(prog, obj); - if (!prog || !dummy_prog) { - printf("finding a prog in obj file failed\n"); - return 1; + if (!ifindex_in || !ifindex_out) { + fprintf(stderr, "Bad interface index or name\n"); + sample_usage(argv, long_options, __doc__, mask, true); + goto end; } - /* bpf_prog_load_xattr gives us the pointer to first prog's fd, - * so we're missing only the fd for dummy prog - */ - dummy_prog_fd = bpf_program__fd(dummy_prog); - if (prog_fd < 0 || dummy_prog_fd < 0) { - printf("bpf_prog_load_xattr: %s\n", strerror(errno)); - return 1; + + skel = xdp_redirect__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_redirect__open: %s\n", strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end; } - tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); - rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); - if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) { - printf("bpf_object__find_map_fd_by_name failed\n"); - return 1; + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) { - printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); - return 1; + skel->rodata->from_match[0] = ifindex_in; + skel->rodata->to_match[0] = ifindex_out; + skel->rodata->ifindex_out = ifindex_out; + + ret = xdp_redirect__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_redirect__load: %s\n", strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (ret) { - printf("can't get prog info - %s\n", strerror(errno)); - return ret; + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; } - prog_id = info.id; + + ret = EXIT_FAIL_XDP; + if (sample_install_xdp(skel->progs.xdp_redirect_prog, ifindex_in, + generic, force) < 0) + goto end_destroy; /* Loading dummy XDP prog on out-device */ - if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd, - (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { - printf("WARN: link set xdp fd failed on %d\n", ifindex_out); - ifindex_out_xdp_dummy_attached = false; + sample_install_xdp(skel->progs.xdp_redirect_dummy_prog, ifindex_out, + generic, force); + + ret = EXIT_FAIL; + if (!if_indextoname(ifindex_in, ifname_in)) { + fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_in, + strerror(errno)); + goto end_destroy; } - memset(&info, 0, sizeof(info)); - ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len); - if (ret) { - printf("can't get prog info - %s\n", strerror(errno)); - return ret; + if (!if_indextoname(ifindex_out, ifname_out)) { + fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_out, + strerror(errno)); + goto end_destroy; } - dummy_prog_id = info.id; - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); + safe_strncpy(str, get_driver_name(ifindex_in), sizeof(str)); + printf("Redirecting from %s (ifindex %d; driver %s) to %s (ifindex %d; driver %s)\n", + ifname_in, ifindex_in, str, ifname_out, ifindex_out, get_driver_name(ifindex_out)); + snprintf(str, sizeof(str), "%s->%s", ifname_in, ifname_out); - /* bpf redirect port */ - ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0); - if (ret) { - perror("bpf_update_elem"); - goto out; + ret = sample_run(interval, NULL, NULL); + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; } - - poll_stats(2, ifindex_out); - -out: - return 0; + ret = EXIT_OK; +end_destroy: + xdp_redirect__destroy(skel); +end: + sample_exit(ret); } diff --git a/samples/bpf/xdp_router_ipv4.bpf.c b/samples/bpf/xdp_router_ipv4.bpf.c new file mode 100644 index 000000000000..0643330d1d2e --- /dev/null +++ b/samples/bpf/xdp_router_ipv4.bpf.c @@ -0,0 +1,189 @@ +/* Copyright (C) 2017 Cavium, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ + +#include "vmlinux.h" +#include "xdp_sample.bpf.h" +#include "xdp_sample_shared.h" + +#define ETH_ALEN 6 +#define ETH_P_8021Q 0x8100 +#define ETH_P_8021AD 0x88A8 + +struct trie_value { + __u8 prefix[4]; + __be64 value; + int ifindex; + int metric; + __be32 gw; +}; + +/* Key for lpm_trie */ +union key_4 { + u32 b32[2]; + u8 b8[8]; +}; + +struct arp_entry { + __be64 mac; + __be32 dst; +}; + +struct direct_map { + struct arp_entry arp; + int ifindex; + __be64 mac; +}; + +/* Map for trie implementation */ +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(key_size, 8); + __uint(value_size, sizeof(struct trie_value)); + __uint(max_entries, 50); + __uint(map_flags, BPF_F_NO_PREALLOC); +} lpm_map SEC(".maps"); + +/* Map for ARP table */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __be32); + __type(value, __be64); + __uint(max_entries, 50); +} arp_table SEC(".maps"); + +/* Map to keep the exact match entries in the route table */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __be32); + __type(value, struct direct_map); + __uint(max_entries, 50); +} exact_match SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 100); +} tx_port SEC(".maps"); + +SEC("xdp") +int xdp_router_ipv4_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u64 nh_off = sizeof(*eth); + struct datarec *rec; + __be16 h_proto; + u32 key = 0; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (rec) + NO_TEAR_INC(rec->processed); + + if (data + nh_off > data_end) + goto drop; + + h_proto = eth->h_proto; + if (h_proto == bpf_htons(ETH_P_8021Q) || + h_proto == bpf_htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + goto drop; + + h_proto = vhdr->h_vlan_encapsulated_proto; + } + + switch (bpf_ntohs(h_proto)) { + case ETH_P_ARP: + if (rec) + NO_TEAR_INC(rec->xdp_pass); + return XDP_PASS; + case ETH_P_IP: { + struct iphdr *iph = data + nh_off; + struct direct_map *direct_entry; + __be64 *dest_mac, *src_mac; + int forward_to; + + if (iph + 1 > data_end) + goto drop; + + direct_entry = bpf_map_lookup_elem(&exact_match, &iph->daddr); + + /* Check for exact match, this would give a faster lookup */ + if (direct_entry && direct_entry->mac && + direct_entry->arp.mac) { + src_mac = &direct_entry->mac; + dest_mac = &direct_entry->arp.mac; + forward_to = direct_entry->ifindex; + } else { + struct trie_value *prefix_value; + union key_4 key4; + + /* Look up in the trie for lpm */ + key4.b32[0] = 32; + key4.b8[4] = iph->daddr & 0xff; + key4.b8[5] = (iph->daddr >> 8) & 0xff; + key4.b8[6] = (iph->daddr >> 16) & 0xff; + key4.b8[7] = (iph->daddr >> 24) & 0xff; + + prefix_value = bpf_map_lookup_elem(&lpm_map, &key4); + if (!prefix_value) + goto drop; + + forward_to = prefix_value->ifindex; + src_mac = &prefix_value->value; + if (!src_mac) + goto drop; + + dest_mac = bpf_map_lookup_elem(&arp_table, &iph->daddr); + if (!dest_mac) { + if (!prefix_value->gw) + goto drop; + + dest_mac = bpf_map_lookup_elem(&arp_table, + &prefix_value->gw); + if (!dest_mac) { + /* Forward the packet to the kernel in + * order to trigger ARP discovery for + * the default gw. + */ + if (rec) + NO_TEAR_INC(rec->xdp_pass); + return XDP_PASS; + } + } + } + + if (src_mac && dest_mac) { + int ret; + + __builtin_memcpy(eth->h_dest, dest_mac, ETH_ALEN); + __builtin_memcpy(eth->h_source, src_mac, ETH_ALEN); + + ret = bpf_redirect_map(&tx_port, forward_to, 0); + if (ret == XDP_REDIRECT) { + if (rec) + NO_TEAR_INC(rec->xdp_redirect); + return ret; + } + } + } + default: + break; + } +drop: + if (rec) + NO_TEAR_INC(rec->xdp_drop); + + return XDP_DROP; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_router_ipv4_kern.c b/samples/bpf/xdp_router_ipv4_kern.c deleted file mode 100644 index b37ca2b13063..000000000000 --- a/samples/bpf/xdp_router_ipv4_kern.c +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (C) 2017 Cavium, Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - */ -#define KBUILD_MODNAME "foo" -#include <uapi/linux/bpf.h> -#include <linux/in.h> -#include <linux/if_ether.h> -#include <linux/if_packet.h> -#include <linux/if_vlan.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <bpf/bpf_helpers.h> -#include <linux/slab.h> -#include <net/ip_fib.h> - -struct trie_value { - __u8 prefix[4]; - __be64 value; - int ifindex; - int metric; - __be32 gw; -}; - -/* Key for lpm_trie*/ -union key_4 { - u32 b32[2]; - u8 b8[8]; -}; - -struct arp_entry { - __be64 mac; - __be32 dst; -}; - -struct direct_map { - struct arp_entry arp; - int ifindex; - __be64 mac; -}; - -/* Map for trie implementation*/ -struct { - __uint(type, BPF_MAP_TYPE_LPM_TRIE); - __uint(key_size, 8); - __uint(value_size, sizeof(struct trie_value)); - __uint(max_entries, 50); - __uint(map_flags, BPF_F_NO_PREALLOC); -} lpm_map SEC(".maps"); - -/* Map for counter*/ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, 256); -} rxcnt SEC(".maps"); - -/* Map for ARP table*/ -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, __be32); - __type(value, __be64); - __uint(max_entries, 50); -} arp_table SEC(".maps"); - -/* Map to keep the exact match entries in the route table*/ -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, __be32); - __type(value, struct direct_map); - __uint(max_entries, 50); -} exact_match SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_DEVMAP); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); - __uint(max_entries, 100); -} tx_port SEC(".maps"); - -/* Function to set source and destination mac of the packet */ -static inline void set_src_dst_mac(void *data, void *src, void *dst) -{ - unsigned short *source = src; - unsigned short *dest = dst; - unsigned short *p = data; - - __builtin_memcpy(p, dest, 6); - __builtin_memcpy(p + 3, source, 6); -} - -/* Parse IPV4 packet to get SRC, DST IP and protocol */ -static inline int parse_ipv4(void *data, u64 nh_off, void *data_end, - __be32 *src, __be32 *dest) -{ - struct iphdr *iph = data + nh_off; - - if (iph + 1 > data_end) - return 0; - *src = iph->saddr; - *dest = iph->daddr; - return iph->protocol; -} - -SEC("xdp_router_ipv4") -int xdp_router_ipv4_prog(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - __be64 *dest_mac = NULL, *src_mac = NULL; - void *data = (void *)(long)ctx->data; - struct trie_value *prefix_value; - int rc = XDP_DROP, forward_to; - struct ethhdr *eth = data; - union key_4 key4; - long *value; - u16 h_proto; - u32 ipproto; - u64 nh_off; - - nh_off = sizeof(*eth); - if (data + nh_off > data_end) - return rc; - - h_proto = eth->h_proto; - - if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { - struct vlan_hdr *vhdr; - - vhdr = data + nh_off; - nh_off += sizeof(struct vlan_hdr); - if (data + nh_off > data_end) - return rc; - h_proto = vhdr->h_vlan_encapsulated_proto; - } - if (h_proto == htons(ETH_P_ARP)) { - return XDP_PASS; - } else if (h_proto == htons(ETH_P_IP)) { - struct direct_map *direct_entry; - __be32 src_ip = 0, dest_ip = 0; - - ipproto = parse_ipv4(data, nh_off, data_end, &src_ip, &dest_ip); - direct_entry = bpf_map_lookup_elem(&exact_match, &dest_ip); - /* Check for exact match, this would give a faster lookup*/ - if (direct_entry && direct_entry->mac && direct_entry->arp.mac) { - src_mac = &direct_entry->mac; - dest_mac = &direct_entry->arp.mac; - forward_to = direct_entry->ifindex; - } else { - /* Look up in the trie for lpm*/ - key4.b32[0] = 32; - key4.b8[4] = dest_ip & 0xff; - key4.b8[5] = (dest_ip >> 8) & 0xff; - key4.b8[6] = (dest_ip >> 16) & 0xff; - key4.b8[7] = (dest_ip >> 24) & 0xff; - prefix_value = bpf_map_lookup_elem(&lpm_map, &key4); - if (!prefix_value) - return XDP_DROP; - src_mac = &prefix_value->value; - if (!src_mac) - return XDP_DROP; - dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); - if (!dest_mac) { - if (!prefix_value->gw) - return XDP_DROP; - dest_ip = prefix_value->gw; - dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); - } - forward_to = prefix_value->ifindex; - } - } else { - ipproto = 0; - } - if (src_mac && dest_mac) { - set_src_dst_mac(data, src_mac, dest_mac); - value = bpf_map_lookup_elem(&rxcnt, &ipproto); - if (value) - *value += 1; - return bpf_redirect_map(&tx_port, forward_to, 0); - } - return rc; -} - -char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index c2da1b51ff95..683913bbf279 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -22,72 +22,41 @@ #include <sys/syscall.h> #include "bpf_util.h" #include <bpf/libbpf.h> -#include <sys/resource.h> #include <libgen.h> +#include <getopt.h> +#include <pthread.h> +#include "xdp_sample_user.h" +#include "xdp_router_ipv4.skel.h" -int sock, sock_arp, flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static int total_ifindex; -static int *ifindex_list; -static __u32 *prog_id_list; -char buf[8192]; +static const char *__doc__ = +"XDP IPv4 router implementation\n" +"Usage: xdp_router_ipv4 <IFNAME-0> ... <IFNAME-N>\n"; + +static char buf[8192]; static int lpm_map_fd; -static int rxcnt_map_fd; static int arp_table_map_fd; static int exact_match_map_fd; static int tx_port_map_fd; -static int get_route_table(int rtm_family); -static void int_exit(int sig) -{ - __u32 prog_id = 0; - int i = 0; +static bool routes_thread_exit; +static int interval = 5; - for (i = 0; i < total_ifindex; i++) { - if (bpf_get_link_xdp_id(ifindex_list[i], &prog_id, flags)) { - printf("bpf_get_link_xdp_id on iface %d failed\n", - ifindex_list[i]); - exit(1); - } - if (prog_id_list[i] == prog_id) - bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); - else if (!prog_id) - printf("couldn't find a prog id on iface %d\n", - ifindex_list[i]); - else - printf("program on iface %d changed, not removing\n", - ifindex_list[i]); - prog_id = 0; - } - exit(0); -} +static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT | + SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_EXCEPTION_CNT; -static void close_and_exit(int sig) -{ - close(sock); - close(sock_arp); +DEFINE_SAMPLE_INIT(xdp_router_ipv4); - int_exit(0); -} +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "skb-mode", no_argument, NULL, 'S' }, + { "force", no_argument, NULL, 'F' }, + { "interval", required_argument, NULL, 'i' }, + { "verbose", no_argument, NULL, 'v' }, + { "stats", no_argument, NULL, 's' }, + {} +}; -/* Get the mac address of the interface given interface name */ -static __be64 getmac(char *iface) -{ - struct ifreq ifr; - __be64 mac = 0; - int fd, i; - - fd = socket(AF_INET, SOCK_DGRAM, 0); - ifr.ifr_addr.sa_family = AF_INET; - strncpy(ifr.ifr_name, iface, IFNAMSIZ - 1); - if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) { - printf("ioctl failed leaving....\n"); - return -1; - } - for (i = 0; i < 6 ; i++) - *((__u8 *)&mac + i) = (__u8)ifr.ifr_hwaddr.sa_data[i]; - close(fd); - return mac; -} +static int get_route_table(int rtm_family); static int recv_msg(struct sockaddr_nl sock_addr, int sock) { @@ -130,7 +99,6 @@ static void read_route(struct nlmsghdr *nh, int nll) int i; struct route_table { int dst_len, iface, metric; - char *iface_name; __be32 dst, gw; __be64 mac; } route; @@ -145,17 +113,7 @@ static void read_route(struct nlmsghdr *nh, int nll) __be64 mac; } direct_entry; - if (nh->nlmsg_type == RTM_DELROUTE) - printf("DELETING Route entry\n"); - else if (nh->nlmsg_type == RTM_GETROUTE) - printf("READING Route entry\n"); - else if (nh->nlmsg_type == RTM_NEWROUTE) - printf("NEW Route entry\n"); - else - printf("%d\n", nh->nlmsg_type); - memset(&route, 0, sizeof(route)); - printf("Destination\t\tGateway\t\tGenmask\t\tMetric\t\tIface\n"); for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { rt_msg = (struct rtmsg *)NLMSG_DATA(nh); rtm_family = rt_msg->rtm_family; @@ -192,11 +150,7 @@ static void read_route(struct nlmsghdr *nh, int nll) route.gw = atoi(gws); route.iface = atoi(ifs); route.metric = atoi(metrics); - route.iface_name = alloca(sizeof(char *) * IFNAMSIZ); - route.iface_name = if_indextoname(route.iface, route.iface_name); - route.mac = getmac(route.iface_name); - if (route.mac == -1) - int_exit(0); + assert(get_mac_addr(route.iface, &route.mac) == 0); assert(bpf_map_update_elem(tx_port_map_fd, &route.iface, &route.iface, 0) == 0); if (rtm_family == AF_INET) { @@ -234,14 +188,6 @@ static void read_route(struct nlmsghdr *nh, int nll) for (i = 0; i < 4; i++) prefix_key->data[i] = (route.dst >> i * 8) & 0xff; - printf("%3d.%d.%d.%d\t\t%3x\t\t%d\t\t%d\t\t%s\n", - (int)prefix_key->data[0], - (int)prefix_key->data[1], - (int)prefix_key->data[2], - (int)prefix_key->data[3], - route.gw, route.dst_len, - route.metric, - route.iface_name); if (bpf_map_lookup_elem(lpm_map_fd, prefix_key, prefix_value) < 0) { for (i = 0; i < 4; i++) @@ -257,20 +203,13 @@ static void read_route(struct nlmsghdr *nh, int nll) ) == 0); } else { if (nh->nlmsg_type == RTM_DELROUTE) { - printf("deleting entry\n"); - printf("prefix key=%d.%d.%d.%d/%d", - prefix_key->data[0], - prefix_key->data[1], - prefix_key->data[2], - prefix_key->data[3], - prefix_key->prefixlen); assert(bpf_map_delete_elem(lpm_map_fd, prefix_key ) == 0); /* Rereading the route table to check if * there is an entry with the same * prefix but a different metric as the - * deleted enty. + * deleted entry. */ get_route_table(AF_INET); } else if (prefix_key->data[0] == @@ -327,14 +266,14 @@ static int get_route_table(int rtm_family) sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sock < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + return -errno; } memset(&sa, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - ret = -1; + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); + ret = -errno; goto cleanup; } memset(&req, 0, sizeof(req)); @@ -353,15 +292,15 @@ static int get_route_table(int rtm_family) msg.msg_iovlen = 1; ret = sendmsg(sock, &msg, 0); if (ret < 0) { - printf("send to netlink: %s\n", strerror(errno)); - ret = -1; + fprintf(stderr, "send to netlink: %s\n", strerror(errno)); + ret = -errno; goto cleanup; } memset(buf, 0, sizeof(buf)); nll = recv_msg(sa, sock); if (nll < 0) { - printf("recv from netlink: %s\n", strerror(nll)); - ret = -1; + fprintf(stderr, "recv from netlink: %s\n", strerror(nll)); + ret = nll; goto cleanup; } nh = (struct nlmsghdr *)buf; @@ -391,9 +330,6 @@ static void read_arp(struct nlmsghdr *nh, int nll) __be64 mac; } direct_entry; - if (nh->nlmsg_type == RTM_GETNEIGH) - printf("READING arp entry\n"); - printf("Address\tHwAddress\n"); for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { rt_msg = (struct ndmsg *)NLMSG_DATA(nh); rt_attr = (struct rtattr *)RTM_RTA(rt_msg); @@ -415,7 +351,7 @@ static void read_arp(struct nlmsghdr *nh, int nll) } arp_entry.dst = atoi(dsts); arp_entry.mac = atol(mac); - printf("%x\t\t%llx\n", arp_entry.dst, arp_entry.mac); + if (ndm_family == AF_INET) { if (bpf_map_lookup_elem(exact_match_map_fd, &arp_entry.dst, @@ -466,14 +402,14 @@ static int get_arp_table(int rtm_family) sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sock < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + return -errno; } memset(&sa, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - ret = -1; + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); + ret = -errno; goto cleanup; } memset(&req, 0, sizeof(req)); @@ -491,15 +427,15 @@ static int get_arp_table(int rtm_family) msg.msg_iovlen = 1; ret = sendmsg(sock, &msg, 0); if (ret < 0) { - printf("send to netlink: %s\n", strerror(errno)); - ret = -1; + fprintf(stderr, "send to netlink: %s\n", strerror(errno)); + ret = -errno; goto cleanup; } memset(buf, 0, sizeof(buf)); nll = recv_msg(sa, sock); if (nll < 0) { - printf("recv from netlink: %s\n", strerror(nll)); - ret = -1; + fprintf(stderr, "recv from netlink: %s\n", strerror(nll)); + ret = nll; goto cleanup; } nh = (struct nlmsghdr *)buf; @@ -512,24 +448,17 @@ cleanup: /* Function to keep track and update changes in route and arp table * Give regular statistics of packets forwarded */ -static int monitor_route(void) +static void *monitor_routes_thread(void *arg) { - unsigned int nr_cpus = bpf_num_possible_cpus(); - const unsigned int nr_keys = 256; struct pollfd fds_route, fds_arp; - __u64 prev[nr_keys][nr_cpus]; struct sockaddr_nl la, lr; - __u64 values[nr_cpus]; + int sock, sock_arp, nll; struct nlmsghdr *nh; - int nll, ret = 0; - int interval = 5; - __u32 key; - int i; sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sock < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + return NULL; } fcntl(sock, F_SETFL, O_NONBLOCK); @@ -537,17 +466,19 @@ static int monitor_route(void) lr.nl_family = AF_NETLINK; lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY; if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - ret = -1; - goto cleanup; + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); + close(sock); + return NULL; } + fds_route.fd = sock; fds_route.events = POLL_IN; sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sock_arp < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + close(sock); + return NULL; } fcntl(sock_arp, F_SETFL, O_NONBLOCK); @@ -555,51 +486,44 @@ static int monitor_route(void) la.nl_family = AF_NETLINK; la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY; if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - ret = -1; + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); goto cleanup; } + fds_arp.fd = sock_arp; fds_arp.events = POLL_IN; - memset(prev, 0, sizeof(prev)); - do { - signal(SIGINT, close_and_exit); - signal(SIGTERM, close_and_exit); + /* dump route and arp tables */ + if (get_arp_table(AF_INET) < 0) { + fprintf(stderr, "Failed reading arp table\n"); + goto cleanup; + } - sleep(interval); - for (key = 0; key < nr_keys; key++) { - __u64 sum = 0; - - assert(bpf_map_lookup_elem(rxcnt_map_fd, - &key, values) == 0); - for (i = 0; i < nr_cpus; i++) - sum += (values[i] - prev[key][i]); - if (sum) - printf("proto %u: %10llu pkt/s\n", - key, sum / interval); - memcpy(prev[key], values, sizeof(values)); - } + if (get_route_table(AF_INET) < 0) { + fprintf(stderr, "Failed reading route table\n"); + goto cleanup; + } + while (!routes_thread_exit) { memset(buf, 0, sizeof(buf)); if (poll(&fds_route, 1, 3) == POLL_IN) { nll = recv_msg(lr, sock); if (nll < 0) { - printf("recv from netlink: %s\n", strerror(nll)); - ret = -1; + fprintf(stderr, "recv from netlink: %s\n", + strerror(nll)); goto cleanup; } nh = (struct nlmsghdr *)buf; - printf("Routing table updated.\n"); read_route(nh, nll); } + memset(buf, 0, sizeof(buf)); if (poll(&fds_arp, 1, 3) == POLL_IN) { nll = recv_msg(la, sock_arp); if (nll < 0) { - printf("recv from netlink: %s\n", strerror(nll)); - ret = -1; + fprintf(stderr, "recv from netlink: %s\n", + strerror(nll)); goto cleanup; } @@ -607,135 +531,169 @@ static int monitor_route(void) read_arp(nh, nll); } - } while (1); + sleep(interval); + } + cleanup: + close(sock_arp); close(sock); - return ret; + return NULL; } -static void usage(const char *prog) +static void usage(char *argv[], const struct option *long_options, + const char *doc, int mask, bool error, + struct bpf_object *obj) { - fprintf(stderr, - "%s: %s [OPTS] interface name list\n\n" - "OPTS:\n" - " -S use skb-mode\n" - " -F force loading prog\n", - __func__, prog); + sample_usage(argv, long_options, doc, mask, error); } -int main(int ac, char **argv) +int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - const char *optstr = "SF"; - struct bpf_object *obj; - char filename[256]; - char **ifname_list; - int prog_fd, opt; - int err, i = 1; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; - - total_ifindex = ac - 1; - ifname_list = (argv + 1); - - while ((opt = getopt(ac, argv, optstr)) != -1) { + bool error = true, generic = false, force = false; + int opt, ret = EXIT_FAIL_BPF; + struct xdp_router_ipv4 *skel; + int i, total_ifindex = argc - 1; + char **ifname_list = argv + 1; + pthread_t routes_thread; + int longindex = 0; + + if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) { + fprintf(stderr, "Failed to set libbpf strict mode: %s\n", + strerror(errno)); + goto end; + } + + skel = xdp_router_ipv4__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_router_ipv4__open: %s\n", + strerror(errno)); + goto end; + } + + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", + strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; + } + + ret = xdp_router_ipv4__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_router_ipv4__load: %s\n", + strerror(errno)); + goto end_destroy; + } + + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + + while ((opt = getopt_long(argc, argv, "si:SFvh", + long_options, &longindex)) != -1) { switch (opt) { + case 's': + mask |= SAMPLE_REDIRECT_MAP_CNT; + total_ifindex--; + ifname_list++; + break; + case 'i': + interval = strtoul(optarg, NULL, 0); + total_ifindex -= 2; + ifname_list += 2; + break; case 'S': - flags |= XDP_FLAGS_SKB_MODE; + generic = true; total_ifindex--; ifname_list++; break; case 'F': - flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + force = true; total_ifindex--; ifname_list++; break; + case 'v': + sample_switch_mode(); + total_ifindex--; + ifname_list++; + break; + case 'h': + error = false; default: - usage(basename(argv[0])); - return 1; + usage(argv, long_options, __doc__, mask, error, skel->obj); + goto end_destroy; } } - if (!(flags & XDP_FLAGS_SKB_MODE)) - flags |= XDP_FLAGS_DRV_MODE; - - if (optind == ac) { - usage(basename(argv[0])); - return 1; + ret = EXIT_FAIL_OPTION; + if (optind == argc) { + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_destroy; } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; + lpm_map_fd = bpf_map__fd(skel->maps.lpm_map); + if (lpm_map_fd < 0) { + fprintf(stderr, "Failed loading lpm_map %s\n", + strerror(-lpm_map_fd)); + goto end_destroy; } - - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) - return 1; - - printf("\n**************loading bpf file*********************\n\n\n"); - if (!prog_fd) { - printf("bpf_prog_load_xattr: %s\n", strerror(errno)); - return 1; + arp_table_map_fd = bpf_map__fd(skel->maps.arp_table); + if (arp_table_map_fd < 0) { + fprintf(stderr, "Failed loading arp_table_map_fd %s\n", + strerror(-arp_table_map_fd)); + goto end_destroy; } - - lpm_map_fd = bpf_object__find_map_fd_by_name(obj, "lpm_map"); - rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); - arp_table_map_fd = bpf_object__find_map_fd_by_name(obj, "arp_table"); - exact_match_map_fd = bpf_object__find_map_fd_by_name(obj, - "exact_match"); - tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); - if (lpm_map_fd < 0 || rxcnt_map_fd < 0 || arp_table_map_fd < 0 || - exact_match_map_fd < 0 || tx_port_map_fd < 0) { - printf("bpf_object__find_map_fd_by_name failed\n"); - return 1; + exact_match_map_fd = bpf_map__fd(skel->maps.exact_match); + if (exact_match_map_fd < 0) { + fprintf(stderr, "Failed loading exact_match_map_fd %s\n", + strerror(-exact_match_map_fd)); + goto end_destroy; } - - ifindex_list = (int *)calloc(total_ifindex, sizeof(int *)); - for (i = 0; i < total_ifindex; i++) { - ifindex_list[i] = if_nametoindex(ifname_list[i]); - if (!ifindex_list[i]) { - printf("Couldn't translate interface name: %s", - strerror(errno)); - return 1; - } + tx_port_map_fd = bpf_map__fd(skel->maps.tx_port); + if (tx_port_map_fd < 0) { + fprintf(stderr, "Failed loading tx_port_map_fd %s\n", + strerror(-tx_port_map_fd)); + goto end_destroy; } - prog_id_list = (__u32 *)calloc(total_ifindex, sizeof(__u32 *)); - for (i = 0; i < total_ifindex; i++) { - if (bpf_set_link_xdp_fd(ifindex_list[i], prog_fd, flags) < 0) { - printf("link set xdp fd failed\n"); - int recovery_index = i; - for (i = 0; i < recovery_index; i++) - bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); + ret = EXIT_FAIL_XDP; + for (i = 0; i < total_ifindex; i++) { + int index = if_nametoindex(ifname_list[i]); - return 1; + if (!index) { + fprintf(stderr, "Interface %s not found %s\n", + ifname_list[i], strerror(-tx_port_map_fd)); + goto end_destroy; } - err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (err) { - printf("can't get prog info - %s\n", strerror(errno)); - return err; - } - prog_id_list[i] = info.id; - memset(&info, 0, sizeof(info)); - printf("Attached to %d\n", ifindex_list[i]); + if (sample_install_xdp(skel->progs.xdp_router_ipv4_prog, + index, generic, force) < 0) + goto end_destroy; } - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); - printf("*******************ROUTE TABLE*************************\n\n\n"); - get_route_table(AF_INET); - printf("*******************ARP TABLE***************************\n\n\n"); - get_arp_table(AF_INET); - if (monitor_route() < 0) { - printf("Error in receiving route update"); - return 1; + ret = pthread_create(&routes_thread, NULL, monitor_routes_thread, NULL); + if (ret) { + fprintf(stderr, "Failed creating routes_thread: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; } - return 0; + ret = sample_run(interval, NULL, NULL); + routes_thread_exit = true; + + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_thread_wait; + } + ret = EXIT_OK; + +end_thread_wait: + pthread_join(routes_thread, NULL); +end_destroy: + xdp_router_ipv4__destroy(skel); +end: + sample_exit(ret); } diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c index 4fe47502ebed..08f5331d2b00 100644 --- a/samples/bpf/xdp_rxq_info_user.c +++ b/samples/bpf/xdp_rxq_info_user.c @@ -14,11 +14,10 @@ static const char *__doc__ = " XDP RX-queue info extract example\n\n" #include <string.h> #include <unistd.h> #include <locale.h> -#include <sys/resource.h> #include <getopt.h> #include <net/if.h> #include <time.h> - +#include <limits.h> #include <arpa/inet.h> #include <linux/if_link.h> @@ -44,6 +43,9 @@ static struct bpf_map *rx_queue_index_map; #define EXIT_FAIL_BPF 4 #define EXIT_FAIL_MEM 5 +#define FAIL_MEM_SIG INT_MAX +#define FAIL_STAT_SIG (INT_MAX - 1) + static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, {"dev", required_argument, NULL, 'd' }, @@ -62,21 +64,27 @@ static void int_exit(int sig) __u32 curr_prog_id = 0; if (ifindex > -1) { - if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); + if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) { + printf("bpf_xdp_query_id failed\n"); exit(EXIT_FAIL); } if (prog_id == curr_prog_id) { fprintf(stderr, "Interrupted: Removing XDP program on ifindex:%d device:%s\n", ifindex, ifname); - bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_xdp_detach(ifindex, xdp_flags, NULL); } else if (!curr_prog_id) { printf("couldn't find a prog id on a given iface\n"); } else { printf("program on interface changed, not removing\n"); } } + + if (sig == FAIL_MEM_SIG) + exit(EXIT_FAIL_MEM); + else if (sig == FAIL_STAT_SIG) + exit(EXIT_FAIL); + exit(EXIT_OK); } @@ -141,7 +149,8 @@ static char* options2str(enum cfg_options_flags flag) if (flag & READ_MEM) return "read"; fprintf(stderr, "ERR: Unknown config option flags"); - exit(EXIT_FAIL); + int_exit(FAIL_STAT_SIG); + return "unknown"; } static void usage(char *argv[]) @@ -174,7 +183,7 @@ static __u64 gettime(void) res = clock_gettime(CLOCK_MONOTONIC, &t); if (res < 0) { fprintf(stderr, "Error with gettimeofday! (%i)\n", res); - exit(EXIT_FAIL); + int_exit(FAIL_STAT_SIG); } return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; } @@ -198,45 +207,38 @@ static struct datarec *alloc_record_per_cpu(void) { unsigned int nr_cpus = bpf_num_possible_cpus(); struct datarec *array; - size_t size; - size = sizeof(struct datarec) * nr_cpus; - array = malloc(size); - memset(array, 0, size); + array = calloc(nr_cpus, sizeof(struct datarec)); if (!array) { fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); - exit(EXIT_FAIL_MEM); + int_exit(FAIL_MEM_SIG); } return array; } static struct record *alloc_record_per_rxq(void) { - unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; + unsigned int nr_rxqs = bpf_map__max_entries(rx_queue_index_map); struct record *array; - size_t size; - size = sizeof(struct record) * nr_rxqs; - array = malloc(size); - memset(array, 0, size); + array = calloc(nr_rxqs, sizeof(struct record)); if (!array) { fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs); - exit(EXIT_FAIL_MEM); + int_exit(FAIL_MEM_SIG); } return array; } static struct stats_record *alloc_stats_record(void) { - unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; + unsigned int nr_rxqs = bpf_map__max_entries(rx_queue_index_map); struct stats_record *rec; int i; - rec = malloc(sizeof(*rec)); - memset(rec, 0, sizeof(*rec)); + rec = calloc(1, sizeof(struct stats_record)); if (!rec) { fprintf(stderr, "Mem alloc error\n"); - exit(EXIT_FAIL_MEM); + int_exit(FAIL_MEM_SIG); } rec->rxq = alloc_record_per_rxq(); for (i = 0; i < nr_rxqs; i++) @@ -248,7 +250,7 @@ static struct stats_record *alloc_stats_record(void) static void free_stats_record(struct stats_record *r) { - unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; + unsigned int nr_rxqs = bpf_map__max_entries(rx_queue_index_map); int i; for (i = 0; i < nr_rxqs; i++) @@ -296,7 +298,7 @@ static void stats_collect(struct stats_record *rec) map_collect_percpu(fd, 0, &rec->stats); fd = bpf_map__fd(rx_queue_index_map); - max_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; + max_rxqs = bpf_map__max_entries(rx_queue_index_map); for (i = 0; i < max_rxqs; i++) map_collect_percpu(fd, i, &rec->rxq[i]); } @@ -342,7 +344,7 @@ static void stats_print(struct stats_record *stats_rec, struct stats_record *stats_prev, int action, __u32 cfg_opt) { - unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; + unsigned int nr_rxqs = bpf_map__max_entries(rx_queue_index_map); unsigned int nr_cpus = bpf_num_possible_cpus(); double pps = 0, err = 0; struct record *rec, *prev; @@ -457,15 +459,12 @@ static void stats_poll(int interval, int action, __u32 cfg_opt) int main(int argc, char **argv) { __u32 cfg_options= NO_TOUCH ; /* Default: Don't touch packet memory */ - struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); int prog_fd, map_fd, opt, err; bool use_separators = true; struct config cfg = { 0 }; + struct bpf_program *prog; struct bpf_object *obj; struct bpf_map *map; char filename[256]; @@ -479,15 +478,18 @@ int main(int argc, char **argv) char *action_str = NULL; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) + return EXIT_FAIL; + + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + err = bpf_object__load(obj); + if (err) return EXIT_FAIL; + prog_fd = bpf_program__fd(prog); map = bpf_object__find_map_by_name(obj, "config_map"); stats_global_map = bpf_object__find_map_by_name(obj, "stats_global_map"); @@ -595,7 +597,7 @@ int main(int argc, char **argv) signal(SIGINT, int_exit); signal(SIGTERM, int_exit); - if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { + if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) { fprintf(stderr, "link set xdp fd failed\n"); return EXIT_FAIL_XDP; } diff --git a/samples/bpf/xdp_sample.bpf.c b/samples/bpf/xdp_sample.bpf.c new file mode 100644 index 000000000000..0eb7e1dcae22 --- /dev/null +++ b/samples/bpf/xdp_sample.bpf.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0 +/* GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ +#include "xdp_sample.bpf.h" + +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> + +array_map rx_cnt SEC(".maps"); +array_map redir_err_cnt SEC(".maps"); +array_map cpumap_enqueue_cnt SEC(".maps"); +array_map cpumap_kthread_cnt SEC(".maps"); +array_map exception_cnt SEC(".maps"); +array_map devmap_xmit_cnt SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 32 * 32); + __type(key, u64); + __type(value, struct datarec); +} devmap_xmit_cnt_multi SEC(".maps"); + +const volatile int nr_cpus = 0; + +/* These can be set before loading so that redundant comparisons can be DCE'd by + * the verifier, and only actual matches are tried after loading tp_btf program. + * This allows sample to filter tracepoint stats based on net_device. + */ +const volatile int from_match[32] = {}; +const volatile int to_match[32] = {}; + +int cpumap_map_id = 0; + +/* Find if b is part of set a, but if a is empty set then evaluate to true */ +#define IN_SET(a, b) \ + ({ \ + bool __res = !(a)[0]; \ + for (int i = 0; i < ARRAY_SIZE(a) && (a)[i]; i++) { \ + __res = (a)[i] == (b); \ + if (__res) \ + break; \ + } \ + __res; \ + }) + +static __always_inline __u32 xdp_get_err_key(int err) +{ + switch (err) { + case 0: + return 0; + case -EINVAL: + return 2; + case -ENETDOWN: + return 3; + case -EMSGSIZE: + return 4; + case -EOPNOTSUPP: + return 5; + case -ENOSPC: + return 6; + default: + return 1; + } +} + +static __always_inline int xdp_redirect_collect_stat(int from, int err) +{ + u32 cpu = bpf_get_smp_processor_id(); + u32 key = XDP_REDIRECT_ERROR; + struct datarec *rec; + u32 idx; + + if (!IN_SET(from_match, from)) + return 0; + + key = xdp_get_err_key(err); + + idx = key * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&redir_err_cnt, &idx); + if (!rec) + return 0; + if (key) + NO_TEAR_INC(rec->dropped); + else + NO_TEAR_INC(rec->processed); + return 0; /* Indicate event was filtered (no further processing)*/ + /* + * Returning 1 here would allow e.g. a perf-record tracepoint + * to see and record these events, but it doesn't work well + * in-practice as stopping perf-record also unload this + * bpf_prog. Plus, there is additional overhead of doing so. + */ +} + +SEC("tp_btf/xdp_redirect_err") +int BPF_PROG(tp_xdp_redirect_err, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect_map_err") +int BPF_PROG(tp_xdp_redirect_map_err, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect") +int BPF_PROG(tp_xdp_redirect, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect_map") +int BPF_PROG(tp_xdp_redirect_map, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_cpumap_enqueue") +int BPF_PROG(tp_xdp_cpumap_enqueue, int map_id, unsigned int processed, + unsigned int drops, int to_cpu) +{ + u32 cpu = bpf_get_smp_processor_id(); + struct datarec *rec; + u32 idx; + + if (cpumap_map_id && cpumap_map_id != map_id) + return 0; + + idx = to_cpu * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &idx); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, processed); + NO_TEAR_ADD(rec->dropped, drops); + /* Record bulk events, then userspace can calc average bulk size */ + if (processed > 0) + NO_TEAR_INC(rec->issue); + /* Inception: It's possible to detect overload situations, via + * this tracepoint. This can be used for creating a feedback + * loop to XDP, which can take appropriate actions to mitigate + * this overload situation. + */ + return 0; +} + +SEC("tp_btf/xdp_cpumap_kthread") +int BPF_PROG(tp_xdp_cpumap_kthread, int map_id, unsigned int processed, + unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats) +{ + struct datarec *rec; + u32 cpu; + + if (cpumap_map_id && cpumap_map_id != map_id) + return 0; + + cpu = bpf_get_smp_processor_id(); + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &cpu); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, processed); + NO_TEAR_ADD(rec->dropped, drops); + NO_TEAR_ADD(rec->xdp_pass, xdp_stats->pass); + NO_TEAR_ADD(rec->xdp_drop, xdp_stats->drop); + NO_TEAR_ADD(rec->xdp_redirect, xdp_stats->redirect); + /* Count times kthread yielded CPU via schedule call */ + if (sched) + NO_TEAR_INC(rec->issue); + return 0; +} + +SEC("tp_btf/xdp_exception") +int BPF_PROG(tp_xdp_exception, const struct net_device *dev, + const struct bpf_prog *xdp, u32 act) +{ + u32 cpu = bpf_get_smp_processor_id(); + struct datarec *rec; + u32 key = act, idx; + + if (!IN_SET(from_match, dev->ifindex)) + return 0; + if (!IN_SET(to_match, dev->ifindex)) + return 0; + + if (key > XDP_REDIRECT) + key = XDP_REDIRECT + 1; + + idx = key * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&exception_cnt, &idx); + if (!rec) + return 0; + NO_TEAR_INC(rec->dropped); + + return 0; +} + +SEC("tp_btf/xdp_devmap_xmit") +int BPF_PROG(tp_xdp_devmap_xmit, const struct net_device *from_dev, + const struct net_device *to_dev, int sent, int drops, int err) +{ + struct datarec *rec; + int idx_in, idx_out; + u32 cpu; + + idx_in = from_dev->ifindex; + idx_out = to_dev->ifindex; + + if (!IN_SET(from_match, idx_in)) + return 0; + if (!IN_SET(to_match, idx_out)) + return 0; + + cpu = bpf_get_smp_processor_id(); + rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &cpu); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, sent); + NO_TEAR_ADD(rec->dropped, drops); + /* Record bulk events, then userspace can calc average bulk size */ + NO_TEAR_INC(rec->info); + /* Record error cases, where no frame were sent */ + /* Catch API error of drv ndo_xdp_xmit sent more than count */ + if (err || drops < 0) + NO_TEAR_INC(rec->issue); + return 0; +} + +SEC("tp_btf/xdp_devmap_xmit") +int BPF_PROG(tp_xdp_devmap_xmit_multi, const struct net_device *from_dev, + const struct net_device *to_dev, int sent, int drops, int err) +{ + struct datarec empty = {}; + struct datarec *rec; + int idx_in, idx_out; + u64 idx; + + idx_in = from_dev->ifindex; + idx_out = to_dev->ifindex; + idx = idx_in; + idx = idx << 32 | idx_out; + + if (!IN_SET(from_match, idx_in)) + return 0; + if (!IN_SET(to_match, idx_out)) + return 0; + + bpf_map_update_elem(&devmap_xmit_cnt_multi, &idx, &empty, BPF_NOEXIST); + rec = bpf_map_lookup_elem(&devmap_xmit_cnt_multi, &idx); + if (!rec) + return 0; + + NO_TEAR_ADD(rec->processed, sent); + NO_TEAR_ADD(rec->dropped, drops); + NO_TEAR_INC(rec->info); + if (err || drops < 0) + NO_TEAR_INC(rec->issue); + return 0; +} diff --git a/samples/bpf/xdp_sample.bpf.h b/samples/bpf/xdp_sample.bpf.h new file mode 100644 index 000000000000..25b1dbe9b37b --- /dev/null +++ b/samples/bpf/xdp_sample.bpf.h @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _XDP_SAMPLE_BPF_H +#define _XDP_SAMPLE_BPF_H + +#include "vmlinux.h" +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> + +#include "xdp_sample_shared.h" + +#define ETH_ALEN 6 +#define ETH_P_802_3_MIN 0x0600 +#define ETH_P_8021Q 0x8100 +#define ETH_P_8021AD 0x88A8 +#define ETH_P_IP 0x0800 +#define ETH_P_IPV6 0x86DD +#define ETH_P_ARP 0x0806 +#define IPPROTO_ICMPV6 58 + +#define EINVAL 22 +#define ENETDOWN 100 +#define EMSGSIZE 90 +#define EOPNOTSUPP 95 +#define ENOSPC 28 + +typedef struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_MMAPABLE); + __type(key, unsigned int); + __type(value, struct datarec); +} array_map; + +extern array_map rx_cnt; +extern const volatile int nr_cpus; + +enum { + XDP_REDIRECT_SUCCESS = 0, + XDP_REDIRECT_ERROR = 1 +}; + +static __always_inline void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \ + __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define bpf_ntohs(x) __builtin_bswap16(x) +#define bpf_htons(x) __builtin_bswap16(x) +#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \ + __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define bpf_ntohs(x) (x) +#define bpf_htons(x) (x) +#else +# error "Endianness detection needs to be set up for your compiler?!" +#endif + +/* + * Note: including linux/compiler.h or linux/kernel.h for the macros below + * conflicts with vmlinux.h include in BPF files, so we define them here. + * + * Following functions are taken from kernel sources and + * break aliasing rules in their original form. + * + * While kernel is compiled with -fno-strict-aliasing, + * perf uses -Wstrict-aliasing=3 which makes build fail + * under gcc 4.4. + * + * Using extra __may_alias__ type to allow aliasing + * in this case. + */ +typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; +typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; +typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; +typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; + +static __always_inline void __read_once_size(const volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; + case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; + case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; + case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; + default: + asm volatile ("" : : : "memory"); + __builtin_memcpy((void *)res, (const void *)p, size); + asm volatile ("" : : : "memory"); + } +} + +static __always_inline void __write_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; + case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; + case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; + case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; + default: + asm volatile ("" : : : "memory"); + __builtin_memcpy((void *)p, (const void *)res, size); + asm volatile ("" : : : "memory"); + } +} + +#define READ_ONCE(x) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +#define WRITE_ONCE(x, val) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +/* Add a value using relaxed read and relaxed write. Less expensive than + * fetch_add when there is no write concurrency. + */ +#define NO_TEAR_ADD(x, val) WRITE_ONCE((x), READ_ONCE(x) + (val)) +#define NO_TEAR_INC(x) NO_TEAR_ADD((x), 1) + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#endif diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c index 33377289e2a8..9cf76b340dd7 100644 --- a/samples/bpf/xdp_sample_pkts_kern.c +++ b/samples/bpf/xdp_sample_pkts_kern.c @@ -5,14 +5,12 @@ #include <bpf/bpf_helpers.h> #define SAMPLE_SIZE 64ul -#define MAX_CPUS 128 - -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = MAX_CPUS, -}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); +} my_map SEC(".maps"); SEC("xdp_sample") int xdp_sample_prog(struct xdp_md *ctx) diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c index 991ef6f0880b..7df7163239ac 100644 --- a/samples/bpf/xdp_sample_pkts_user.c +++ b/samples/bpf/xdp_sample_pkts_user.c @@ -12,13 +12,11 @@ #include <signal.h> #include <bpf/libbpf.h> #include <bpf/bpf.h> -#include <sys/resource.h> #include <libgen.h> #include <linux/if_link.h> #include "perf-sys.h" -#define MAX_CPUS 128 static int if_idx; static char *if_name; static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; @@ -31,7 +29,7 @@ static int do_attach(int idx, int fd, const char *name) __u32 info_len = sizeof(info); int err; - err = bpf_set_link_xdp_fd(idx, fd, xdp_flags); + err = bpf_xdp_attach(idx, fd, xdp_flags, NULL); if (err < 0) { printf("ERROR: failed to attach program to %s\n", name); return err; @@ -52,13 +50,13 @@ static int do_detach(int idx, const char *name) __u32 curr_prog_id = 0; int err = 0; - err = bpf_get_link_xdp_id(idx, &curr_prog_id, xdp_flags); + err = bpf_xdp_query_id(idx, xdp_flags, &curr_prog_id); if (err) { - printf("bpf_get_link_xdp_id failed\n"); + printf("bpf_xdp_query_id failed\n"); return err; } if (prog_id == curr_prog_id) { - err = bpf_set_link_xdp_fd(idx, -1, xdp_flags); + err = bpf_xdp_detach(idx, xdp_flags, NULL); if (err < 0) printf("ERROR: failed to detach prog from %s\n", name); } else if (!curr_prog_id) { @@ -104,19 +102,16 @@ static void usage(const char *prog) fprintf(stderr, "%s: %s [OPTS] <ifname|ifindex>\n\n" "OPTS:\n" - " -F force loading prog\n", + " -F force loading prog\n" + " -S use skb-mode\n", __func__, prog); } int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; - struct perf_buffer_opts pb_opts = {}; const char *optstr = "FS"; int prog_fd, map_fd, opt; + struct bpf_program *prog; struct bpf_object *obj; struct bpf_map *map; char filename[256]; @@ -144,23 +139,22 @@ int main(int argc, char **argv) return 1; } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) return 1; - if (!prog_fd) { - printf("bpf_prog_load_xattr: %s\n", strerror(errno)); + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); + + err = bpf_object__load(obj); + if (err) return 1; - } - map = bpf_map__next(NULL, obj); + prog_fd = bpf_program__fd(prog); + + map = bpf_object__next_map(obj, NULL); if (!map) { printf("finding a map in obj file failed\n"); return 1; @@ -187,8 +181,7 @@ int main(int argc, char **argv) return 1; } - pb_opts.sample_cb = print_bpf_output; - pb = perf_buffer__new(map_fd, 8, &pb_opts); + pb = perf_buffer__new(map_fd, 8, print_bpf_output, NULL, NULL, NULL); err = libbpf_get_error(pb); if (err) { perror("perf_buffer setup failed"); diff --git a/samples/bpf/xdp_sample_shared.h b/samples/bpf/xdp_sample_shared.h new file mode 100644 index 000000000000..8a7669a5d563 --- /dev/null +++ b/samples/bpf/xdp_sample_shared.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef _XDP_SAMPLE_SHARED_H +#define _XDP_SAMPLE_SHARED_H + +struct datarec { + size_t processed; + size_t dropped; + size_t issue; + union { + size_t xdp_pass; + size_t info; + }; + size_t xdp_drop; + size_t xdp_redirect; +} __attribute__((aligned(64))); + +#endif diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c new file mode 100644 index 000000000000..158682852162 --- /dev/null +++ b/samples/bpf/xdp_sample_user.c @@ -0,0 +1,1673 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <linux/ethtool.h> +#include <linux/hashtable.h> +#include <linux/if_link.h> +#include <linux/jhash.h> +#include <linux/limits.h> +#include <linux/list.h> +#include <linux/sockios.h> +#include <locale.h> +#include <math.h> +#include <net/if.h> +#include <poll.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/signalfd.h> +#include <sys/sysinfo.h> +#include <sys/timerfd.h> +#include <sys/utsname.h> +#include <time.h> +#include <unistd.h> + +#include "bpf_util.h" +#include "xdp_sample_user.h" + +#define __sample_print(fmt, cond, ...) \ + ({ \ + if (cond) \ + printf(fmt, ##__VA_ARGS__); \ + }) + +#define print_always(fmt, ...) __sample_print(fmt, 1, ##__VA_ARGS__) +#define print_default(fmt, ...) \ + __sample_print(fmt, sample_log_level & LL_DEFAULT, ##__VA_ARGS__) +#define __print_err(err, fmt, ...) \ + ({ \ + __sample_print(fmt, err > 0 || sample_log_level & LL_DEFAULT, \ + ##__VA_ARGS__); \ + sample_err_exp = sample_err_exp ? true : err > 0; \ + }) +#define print_err(err, fmt, ...) __print_err(err, fmt, ##__VA_ARGS__) + +#define __COLUMN(x) "%'10" x " %-13s" +#define FMT_COLUMNf __COLUMN(".0f") +#define FMT_COLUMNd __COLUMN("d") +#define FMT_COLUMNl __COLUMN("llu") +#define RX(rx) rx, "rx/s" +#define PPS(pps) pps, "pkt/s" +#define DROP(drop) drop, "drop/s" +#define ERR(err) err, "error/s" +#define HITS(hits) hits, "hit/s" +#define XMIT(xmit) xmit, "xmit/s" +#define PASS(pass) pass, "pass/s" +#define REDIR(redir) redir, "redir/s" +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ + +#define XDP_UNKNOWN (XDP_REDIRECT + 1) +#define XDP_ACTION_MAX (XDP_UNKNOWN + 1) +#define XDP_REDIRECT_ERR_MAX 7 + +enum map_type { + MAP_RX, + MAP_REDIRECT_ERR, + MAP_CPUMAP_ENQUEUE, + MAP_CPUMAP_KTHREAD, + MAP_EXCEPTION, + MAP_DEVMAP_XMIT, + MAP_DEVMAP_XMIT_MULTI, + NUM_MAP, +}; + +enum log_level { + LL_DEFAULT = 1U << 0, + LL_SIMPLE = 1U << 1, + LL_DEBUG = 1U << 2, +}; + +struct record { + __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; + +struct map_entry { + struct hlist_node node; + __u64 pair; + struct record val; +}; + +struct stats_record { + struct record rx_cnt; + struct record redir_err[XDP_REDIRECT_ERR_MAX]; + struct record kthread; + struct record exception[XDP_ACTION_MAX]; + struct record devmap_xmit; + DECLARE_HASHTABLE(xmit_map, 5); + struct record enq[]; +}; + +struct sample_output { + struct { + __u64 rx; + __u64 redir; + __u64 drop; + __u64 drop_xmit; + __u64 err; + __u64 xmit; + } totals; + struct { + union { + __u64 pps; + __u64 num; + }; + __u64 drop; + __u64 err; + } rx_cnt; + struct { + __u64 suc; + __u64 err; + } redir_cnt; + struct { + __u64 hits; + } except_cnt; + struct { + __u64 pps; + __u64 drop; + __u64 err; + double bavg; + } xmit_cnt; +}; + +struct xdp_desc { + int ifindex; + __u32 prog_id; + int flags; +} sample_xdp_progs[32]; + +struct datarec *sample_mmap[NUM_MAP]; +struct bpf_map *sample_map[NUM_MAP]; +size_t sample_map_count[NUM_MAP]; +enum log_level sample_log_level; +struct sample_output sample_out; +unsigned long sample_interval; +bool sample_err_exp; +int sample_xdp_cnt; +int sample_n_cpus; +int sample_sig_fd; +int sample_mask; + +static const char *xdp_redirect_err_names[XDP_REDIRECT_ERR_MAX] = { + /* Key=1 keeps unknown errors */ + "Success", + "Unknown", + "EINVAL", + "ENETDOWN", + "EMSGSIZE", + "EOPNOTSUPP", + "ENOSPC", +}; + +/* Keyed from Unknown */ +static const char *xdp_redirect_err_help[XDP_REDIRECT_ERR_MAX - 1] = { + "Unknown error", + "Invalid redirection", + "Device being redirected to is down", + "Packet length too large for device", + "Operation not supported", + "No space in ptr_ring of cpumap kthread", +}; + +static const char *xdp_action_names[XDP_ACTION_MAX] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", + [XDP_REDIRECT] = "XDP_REDIRECT", + [XDP_UNKNOWN] = "XDP_UNKNOWN", +}; + +static __u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + return UINT64_MAX; + } + return (__u64)t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +static const char *action2str(int action) +{ + if (action < XDP_ACTION_MAX) + return xdp_action_names[action]; + return NULL; +} + +static void sample_print_help(int mask) +{ + printf("Output format description\n\n" + "By default, redirect success statistics are disabled, use -s to enable.\n" + "The terse output mode is default, verbose mode can be activated using -v\n" + "Use SIGQUIT (Ctrl + \\) to switch the mode dynamically at runtime\n\n" + "Terse mode displays at most the following fields:\n" + " rx/s Number of packets received per second\n" + " redir/s Number of packets successfully redirected per second\n" + " err,drop/s Aggregated count of errors per second (including dropped packets)\n" + " xmit/s Number of packets transmitted on the output device per second\n\n" + "Output description for verbose mode:\n" + " FIELD DESCRIPTION\n"); + + if (mask & SAMPLE_RX_CNT) { + printf(" receive\t\tDisplays the number of packets received & errors encountered\n" + " \t\t\tWhenever an error or packet drop occurs, details of per CPU error\n" + " \t\t\tand drop statistics will be expanded inline in terse mode.\n" + " \t\t\t\tpkt/s - Packets received per second\n" + " \t\t\t\tdrop/s - Packets dropped per second\n" + " \t\t\t\terror/s - Errors encountered per second\n\n"); + } + if (mask & (SAMPLE_REDIRECT_CNT | SAMPLE_REDIRECT_ERR_CNT)) { + printf(" redirect\t\tDisplays the number of packets successfully redirected\n" + " \t\t\tErrors encountered are expanded under redirect_err field\n" + " \t\t\tNote that passing -s to enable it has a per packet overhead\n" + " \t\t\t\tredir/s - Packets redirected successfully per second\n\n" + " redirect_err\t\tDisplays the number of packets that failed redirection\n" + " \t\t\tThe errno is expanded under this field with per CPU count\n" + " \t\t\tThe recognized errors are:\n"); + + for (int i = 2; i < XDP_REDIRECT_ERR_MAX; i++) + printf("\t\t\t %s: %s\n", xdp_redirect_err_names[i], + xdp_redirect_err_help[i - 1]); + + printf(" \n\t\t\t\terror/s - Packets that failed redirection per second\n\n"); + } + + if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) { + printf(" enqueue to cpu N\tDisplays the number of packets enqueued to bulk queue of CPU N\n" + " \t\t\tExpands to cpu:FROM->N to display enqueue stats for each CPU enqueuing to CPU N\n" + " \t\t\tReceived packets can be associated with the CPU redirect program is enqueuing \n" + " \t\t\tpackets to.\n" + " \t\t\t\tpkt/s - Packets enqueued per second from other CPU to CPU N\n" + " \t\t\t\tdrop/s - Packets dropped when trying to enqueue to CPU N\n" + " \t\t\t\tbulk-avg - Average number of packets processed for each event\n\n"); + } + + if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) { + printf(" kthread\t\tDisplays the number of packets processed in CPUMAP kthread for each CPU\n" + " \t\t\tPackets consumed from ptr_ring in kthread, and its xdp_stats (after calling \n" + " \t\t\tCPUMAP bpf prog) are expanded below this. xdp_stats are expanded as a total and\n" + " \t\t\tthen per-CPU to associate it to each CPU's pinned CPUMAP kthread.\n" + " \t\t\t\tpkt/s - Packets consumed per second from ptr_ring\n" + " \t\t\t\tdrop/s - Packets dropped per second in kthread\n" + " \t\t\t\tsched - Number of times kthread called schedule()\n\n" + " \t\t\txdp_stats (also expands to per-CPU counts)\n" + " \t\t\t\tpass/s - XDP_PASS count for CPUMAP program execution\n" + " \t\t\t\tdrop/s - XDP_DROP count for CPUMAP program execution\n" + " \t\t\t\tredir/s - XDP_REDIRECT count for CPUMAP program execution\n\n"); + } + + if (mask & SAMPLE_EXCEPTION_CNT) { + printf(" xdp_exception\t\tDisplays xdp_exception tracepoint events\n" + " \t\t\tThis can occur due to internal driver errors, unrecognized\n" + " \t\t\tXDP actions and due to explicit user trigger by use of XDP_ABORTED\n" + " \t\t\tEach action is expanded below this field with its count\n" + " \t\t\t\thit/s - Number of times the tracepoint was hit per second\n\n"); + } + + if (mask & SAMPLE_DEVMAP_XMIT_CNT) { + printf(" devmap_xmit\t\tDisplays devmap_xmit tracepoint events\n" + " \t\t\tThis tracepoint is invoked for successful transmissions on output\n" + " \t\t\tdevice but these statistics are not available for generic XDP mode,\n" + " \t\t\thence they will be omitted from the output when using SKB mode\n" + " \t\t\t\txmit/s - Number of packets that were transmitted per second\n" + " \t\t\t\tdrop/s - Number of packets that failed transmissions per second\n" + " \t\t\t\tdrv_err/s - Number of internal driver errors per second\n" + " \t\t\t\tbulk-avg - Average number of packets processed for each event\n\n"); + } +} + +void sample_usage(char *argv[], const struct option *long_options, + const char *doc, int mask, bool error) +{ + int i; + + if (!error) + sample_print_help(mask); + + printf("\n%s\nOption for %s:\n", doc, argv[0]); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-15s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value: %d)", + *long_options[i].flag); + else + printf("\t short-option: -%c", long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +static struct datarec *alloc_record_per_cpu(void) +{ + unsigned int nr_cpus = libbpf_num_possible_cpus(); + struct datarec *array; + + array = calloc(nr_cpus, sizeof(*array)); + if (!array) { + fprintf(stderr, "Failed to allocate memory (nr_cpus: %u)\n", + nr_cpus); + return NULL; + } + return array; +} + +static int map_entry_init(struct map_entry *e, __u64 pair) +{ + e->pair = pair; + INIT_HLIST_NODE(&e->node); + e->val.timestamp = gettime(); + e->val.cpu = alloc_record_per_cpu(); + if (!e->val.cpu) + return -ENOMEM; + return 0; +} + +static void map_collect_percpu(struct datarec *values, struct record *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = libbpf_num_possible_cpus(); + __u64 sum_xdp_redirect = 0; + __u64 sum_processed = 0; + __u64 sum_xdp_pass = 0; + __u64 sum_xdp_drop = 0; + __u64 sum_dropped = 0; + __u64 sum_issue = 0; + int i; + + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = READ_ONCE(values[i].processed); + rec->cpu[i].dropped = READ_ONCE(values[i].dropped); + rec->cpu[i].issue = READ_ONCE(values[i].issue); + rec->cpu[i].xdp_pass = READ_ONCE(values[i].xdp_pass); + rec->cpu[i].xdp_drop = READ_ONCE(values[i].xdp_drop); + rec->cpu[i].xdp_redirect = READ_ONCE(values[i].xdp_redirect); + + sum_processed += rec->cpu[i].processed; + sum_dropped += rec->cpu[i].dropped; + sum_issue += rec->cpu[i].issue; + sum_xdp_pass += rec->cpu[i].xdp_pass; + sum_xdp_drop += rec->cpu[i].xdp_drop; + sum_xdp_redirect += rec->cpu[i].xdp_redirect; + } + + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.issue = sum_issue; + rec->total.xdp_pass = sum_xdp_pass; + rec->total.xdp_drop = sum_xdp_drop; + rec->total.xdp_redirect = sum_xdp_redirect; +} + +static int map_collect_percpu_devmap(int map_fd, struct stats_record *rec) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u32 batch, count = 32; + struct datarec *values; + bool init = false; + __u64 *keys; + int i, ret; + + keys = calloc(count, sizeof(__u64)); + if (!keys) + return -ENOMEM; + values = calloc(count * nr_cpus, sizeof(struct datarec)); + if (!values) { + free(keys); + return -ENOMEM; + } + + for (;;) { + bool exit = false; + + ret = bpf_map_lookup_batch(map_fd, init ? &batch : NULL, &batch, + keys, values, &count, NULL); + if (ret < 0 && errno != ENOENT) + break; + if (errno == ENOENT) + exit = true; + + init = true; + for (i = 0; i < count; i++) { + struct map_entry *e, *x = NULL; + __u64 pair = keys[i]; + struct datarec *arr; + + arr = &values[i * nr_cpus]; + hash_for_each_possible(rec->xmit_map, e, node, pair) { + if (e->pair == pair) { + x = e; + break; + } + } + if (!x) { + x = calloc(1, sizeof(*x)); + if (!x) + goto cleanup; + if (map_entry_init(x, pair) < 0) { + free(x); + goto cleanup; + } + hash_add(rec->xmit_map, &x->node, pair); + } + map_collect_percpu(arr, &x->val); + } + + if (exit) + break; + count = 32; + } + + free(values); + free(keys); + return 0; +cleanup: + free(values); + free(keys); + return -ENOMEM; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int i; + + rec = calloc(1, sizeof(*rec) + sample_n_cpus * sizeof(struct record)); + if (!rec) { + fprintf(stderr, "Failed to allocate memory\n"); + return NULL; + } + + if (sample_mask & SAMPLE_RX_CNT) { + rec->rx_cnt.cpu = alloc_record_per_cpu(); + if (!rec->rx_cnt.cpu) { + fprintf(stderr, + "Failed to allocate rx_cnt per-CPU array\n"); + goto end_rec; + } + } + if (sample_mask & (SAMPLE_REDIRECT_CNT | SAMPLE_REDIRECT_ERR_CNT)) { + for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) { + rec->redir_err[i].cpu = alloc_record_per_cpu(); + if (!rec->redir_err[i].cpu) { + fprintf(stderr, + "Failed to allocate redir_err per-CPU array for " + "\"%s\" case\n", + xdp_redirect_err_names[i]); + while (i--) + free(rec->redir_err[i].cpu); + goto end_rx_cnt; + } + } + } + if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT) { + rec->kthread.cpu = alloc_record_per_cpu(); + if (!rec->kthread.cpu) { + fprintf(stderr, + "Failed to allocate kthread per-CPU array\n"); + goto end_redir; + } + } + if (sample_mask & SAMPLE_EXCEPTION_CNT) { + for (i = 0; i < XDP_ACTION_MAX; i++) { + rec->exception[i].cpu = alloc_record_per_cpu(); + if (!rec->exception[i].cpu) { + fprintf(stderr, + "Failed to allocate exception per-CPU array for " + "\"%s\" case\n", + action2str(i)); + while (i--) + free(rec->exception[i].cpu); + goto end_kthread; + } + } + } + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT) { + rec->devmap_xmit.cpu = alloc_record_per_cpu(); + if (!rec->devmap_xmit.cpu) { + fprintf(stderr, + "Failed to allocate devmap_xmit per-CPU array\n"); + goto end_exception; + } + } + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + hash_init(rec->xmit_map); + if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) { + for (i = 0; i < sample_n_cpus; i++) { + rec->enq[i].cpu = alloc_record_per_cpu(); + if (!rec->enq[i].cpu) { + fprintf(stderr, + "Failed to allocate enqueue per-CPU array for " + "CPU %d\n", + i); + while (i--) + free(rec->enq[i].cpu); + goto end_devmap_xmit; + } + } + } + + return rec; + +end_devmap_xmit: + free(rec->devmap_xmit.cpu); +end_exception: + for (i = 0; i < XDP_ACTION_MAX; i++) + free(rec->exception[i].cpu); +end_kthread: + free(rec->kthread.cpu); +end_redir: + for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) + free(rec->redir_err[i].cpu); +end_rx_cnt: + free(rec->rx_cnt.cpu); +end_rec: + free(rec); + return NULL; +} + +static void free_stats_record(struct stats_record *r) +{ + struct hlist_node *tmp; + struct map_entry *e; + int i; + + for (i = 0; i < sample_n_cpus; i++) + free(r->enq[i].cpu); + hash_for_each_safe(r->xmit_map, i, tmp, e, node) { + hash_del(&e->node); + free(e->val.cpu); + free(e); + } + free(r->devmap_xmit.cpu); + for (i = 0; i < XDP_ACTION_MAX; i++) + free(r->exception[i].cpu); + free(r->kthread.cpu); + for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) + free(r->redir_err[i].cpu); + free(r->rx_cnt.cpu); + free(r); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double)period / NANOSEC_PER_SEC); + + return period_; +} + +static double sample_round(double val) +{ + if (val - floor(val) < 0.5) + return floor(val); + return ceil(val); +} + +static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->processed - p->processed; + pps = sample_round(packets / period_); + } + return pps; +} + +static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->dropped - p->dropped; + pps = sample_round(packets / period_); + } + return pps; +} + +static __u64 calc_errs_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->issue - p->issue; + pps = sample_round(packets / period_); + } + return pps; +} + +static __u64 calc_info_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->info - p->info; + pps = sample_round(packets / period_); + } + return pps; +} + +static void calc_xdp_pps(struct datarec *r, struct datarec *p, double *xdp_pass, + double *xdp_drop, double *xdp_redirect, double period_) +{ + *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0; + if (period_ > 0) { + *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_; + *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_; + *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_; + } +} + +static void stats_get_rx_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, struct sample_output *out) +{ + struct record *rec, *prev; + double t, pps, drop, err; + int i; + + rec = &stats_rec->rx_cnt; + prev = &stats_prev->rx_cnt; + t = calc_period(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + "\n", + str, PPS(pps), DROP(drop), ERR(err)); + } + + if (out) { + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + + out->rx_cnt.pps = pps; + out->rx_cnt.drop = drop; + out->rx_cnt.err = err; + out->totals.rx += pps; + out->totals.drop += drop; + out->totals.err += err; + } +} + +static void stats_get_cpumap_enqueue(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus) +{ + struct record *rec, *prev; + double t, pps, drop, err; + int i, to_cpu; + + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < sample_n_cpus; to_cpu++) { + rec = &stats_rec->enq[to_cpu]; + prev = &stats_prev->enq[to_cpu]; + t = calc_period(rec, prev); + + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + + if (pps > 0 || drop > 0) { + char str[64]; + + snprintf(str, sizeof(str), "enqueue to cpu %d", to_cpu); + + if (err > 0) + err = pps / err; /* calc average bulk size */ + + print_err(drop, + " %-20s " FMT_COLUMNf FMT_COLUMNf __COLUMN( + ".2f") "\n", + str, PPS(pps), DROP(drop), err, "bulk-avg"); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d->%d", i, to_cpu); + if (err > 0) + err = pps / err; /* calc average bulk size */ + print_default( + " %-18s " FMT_COLUMNf FMT_COLUMNf __COLUMN( + ".2f") "\n", + str, PPS(pps), DROP(drop), err, "bulk-avg"); + } + } +} + +static void stats_get_cpumap_remote(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus) +{ + double xdp_pass, xdp_drop, xdp_redirect; + struct record *rec, *prev; + double t; + int i; + + rec = &stats_rec->kthread; + prev = &stats_prev->kthread; + t = calc_period(rec, prev); + + calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop, + &xdp_redirect, t); + if (xdp_pass || xdp_drop || xdp_redirect) { + print_err(xdp_drop, + " %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", + "xdp_stats", PASS(xdp_pass), DROP(xdp_drop), + REDIR(xdp_redirect)); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + calc_xdp_pps(r, p, &xdp_pass, &xdp_drop, &xdp_redirect, t); + if (!xdp_pass && !xdp_drop && !xdp_redirect) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-16s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + "\n", + str, PASS(xdp_pass), DROP(xdp_drop), + REDIR(xdp_redirect)); + } +} + +static void stats_get_cpumap_kthread(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus) +{ + struct record *rec, *prev; + double t, pps, drop, err; + int i; + + rec = &stats_rec->kthread; + prev = &stats_prev->kthread; + t = calc_period(rec, prev); + + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + + print_err(drop, " %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", + pps ? "kthread total" : "kthread", PPS(pps), DROP(drop), err, + "sched"); + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + "\n", + str, PPS(pps), DROP(drop), err, "sched"); + } +} + +static void stats_get_redirect_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + struct record *rec, *prev; + double t, pps; + int i; + + rec = &stats_rec->redir_err[0]; + prev = &stats_prev->redir_err[0]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + if (!pps) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-18s " FMT_COLUMNf "\n", str, REDIR(pps)); + } + + if (out) { + pps = calc_pps(&rec->total, &prev->total, t); + out->redir_cnt.suc = pps; + out->totals.redir += pps; + } +} + +static void stats_get_redirect_err_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + struct record *rec, *prev; + double t, drop, sum = 0; + int rec_i, i; + + for (rec_i = 1; rec_i < XDP_REDIRECT_ERR_MAX; rec_i++) { + char str[64]; + + rec = &stats_rec->redir_err[rec_i]; + prev = &stats_prev->redir_err[rec_i]; + t = calc_period(rec, prev); + + drop = calc_drop_pps(&rec->total, &prev->total, t); + if (drop > 0 && !out) { + snprintf(str, sizeof(str), + sample_log_level & LL_DEFAULT ? "%s total" : + "%s", + xdp_redirect_err_names[rec_i]); + print_err(drop, " %-18s " FMT_COLUMNf "\n", str, + ERR(drop)); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + double drop; + + drop = calc_drop_pps(r, p, t); + if (!drop) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-16s" FMT_COLUMNf "\n", str, + ERR(drop)); + } + + sum += drop; + } + + if (out) { + out->redir_cnt.err = sum; + out->totals.err += sum; + } +} + +static void stats_get_exception_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + double t, drop, sum = 0; + struct record *rec, *prev; + int rec_i, i; + + for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { + rec = &stats_rec->exception[rec_i]; + prev = &stats_prev->exception[rec_i]; + t = calc_period(rec, prev); + + drop = calc_drop_pps(&rec->total, &prev->total, t); + /* Fold out errors after heading */ + sum += drop; + + if (drop > 0 && !out) { + print_always(" %-18s " FMT_COLUMNf "\n", + action2str(rec_i), ERR(drop)); + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + double drop; + + drop = calc_drop_pps(r, p, t); + if (!drop) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-16s" FMT_COLUMNf "\n", + str, ERR(drop)); + } + } + } + + if (out) { + out->except_cnt.hits = sum; + out->totals.err += sum; + } +} + +static void stats_get_devmap_xmit(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + double pps, drop, info, err; + struct record *rec, *prev; + double t; + int i; + + rec = &stats_rec->devmap_xmit; + prev = &stats_prev->devmap_xmit; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + info = calc_info_pps(r, p, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + print_default(" %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + __COLUMN(".2f") "\n", + str, XMIT(pps), DROP(drop), err, "drv_err/s", + info, "bulk-avg"); + } + if (out) { + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + info = calc_info_pps(&rec->total, &prev->total, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + err = calc_errs_pps(&rec->total, &prev->total, t); + + out->xmit_cnt.pps = pps; + out->xmit_cnt.drop = drop; + out->xmit_cnt.bavg = info; + out->xmit_cnt.err = err; + out->totals.xmit += pps; + out->totals.drop_xmit += drop; + out->totals.err += err; + } +} + +static void stats_get_devmap_xmit_multi(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out, + bool xmit_total) +{ + double pps, drop, info, err; + struct map_entry *entry; + struct record *r, *p; + double t; + int bkt; + + hash_for_each(stats_rec->xmit_map, bkt, entry, node) { + struct map_entry *e, *x = NULL; + char ifname_from[IFNAMSIZ]; + char ifname_to[IFNAMSIZ]; + const char *fstr, *tstr; + unsigned long prev_time; + struct record beg = {}; + __u32 from_idx, to_idx; + char str[128]; + __u64 pair; + int i; + + prev_time = sample_interval * NANOSEC_PER_SEC; + + pair = entry->pair; + from_idx = pair >> 32; + to_idx = pair & 0xFFFFFFFF; + + r = &entry->val; + beg.timestamp = r->timestamp - prev_time; + + /* Find matching entry from stats_prev map */ + hash_for_each_possible(stats_prev->xmit_map, e, node, pair) { + if (e->pair == pair) { + x = e; + break; + } + } + if (x) + p = &x->val; + else + p = &beg; + t = calc_period(r, p); + pps = calc_pps(&r->total, &p->total, t); + drop = calc_drop_pps(&r->total, &p->total, t); + info = calc_info_pps(&r->total, &p->total, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + err = calc_errs_pps(&r->total, &p->total, t); + + if (out) { + /* We are responsible for filling out totals */ + out->totals.xmit += pps; + out->totals.drop_xmit += drop; + out->totals.err += err; + continue; + } + + fstr = tstr = NULL; + if (if_indextoname(from_idx, ifname_from)) + fstr = ifname_from; + if (if_indextoname(to_idx, ifname_to)) + tstr = ifname_to; + + snprintf(str, sizeof(str), "xmit %s->%s", fstr ?: "?", + tstr ?: "?"); + /* Skip idle streams of redirection */ + if (pps || drop || err) { + print_err(drop, + " %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + __COLUMN(".2f") "\n", str, XMIT(pps), DROP(drop), + err, "drv_err/s", info, "bulk-avg"); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *rc = &r->cpu[i]; + struct datarec *pc, p_beg = {}; + char str[64]; + + pc = p == &beg ? &p_beg : &p->cpu[i]; + + pps = calc_pps(rc, pc, t); + drop = calc_drop_pps(rc, pc, t); + err = calc_errs_pps(rc, pc, t); + + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + info = calc_info_pps(rc, pc, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + + print_default(" %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + __COLUMN(".2f") "\n", str, XMIT(pps), + DROP(drop), err, "drv_err/s", info, "bulk-avg"); + } + } +} + +static void stats_print(const char *prefix, int mask, struct stats_record *r, + struct stats_record *p, struct sample_output *out) +{ + int nr_cpus = libbpf_num_possible_cpus(); + const char *str; + + print_always("%-23s", prefix ?: "Summary"); + if (mask & SAMPLE_RX_CNT) + print_always(FMT_COLUMNl, RX(out->totals.rx)); + if (mask & SAMPLE_REDIRECT_CNT) + print_always(FMT_COLUMNl, REDIR(out->totals.redir)); + printf(FMT_COLUMNl, + out->totals.err + out->totals.drop + out->totals.drop_xmit, + "err,drop/s"); + if (mask & SAMPLE_DEVMAP_XMIT_CNT || + mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + printf(FMT_COLUMNl, XMIT(out->totals.xmit)); + printf("\n"); + + if (mask & SAMPLE_RX_CNT) { + str = (sample_log_level & LL_DEFAULT) && out->rx_cnt.pps ? + "receive total" : + "receive"; + print_err((out->rx_cnt.err || out->rx_cnt.drop), + " %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl "\n", + str, PPS(out->rx_cnt.pps), DROP(out->rx_cnt.drop), + ERR(out->rx_cnt.err)); + + stats_get_rx_cnt(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) + stats_get_cpumap_enqueue(r, p, nr_cpus); + + if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) { + stats_get_cpumap_kthread(r, p, nr_cpus); + stats_get_cpumap_remote(r, p, nr_cpus); + } + + if (mask & SAMPLE_REDIRECT_CNT) { + str = out->redir_cnt.suc ? "redirect total" : "redirect"; + print_default(" %-20s " FMT_COLUMNl "\n", str, + REDIR(out->redir_cnt.suc)); + + stats_get_redirect_cnt(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_REDIRECT_ERR_CNT) { + str = (sample_log_level & LL_DEFAULT) && out->redir_cnt.err ? + "redirect_err total" : + "redirect_err"; + print_err(out->redir_cnt.err, " %-20s " FMT_COLUMNl "\n", str, + ERR(out->redir_cnt.err)); + + stats_get_redirect_err_cnt(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_EXCEPTION_CNT) { + str = out->except_cnt.hits ? "xdp_exception total" : + "xdp_exception"; + + print_err(out->except_cnt.hits, " %-20s " FMT_COLUMNl "\n", str, + HITS(out->except_cnt.hits)); + + stats_get_exception_cnt(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_DEVMAP_XMIT_CNT) { + str = (sample_log_level & LL_DEFAULT) && out->xmit_cnt.pps ? + "devmap_xmit total" : + "devmap_xmit"; + + print_err(out->xmit_cnt.err || out->xmit_cnt.drop, + " %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl + __COLUMN(".2f") "\n", + str, XMIT(out->xmit_cnt.pps), + DROP(out->xmit_cnt.drop), out->xmit_cnt.err, + "drv_err/s", out->xmit_cnt.bavg, "bulk-avg"); + + stats_get_devmap_xmit(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + stats_get_devmap_xmit_multi(r, p, nr_cpus, NULL, + mask & SAMPLE_DEVMAP_XMIT_CNT); + + if (sample_log_level & LL_DEFAULT || + ((sample_log_level & LL_SIMPLE) && sample_err_exp)) { + sample_err_exp = false; + printf("\n"); + } +} + +int sample_setup_maps(struct bpf_map **maps) +{ + sample_n_cpus = libbpf_num_possible_cpus(); + + for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) { + sample_map[i] = maps[i]; + + switch (i) { + case MAP_RX: + case MAP_CPUMAP_KTHREAD: + case MAP_DEVMAP_XMIT: + sample_map_count[i] = sample_n_cpus; + break; + case MAP_REDIRECT_ERR: + sample_map_count[i] = + XDP_REDIRECT_ERR_MAX * sample_n_cpus; + break; + case MAP_EXCEPTION: + sample_map_count[i] = XDP_ACTION_MAX * sample_n_cpus; + case MAP_CPUMAP_ENQUEUE: + sample_map_count[i] = sample_n_cpus * sample_n_cpus; + break; + default: + return -EINVAL; + } + if (bpf_map__set_max_entries(sample_map[i], sample_map_count[i]) < 0) + return -errno; + } + sample_map[MAP_DEVMAP_XMIT_MULTI] = maps[MAP_DEVMAP_XMIT_MULTI]; + return 0; +} + +static int sample_setup_maps_mappings(void) +{ + for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) { + size_t size = sample_map_count[i] * sizeof(struct datarec); + + sample_mmap[i] = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, bpf_map__fd(sample_map[i]), 0); + if (sample_mmap[i] == MAP_FAILED) + return -errno; + } + return 0; +} + +int __sample_init(int mask) +{ + sigset_t st; + + sigemptyset(&st); + sigaddset(&st, SIGQUIT); + sigaddset(&st, SIGINT); + sigaddset(&st, SIGTERM); + + if (sigprocmask(SIG_BLOCK, &st, NULL) < 0) + return -errno; + + sample_sig_fd = signalfd(-1, &st, SFD_CLOEXEC | SFD_NONBLOCK); + if (sample_sig_fd < 0) + return -errno; + + sample_mask = mask; + + return sample_setup_maps_mappings(); +} + +static int __sample_remove_xdp(int ifindex, __u32 prog_id, int xdp_flags) +{ + __u32 cur_prog_id = 0; + int ret; + + if (prog_id) { + ret = bpf_xdp_query_id(ifindex, xdp_flags, &cur_prog_id); + if (ret < 0) + return -errno; + + if (prog_id != cur_prog_id) { + print_always( + "Program on ifindex %d does not match installed " + "program, skipping unload\n", + ifindex); + return -ENOENT; + } + } + + return bpf_xdp_detach(ifindex, xdp_flags, NULL); +} + +int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic, + bool force) +{ + int ret, xdp_flags = 0; + __u32 prog_id = 0; + + if (sample_xdp_cnt == 32) { + fprintf(stderr, + "Total limit for installed XDP programs in a sample reached\n"); + return -ENOTSUP; + } + + xdp_flags |= !force ? XDP_FLAGS_UPDATE_IF_NOEXIST : 0; + xdp_flags |= generic ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE; + ret = bpf_xdp_attach(ifindex, bpf_program__fd(xdp_prog), xdp_flags, NULL); + if (ret < 0) { + ret = -errno; + fprintf(stderr, + "Failed to install program \"%s\" on ifindex %d, mode = %s, " + "force = %s: %s\n", + bpf_program__name(xdp_prog), ifindex, + generic ? "skb" : "native", force ? "true" : "false", + strerror(-ret)); + return ret; + } + + ret = bpf_xdp_query_id(ifindex, xdp_flags, &prog_id); + if (ret < 0) { + ret = -errno; + fprintf(stderr, + "Failed to get XDP program id for ifindex %d, removing program: %s\n", + ifindex, strerror(errno)); + __sample_remove_xdp(ifindex, 0, xdp_flags); + return ret; + } + sample_xdp_progs[sample_xdp_cnt++] = + (struct xdp_desc){ ifindex, prog_id, xdp_flags }; + + return 0; +} + +static void sample_summary_print(void) +{ + double num = sample_out.rx_cnt.num; + + if (sample_out.totals.rx) { + double pkts = sample_out.totals.rx; + + print_always(" Packets received : %'-10llu\n", + sample_out.totals.rx); + print_always(" Average packets/s : %'-10.0f\n", + sample_round(pkts / num)); + } + if (sample_out.totals.redir) { + double pkts = sample_out.totals.redir; + + print_always(" Packets redirected : %'-10llu\n", + sample_out.totals.redir); + print_always(" Average redir/s : %'-10.0f\n", + sample_round(pkts / num)); + } + if (sample_out.totals.drop) + print_always(" Rx dropped : %'-10llu\n", + sample_out.totals.drop); + if (sample_out.totals.drop_xmit) + print_always(" Tx dropped : %'-10llu\n", + sample_out.totals.drop_xmit); + if (sample_out.totals.err) + print_always(" Errors recorded : %'-10llu\n", + sample_out.totals.err); + if (sample_out.totals.xmit) { + double pkts = sample_out.totals.xmit; + + print_always(" Packets transmitted : %'-10llu\n", + sample_out.totals.xmit); + print_always(" Average transmit/s : %'-10.0f\n", + sample_round(pkts / num)); + } +} + +void sample_exit(int status) +{ + size_t size; + + for (int i = 0; i < NUM_MAP; i++) { + size = sample_map_count[i] * sizeof(**sample_mmap); + munmap(sample_mmap[i], size); + } + while (sample_xdp_cnt--) { + int i = sample_xdp_cnt, ifindex, xdp_flags; + __u32 prog_id; + + prog_id = sample_xdp_progs[i].prog_id; + ifindex = sample_xdp_progs[i].ifindex; + xdp_flags = sample_xdp_progs[i].flags; + + __sample_remove_xdp(ifindex, prog_id, xdp_flags); + } + sample_summary_print(); + close(sample_sig_fd); + exit(status); +} + +static int sample_stats_collect(struct stats_record *rec) +{ + int i; + + if (sample_mask & SAMPLE_RX_CNT) + map_collect_percpu(sample_mmap[MAP_RX], &rec->rx_cnt); + + if (sample_mask & SAMPLE_REDIRECT_CNT) + map_collect_percpu(sample_mmap[MAP_REDIRECT_ERR], &rec->redir_err[0]); + + if (sample_mask & SAMPLE_REDIRECT_ERR_CNT) { + for (i = 1; i < XDP_REDIRECT_ERR_MAX; i++) + map_collect_percpu(&sample_mmap[MAP_REDIRECT_ERR][i * sample_n_cpus], + &rec->redir_err[i]); + } + + if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) + for (i = 0; i < sample_n_cpus; i++) + map_collect_percpu(&sample_mmap[MAP_CPUMAP_ENQUEUE][i * sample_n_cpus], + &rec->enq[i]); + + if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT) + map_collect_percpu(sample_mmap[MAP_CPUMAP_KTHREAD], + &rec->kthread); + + if (sample_mask & SAMPLE_EXCEPTION_CNT) + for (i = 0; i < XDP_ACTION_MAX; i++) + map_collect_percpu(&sample_mmap[MAP_EXCEPTION][i * sample_n_cpus], + &rec->exception[i]); + + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT) + map_collect_percpu(sample_mmap[MAP_DEVMAP_XMIT], &rec->devmap_xmit); + + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) { + if (map_collect_percpu_devmap(bpf_map__fd(sample_map[MAP_DEVMAP_XMIT_MULTI]), rec) < 0) + return -EINVAL; + } + return 0; +} + +static void sample_summary_update(struct sample_output *out) +{ + sample_out.totals.rx += out->totals.rx; + sample_out.totals.redir += out->totals.redir; + sample_out.totals.drop += out->totals.drop; + sample_out.totals.drop_xmit += out->totals.drop_xmit; + sample_out.totals.err += out->totals.err; + sample_out.totals.xmit += out->totals.xmit; + sample_out.rx_cnt.num++; +} + +static void sample_stats_print(int mask, struct stats_record *cur, + struct stats_record *prev, char *prog_name) +{ + struct sample_output out = {}; + + if (mask & SAMPLE_RX_CNT) + stats_get_rx_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_REDIRECT_CNT) + stats_get_redirect_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_REDIRECT_ERR_CNT) + stats_get_redirect_err_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_EXCEPTION_CNT) + stats_get_exception_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_DEVMAP_XMIT_CNT) + stats_get_devmap_xmit(cur, prev, 0, &out); + else if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + stats_get_devmap_xmit_multi(cur, prev, 0, &out, + mask & SAMPLE_DEVMAP_XMIT_CNT); + sample_summary_update(&out); + + stats_print(prog_name, mask, cur, prev, &out); +} + +void sample_switch_mode(void) +{ + sample_log_level ^= LL_DEBUG - 1; +} + +static int sample_signal_cb(void) +{ + struct signalfd_siginfo si; + int r; + + r = read(sample_sig_fd, &si, sizeof(si)); + if (r < 0) + return -errno; + + switch (si.ssi_signo) { + case SIGQUIT: + sample_switch_mode(); + printf("\n"); + break; + default: + printf("\n"); + return 1; + } + + return 0; +} + +/* Pointer swap trick */ +static void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + +static int sample_timer_cb(int timerfd, struct stats_record **rec, + struct stats_record **prev) +{ + char line[64] = "Summary"; + int ret; + __u64 t; + + ret = read(timerfd, &t, sizeof(t)); + if (ret < 0) + return -errno; + + swap(prev, rec); + ret = sample_stats_collect(*rec); + if (ret < 0) + return ret; + + if (sample_xdp_cnt == 2 && !(sample_mask & SAMPLE_SKIP_HEADING)) { + char fi[IFNAMSIZ]; + char to[IFNAMSIZ]; + const char *f, *t; + + f = t = NULL; + if (if_indextoname(sample_xdp_progs[0].ifindex, fi)) + f = fi; + if (if_indextoname(sample_xdp_progs[1].ifindex, to)) + t = to; + + snprintf(line, sizeof(line), "%s->%s", f ?: "?", t ?: "?"); + } + + sample_stats_print(sample_mask, *rec, *prev, line); + return 0; +} + +int sample_run(int interval, void (*post_cb)(void *), void *ctx) +{ + struct timespec ts = { interval, 0 }; + struct itimerspec its = { ts, ts }; + struct stats_record *rec, *prev; + struct pollfd pfd[2] = {}; + int timerfd, ret; + + if (!interval) { + fprintf(stderr, "Incorrect interval 0\n"); + return -EINVAL; + } + sample_interval = interval; + /* Pretty print numbers */ + setlocale(LC_NUMERIC, "en_US.UTF-8"); + + timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK); + if (timerfd < 0) + return -errno; + timerfd_settime(timerfd, 0, &its, NULL); + + pfd[0].fd = sample_sig_fd; + pfd[0].events = POLLIN; + + pfd[1].fd = timerfd; + pfd[1].events = POLLIN; + + ret = -ENOMEM; + rec = alloc_stats_record(); + if (!rec) + goto end; + prev = alloc_stats_record(); + if (!prev) + goto end_rec; + + ret = sample_stats_collect(rec); + if (ret < 0) + goto end_rec_prev; + + for (;;) { + ret = poll(pfd, 2, -1); + if (ret < 0) { + if (errno == EINTR) + continue; + else + break; + } + + if (pfd[0].revents & POLLIN) + ret = sample_signal_cb(); + else if (pfd[1].revents & POLLIN) + ret = sample_timer_cb(timerfd, &rec, &prev); + + if (ret) + break; + + if (post_cb) + post_cb(ctx); + } + +end_rec_prev: + free_stats_record(prev); +end_rec: + free_stats_record(rec); +end: + close(timerfd); + + return ret; +} + +const char *get_driver_name(int ifindex) +{ + struct ethtool_drvinfo drv = {}; + char ifname[IF_NAMESIZE]; + static char drvname[32]; + struct ifreq ifr = {}; + int fd, r = 0; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return "[error]"; + + if (!if_indextoname(ifindex, ifname)) + goto end; + + drv.cmd = ETHTOOL_GDRVINFO; + safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + ifr.ifr_data = (void *)&drv; + + r = ioctl(fd, SIOCETHTOOL, &ifr); + if (r) + goto end; + + safe_strncpy(drvname, drv.driver, sizeof(drvname)); + + close(fd); + return drvname; + +end: + r = errno; + close(fd); + return r == EOPNOTSUPP ? "loopback" : "[error]"; +} + +int get_mac_addr(int ifindex, void *mac_addr) +{ + char ifname[IF_NAMESIZE]; + struct ifreq ifr = {}; + int fd, r; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return -errno; + + if (!if_indextoname(ifindex, ifname)) { + r = -errno; + goto end; + } + + safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + r = ioctl(fd, SIOCGIFHWADDR, &ifr); + if (r) { + r = -errno; + goto end; + } + + memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char)); + +end: + close(fd); + return r; +} + +__attribute__((constructor)) static void sample_ctor(void) +{ + if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) { + fprintf(stderr, "Failed to set libbpf strict mode: %s\n", + strerror(errno)); + /* Just exit, nothing to cleanup right now */ + exit(EXIT_FAIL_BPF); + } +} diff --git a/samples/bpf/xdp_sample_user.h b/samples/bpf/xdp_sample_user.h new file mode 100644 index 000000000000..f45051679977 --- /dev/null +++ b/samples/bpf/xdp_sample_user.h @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef XDP_SAMPLE_USER_H +#define XDP_SAMPLE_USER_H + +#include <bpf/libbpf.h> +#include <linux/compiler.h> + +#include "xdp_sample_shared.h" + +enum stats_mask { + _SAMPLE_REDIRECT_MAP = 1U << 0, + SAMPLE_RX_CNT = 1U << 1, + SAMPLE_REDIRECT_ERR_CNT = 1U << 2, + SAMPLE_CPUMAP_ENQUEUE_CNT = 1U << 3, + SAMPLE_CPUMAP_KTHREAD_CNT = 1U << 4, + SAMPLE_EXCEPTION_CNT = 1U << 5, + SAMPLE_DEVMAP_XMIT_CNT = 1U << 6, + SAMPLE_REDIRECT_CNT = 1U << 7, + SAMPLE_REDIRECT_MAP_CNT = SAMPLE_REDIRECT_CNT | _SAMPLE_REDIRECT_MAP, + SAMPLE_REDIRECT_ERR_MAP_CNT = SAMPLE_REDIRECT_ERR_CNT | _SAMPLE_REDIRECT_MAP, + SAMPLE_DEVMAP_XMIT_CNT_MULTI = 1U << 8, + SAMPLE_SKIP_HEADING = 1U << 9, +}; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 + +int sample_setup_maps(struct bpf_map **maps); +int __sample_init(int mask); +void sample_exit(int status); +int sample_run(int interval, void (*post_cb)(void *), void *ctx); + +void sample_switch_mode(void); +int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic, + bool force); +void sample_usage(char *argv[], const struct option *long_options, + const char *doc, int mask, bool error); + +const char *get_driver_name(int ifindex); +int get_mac_addr(int ifindex, void *mac_addr); + +#pragma GCC diagnostic push +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Wstringop-truncation" +#endif +__attribute__((unused)) +static inline char *safe_strncpy(char *dst, const char *src, size_t size) +{ + if (!size) + return dst; + strncpy(dst, src, size - 1); + dst[size - 1] = '\0'; + return dst; +} +#pragma GCC diagnostic pop + +#define __attach_tp(name) \ + ({ \ + if (bpf_program__type(skel->progs.name) != BPF_PROG_TYPE_TRACING)\ + return -EINVAL; \ + skel->links.name = bpf_program__attach(skel->progs.name); \ + if (!skel->links.name) \ + return -errno; \ + }) + +#define sample_init_pre_load(skel) \ + ({ \ + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); \ + sample_setup_maps((struct bpf_map *[]){ \ + skel->maps.rx_cnt, skel->maps.redir_err_cnt, \ + skel->maps.cpumap_enqueue_cnt, \ + skel->maps.cpumap_kthread_cnt, \ + skel->maps.exception_cnt, skel->maps.devmap_xmit_cnt, \ + skel->maps.devmap_xmit_cnt_multi }); \ + }) + +#define DEFINE_SAMPLE_INIT(name) \ + static int sample_init(struct name *skel, int mask) \ + { \ + int ret; \ + ret = __sample_init(mask); \ + if (ret < 0) \ + return ret; \ + if (mask & SAMPLE_REDIRECT_MAP_CNT) \ + __attach_tp(tp_xdp_redirect_map); \ + if (mask & SAMPLE_REDIRECT_CNT) \ + __attach_tp(tp_xdp_redirect); \ + if (mask & SAMPLE_REDIRECT_ERR_MAP_CNT) \ + __attach_tp(tp_xdp_redirect_map_err); \ + if (mask & SAMPLE_REDIRECT_ERR_CNT) \ + __attach_tp(tp_xdp_redirect_err); \ + if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) \ + __attach_tp(tp_xdp_cpumap_enqueue); \ + if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) \ + __attach_tp(tp_xdp_cpumap_kthread); \ + if (mask & SAMPLE_EXCEPTION_CNT) \ + __attach_tp(tp_xdp_exception); \ + if (mask & SAMPLE_DEVMAP_XMIT_CNT) \ + __attach_tp(tp_xdp_devmap_xmit); \ + if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) \ + __attach_tp(tp_xdp_devmap_xmit_multi); \ + return 0; \ + } + +#endif diff --git a/samples/bpf/xdp_tx_iptunnel_kern.c b/samples/bpf/xdp_tx_iptunnel_kern.c index 575d57e4b8d6..0e2bca3a3fff 100644 --- a/samples/bpf/xdp_tx_iptunnel_kern.c +++ b/samples/bpf/xdp_tx_iptunnel_kern.c @@ -212,7 +212,7 @@ static __always_inline int handle_ipv6(struct xdp_md *xdp) return XDP_TX; } -SEC("xdp_tx_iptunnel") +SEC("xdp.frags") int _xdp_tx_iptunnel(struct xdp_md *xdp) { void *data_end = (void *)(long)xdp->data_end; diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c index a419bee151a8..307baef6861a 100644 --- a/samples/bpf/xdp_tx_iptunnel_user.c +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -10,7 +10,6 @@ #include <stdlib.h> #include <string.h> #include <net/if.h> -#include <sys/resource.h> #include <arpa/inet.h> #include <netinet/ether.h> #include <unistd.h> @@ -32,12 +31,12 @@ static void int_exit(int sig) __u32 curr_prog_id = 0; if (ifindex > -1) { - if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); + if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) { + printf("bpf_xdp_query_id failed\n"); exit(1); } if (prog_id == curr_prog_id) - bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_xdp_detach(ifindex, xdp_flags, NULL); else if (!curr_prog_id) printf("couldn't find a prog id on a given iface\n"); else @@ -152,10 +151,6 @@ static int parse_ports(const char *port_str, int *min_port, int *max_port) int main(int argc, char **argv) { - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int min_port = 0, max_port = 0, vip2tnl_map_fd; const char *optstr = "i:a:p:s:d:m:T:P:FSNh"; unsigned char opt_flags[256] = {}; @@ -163,6 +158,7 @@ int main(int argc, char **argv) __u32 info_len = sizeof(info); unsigned int kill_after_s = 0; struct iptnl_info tnl = {}; + struct bpf_program *prog; struct bpf_object *obj; struct vip vip = {}; char filename[256]; @@ -254,26 +250,26 @@ int main(int argc, char **argv) } } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); - return 1; - } - if (!ifindex) { fprintf(stderr, "Invalid ifname\n"); return 1; } snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) return 1; - if (!prog_fd) { - printf("bpf_prog_load_xattr: %s\n", strerror(errno)); + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); + + err = bpf_object__load(obj); + if (err) { + printf("bpf_object__load(): %s\n", strerror(errno)); return 1; } + prog_fd = bpf_program__fd(prog); rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); vip2tnl_map_fd = bpf_object__find_map_fd_by_name(obj, "vip2tnl"); @@ -294,7 +290,7 @@ int main(int argc, char **argv) } } - if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { + if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) { printf("link set xdp fd failed\n"); return 1; } @@ -308,7 +304,7 @@ int main(int argc, char **argv) poll_stats(kill_after_s); - bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_xdp_detach(ifindex, xdp_flags, NULL); return 0; } diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h deleted file mode 100644 index b7eca15c78cc..000000000000 --- a/samples/bpf/xdpsock.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright(c) 2019 Intel Corporation. - */ - -#ifndef XDPSOCK_H_ -#define XDPSOCK_H_ - -#define MAX_SOCKS 4 - -#endif /* XDPSOCK_H */ diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c deleted file mode 100644 index 05430484375c..000000000000 --- a/samples/bpf/xdpsock_kern.c +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/bpf.h> -#include <bpf/bpf_helpers.h> -#include "xdpsock.h" - -/* This XDP program is only needed for the XDP_SHARED_UMEM mode. - * If you do not use this mode, libbpf can supply an XDP program for you. - */ - -struct { - __uint(type, BPF_MAP_TYPE_XSKMAP); - __uint(max_entries, MAX_SOCKS); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); -} xsks_map SEC(".maps"); - -static unsigned int rr; - -SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) -{ - rr = (rr + 1) & (MAX_SOCKS - 1); - - return bpf_redirect_map(&xsks_map, rr, XDP_DROP); -} diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c deleted file mode 100644 index c91e91362a0c..000000000000 --- a/samples/bpf/xdpsock_user.c +++ /dev/null @@ -1,1212 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 2017 - 2018 Intel Corporation. */ - -#include <asm/barrier.h> -#include <errno.h> -#include <getopt.h> -#include <libgen.h> -#include <linux/bpf.h> -#include <linux/compiler.h> -#include <linux/if_link.h> -#include <linux/if_xdp.h> -#include <linux/if_ether.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <arpa/inet.h> -#include <locale.h> -#include <net/ethernet.h> -#include <net/if.h> -#include <poll.h> -#include <pthread.h> -#include <signal.h> -#include <stdbool.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/mman.h> -#include <sys/resource.h> -#include <sys/socket.h> -#include <sys/types.h> -#include <time.h> -#include <unistd.h> - -#include <bpf/libbpf.h> -#include <bpf/xsk.h> -#include <bpf/bpf.h> -#include "xdpsock.h" - -#ifndef SOL_XDP -#define SOL_XDP 283 -#endif - -#ifndef AF_XDP -#define AF_XDP 44 -#endif - -#ifndef PF_XDP -#define PF_XDP AF_XDP -#endif - -#define NUM_FRAMES (4 * 1024) -#define MIN_PKT_SIZE 64 - -#define DEBUG_HEXDUMP 0 - -typedef __u64 u64; -typedef __u32 u32; -typedef __u16 u16; -typedef __u8 u8; - -static unsigned long prev_time; - -enum benchmark_type { - BENCH_RXDROP = 0, - BENCH_TXONLY = 1, - BENCH_L2FWD = 2, -}; - -static enum benchmark_type opt_bench = BENCH_RXDROP; -static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static const char *opt_if = ""; -static int opt_ifindex; -static int opt_queue; -static unsigned long opt_duration; -static unsigned long start_time; -static bool benchmark_done; -static u32 opt_batch_size = 64; -static int opt_pkt_count; -static u16 opt_pkt_size = MIN_PKT_SIZE; -static u32 opt_pkt_fill_pattern = 0x12345678; -static int opt_poll; -static int opt_interval = 1; -static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; -static u32 opt_umem_flags; -static int opt_unaligned_chunks; -static int opt_mmap_flags; -static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; -static int opt_timeout = 1000; -static bool opt_need_wakeup = true; -static u32 opt_num_xsks = 1; -static u32 prog_id; - -struct xsk_umem_info { - struct xsk_ring_prod fq; - struct xsk_ring_cons cq; - struct xsk_umem *umem; - void *buffer; -}; - -struct xsk_socket_info { - struct xsk_ring_cons rx; - struct xsk_ring_prod tx; - struct xsk_umem_info *umem; - struct xsk_socket *xsk; - unsigned long rx_npkts; - unsigned long tx_npkts; - unsigned long prev_rx_npkts; - unsigned long prev_tx_npkts; - u32 outstanding_tx; -}; - -static int num_socks; -struct xsk_socket_info *xsks[MAX_SOCKS]; - -static unsigned long get_nsecs(void) -{ - struct timespec ts; - - clock_gettime(CLOCK_MONOTONIC, &ts); - return ts.tv_sec * 1000000000UL + ts.tv_nsec; -} - -static void print_benchmark(bool running) -{ - const char *bench_str = "INVALID"; - - if (opt_bench == BENCH_RXDROP) - bench_str = "rxdrop"; - else if (opt_bench == BENCH_TXONLY) - bench_str = "txonly"; - else if (opt_bench == BENCH_L2FWD) - bench_str = "l2fwd"; - - printf("%s:%d %s ", opt_if, opt_queue, bench_str); - if (opt_xdp_flags & XDP_FLAGS_SKB_MODE) - printf("xdp-skb "); - else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE) - printf("xdp-drv "); - else - printf(" "); - - if (opt_poll) - printf("poll() "); - - if (running) { - printf("running..."); - fflush(stdout); - } -} - -static void dump_stats(void) -{ - unsigned long now = get_nsecs(); - long dt = now - prev_time; - int i; - - prev_time = now; - - for (i = 0; i < num_socks && xsks[i]; i++) { - char *fmt = "%-15s %'-11.0f %'-11lu\n"; - double rx_pps, tx_pps; - - rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) * - 1000000000. / dt; - tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) * - 1000000000. / dt; - - printf("\n sock%d@", i); - print_benchmark(false); - printf("\n"); - - printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts", - dt / 1000000000.); - printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts); - printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts); - - xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts; - xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts; - } -} - -static bool is_benchmark_done(void) -{ - if (opt_duration > 0) { - unsigned long dt = (get_nsecs() - start_time); - - if (dt >= opt_duration) - benchmark_done = true; - } - return benchmark_done; -} - -static void *poller(void *arg) -{ - (void)arg; - while (!is_benchmark_done()) { - sleep(opt_interval); - dump_stats(); - } - - return NULL; -} - -static void remove_xdp_program(void) -{ - u32 curr_prog_id = 0; - - if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(EXIT_FAILURE); - } - if (prog_id == curr_prog_id) - bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags); - else if (!curr_prog_id) - printf("couldn't find a prog id on a given interface\n"); - else - printf("program on interface changed, not removing\n"); -} - -static void int_exit(int sig) -{ - benchmark_done = true; -} - -static void xdpsock_cleanup(void) -{ - struct xsk_umem *umem = xsks[0]->umem->umem; - int i; - - dump_stats(); - for (i = 0; i < num_socks; i++) - xsk_socket__delete(xsks[i]->xsk); - (void)xsk_umem__delete(umem); - remove_xdp_program(); -} - -static void __exit_with_error(int error, const char *file, const char *func, - int line) -{ - fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func, - line, error, strerror(error)); - dump_stats(); - remove_xdp_program(); - exit(EXIT_FAILURE); -} - -#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \ - __LINE__) -static void swap_mac_addresses(void *data) -{ - struct ether_header *eth = (struct ether_header *)data; - struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; - struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; - struct ether_addr tmp; - - tmp = *src_addr; - *src_addr = *dst_addr; - *dst_addr = tmp; -} - -static void hex_dump(void *pkt, size_t length, u64 addr) -{ - const unsigned char *address = (unsigned char *)pkt; - const unsigned char *line = address; - size_t line_size = 32; - unsigned char c; - char buf[32]; - int i = 0; - - if (!DEBUG_HEXDUMP) - return; - - sprintf(buf, "addr=%llu", addr); - printf("length = %zu\n", length); - printf("%s | ", buf); - while (length-- > 0) { - printf("%02X ", *address++); - if (!(++i % line_size) || (length == 0 && i % line_size)) { - if (length == 0) { - while (i++ % line_size) - printf("__ "); - } - printf(" | "); /* right close */ - while (line < address) { - c = *line++; - printf("%c", (c < 33 || c == 255) ? 0x2E : c); - } - printf("\n"); - if (length > 0) - printf("%s | ", buf); - } - } - printf("\n"); -} - -static void *memset32_htonl(void *dest, u32 val, u32 size) -{ - u32 *ptr = (u32 *)dest; - int i; - - val = htonl(val); - - for (i = 0; i < (size & (~0x3)); i += 4) - ptr[i >> 2] = val; - - for (; i < size; i++) - ((char *)dest)[i] = ((char *)&val)[i & 3]; - - return dest; -} - -/* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static inline unsigned short from32to16(unsigned int x) -{ - /* add up 16-bit and 16-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - -/* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static unsigned int do_csum(const unsigned char *buff, int len) -{ - unsigned int result = 0; - int odd; - - if (len <= 0) - goto out; - odd = 1 & (unsigned long)buff; - if (odd) { -#ifdef __LITTLE_ENDIAN - result += (*buff << 8); -#else - result = *buff; -#endif - len--; - buff++; - } - if (len >= 2) { - if (2 & (unsigned long)buff) { - result += *(unsigned short *)buff; - len -= 2; - buff += 2; - } - if (len >= 4) { - const unsigned char *end = buff + - ((unsigned int)len & ~3); - unsigned int carry = 0; - - do { - unsigned int w = *(unsigned int *)buff; - - buff += 4; - result += carry; - result += w; - carry = (w > result); - } while (buff < end); - result += carry; - result = (result & 0xffff) + (result >> 16); - } - if (len & 2) { - result += *(unsigned short *)buff; - buff += 2; - } - } - if (len & 1) -#ifdef __LITTLE_ENDIAN - result += *buff; -#else - result += (*buff << 8); -#endif - result = from32to16(result); - if (odd) - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); -out: - return result; -} - -__sum16 ip_fast_csum(const void *iph, unsigned int ihl); - -/* - * This is a version of ip_compute_csum() optimized for IP headers, - * which always checksum on 4 octet boundaries. - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -__sum16 ip_fast_csum(const void *iph, unsigned int ihl) -{ - return (__force __sum16)~do_csum(iph, ihl * 4); -} - -/* - * Fold a partial checksum - * This function code has been taken from - * Linux kernel include/asm-generic/checksum.h - */ -static inline __sum16 csum_fold(__wsum csum) -{ - u32 sum = (__force u32)csum; - - sum = (sum & 0xffff) + (sum >> 16); - sum = (sum & 0xffff) + (sum >> 16); - return (__force __sum16)~sum; -} - -/* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static inline u32 from64to32(u64 x) -{ - /* add up 32-bit and 32-bit for 32+c bit */ - x = (x & 0xffffffff) + (x >> 32); - /* add up carry.. */ - x = (x & 0xffffffff) + (x >> 32); - return (u32)x; -} - -__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, - __u32 len, __u8 proto, __wsum sum); - -/* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, - __u32 len, __u8 proto, __wsum sum) -{ - unsigned long long s = (__force u32)sum; - - s += (__force u32)saddr; - s += (__force u32)daddr; -#ifdef __BIG_ENDIAN__ - s += proto + len; -#else - s += (proto + len) << 8; -#endif - return (__force __wsum)from64to32(s); -} - -/* - * This function has been taken from - * Linux kernel include/asm-generic/checksum.h - */ -static inline __sum16 -csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, - __u8 proto, __wsum sum) -{ - return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); -} - -static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, - u8 proto, u16 *udp_pkt) -{ - u32 csum = 0; - u32 cnt = 0; - - /* udp hdr and data */ - for (; cnt < len; cnt += 2) - csum += udp_pkt[cnt >> 1]; - - return csum_tcpudp_magic(saddr, daddr, len, proto, csum); -} - -#define ETH_FCS_SIZE 4 - -#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ - sizeof(struct udphdr)) - -#define PKT_SIZE (opt_pkt_size - ETH_FCS_SIZE) -#define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr)) -#define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) -#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) - -static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; - -static void gen_eth_hdr_data(void) -{ - struct udphdr *udp_hdr = (struct udphdr *)(pkt_data + - sizeof(struct ethhdr) + - sizeof(struct iphdr)); - struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + - sizeof(struct ethhdr)); - struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; - - /* ethernet header */ - memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN); - memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN); - eth_hdr->h_proto = htons(ETH_P_IP); - - /* IP header */ - ip_hdr->version = IPVERSION; - ip_hdr->ihl = 0x5; /* 20 byte header */ - ip_hdr->tos = 0x0; - ip_hdr->tot_len = htons(IP_PKT_SIZE); - ip_hdr->id = 0; - ip_hdr->frag_off = 0; - ip_hdr->ttl = IPDEFTTL; - ip_hdr->protocol = IPPROTO_UDP; - ip_hdr->saddr = htonl(0x0a0a0a10); - ip_hdr->daddr = htonl(0x0a0a0a20); - - /* IP header checksum */ - ip_hdr->check = 0; - ip_hdr->check = ip_fast_csum((const void *)ip_hdr, ip_hdr->ihl); - - /* UDP header */ - udp_hdr->source = htons(0x1000); - udp_hdr->dest = htons(0x1000); - udp_hdr->len = htons(UDP_PKT_SIZE); - - /* UDP data */ - memset32_htonl(pkt_data + PKT_HDR_SIZE, opt_pkt_fill_pattern, - UDP_PKT_DATA_SIZE); - - /* UDP header checksum */ - udp_hdr->check = 0; - udp_hdr->check = udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, - IPPROTO_UDP, (u16 *)udp_hdr); -} - -static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) -{ - memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, - PKT_SIZE); -} - -static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) -{ - struct xsk_umem_info *umem; - struct xsk_umem_config cfg = { - .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, - .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, - .frame_size = opt_xsk_frame_size, - .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, - .flags = opt_umem_flags - }; - int ret; - - umem = calloc(1, sizeof(*umem)); - if (!umem) - exit_with_error(errno); - - ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, - &cfg); - if (ret) - exit_with_error(-ret); - - umem->buffer = buffer; - return umem; -} - -static void xsk_populate_fill_ring(struct xsk_umem_info *umem) -{ - int ret, i; - u32 idx; - - ret = xsk_ring_prod__reserve(&umem->fq, - XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx); - if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) - exit_with_error(-ret); - for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++) - *xsk_ring_prod__fill_addr(&umem->fq, idx++) = - i * opt_xsk_frame_size; - xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); -} - -static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem, - bool rx, bool tx) -{ - struct xsk_socket_config cfg; - struct xsk_socket_info *xsk; - struct xsk_ring_cons *rxr; - struct xsk_ring_prod *txr; - int ret; - - xsk = calloc(1, sizeof(*xsk)); - if (!xsk) - exit_with_error(errno); - - xsk->umem = umem; - cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; - cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - if (opt_num_xsks > 1) - cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; - else - cfg.libbpf_flags = 0; - cfg.xdp_flags = opt_xdp_flags; - cfg.bind_flags = opt_xdp_bind_flags; - - rxr = rx ? &xsk->rx : NULL; - txr = tx ? &xsk->tx : NULL; - ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem, - rxr, txr, &cfg); - if (ret) - exit_with_error(-ret); - - ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags); - if (ret) - exit_with_error(-ret); - - return xsk; -} - -static struct option long_options[] = { - {"rxdrop", no_argument, 0, 'r'}, - {"txonly", no_argument, 0, 't'}, - {"l2fwd", no_argument, 0, 'l'}, - {"interface", required_argument, 0, 'i'}, - {"queue", required_argument, 0, 'q'}, - {"poll", no_argument, 0, 'p'}, - {"xdp-skb", no_argument, 0, 'S'}, - {"xdp-native", no_argument, 0, 'N'}, - {"interval", required_argument, 0, 'n'}, - {"zero-copy", no_argument, 0, 'z'}, - {"copy", no_argument, 0, 'c'}, - {"frame-size", required_argument, 0, 'f'}, - {"no-need-wakeup", no_argument, 0, 'm'}, - {"unaligned", no_argument, 0, 'u'}, - {"shared-umem", no_argument, 0, 'M'}, - {"force", no_argument, 0, 'F'}, - {"duration", required_argument, 0, 'd'}, - {"batch-size", required_argument, 0, 'b'}, - {"tx-pkt-count", required_argument, 0, 'C'}, - {"tx-pkt-size", required_argument, 0, 's'}, - {"tx-pkt-pattern", required_argument, 0, 'P'}, - {0, 0, 0, 0} -}; - -static void usage(const char *prog) -{ - const char *str = - " Usage: %s [OPTIONS]\n" - " Options:\n" - " -r, --rxdrop Discard all incoming packets (default)\n" - " -t, --txonly Only send packets\n" - " -l, --l2fwd MAC swap L2 forwarding\n" - " -i, --interface=n Run on interface n\n" - " -q, --queue=n Use queue n (default 0)\n" - " -p, --poll Use poll syscall\n" - " -S, --xdp-skb=n Use XDP skb-mod\n" - " -N, --xdp-native=n Enforce XDP native mode\n" - " -n, --interval=n Specify statistics update interval (default 1 sec).\n" - " -z, --zero-copy Force zero-copy mode.\n" - " -c, --copy Force copy mode.\n" - " -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n" - " -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n" - " -u, --unaligned Enable unaligned chunk placement\n" - " -M, --shared-umem Enable XDP_SHARED_UMEM\n" - " -F, --force Force loading the XDP prog\n" - " -d, --duration=n Duration in secs to run command.\n" - " Default: forever.\n" - " -b, --batch-size=n Batch size for sending or receiving\n" - " packets. Default: %d\n" - " -C, --tx-pkt-count=n Number of packets to send.\n" - " Default: Continuous packets.\n" - " -s, --tx-pkt-size=n Transmit packet size.\n" - " (Default: %d bytes)\n" - " Min size: %d, Max size %d.\n" - " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" - "\n"; - fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, - opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, - XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern); - - exit(EXIT_FAILURE); -} - -static void parse_command_line(int argc, char **argv) -{ - int option_index, c; - - opterr = 0; - - for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:", - long_options, &option_index); - if (c == -1) - break; - - switch (c) { - case 'r': - opt_bench = BENCH_RXDROP; - break; - case 't': - opt_bench = BENCH_TXONLY; - break; - case 'l': - opt_bench = BENCH_L2FWD; - break; - case 'i': - opt_if = optarg; - break; - case 'q': - opt_queue = atoi(optarg); - break; - case 'p': - opt_poll = 1; - break; - case 'S': - opt_xdp_flags |= XDP_FLAGS_SKB_MODE; - opt_xdp_bind_flags |= XDP_COPY; - break; - case 'N': - /* default, set below */ - break; - case 'n': - opt_interval = atoi(optarg); - break; - case 'z': - opt_xdp_bind_flags |= XDP_ZEROCOPY; - break; - case 'c': - opt_xdp_bind_flags |= XDP_COPY; - break; - case 'u': - opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; - opt_unaligned_chunks = 1; - opt_mmap_flags = MAP_HUGETLB; - break; - case 'F': - opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; - break; - case 'f': - opt_xsk_frame_size = atoi(optarg); - break; - case 'm': - opt_need_wakeup = false; - opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP; - break; - case 'M': - opt_num_xsks = MAX_SOCKS; - break; - case 'd': - opt_duration = atoi(optarg); - opt_duration *= 1000000000; - break; - case 'b': - opt_batch_size = atoi(optarg); - break; - case 'C': - opt_pkt_count = atoi(optarg); - break; - case 's': - opt_pkt_size = atoi(optarg); - if (opt_pkt_size > (XSK_UMEM__DEFAULT_FRAME_SIZE) || - opt_pkt_size < MIN_PKT_SIZE) { - fprintf(stderr, - "ERROR: Invalid frame size %d\n", - opt_pkt_size); - usage(basename(argv[0])); - } - break; - case 'P': - opt_pkt_fill_pattern = strtol(optarg, NULL, 16); - break; - default: - usage(basename(argv[0])); - } - } - - if (!(opt_xdp_flags & XDP_FLAGS_SKB_MODE)) - opt_xdp_flags |= XDP_FLAGS_DRV_MODE; - - opt_ifindex = if_nametoindex(opt_if); - if (!opt_ifindex) { - fprintf(stderr, "ERROR: interface \"%s\" does not exist\n", - opt_if); - usage(basename(argv[0])); - } - - if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) && - !opt_unaligned_chunks) { - fprintf(stderr, "--frame-size=%d is not a power of two\n", - opt_xsk_frame_size); - usage(basename(argv[0])); - } -} - -static void kick_tx(struct xsk_socket_info *xsk) -{ - int ret; - - ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || - errno == EBUSY || errno == ENETDOWN) - return; - exit_with_error(errno); -} - -static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, - struct pollfd *fds) -{ - struct xsk_umem_info *umem = xsk->umem; - u32 idx_cq = 0, idx_fq = 0; - unsigned int rcvd; - size_t ndescs; - - if (!xsk->outstanding_tx) - return; - - if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) - kick_tx(xsk); - - ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : - xsk->outstanding_tx; - - /* re-add completed Tx buffers */ - rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq); - if (rcvd > 0) { - unsigned int i; - int ret; - - ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); - while (ret != rcvd) { - if (ret < 0) - exit_with_error(-ret); - if (xsk_ring_prod__needs_wakeup(&umem->fq)) - ret = poll(fds, num_socks, opt_timeout); - ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); - } - - for (i = 0; i < rcvd; i++) - *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = - *xsk_ring_cons__comp_addr(&umem->cq, idx_cq++); - - xsk_ring_prod__submit(&xsk->umem->fq, rcvd); - xsk_ring_cons__release(&xsk->umem->cq, rcvd); - xsk->outstanding_tx -= rcvd; - xsk->tx_npkts += rcvd; - } -} - -static inline void complete_tx_only(struct xsk_socket_info *xsk, - int batch_size) -{ - unsigned int rcvd; - u32 idx; - - if (!xsk->outstanding_tx) - return; - - if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) - kick_tx(xsk); - - rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); - if (rcvd > 0) { - xsk_ring_cons__release(&xsk->umem->cq, rcvd); - xsk->outstanding_tx -= rcvd; - xsk->tx_npkts += rcvd; - } -} - -static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) -{ - unsigned int rcvd, i; - u32 idx_rx = 0, idx_fq = 0; - int ret; - - rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); - if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) - ret = poll(fds, num_socks, opt_timeout); - return; - } - - ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); - while (ret != rcvd) { - if (ret < 0) - exit_with_error(-ret); - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) - ret = poll(fds, num_socks, opt_timeout); - ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); - } - - for (i = 0; i < rcvd; i++) { - u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; - u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; - u64 orig = xsk_umem__extract_addr(addr); - - addr = xsk_umem__add_offset_to_addr(addr); - char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); - - hex_dump(pkt, len, addr); - *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; - } - - xsk_ring_prod__submit(&xsk->umem->fq, rcvd); - xsk_ring_cons__release(&xsk->rx, rcvd); - xsk->rx_npkts += rcvd; -} - -static void rx_drop_all(void) -{ - struct pollfd fds[MAX_SOCKS] = {}; - int i, ret; - - for (i = 0; i < num_socks; i++) { - fds[i].fd = xsk_socket__fd(xsks[i]->xsk); - fds[i].events = POLLIN; - } - - for (;;) { - if (opt_poll) { - ret = poll(fds, num_socks, opt_timeout); - if (ret <= 0) - continue; - } - - for (i = 0; i < num_socks; i++) - rx_drop(xsks[i], fds); - - if (benchmark_done) - break; - } -} - -static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb, int batch_size) -{ - u32 idx; - unsigned int i; - - while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < - batch_size) { - complete_tx_only(xsk, batch_size); - } - - for (i = 0; i < batch_size; i++) { - struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, - idx + i); - tx_desc->addr = (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; - tx_desc->len = PKT_SIZE; - } - - xsk_ring_prod__submit(&xsk->tx, batch_size); - xsk->outstanding_tx += batch_size; - frame_nb += batch_size; - frame_nb %= NUM_FRAMES; - complete_tx_only(xsk, batch_size); -} - -static inline int get_batch_size(int pkt_cnt) -{ - if (!opt_pkt_count) - return opt_batch_size; - - if (pkt_cnt + opt_batch_size <= opt_pkt_count) - return opt_batch_size; - - return opt_pkt_count - pkt_cnt; -} - -static void complete_tx_only_all(void) -{ - bool pending; - int i; - - do { - pending = false; - for (i = 0; i < num_socks; i++) { - if (xsks[i]->outstanding_tx) { - complete_tx_only(xsks[i], opt_batch_size); - pending = !!xsks[i]->outstanding_tx; - } - } - } while (pending); -} - -static void tx_only_all(void) -{ - struct pollfd fds[MAX_SOCKS] = {}; - u32 frame_nb[MAX_SOCKS] = {}; - int pkt_cnt = 0; - int i, ret; - - for (i = 0; i < num_socks; i++) { - fds[0].fd = xsk_socket__fd(xsks[i]->xsk); - fds[0].events = POLLOUT; - } - - while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { - int batch_size = get_batch_size(pkt_cnt); - - if (opt_poll) { - ret = poll(fds, num_socks, opt_timeout); - if (ret <= 0) - continue; - - if (!(fds[0].revents & POLLOUT)) - continue; - } - - for (i = 0; i < num_socks; i++) - tx_only(xsks[i], frame_nb[i], batch_size); - - pkt_cnt += batch_size; - - if (benchmark_done) - break; - } - - if (opt_pkt_count) - complete_tx_only_all(); -} - -static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) -{ - unsigned int rcvd, i; - u32 idx_rx = 0, idx_tx = 0; - int ret; - - complete_tx_l2fwd(xsk, fds); - - rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); - if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) - ret = poll(fds, num_socks, opt_timeout); - return; - } - - ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); - while (ret != rcvd) { - if (ret < 0) - exit_with_error(-ret); - if (xsk_ring_prod__needs_wakeup(&xsk->tx)) - kick_tx(xsk); - ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); - } - - for (i = 0; i < rcvd; i++) { - u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; - u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; - u64 orig = addr; - - addr = xsk_umem__add_offset_to_addr(addr); - char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); - - swap_mac_addresses(pkt); - - hex_dump(pkt, len, addr); - xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig; - xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len; - } - - xsk_ring_prod__submit(&xsk->tx, rcvd); - xsk_ring_cons__release(&xsk->rx, rcvd); - - xsk->rx_npkts += rcvd; - xsk->outstanding_tx += rcvd; -} - -static void l2fwd_all(void) -{ - struct pollfd fds[MAX_SOCKS] = {}; - int i, ret; - - for (i = 0; i < num_socks; i++) { - fds[i].fd = xsk_socket__fd(xsks[i]->xsk); - fds[i].events = POLLOUT | POLLIN; - } - - for (;;) { - if (opt_poll) { - ret = poll(fds, num_socks, opt_timeout); - if (ret <= 0) - continue; - } - - for (i = 0; i < num_socks; i++) - l2fwd(xsks[i], fds); - - if (benchmark_done) - break; - } -} - -static void load_xdp_program(char **argv, struct bpf_object **obj) -{ - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; - char xdp_filename[256]; - int prog_fd; - - snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]); - prog_load_attr.file = xdp_filename; - - if (bpf_prog_load_xattr(&prog_load_attr, obj, &prog_fd)) - exit(EXIT_FAILURE); - if (prog_fd < 0) { - fprintf(stderr, "ERROR: no program found: %s\n", - strerror(prog_fd)); - exit(EXIT_FAILURE); - } - - if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) { - fprintf(stderr, "ERROR: link set xdp fd failed\n"); - exit(EXIT_FAILURE); - } -} - -static void enter_xsks_into_map(struct bpf_object *obj) -{ - struct bpf_map *map; - int i, xsks_map; - - map = bpf_object__find_map_by_name(obj, "xsks_map"); - xsks_map = bpf_map__fd(map); - if (xsks_map < 0) { - fprintf(stderr, "ERROR: no xsks map found: %s\n", - strerror(xsks_map)); - exit(EXIT_FAILURE); - } - - for (i = 0; i < num_socks; i++) { - int fd = xsk_socket__fd(xsks[i]->xsk); - int key, ret; - - key = i; - ret = bpf_map_update_elem(xsks_map, &key, &fd, 0); - if (ret) { - fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i); - exit(EXIT_FAILURE); - } - } -} - -int main(int argc, char **argv) -{ - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - bool rx = false, tx = false; - struct xsk_umem_info *umem; - struct bpf_object *obj; - pthread_t pt; - int i, ret; - void *bufs; - - parse_command_line(argc, argv); - - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", - strerror(errno)); - exit(EXIT_FAILURE); - } - - if (opt_num_xsks > 1) - load_xdp_program(argv, &obj); - - /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */ - bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0); - if (bufs == MAP_FAILED) { - printf("ERROR: mmap failed\n"); - exit(EXIT_FAILURE); - } - - /* Create sockets... */ - umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size); - if (opt_bench == BENCH_RXDROP || opt_bench == BENCH_L2FWD) { - rx = true; - xsk_populate_fill_ring(umem); - } - if (opt_bench == BENCH_L2FWD || opt_bench == BENCH_TXONLY) - tx = true; - for (i = 0; i < opt_num_xsks; i++) - xsks[num_socks++] = xsk_configure_socket(umem, rx, tx); - - if (opt_bench == BENCH_TXONLY) { - gen_eth_hdr_data(); - - for (i = 0; i < NUM_FRAMES; i++) - gen_eth_frame(umem, i * opt_xsk_frame_size); - } - - if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY) - enter_xsks_into_map(obj); - - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); - signal(SIGABRT, int_exit); - - setlocale(LC_ALL, ""); - - ret = pthread_create(&pt, NULL, poller, NULL); - if (ret) - exit_with_error(ret); - - prev_time = get_nsecs(); - start_time = prev_time; - - if (opt_bench == BENCH_RXDROP) - rx_drop_all(); - else if (opt_bench == BENCH_TXONLY) - tx_only_all(); - else - l2fwd_all(); - - benchmark_done = true; - - pthread_join(pt, NULL); - - xdpsock_cleanup(); - - return 0; -} diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c index e2398d94e8da..37a657b25d58 100644 --- a/samples/configfs/configfs_sample.c +++ b/samples/configfs/configfs_sample.c @@ -1,25 +1,21 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * vim: noexpandtab ts=8 sts=0 sw=8: - * * configfs_example_macros.c - This file is a demonstration module * containing a number of configfs subsystems. It uses the helper * macros defined by configfs.h * * Based on sysfs: - * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. */ #include <linux/init.h> +#include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> - #include <linux/configfs.h> - - /* * 01-childless * @@ -40,8 +36,8 @@ struct childless { static inline struct childless *to_childless(struct config_item *item) { - return item ? container_of(to_configfs_subsystem(to_config_group(item)), - struct childless, subsys) : NULL; + return container_of(to_configfs_subsystem(to_config_group(item)), + struct childless, subsys); } static ssize_t childless_showme_show(struct config_item *item, char *page) @@ -64,17 +60,11 @@ static ssize_t childless_storeme_store(struct config_item *item, const char *page, size_t count) { struct childless *childless = to_childless(item); - unsigned long tmp; - char *p = (char *) page; - - tmp = simple_strtoul(p, &p, 10); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; - - if (tmp > INT_MAX) - return -ERANGE; + int ret; - childless->storeme = tmp; + ret = kstrtoint(page, 10, &childless->storeme); + if (ret) + return ret; return count; } @@ -117,7 +107,6 @@ static struct childless childless_subsys = { }, }; - /* ----------------------------------------------------------------- */ /* @@ -136,7 +125,7 @@ struct simple_child { static inline struct simple_child *to_simple_child(struct config_item *item) { - return item ? container_of(item, struct simple_child, item) : NULL; + return container_of(item, struct simple_child, item); } static ssize_t simple_child_storeme_show(struct config_item *item, char *page) @@ -148,17 +137,11 @@ static ssize_t simple_child_storeme_store(struct config_item *item, const char *page, size_t count) { struct simple_child *simple_child = to_simple_child(item); - unsigned long tmp; - char *p = (char *) page; - - tmp = simple_strtoul(p, &p, 10); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; - - if (tmp > INT_MAX) - return -ERANGE; + int ret; - simple_child->storeme = tmp; + ret = kstrtoint(page, 10, &simple_child->storeme); + if (ret) + return ret; return count; } @@ -176,7 +159,7 @@ static void simple_child_release(struct config_item *item) } static struct configfs_item_operations simple_child_item_ops = { - .release = simple_child_release, + .release = simple_child_release, }; static const struct config_item_type simple_child_type = { @@ -185,15 +168,14 @@ static const struct config_item_type simple_child_type = { .ct_owner = THIS_MODULE, }; - struct simple_children { struct config_group group; }; static inline struct simple_children *to_simple_children(struct config_item *item) { - return item ? container_of(to_config_group(item), - struct simple_children, group) : NULL; + return container_of(to_config_group(item), + struct simple_children, group); } static struct config_item *simple_children_make_item(struct config_group *group, @@ -208,8 +190,6 @@ static struct config_item *simple_children_make_item(struct config_group *group, config_item_init_type_name(&simple_child->item, name, &simple_child_type); - simple_child->storeme = 0; - return &simple_child->item; } @@ -263,7 +243,6 @@ static struct configfs_subsystem simple_children_subsys = { }, }; - /* ----------------------------------------------------------------- */ /* @@ -350,9 +329,8 @@ static struct configfs_subsystem *example_subsys[] = { static int __init configfs_example_init(void) { - int ret; - int i; struct configfs_subsystem *subsys; + int ret, i; for (i = 0; example_subsys[i]; i++) { subsys = example_subsys[i]; @@ -361,9 +339,8 @@ static int __init configfs_example_init(void) mutex_init(&subsys->su_mutex); ret = configfs_register_subsystem(subsys); if (ret) { - printk(KERN_ERR "Error %d while registering subsystem %s\n", - ret, - subsys->su_group.cg_item.ci_namebuf); + pr_err("Error %d while registering subsystem %s\n", + ret, subsys->su_group.cg_item.ci_namebuf); goto out_unregister; } } diff --git a/samples/connector/.gitignore b/samples/connector/.gitignore index d2b9c32accd4..0e26039f39b5 100644 --- a/samples/connector/.gitignore +++ b/samples/connector/.gitignore @@ -1 +1,2 @@ -ucon +# SPDX-License-Identifier: GPL-2.0-only +/ucon diff --git a/samples/connector/Makefile b/samples/connector/Makefile index b785cbde5ffa..d98a9e047c11 100644 --- a/samples/connector/Makefile +++ b/samples/connector/Makefile @@ -1,13 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_SAMPLE_CONNECTOR) += cn_test.o -# List of programs to build -hostprogs := ucon -always-y := $(hostprogs) +userprogs-always-$(CONFIG_CC_CAN_LINK) += ucon -HOSTCFLAGS_ucon.o += -I$(objtree)/usr/include - -all: modules - -modules clean: - $(MAKE) -C ../.. M=$(CURDIR) $@ +userccflags += -I usr/include diff --git a/samples/coresight/Makefile b/samples/coresight/Makefile new file mode 100644 index 000000000000..b3fce4af2347 --- /dev/null +++ b/samples/coresight/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_SAMPLE_CORESIGHT_SYSCFG) += coresight-cfg-sample.o +ccflags-y += -I$(srctree)/drivers/hwtracing/coresight diff --git a/samples/coresight/coresight-cfg-sample.c b/samples/coresight/coresight-cfg-sample.c new file mode 100644 index 000000000000..25485c80b5e3 --- /dev/null +++ b/samples/coresight/coresight-cfg-sample.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright(C) 2020 Linaro Limited. All rights reserved. + * Author: Mike Leach <mike.leach@linaro.org> + */ + +#include "coresight-config.h" +#include "coresight-syscfg.h" + +/* create an alternate autofdo configuration */ + +/* we will provide 4 sets of preset parameter values */ +#define AFDO2_NR_PRESETS 4 +/* the total number of parameters in used features - strobing has 2 */ +#define AFDO2_NR_PARAM_SUM 2 + +static const char *afdo2_ref_names[] = { + "strobing", +}; + +/* + * set of presets leaves strobing window constant while varying period to allow + * experimentation with mark / space ratios for various workloads + */ +static u64 afdo2_presets[AFDO2_NR_PRESETS][AFDO2_NR_PARAM_SUM] = { + { 1000, 100 }, + { 1000, 1000 }, + { 1000, 5000 }, + { 1000, 10000 }, +}; + +struct cscfg_config_desc afdo2 = { + .name = "autofdo2", + .description = "Setup ETMs with strobing for autofdo\n" + "Supplied presets allow experimentation with mark-space ratio for various loads\n", + .nr_feat_refs = ARRAY_SIZE(afdo2_ref_names), + .feat_ref_names = afdo2_ref_names, + .nr_presets = AFDO2_NR_PRESETS, + .nr_total_params = AFDO2_NR_PARAM_SUM, + .presets = &afdo2_presets[0][0], +}; + +static struct cscfg_feature_desc *sample_feats[] = { + NULL +}; + +static struct cscfg_config_desc *sample_cfgs[] = { + &afdo2, + NULL +}; + +static struct cscfg_load_owner_info mod_owner = { + .type = CSCFG_OWNER_MODULE, + .owner_handle = THIS_MODULE, +}; + +/* module init and exit - just load and unload configs */ +static int __init cscfg_sample_init(void) +{ + return cscfg_load_config_sets(sample_cfgs, sample_feats, &mod_owner); +} + +static void __exit cscfg_sample_exit(void) +{ + cscfg_unload_config_sets(&mod_owner); +} + +module_init(cscfg_sample_init); +module_exit(cscfg_sample_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Mike Leach <mike.leach@linaro.org>"); +MODULE_DESCRIPTION("CoreSight Syscfg Example"); diff --git a/samples/fanotify/.gitignore b/samples/fanotify/.gitignore new file mode 100644 index 000000000000..d74593e8b2de --- /dev/null +++ b/samples/fanotify/.gitignore @@ -0,0 +1 @@ +fs-monitor diff --git a/samples/fanotify/Makefile b/samples/fanotify/Makefile new file mode 100644 index 000000000000..e20db1bdde3b --- /dev/null +++ b/samples/fanotify/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += fs-monitor + +userccflags += -I usr/include -Wall + diff --git a/samples/fanotify/fs-monitor.c b/samples/fanotify/fs-monitor.c new file mode 100644 index 000000000000..608db24c471e --- /dev/null +++ b/samples/fanotify/fs-monitor.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021, Collabora Ltd. + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <err.h> +#include <stdlib.h> +#include <stdio.h> +#include <fcntl.h> +#include <sys/fanotify.h> +#include <sys/types.h> +#include <unistd.h> + +#ifndef FAN_FS_ERROR +#define FAN_FS_ERROR 0x00008000 +#define FAN_EVENT_INFO_TYPE_ERROR 5 + +struct fanotify_event_info_error { + struct fanotify_event_info_header hdr; + __s32 error; + __u32 error_count; +}; +#endif + +#ifndef FILEID_INO32_GEN +#define FILEID_INO32_GEN 1 +#endif + +#ifndef FILEID_INVALID +#define FILEID_INVALID 0xff +#endif + +static void print_fh(struct file_handle *fh) +{ + int i; + uint32_t *h = (uint32_t *) fh->f_handle; + + printf("\tfh: "); + for (i = 0; i < fh->handle_bytes; i++) + printf("%hhx", fh->f_handle[i]); + printf("\n"); + + printf("\tdecoded fh: "); + if (fh->handle_type == FILEID_INO32_GEN) + printf("inode=%u gen=%u\n", h[0], h[1]); + else if (fh->handle_type == FILEID_INVALID && !fh->handle_bytes) + printf("Type %d (Superblock error)\n", fh->handle_type); + else + printf("Type %d (Unknown)\n", fh->handle_type); + +} + +static void handle_notifications(char *buffer, int len) +{ + struct fanotify_event_metadata *event = + (struct fanotify_event_metadata *) buffer; + struct fanotify_event_info_header *info; + struct fanotify_event_info_error *err; + struct fanotify_event_info_fid *fid; + int off; + + for (; FAN_EVENT_OK(event, len); event = FAN_EVENT_NEXT(event, len)) { + + if (event->mask != FAN_FS_ERROR) { + printf("unexpected FAN MARK: %llx\n", + (unsigned long long)event->mask); + goto next_event; + } + + if (event->fd != FAN_NOFD) { + printf("Unexpected fd (!= FAN_NOFD)\n"); + goto next_event; + } + + printf("FAN_FS_ERROR (len=%d)\n", event->event_len); + + for (off = sizeof(*event) ; off < event->event_len; + off += info->len) { + info = (struct fanotify_event_info_header *) + ((char *) event + off); + + switch (info->info_type) { + case FAN_EVENT_INFO_TYPE_ERROR: + err = (struct fanotify_event_info_error *) info; + + printf("\tGeneric Error Record: len=%d\n", + err->hdr.len); + printf("\terror: %d\n", err->error); + printf("\terror_count: %d\n", err->error_count); + break; + + case FAN_EVENT_INFO_TYPE_FID: + fid = (struct fanotify_event_info_fid *) info; + + printf("\tfsid: %x%x\n", + fid->fsid.val[0], fid->fsid.val[1]); + print_fh((struct file_handle *) &fid->handle); + break; + + default: + printf("\tUnknown info type=%d len=%d:\n", + info->info_type, info->len); + } + } +next_event: + printf("---\n\n"); + } +} + +int main(int argc, char **argv) +{ + int fd; + + char buffer[BUFSIZ]; + + if (argc < 2) { + printf("Missing path argument\n"); + return 1; + } + + fd = fanotify_init(FAN_CLASS_NOTIF|FAN_REPORT_FID, O_RDONLY); + if (fd < 0) + errx(1, "fanotify_init"); + + if (fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, + FAN_FS_ERROR, AT_FDCWD, argv[1])) { + errx(1, "fanotify_mark"); + } + + while (1) { + int n = read(fd, buffer, BUFSIZ); + + if (n < 0) + errx(1, "read"); + + handle_notifications(buffer, n); + } + + return 0; +} diff --git a/samples/fprobe/Makefile b/samples/fprobe/Makefile new file mode 100644 index 000000000000..ecccbfa6e99b --- /dev/null +++ b/samples/fprobe/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_SAMPLE_FPROBE) += fprobe_example.o diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c new file mode 100644 index 000000000000..e22da8573116 --- /dev/null +++ b/samples/fprobe/fprobe_example.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Here's a sample kernel module showing the use of fprobe to dump a + * stack trace and selected registers when kernel_clone() is called. + * + * For more information on theory of operation of kprobes, see + * Documentation/trace/kprobes.rst + * + * You will see the trace data in /var/log/messages and on the console + * whenever kernel_clone() is invoked to create a new process. + */ + +#define pr_fmt(fmt) "%s: " fmt, __func__ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/fprobe.h> +#include <linux/sched/debug.h> +#include <linux/slab.h> + +#define BACKTRACE_DEPTH 16 +#define MAX_SYMBOL_LEN 4096 +static struct fprobe sample_probe; +static unsigned long nhit; + +static char symbol[MAX_SYMBOL_LEN] = "kernel_clone"; +module_param_string(symbol, symbol, sizeof(symbol), 0644); +MODULE_PARM_DESC(symbol, "Probed symbol(s), given by comma separated symbols or a wildcard pattern."); + +static char nosymbol[MAX_SYMBOL_LEN] = ""; +module_param_string(nosymbol, nosymbol, sizeof(nosymbol), 0644); +MODULE_PARM_DESC(nosymbol, "Not-probed symbols, given by a wildcard pattern."); + +static bool stackdump = true; +module_param(stackdump, bool, 0644); +MODULE_PARM_DESC(stackdump, "Enable stackdump."); + +static bool use_trace = false; +module_param(use_trace, bool, 0644); +MODULE_PARM_DESC(use_trace, "Use trace_printk instead of printk. This is only for debugging."); + +static void show_backtrace(void) +{ + unsigned long stacks[BACKTRACE_DEPTH]; + unsigned int len; + + len = stack_trace_save(stacks, BACKTRACE_DEPTH, 2); + stack_trace_print(stacks, len, 24); +} + +static void sample_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs) +{ + if (use_trace) + /* + * This is just an example, no kernel code should call + * trace_printk() except when actively debugging. + */ + trace_printk("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip); + else + pr_info("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip); + nhit++; + if (stackdump) + show_backtrace(); +} + +static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs) +{ + unsigned long rip = instruction_pointer(regs); + + if (use_trace) + /* + * This is just an example, no kernel code should call + * trace_printk() except when actively debugging. + */ + trace_printk("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n", + (void *)ip, (void *)ip, (void *)rip, (void *)rip); + else + pr_info("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n", + (void *)ip, (void *)ip, (void *)rip, (void *)rip); + nhit++; + if (stackdump) + show_backtrace(); +} + +static int __init fprobe_init(void) +{ + char *p, *symbuf = NULL; + const char **syms; + int ret, count, i; + + sample_probe.entry_handler = sample_entry_handler; + sample_probe.exit_handler = sample_exit_handler; + + if (strchr(symbol, '*')) { + /* filter based fprobe */ + ret = register_fprobe(&sample_probe, symbol, + nosymbol[0] == '\0' ? NULL : nosymbol); + goto out; + } else if (!strchr(symbol, ',')) { + symbuf = symbol; + ret = register_fprobe_syms(&sample_probe, (const char **)&symbuf, 1); + goto out; + } + + /* Comma separated symbols */ + symbuf = kstrdup(symbol, GFP_KERNEL); + if (!symbuf) + return -ENOMEM; + p = symbuf; + count = 1; + while ((p = strchr(++p, ',')) != NULL) + count++; + + pr_info("%d symbols found\n", count); + + syms = kcalloc(count, sizeof(char *), GFP_KERNEL); + if (!syms) { + kfree(symbuf); + return -ENOMEM; + } + + p = symbuf; + for (i = 0; i < count; i++) + syms[i] = strsep(&p, ","); + + ret = register_fprobe_syms(&sample_probe, syms, count); + kfree(syms); + kfree(symbuf); +out: + if (ret < 0) + pr_err("register_fprobe failed, returned %d\n", ret); + else + pr_info("Planted fprobe at %s\n", symbol); + + return ret; +} + +static void __exit fprobe_exit(void) +{ + unregister_fprobe(&sample_probe); + + pr_info("fprobe at %s unregistered. %ld times hit, %ld times missed\n", + symbol, nhit, sample_probe.nmissed); +} + +module_init(fprobe_init) +module_exit(fprobe_exit) +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile index 4ce896e10b2e..faf8cdb79c5f 100644 --- a/samples/ftrace/Makefile +++ b/samples/ftrace/Makefile @@ -3,6 +3,8 @@ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace-direct-multi.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace-direct-multi-modify.o CFLAGS_sample-trace-array.o := -I$(src) obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index e04229d21475..39146fa83e20 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -2,6 +2,10 @@ #include <linux/module.h> #include <linux/kthread.h> #include <linux/ftrace.h> +#include <asm/asm-offsets.h> + +extern void my_direct_func1(void); +extern void my_direct_func2(void); void my_direct_func1(void) { @@ -18,23 +22,77 @@ extern void my_tramp2(void *); static unsigned long my_ip = (unsigned long)schedule; +#ifdef CONFIG_X86_64 + +#include <asm/ibt.h> + asm ( " .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" " my_tramp1:" + ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" " call my_direct_func1\n" " leave\n" -" ret\n" +" .size my_tramp1, .-my_tramp1\n" + ASM_RET + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" " my_tramp2:" + ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" " call my_direct_func2\n" " leave\n" -" ret\n" + ASM_RET +" .size my_tramp2, .-my_tramp2\n" " .popsection\n" ); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func1\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp1, .-my_tramp1\n" +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func2\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ + static unsigned long my_tramp = (unsigned long)my_tramp1; static unsigned long tramps[2] = { (unsigned long)my_tramp1, diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c new file mode 100644 index 000000000000..65aa94d96f4e --- /dev/null +++ b/samples/ftrace/ftrace-direct-multi-modify.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/ftrace.h> +#include <asm/asm-offsets.h> + +extern void my_direct_func1(unsigned long ip); +extern void my_direct_func2(unsigned long ip); + +void my_direct_func1(unsigned long ip) +{ + trace_printk("my direct func1 ip %lx\n", ip); +} + +void my_direct_func2(unsigned long ip) +{ + trace_printk("my direct func2 ip %lx\n", ip); +} + +extern void my_tramp1(void *); +extern void my_tramp2(void *); + +#ifdef CONFIG_X86_64 + +#include <asm/ibt.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" + ASM_ENDBR +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" pushq %rdi\n" +" movq 8(%rbp), %rdi\n" +" call my_direct_func1\n" +" popq %rdi\n" +" leave\n" + ASM_RET +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" + ASM_ENDBR +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" pushq %rdi\n" +" movq 8(%rbp), %rdi\n" +" call my_direct_func2\n" +" popq %rdi\n" +" leave\n" + ASM_RET +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" lgr %r2,%r0\n" +" brasl %r14,my_direct_func1\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp1, .-my_tramp1\n" +"\n" +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" lgr %r2,%r0\n" +" brasl %r14,my_direct_func2\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ + +static unsigned long my_tramp = (unsigned long)my_tramp1; +static unsigned long tramps[2] = { + (unsigned long)my_tramp1, + (unsigned long)my_tramp2, +}; + +static struct ftrace_ops direct; + +static int simple_thread(void *arg) +{ + static int t; + int ret = 0; + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(2 * HZ); + + if (ret) + continue; + t ^= 1; + ret = modify_ftrace_direct_multi(&direct, tramps[t]); + if (!ret) + my_tramp = tramps[t]; + WARN_ON_ONCE(ret); + } + + return 0; +} + +static struct task_struct *simple_tsk; + +static int __init ftrace_direct_multi_init(void) +{ + int ret; + + ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0); + ftrace_set_filter_ip(&direct, (unsigned long) schedule, 0, 0); + + ret = register_ftrace_direct_multi(&direct, my_tramp); + + if (!ret) + simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn"); + return ret; +} + +static void __exit ftrace_direct_multi_exit(void) +{ + kthread_stop(simple_tsk); + unregister_ftrace_direct_multi(&direct, my_tramp); +} + +module_init(ftrace_direct_multi_init); +module_exit(ftrace_direct_multi_exit); + +MODULE_AUTHOR("Jiri Olsa"); +MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct_multi()"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c new file mode 100644 index 000000000000..41ded7c615c7 --- /dev/null +++ b/samples/ftrace/ftrace-direct-multi.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> + +#include <linux/mm.h> /* for handle_mm_fault() */ +#include <linux/ftrace.h> +#include <linux/sched/stat.h> +#include <asm/asm-offsets.h> + +extern void my_direct_func(unsigned long ip); + +void my_direct_func(unsigned long ip) +{ + trace_printk("ip %lx\n", ip); +} + +extern void my_tramp(void *); + +#ifdef CONFIG_X86_64 + +#include <asm/ibt.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" + ASM_ENDBR +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" pushq %rdi\n" +" movq 8(%rbp), %rdi\n" +" call my_direct_func\n" +" popq %rdi\n" +" leave\n" + ASM_RET +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" lgr %r2,%r0\n" +" brasl %r14,my_direct_func\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ + +static struct ftrace_ops direct; + +static int __init ftrace_direct_multi_init(void) +{ + ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0); + ftrace_set_filter_ip(&direct, (unsigned long) schedule, 0, 0); + + return register_ftrace_direct_multi(&direct, (unsigned long) my_tramp); +} + +static void __exit ftrace_direct_multi_exit(void) +{ + unregister_ftrace_direct_multi(&direct, (unsigned long) my_tramp); +} + +module_init(ftrace_direct_multi_init); +module_exit(ftrace_direct_multi_exit); + +MODULE_AUTHOR("Jiri Olsa"); +MODULE_DESCRIPTION("Example use case of using register_ftrace_direct_multi()"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index 27efa5f6ff52..6690468c5cc2 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -3,6 +3,10 @@ #include <linux/mm.h> /* for handle_mm_fault() */ #include <linux/ftrace.h> +#include <asm/asm-offsets.h> + +extern void my_direct_func(struct vm_area_struct *vma, + unsigned long address, unsigned int flags); void my_direct_func(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -13,9 +17,16 @@ void my_direct_func(struct vm_area_struct *vma, extern void my_tramp(void *); +#ifdef CONFIG_X86_64 + +#include <asm/ibt.h> + asm ( " .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" " my_tramp:" + ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" " pushq %rdi\n" @@ -26,10 +37,36 @@ asm ( " popq %rsi\n" " popq %rdi\n" " leave\n" -" ret\n" + ASM_RET +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp, .-my_tramp\n" " .popsection\n" ); +#endif /* CONFIG_S390 */ static int __init ftrace_direct_init(void) { diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c index a2e3063bd306..e8f1e440b9b8 100644 --- a/samples/ftrace/ftrace-direct.c +++ b/samples/ftrace/ftrace-direct.c @@ -3,6 +3,9 @@ #include <linux/sched.h> /* for wake_up_process() */ #include <linux/ftrace.h> +#include <asm/asm-offsets.h> + +extern void my_direct_func(struct task_struct *p); void my_direct_func(struct task_struct *p) { @@ -11,19 +14,52 @@ void my_direct_func(struct task_struct *p) extern void my_tramp(void *); +#ifdef CONFIG_X86_64 + +#include <asm/ibt.h> + asm ( " .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" " my_tramp:" + ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" " pushq %rdi\n" " call my_direct_func\n" " popq %rdi\n" " leave\n" -" ret\n" + ASM_RET +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp, .-my_tramp\n" " .popsection\n" ); +#endif /* CONFIG_S390 */ static int __init ftrace_direct_init(void) { diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c index d523450d73eb..6aba02a31c96 100644 --- a/samples/ftrace/sample-trace-array.c +++ b/samples/ftrace/sample-trace-array.c @@ -6,6 +6,7 @@ #include <linux/timer.h> #include <linux/err.h> #include <linux/jiffies.h> +#include <linux/workqueue.h> /* * Any file that uses trace points, must include the header. @@ -20,6 +21,16 @@ struct trace_array *tr; static void mytimer_handler(struct timer_list *unused); static struct task_struct *simple_tsk; +static void trace_work_fn(struct work_struct *work) +{ + /* + * Disable tracing for event "sample_event". + */ + trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", + false); +} +static DECLARE_WORK(trace_work, trace_work_fn); + /* * mytimer: Timer setup to disable tracing for event "sample_event". This * timer is only for the purposes of the sample module to demonstrate access of @@ -29,11 +40,7 @@ static DEFINE_TIMER(mytimer, mytimer_handler); static void mytimer_handler(struct timer_list *unused) { - /* - * Disable tracing for event "sample_event". - */ - trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", - false); + schedule_work(&trace_work); } static void simple_thread_func(int count) @@ -76,6 +83,7 @@ static int simple_thread(void *arg) simple_thread_func(count++); del_timer(&mytimer); + cancel_work_sync(&trace_work); /* * trace_array_put() decrements the reference counter associated with @@ -107,8 +115,12 @@ static int __init sample_trace_array_init(void) trace_printk_init_buffers(); simple_tsk = kthread_run(simple_thread, NULL, "sample-instance"); - if (IS_ERR(simple_tsk)) + if (IS_ERR(simple_tsk)) { + trace_array_put(tr); + trace_array_destroy(tr); return -1; + } + return 0; } diff --git a/samples/hidraw/.gitignore b/samples/hidraw/.gitignore index 05e51a685242..5233ab63262e 100644 --- a/samples/hidraw/.gitignore +++ b/samples/hidraw/.gitignore @@ -1 +1,2 @@ -hid-example +# SPDX-License-Identifier: GPL-2.0-only +/hid-example diff --git a/samples/hidraw/Makefile b/samples/hidraw/Makefile index 8bd25f77671f..594d989e5486 100644 --- a/samples/hidraw/Makefile +++ b/samples/hidraw/Makefile @@ -1,8 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# List of programs to build -hostprogs := hid-example -always-y := $(hostprogs) +userprogs-always-y += hid-example -HOSTCFLAGS_hid-example.o += -I$(objtree)/usr/include - -all: hid-example +userccflags += -I usr/include diff --git a/samples/hidraw/hid-example.c b/samples/hidraw/hid-example.c index 37a0ffcb4d63..0f73ace3c6c3 100644 --- a/samples/hidraw/hid-example.c +++ b/samples/hidraw/hid-example.c @@ -128,7 +128,7 @@ int main(int argc, char **argv) perror("HIDIOCGFEATURE"); } else { printf("ioctl HIDIOCGFEATURE returned: %d\n", res); - printf("Report data (not containing the report number):\n\t"); + printf("Report data:\n\t"); for (i = 0; i < res; i++) printf("%hhx ", buf[i]); puts("\n"); diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index c58504774118..418c46fe5ffc 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -23,7 +23,7 @@ struct perf_event * __percpu *sample_hbp; -static char ksym_name[KSYM_NAME_LEN] = "pid_max"; +static char ksym_name[KSYM_NAME_LEN] = "jiffies"; module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any" " write operations on the kernel symbol"); @@ -41,11 +41,15 @@ static int __init hw_break_module_init(void) { int ret; struct perf_event_attr attr; + void *addr = __symbol_get(ksym_name); + + if (!addr) + return -ENXIO; hw_breakpoint_init(&attr); - attr.bp_addr = kallsyms_lookup_name(ksym_name); + attr.bp_addr = (unsigned long)addr; attr.bp_len = HW_BREAKPOINT_LEN_4; - attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + attr.bp_type = HW_BREAKPOINT_W; sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL); if (IS_ERR((void __force *)sample_hbp)) { @@ -66,6 +70,7 @@ fail: static void __exit hw_break_module_exit(void) { unregister_wide_hw_breakpoint(sample_hbp); + symbol_put(ksym_name); printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name); } diff --git a/samples/kdb/kdb_hello.c b/samples/kdb/kdb_hello.c index c1c2fa0f62c2..82736e5a5e32 100644 --- a/samples/kdb/kdb_hello.c +++ b/samples/kdb/kdb_hello.c @@ -28,28 +28,26 @@ static int kdb_hello_cmd(int argc, const char **argv) return 0; } +static kdbtab_t hello_cmd = { + .name = "hello", + .func = kdb_hello_cmd, + .usage = "[string]", + .help = "Say Hello World or Hello [string]", +}; static int __init kdb_hello_cmd_init(void) { /* * Registration of a dynamically added kdb command is done with - * kdb_register() with the arguments being: - * 1: The name of the shell command - * 2: The function that processes the command - * 3: Description of the usage of any arguments - * 4: Descriptive text when you run help - * 5: Number of characters to complete the command - * 0 == type the whole command - * 1 == match both "g" and "go" for example + * kdb_register(). */ - kdb_register("hello", kdb_hello_cmd, "[string]", - "Say Hello World or Hello [string]", 0); + kdb_register(&hello_cmd); return 0; } static void __exit kdb_hello_cmd_exit(void) { - kdb_unregister("hello"); + kdb_unregister(&hello_cmd); } module_init(kdb_hello_cmd_init); diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c index c406f03ee551..642d0748c169 100644 --- a/samples/kfifo/bytestream-example.c +++ b/samples/kfifo/bytestream-example.c @@ -22,10 +22,10 @@ #define PROC_FIFO "bytestream-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @@ -116,14 +116,16 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static ssize_t fifo_read(struct file *file, char __user *buf, @@ -132,14 +134,16 @@ static ssize_t fifo_read(struct file *file, char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static const struct proc_ops fifo_proc_ops = { diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c index 78977fc4a23f..c61482ba94f4 100644 --- a/samples/kfifo/inttype-example.c +++ b/samples/kfifo/inttype-example.c @@ -22,10 +22,10 @@ #define PROC_FIFO "int-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @@ -109,14 +109,16 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static ssize_t fifo_read(struct file *file, char __user *buf, @@ -125,14 +127,16 @@ static ssize_t fifo_read(struct file *file, char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static const struct proc_ops fifo_proc_ops = { diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c index c507998a2617..e4087b2d3fc4 100644 --- a/samples/kfifo/record-example.c +++ b/samples/kfifo/record-example.c @@ -22,10 +22,10 @@ #define PROC_FIFO "record-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @@ -123,14 +123,16 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static ssize_t fifo_read(struct file *file, char __user *buf, @@ -139,14 +141,16 @@ static ssize_t fifo_read(struct file *file, char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static const struct proc_ops fifo_proc_ops = { diff --git a/samples/kmemleak/Makefile b/samples/kmemleak/Makefile new file mode 100644 index 000000000000..16b6132c540c --- /dev/null +++ b/samples/kmemleak/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/samples/kmemleak/kmemleak-test.c b/samples/kmemleak/kmemleak-test.c new file mode 100644 index 000000000000..7b476eb8285f --- /dev/null +++ b/samples/kmemleak/kmemleak-test.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * samples/kmemleak/kmemleak-test.c + * + * Copyright (C) 2008 ARM Limited + * Written by Catalin Marinas <catalin.marinas@arm.com> + */ + +#define pr_fmt(fmt) "kmemleak: " fmt + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/list.h> +#include <linux/percpu.h> +#include <linux/fdtable.h> + +#include <linux/kmemleak.h> + +struct test_node { + long header[25]; + struct list_head list; + long footer[25]; +}; + +static LIST_HEAD(test_list); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); + +/* + * Some very simple testing. This function needs to be extended for + * proper testing. + */ +static int __init kmemleak_test_init(void) +{ + struct test_node *elem; + int i; + + pr_info("Kmemleak testing\n"); + + /* make some orphan objects */ + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); +#ifndef CONFIG_MODULES + pr_info("kmem_cache_alloc(files_cachep) = %p\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); + pr_info("kmem_cache_alloc(files_cachep) = %p\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); +#endif + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + + /* + * Add elements to a list. They should only appear as orphan + * after the module is removed. + */ + for (i = 0; i < 10; i++) { + elem = kzalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kzalloc(sizeof(*elem)) = %p\n", elem); + if (!elem) + return -ENOMEM; + INIT_LIST_HEAD(&elem->list); + list_add_tail(&elem->list, &test_list); + } + + for_each_possible_cpu(i) { + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); + pr_info("kmalloc(129) = %p\n", + per_cpu(kmemleak_test_pointer, i)); + } + + return 0; +} +module_init(kmemleak_test_init); + +static void __exit kmemleak_test_exit(void) +{ + struct test_node *elem, *tmp; + + /* + * Remove the list elements without actually freeing the + * memory. + */ + list_for_each_entry_safe(elem, tmp, &test_list, list) + list_del(&elem->list); +} +module_exit(kmemleak_test_exit); + +MODULE_LICENSE("GPL"); diff --git a/samples/kobject/kobject-example.c b/samples/kobject/kobject-example.c index 9e383fdbaa00..96678ed73216 100644 --- a/samples/kobject/kobject-example.c +++ b/samples/kobject/kobject-example.c @@ -28,7 +28,7 @@ static int bar; static ssize_t foo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", foo); + return sysfs_emit(buf, "%d\n", foo); } static ssize_t foo_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -60,7 +60,7 @@ static ssize_t b_show(struct kobject *kobj, struct kobj_attribute *attr, var = baz; else var = bar; - return sprintf(buf, "%d\n", var); + return sysfs_emit(buf, "%d\n", var); } static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr, diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c index c8010f126808..52f1acabd479 100644 --- a/samples/kobject/kset-example.c +++ b/samples/kobject/kset-example.c @@ -112,7 +112,7 @@ static void foo_release(struct kobject *kobj) static ssize_t foo_show(struct foo_obj *foo_obj, struct foo_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", foo_obj->foo); + return sysfs_emit(buf, "%d\n", foo_obj->foo); } static ssize_t foo_store(struct foo_obj *foo_obj, struct foo_attribute *attr, @@ -144,7 +144,7 @@ static ssize_t b_show(struct foo_obj *foo_obj, struct foo_attribute *attr, var = foo_obj->baz; else var = foo_obj->bar; - return sprintf(buf, "%d\n", var); + return sysfs_emit(buf, "%d\n", var); } static ssize_t b_store(struct foo_obj *foo_obj, struct foo_attribute *attr, diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c index d693c23a85e8..fd346f58ddba 100644 --- a/samples/kprobes/kprobe_example.c +++ b/samples/kprobes/kprobe_example.c @@ -1,23 +1,23 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * NOTE: This example is works on x86 and powerpc. * Here's a sample kernel module showing the use of kprobes to dump a - * stack trace and selected registers when _do_fork() is called. + * stack trace and selected registers when kernel_clone() is called. * * For more information on theory of operation of kprobes, see - * Documentation/kprobes.txt + * Documentation/trace/kprobes.rst * * You will see the trace data in /var/log/messages and on the console - * whenever _do_fork() is invoked to create a new process. + * whenever kernel_clone() is invoked to create a new process. */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + #include <linux/kernel.h> #include <linux/module.h> #include <linux/kprobes.h> -#define MAX_SYMBOL_LEN 64 -static char symbol[MAX_SYMBOL_LEN] = "_do_fork"; -module_param_string(symbol, symbol, sizeof(symbol), 0644); +static char symbol[KSYM_NAME_LEN] = "kernel_clone"; +module_param_string(symbol, symbol, KSYM_NAME_LEN, 0644); /* For each probe you need to allocate a kprobe structure */ static struct kprobe kp = { @@ -25,27 +25,34 @@ static struct kprobe kp = { }; /* kprobe pre_handler: called just before the probed instruction is executed */ -static int handler_pre(struct kprobe *p, struct pt_regs *regs) +static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs) { #ifdef CONFIG_X86 - pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, ip = %lx, flags = 0x%lx\n", p->symbol_name, p->addr, regs->ip, regs->flags); #endif #ifdef CONFIG_PPC - pr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n", p->symbol_name, p->addr, regs->nip, regs->msr); #endif #ifdef CONFIG_MIPS - pr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n", p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status); #endif #ifdef CONFIG_ARM64 - pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx," - " pstate = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, pc = 0x%lx, pstate = 0x%lx\n", p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate); #endif +#ifdef CONFIG_ARM + pr_info("<%s> p->addr = 0x%p, pc = 0x%lx, cpsr = 0x%lx\n", + p->symbol_name, p->addr, (long)regs->ARM_pc, (long)regs->ARM_cpsr); +#endif +#ifdef CONFIG_RISCV + pr_info("<%s> p->addr = 0x%p, pc = 0x%lx, status = 0x%lx\n", + p->symbol_name, p->addr, regs->epc, regs->status); +#endif #ifdef CONFIG_S390 - pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n", + pr_info("<%s> p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n", p->symbol_name, p->addr, regs->psw.addr, regs->flags); #endif @@ -54,49 +61,44 @@ static int handler_pre(struct kprobe *p, struct pt_regs *regs) } /* kprobe post_handler: called after the probed instruction is executed */ -static void handler_post(struct kprobe *p, struct pt_regs *regs, +static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { #ifdef CONFIG_X86 - pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, flags = 0x%lx\n", p->symbol_name, p->addr, regs->flags); #endif #ifdef CONFIG_PPC - pr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, msr = 0x%lx\n", p->symbol_name, p->addr, regs->msr); #endif #ifdef CONFIG_MIPS - pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, status = 0x%lx\n", p->symbol_name, p->addr, regs->cp0_status); #endif #ifdef CONFIG_ARM64 - pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, pstate = 0x%lx\n", p->symbol_name, p->addr, (long)regs->pstate); #endif +#ifdef CONFIG_ARM + pr_info("<%s> p->addr = 0x%p, cpsr = 0x%lx\n", + p->symbol_name, p->addr, (long)regs->ARM_cpsr); +#endif +#ifdef CONFIG_RISCV + pr_info("<%s> p->addr = 0x%p, status = 0x%lx\n", + p->symbol_name, p->addr, regs->status); +#endif #ifdef CONFIG_S390 - pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n", + pr_info("<%s> p->addr, 0x%p, flags = 0x%lx\n", p->symbol_name, p->addr, regs->flags); #endif } -/* - * fault_handler: this is called if an exception is generated for any - * instruction within the pre- or post-handler, or when Kprobes - * single-steps the probed instruction. - */ -static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr) -{ - pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr); - /* Return 0 because we don't handle the fault. */ - return 0; -} - static int __init kprobe_init(void) { int ret; kp.pre_handler = handler_pre; kp.post_handler = handler_post; - kp.fault_handler = handler_fault; ret = register_kprobe(&kp); if (ret < 0) { diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c index 186315ca88b3..cbf16542d84e 100644 --- a/samples/kprobes/kretprobe_example.c +++ b/samples/kprobes/kretprobe_example.c @@ -8,10 +8,10 @@ * * usage: insmod kretprobe_example.ko func=<func_name> * - * If no func_name is specified, _do_fork is instrumented + * If no func_name is specified, kernel_clone is instrumented * * For more information on theory of operation of kretprobes, see - * Documentation/kprobes.txt + * Documentation/trace/kprobes.rst * * Build and insert the kernel module as done in the kprobe example. * You will see the trace data in /var/log/messages and on the console @@ -23,11 +23,10 @@ #include <linux/module.h> #include <linux/kprobes.h> #include <linux/ktime.h> -#include <linux/limits.h> #include <linux/sched.h> -static char func_name[NAME_MAX] = "_do_fork"; -module_param_string(func, func_name, NAME_MAX, S_IRUGO); +static char func_name[KSYM_NAME_LEN] = "kernel_clone"; +module_param_string(func, func_name, KSYM_NAME_LEN, 0644); MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the" " function's execution time"); @@ -48,6 +47,7 @@ static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) data->entry_stamp = ktime_get(); return 0; } +NOKPROBE_SYMBOL(entry_handler); /* * Return-probe handler: Log the return value and duration. Duration may turn @@ -67,6 +67,7 @@ static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) func_name, retval, (long long)delta); return 0; } +NOKPROBE_SYMBOL(ret_handler); static struct kretprobe my_kretprobe = { .handler = ret_handler, @@ -84,7 +85,7 @@ static int __init kretprobe_init(void) ret = register_kretprobe(&my_kretprobe); if (ret < 0) { pr_err("register_kretprobe failed, returned %d\n", ret); - return -1; + return ret; } pr_info("Planted return probe at %s: %p\n", my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr); diff --git a/samples/landlock/.gitignore b/samples/landlock/.gitignore new file mode 100644 index 000000000000..f43668b2d318 --- /dev/null +++ b/samples/landlock/.gitignore @@ -0,0 +1 @@ +/sandboxer diff --git a/samples/landlock/Makefile b/samples/landlock/Makefile new file mode 100644 index 000000000000..5d601e51c2eb --- /dev/null +++ b/samples/landlock/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: BSD-3-Clause + +userprogs-always-y := sandboxer + +userccflags += -I usr/include + +.PHONY: all clean + +all: + $(MAKE) -C ../.. samples/landlock/ + +clean: + $(MAKE) -C ../.. M=samples/landlock/ clean diff --git a/samples/landlock/sandboxer.c b/samples/landlock/sandboxer.c new file mode 100644 index 000000000000..f29bb3c72230 --- /dev/null +++ b/samples/landlock/sandboxer.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* + * Simple Landlock sandbox manager able to launch a process restricted by a + * user-defined filesystem access control policy. + * + * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> + * Copyright © 2020 ANSSI + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <linux/landlock.h> +#include <linux/prctl.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/prctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <unistd.h> + +#ifndef landlock_create_ruleset +static inline int +landlock_create_ruleset(const struct landlock_ruleset_attr *const attr, + const size_t size, const __u32 flags) +{ + return syscall(__NR_landlock_create_ruleset, attr, size, flags); +} +#endif + +#ifndef landlock_add_rule +static inline int landlock_add_rule(const int ruleset_fd, + const enum landlock_rule_type rule_type, + const void *const rule_attr, + const __u32 flags) +{ + return syscall(__NR_landlock_add_rule, ruleset_fd, rule_type, rule_attr, + flags); +} +#endif + +#ifndef landlock_restrict_self +static inline int landlock_restrict_self(const int ruleset_fd, + const __u32 flags) +{ + return syscall(__NR_landlock_restrict_self, ruleset_fd, flags); +} +#endif + +#define ENV_FS_RO_NAME "LL_FS_RO" +#define ENV_FS_RW_NAME "LL_FS_RW" +#define ENV_PATH_TOKEN ":" + +static int parse_path(char *env_path, const char ***const path_list) +{ + int i, num_paths = 0; + + if (env_path) { + num_paths++; + for (i = 0; env_path[i]; i++) { + if (env_path[i] == ENV_PATH_TOKEN[0]) + num_paths++; + } + } + *path_list = malloc(num_paths * sizeof(**path_list)); + for (i = 0; i < num_paths; i++) + (*path_list)[i] = strsep(&env_path, ENV_PATH_TOKEN); + + return num_paths; +} + +/* clang-format off */ + +#define ACCESS_FILE ( \ + LANDLOCK_ACCESS_FS_EXECUTE | \ + LANDLOCK_ACCESS_FS_WRITE_FILE | \ + LANDLOCK_ACCESS_FS_READ_FILE) + +/* clang-format on */ + +static int populate_ruleset(const char *const env_var, const int ruleset_fd, + const __u64 allowed_access) +{ + int num_paths, i, ret = 1; + char *env_path_name; + const char **path_list = NULL; + struct landlock_path_beneath_attr path_beneath = { + .parent_fd = -1, + }; + + env_path_name = getenv(env_var); + if (!env_path_name) { + /* Prevents users to forget a setting. */ + fprintf(stderr, "Missing environment variable %s\n", env_var); + return 1; + } + env_path_name = strdup(env_path_name); + unsetenv(env_var); + num_paths = parse_path(env_path_name, &path_list); + if (num_paths == 1 && path_list[0][0] == '\0') { + /* + * Allows to not use all possible restrictions (e.g. use + * LL_FS_RO without LL_FS_RW). + */ + ret = 0; + goto out_free_name; + } + + for (i = 0; i < num_paths; i++) { + struct stat statbuf; + + path_beneath.parent_fd = open(path_list[i], O_PATH | O_CLOEXEC); + if (path_beneath.parent_fd < 0) { + fprintf(stderr, "Failed to open \"%s\": %s\n", + path_list[i], strerror(errno)); + goto out_free_name; + } + if (fstat(path_beneath.parent_fd, &statbuf)) { + close(path_beneath.parent_fd); + goto out_free_name; + } + path_beneath.allowed_access = allowed_access; + if (!S_ISDIR(statbuf.st_mode)) + path_beneath.allowed_access &= ACCESS_FILE; + if (landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)) { + fprintf(stderr, + "Failed to update the ruleset with \"%s\": %s\n", + path_list[i], strerror(errno)); + close(path_beneath.parent_fd); + goto out_free_name; + } + close(path_beneath.parent_fd); + } + ret = 0; + +out_free_name: + free(path_list); + free(env_path_name); + return ret; +} + +/* clang-format off */ + +#define ACCESS_FS_ROUGHLY_READ ( \ + LANDLOCK_ACCESS_FS_EXECUTE | \ + LANDLOCK_ACCESS_FS_READ_FILE | \ + LANDLOCK_ACCESS_FS_READ_DIR) + +#define ACCESS_FS_ROUGHLY_WRITE ( \ + LANDLOCK_ACCESS_FS_WRITE_FILE | \ + LANDLOCK_ACCESS_FS_REMOVE_DIR | \ + LANDLOCK_ACCESS_FS_REMOVE_FILE | \ + LANDLOCK_ACCESS_FS_MAKE_CHAR | \ + LANDLOCK_ACCESS_FS_MAKE_DIR | \ + LANDLOCK_ACCESS_FS_MAKE_REG | \ + LANDLOCK_ACCESS_FS_MAKE_SOCK | \ + LANDLOCK_ACCESS_FS_MAKE_FIFO | \ + LANDLOCK_ACCESS_FS_MAKE_BLOCK | \ + LANDLOCK_ACCESS_FS_MAKE_SYM | \ + LANDLOCK_ACCESS_FS_REFER) + +/* clang-format on */ + +#define LANDLOCK_ABI_LAST 2 + +int main(const int argc, char *const argv[], char *const *const envp) +{ + const char *cmd_path; + char *const *cmd_argv; + int ruleset_fd, abi; + __u64 access_fs_ro = ACCESS_FS_ROUGHLY_READ, + access_fs_rw = ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_WRITE; + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = access_fs_rw, + }; + + if (argc < 2) { + fprintf(stderr, + "usage: %s=\"...\" %s=\"...\" %s <cmd> [args]...\n\n", + ENV_FS_RO_NAME, ENV_FS_RW_NAME, argv[0]); + fprintf(stderr, + "Launch a command in a restricted environment.\n\n"); + fprintf(stderr, "Environment variables containing paths, " + "each separated by a colon:\n"); + fprintf(stderr, + "* %s: list of paths allowed to be used in a read-only way.\n", + ENV_FS_RO_NAME); + fprintf(stderr, + "* %s: list of paths allowed to be used in a read-write way.\n", + ENV_FS_RW_NAME); + fprintf(stderr, + "\nexample:\n" + "%s=\"/bin:/lib:/usr:/proc:/etc:/dev/urandom\" " + "%s=\"/dev/null:/dev/full:/dev/zero:/dev/pts:/tmp\" " + "%s bash -i\n\n", + ENV_FS_RO_NAME, ENV_FS_RW_NAME, argv[0]); + fprintf(stderr, + "This sandboxer can use Landlock features " + "up to ABI version %d.\n", + LANDLOCK_ABI_LAST); + return 1; + } + + abi = landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION); + if (abi < 0) { + const int err = errno; + + perror("Failed to check Landlock compatibility"); + switch (err) { + case ENOSYS: + fprintf(stderr, + "Hint: Landlock is not supported by the current kernel. " + "To support it, build the kernel with " + "CONFIG_SECURITY_LANDLOCK=y and prepend " + "\"landlock,\" to the content of CONFIG_LSM.\n"); + break; + case EOPNOTSUPP: + fprintf(stderr, + "Hint: Landlock is currently disabled. " + "It can be enabled in the kernel configuration by " + "prepending \"landlock,\" to the content of CONFIG_LSM, " + "or at boot time by setting the same content to the " + "\"lsm\" kernel parameter.\n"); + break; + } + return 1; + } + + /* Best-effort security. */ + switch (abi) { + case 1: + /* Removes LANDLOCK_ACCESS_FS_REFER for ABI < 2 */ + ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_REFER; + + fprintf(stderr, + "Hint: You should update the running kernel " + "to leverage Landlock features " + "provided by ABI version %d (instead of %d).\n", + LANDLOCK_ABI_LAST, abi); + __attribute__((fallthrough)); + case LANDLOCK_ABI_LAST: + break; + default: + fprintf(stderr, + "Hint: You should update this sandboxer " + "to leverage Landlock features " + "provided by ABI version %d (instead of %d).\n", + abi, LANDLOCK_ABI_LAST); + } + access_fs_ro &= ruleset_attr.handled_access_fs; + access_fs_rw &= ruleset_attr.handled_access_fs; + + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + if (ruleset_fd < 0) { + perror("Failed to create a ruleset"); + return 1; + } + if (populate_ruleset(ENV_FS_RO_NAME, ruleset_fd, access_fs_ro)) { + goto err_close_ruleset; + } + if (populate_ruleset(ENV_FS_RW_NAME, ruleset_fd, access_fs_rw)) { + goto err_close_ruleset; + } + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + perror("Failed to restrict privileges"); + goto err_close_ruleset; + } + if (landlock_restrict_self(ruleset_fd, 0)) { + perror("Failed to enforce ruleset"); + goto err_close_ruleset; + } + close(ruleset_fd); + + cmd_path = argv[1]; + cmd_argv = argv + 1; + execvpe(cmd_path, cmd_argv, envp); + fprintf(stderr, "Failed to execute \"%s\": %s\n", cmd_path, + strerror(errno)); + fprintf(stderr, "Hint: access to the binary, the interpreter or " + "shared libraries may be denied.\n"); + return 1; + +err_close_ruleset: + close(ruleset_fd); + return 1; +} diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c index 918ce17b43fd..6701641bf12d 100644 --- a/samples/livepatch/livepatch-shadow-fix1.c +++ b/samples/livepatch/livepatch-shadow-fix1.c @@ -109,9 +109,9 @@ static void livepatch_fix1_dummy_leak_dtor(void *obj, void *shadow_data) void *d = obj; int **shadow_leak = shadow_data; - kfree(*shadow_leak); pr_info("%s: dummy @ %p, prevented leak @ %p\n", __func__, d, *shadow_leak); + kfree(*shadow_leak); } static void livepatch_fix1_dummy_free(struct dummy *d) diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c index 29fe5cd42047..361046a4f10c 100644 --- a/samples/livepatch/livepatch-shadow-fix2.c +++ b/samples/livepatch/livepatch-shadow-fix2.c @@ -61,9 +61,9 @@ static void livepatch_fix2_dummy_leak_dtor(void *obj, void *shadow_data) void *d = obj; int **shadow_leak = shadow_data; - kfree(*shadow_leak); pr_info("%s: dummy @ %p, prevented leak @ %p\n", __func__, d, *shadow_leak); + kfree(*shadow_leak); } static void livepatch_fix2_dummy_free(struct dummy *d) diff --git a/samples/mei/.gitignore b/samples/mei/.gitignore index f356b81ca1ec..fe894bcb6a62 100644 --- a/samples/mei/.gitignore +++ b/samples/mei/.gitignore @@ -1 +1,2 @@ -mei-amt-version +# SPDX-License-Identifier: GPL-2.0-only +/mei-amt-version diff --git a/samples/mei/Makefile b/samples/mei/Makefile index f5b9d02be2cd..c54b8a0ab04e 100644 --- a/samples/mei/Makefile +++ b/samples/mei/Makefile @@ -1,10 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 # Copyright (c) 2012-2019, Intel Corporation. All rights reserved. +userprogs-always-y += mei-amt-version -hostprogs := mei-amt-version - -HOSTCFLAGS_mei-amt-version.o += -I$(objtree)/usr/include - -always-y := $(hostprogs) - -all: mei-amt-version +userccflags += -I usr/include diff --git a/samples/mei/mei-amt-version.c b/samples/mei/mei-amt-version.c index 32234481ad7d..867debd3b912 100644 --- a/samples/mei/mei-amt-version.c +++ b/samples/mei/mei-amt-version.c @@ -154,31 +154,52 @@ err: static ssize_t mei_recv_msg(struct mei *me, unsigned char *buffer, ssize_t len, unsigned long timeout) { + struct timeval tv; + fd_set set; ssize_t rc; + tv.tv_sec = timeout / 1000; + tv.tv_usec = (timeout % 1000) * 1000000; + mei_msg(me, "call read length = %zd\n", len); + FD_ZERO(&set); + FD_SET(me->fd, &set); + rc = select(me->fd + 1, &set, NULL, NULL, &tv); + if (rc > 0 && FD_ISSET(me->fd, &set)) { + mei_msg(me, "have reply\n"); + } else if (rc == 0) { + rc = -1; + mei_err(me, "read failed on timeout\n"); + goto out; + } else { /* rc < 0 */ + rc = errno; + mei_err(me, "read failed on select with status %zd %s\n", + rc, strerror(errno)); + goto out; + } + rc = read(me->fd, buffer, len); if (rc < 0) { mei_err(me, "read failed with status %zd %s\n", rc, strerror(errno)); - mei_deinit(me); - } else { - mei_msg(me, "read succeeded with result %zd\n", rc); + goto out; } + + mei_msg(me, "read succeeded with result %zd\n", rc); + +out: + if (rc < 0) + mei_deinit(me); + return rc; } static ssize_t mei_send_msg(struct mei *me, const unsigned char *buffer, ssize_t len, unsigned long timeout) { - struct timeval tv; ssize_t written; ssize_t rc; - fd_set set; - - tv.tv_sec = timeout / 1000; - tv.tv_usec = (timeout % 1000) * 1000000; mei_msg(me, "call write length = %zd\n", len); @@ -189,19 +210,7 @@ static ssize_t mei_send_msg(struct mei *me, const unsigned char *buffer, written, strerror(errno)); goto out; } - - FD_ZERO(&set); - FD_SET(me->fd, &set); - rc = select(me->fd + 1 , &set, NULL, NULL, &tv); - if (rc > 0 && FD_ISSET(me->fd, &set)) { - mei_msg(me, "write success\n"); - } else if (rc == 0) { - mei_err(me, "write failed on timeout with status\n"); - goto out; - } else { /* rc < 0 */ - mei_err(me, "write failed on select with status %zd\n", rc); - goto out; - } + mei_msg(me, "write success\n"); rc = written; out: @@ -267,7 +276,7 @@ struct amt_host_if_msg_header { struct amt_host_if_resp_header { struct amt_host_if_msg_header header; uint32_t status; - unsigned char data[0]; + unsigned char data[]; } __attribute__((packed)); const uuid_le MEI_IAMTHIF = UUID_LE(0x12f80028, 0xb4b7, 0x4b2d, \ diff --git a/samples/mic/mpssd/.gitignore b/samples/mic/mpssd/.gitignore deleted file mode 100644 index 8b7c72f07c92..000000000000 --- a/samples/mic/mpssd/.gitignore +++ /dev/null @@ -1 +0,0 @@ -mpssd diff --git a/samples/mic/mpssd/Makefile b/samples/mic/mpssd/Makefile deleted file mode 100644 index a7a6e0c70424..000000000000 --- a/samples/mic/mpssd/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -ifndef CROSS_COMPILE -uname_M := $(shell uname -m 2>/dev/null || echo not) -ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) - -ifeq ($(ARCH),x86) - -PROGS := mpssd -CC = $(CROSS_COMPILE)gcc -CFLAGS := -I../../../usr/include -I../../../tools/include - -ifdef DEBUG -CFLAGS += -DDEBUG=$(DEBUG) -endif - -all: $(PROGS) -mpssd: mpssd.c sysfs.c - $(CC) $(CFLAGS) mpssd.c sysfs.c -o mpssd -lpthread - -install: - install mpssd /usr/sbin/mpssd - install micctrl /usr/sbin/micctrl - -clean: - rm -fr $(PROGS) - -endif -endif diff --git a/samples/mic/mpssd/micctrl b/samples/mic/mpssd/micctrl deleted file mode 100755 index 030a60b04046..000000000000 --- a/samples/mic/mpssd/micctrl +++ /dev/null @@ -1,162 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0-only -# Intel MIC Platform Software Stack (MPSS) -# -# Copyright(c) 2013 Intel Corporation. -# -# Intel MIC User Space Tools. -# -# micctrl - Controls MIC boot/start/stop. -# -# chkconfig: 2345 95 05 -# description: start MPSS stack processing. -# -### BEGIN INIT INFO -# Provides: micctrl -### END INIT INFO - -# Source function library. -. /etc/init.d/functions - -sysfs="/sys/class/mic" - -_status() -{ - f=$sysfs/$1 - echo -e $1 state: "`cat $f/state`" shutdown_status: "`cat $f/shutdown_status`" -} - -status() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _status $1 - return $? - fi - for f in $sysfs/* - do - _status `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_reset() -{ - f=$sysfs/$1 - echo reset > $f/state -} - -reset() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _reset $1 - return $? - fi - for f in $sysfs/* - do - _reset `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_boot() -{ - f=$sysfs/$1 - echo "linux" > $f/bootmode - echo "mic/uos.img" > $f/firmware - echo "mic/$1.image" > $f/ramdisk - echo "boot" > $f/state -} - -boot() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _boot $1 - return $? - fi - for f in $sysfs/* - do - _boot `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_shutdown() -{ - f=$sysfs/$1 - echo shutdown > $f/state -} - -shutdown() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _shutdown $1 - return $? - fi - for f in $sysfs/* - do - _shutdown `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_wait() -{ - f=$sysfs/$1 - while [ "`cat $f/state`" != "offline" -a "`cat $f/state`" != "online" ] - do - sleep 1 - echo -e "Waiting for $1 to go offline" - done -} - -wait() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _wait $1 - return $? - fi - # Wait for the cards to go offline - for f in $sysfs/* - do - _wait `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -if [ ! -d "$sysfs" ]; then - echo -e $"Module unloaded " - exit 3 -fi - -case $1 in - -s) - status $2 - ;; - -r) - reset $2 - ;; - -b) - boot $2 - ;; - -S) - shutdown $2 - ;; - -w) - wait $2 - ;; - *) - echo $"Usage: $0 {-s (status) |-r (reset) |-b (boot) |-S (shutdown) |-w (wait)}" - exit 2 -esac - -exit $? diff --git a/samples/mic/mpssd/mpss b/samples/mic/mpssd/mpss deleted file mode 100755 index 248ac7313c71..000000000000 --- a/samples/mic/mpssd/mpss +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0-only -# Intel MIC Platform Software Stack (MPSS) -# -# Copyright(c) 2013 Intel Corporation. -# -# Intel MIC User Space Tools. -# -# mpss Start mpssd. -# -# chkconfig: 2345 95 05 -# description: start MPSS stack processing. -# -### BEGIN INIT INFO -# Provides: mpss -# Required-Start: -# Required-Stop: -# Short-Description: MPSS stack control -# Description: MPSS stack control -### END INIT INFO - -# Source function library. -. /etc/init.d/functions - -exec=/usr/sbin/mpssd -sysfs="/sys/class/mic" -mic_modules="mic_host mic_x100_dma scif vop" - -start() -{ - [ -x $exec ] || exit 5 - - if [ "`ps -e | awk '{print $4}' | grep mpssd | head -1`" = "mpssd" ]; then - echo -e $"MPSSD already running! " - success - echo - return 0 - fi - - echo -e $"Starting MPSS Stack" - echo -e $"Loading MIC drivers:" $mic_modules - - modprobe -a $mic_modules - RETVAL=$? - if [ $RETVAL -ne 0 ]; then - failure - echo - return $RETVAL - fi - - # Start the daemon - echo -n $"Starting MPSSD " - $exec - RETVAL=$? - if [ $RETVAL -ne 0 ]; then - failure - echo - return $RETVAL - fi - success - echo - - sleep 5 - - # Boot the cards - micctrl -b - - # Wait till ping works - for f in $sysfs/* - do - count=100 - ipaddr=`cat $f/cmdline` - ipaddr=${ipaddr#*address,} - ipaddr=`echo $ipaddr | cut -d, -f1 | cut -d\; -f1` - while [ $count -ge 0 ] - do - echo -e "Pinging "`basename $f`" " - ping -c 1 $ipaddr &> /dev/null - RETVAL=$? - if [ $RETVAL -eq 0 ]; then - success - break - fi - sleep 1 - count=`expr $count - 1` - done - [ $RETVAL -ne 0 ] && failure || success - echo - done - return $RETVAL -} - -stop() -{ - echo -e $"Shutting down MPSS Stack: " - - # Bail out if module is unloaded - if [ ! -d "$sysfs" ]; then - echo -n $"Module unloaded " - success - echo - return 0 - fi - - # Shut down the cards. - micctrl -S - - # Wait for the cards to go offline - for f in $sysfs/* - do - while [ "`cat $f/state`" != "ready" ] - do - sleep 1 - echo -e "Waiting for "`basename $f`" to become ready" - done - done - - # Display the status of the cards - micctrl -s - - # Kill MPSSD now - echo -n $"Killing MPSSD" - killall -9 mpssd 2>/dev/null - RETVAL=$? - [ $RETVAL -ne 0 ] && failure || success - echo - return $RETVAL -} - -restart() -{ - stop - sleep 5 - start -} - -status() -{ - micctrl -s - if [ "`ps -e | awk '{print $4}' | grep mpssd | head -n 1`" = "mpssd" ]; then - echo "mpssd is running" - else - echo "mpssd is stopped" - fi - return 0 -} - -unload() -{ - if [ ! -d "$sysfs" ]; then - echo -n $"No MIC_HOST Module: " - success - echo - return - fi - - stop - - sleep 5 - echo -n $"Removing MIC drivers:" $mic_modules - modprobe -r $mic_modules - RETVAL=$? - [ $RETVAL -ne 0 ] && failure || success - echo - return $RETVAL -} - -case $1 in - start) - start - ;; - stop) - stop - ;; - restart) - restart - ;; - status) - status - ;; - unload) - unload - ;; - *) - echo $"Usage: $0 {start|stop|restart|status|unload}" - exit 2 -esac - -exit $? diff --git a/samples/mic/mpssd/mpssd.c b/samples/mic/mpssd/mpssd.c deleted file mode 100644 index a11bf6c5b53b..000000000000 --- a/samples/mic/mpssd/mpssd.c +++ /dev/null @@ -1,1815 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC User Space Tools. - */ - -#define _GNU_SOURCE - -#include <stdlib.h> -#include <fcntl.h> -#include <getopt.h> -#include <assert.h> -#include <unistd.h> -#include <stdbool.h> -#include <signal.h> -#include <poll.h> -#include <features.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/mman.h> -#include <sys/socket.h> -#include <linux/virtio_ring.h> -#include <linux/virtio_net.h> -#include <linux/virtio_console.h> -#include <linux/virtio_blk.h> -#include <linux/version.h> -#include "mpssd.h" -#include <linux/mic_ioctl.h> -#include <linux/mic_common.h> -#include <tools/endian.h> - -static void *init_mic(void *arg); - -static FILE *logfp; -static struct mic_info mic_list; - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -#define min_t(type, x, y) ({ \ - type __min1 = (x); \ - type __min2 = (y); \ - __min1 < __min2 ? __min1 : __min2; }) - -/* align addr on a size boundary - adjust address up/down if needed */ -#define _ALIGN_DOWN(addr, size) ((addr)&(~((size)-1))) -#define _ALIGN_UP(addr, size) _ALIGN_DOWN(addr + size - 1, size) - -/* align addr on a size boundary - adjust address up if needed */ -#define _ALIGN(addr, size) _ALIGN_UP(addr, size) - -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) - -#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) - -#define GSO_ENABLED 1 -#define MAX_GSO_SIZE (64 * 1024) -#define ETH_H_LEN 14 -#define MAX_NET_PKT_SIZE (_ALIGN_UP(MAX_GSO_SIZE + ETH_H_LEN, 64)) -#define MIC_DEVICE_PAGE_END 0x1000 - -#ifndef VIRTIO_NET_HDR_F_DATA_VALID -#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ -#endif - -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[2]; - __u32 host_features, guest_acknowledgements; - struct virtio_console_config cons_config; -} virtcons_dev_page = { - .dd = { - .type = VIRTIO_ID_CONSOLE, - .num_vq = ARRAY_SIZE(virtcons_dev_page.vqconfig), - .feature_len = sizeof(virtcons_dev_page.host_features), - .config_len = sizeof(virtcons_dev_page.cons_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .vqconfig[1] = { - .num = htole16(MIC_VRING_ENTRIES), - }, -}; - -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[2]; - __u32 host_features, guest_acknowledgements; - struct virtio_net_config net_config; -} virtnet_dev_page = { - .dd = { - .type = VIRTIO_ID_NET, - .num_vq = ARRAY_SIZE(virtnet_dev_page.vqconfig), - .feature_len = sizeof(virtnet_dev_page.host_features), - .config_len = sizeof(virtnet_dev_page.net_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .vqconfig[1] = { - .num = htole16(MIC_VRING_ENTRIES), - }, -#if GSO_ENABLED - .host_features = htole32( - 1 << VIRTIO_NET_F_CSUM | - 1 << VIRTIO_NET_F_GSO | - 1 << VIRTIO_NET_F_GUEST_TSO4 | - 1 << VIRTIO_NET_F_GUEST_TSO6 | - 1 << VIRTIO_NET_F_GUEST_ECN), -#else - .host_features = 0, -#endif -}; - -static const char *mic_config_dir = "/etc/mpss"; -static const char *virtblk_backend = "VIRTBLK_BACKEND"; -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[1]; - __u32 host_features, guest_acknowledgements; - struct virtio_blk_config blk_config; -} virtblk_dev_page = { - .dd = { - .type = VIRTIO_ID_BLOCK, - .num_vq = ARRAY_SIZE(virtblk_dev_page.vqconfig), - .feature_len = sizeof(virtblk_dev_page.host_features), - .config_len = sizeof(virtblk_dev_page.blk_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .host_features = - htole32(1<<VIRTIO_BLK_F_SEG_MAX), - .blk_config = { - .seg_max = htole32(MIC_VRING_ENTRIES - 2), - .capacity = htole64(0), - } -}; - -static char *myname; - -static int -tap_configure(struct mic_info *mic, char *dev) -{ - pid_t pid; - char *ifargv[7]; - char ipaddr[IFNAMSIZ]; - int ret = 0; - - pid = fork(); - if (pid == 0) { - ifargv[0] = "ip"; - ifargv[1] = "link"; - ifargv[2] = "set"; - ifargv[3] = dev; - ifargv[4] = "up"; - ifargv[5] = NULL; - mpsslog("Configuring %s\n", dev); - ret = execvp("ip", ifargv); - if (ret < 0) { - mpsslog("%s execvp failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - } - if (pid < 0) { - mpsslog("%s fork failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - ret = waitpid(pid, NULL, 0); - if (ret < 0) { - mpsslog("%s waitpid failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - snprintf(ipaddr, IFNAMSIZ, "172.31.%d.254/24", mic->id + 1); - - pid = fork(); - if (pid == 0) { - ifargv[0] = "ip"; - ifargv[1] = "addr"; - ifargv[2] = "add"; - ifargv[3] = ipaddr; - ifargv[4] = "dev"; - ifargv[5] = dev; - ifargv[6] = NULL; - mpsslog("Configuring %s ipaddr %s\n", dev, ipaddr); - ret = execvp("ip", ifargv); - if (ret < 0) { - mpsslog("%s execvp failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - } - if (pid < 0) { - mpsslog("%s fork failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - ret = waitpid(pid, NULL, 0); - if (ret < 0) { - mpsslog("%s waitpid failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - mpsslog("MIC name %s %s %d DONE!\n", - mic->name, __func__, __LINE__); - return 0; -} - -static int tun_alloc(struct mic_info *mic, char *dev) -{ - struct ifreq ifr; - int fd, err; -#if GSO_ENABLED - unsigned offload; -#endif - fd = open("/dev/net/tun", O_RDWR); - if (fd < 0) { - mpsslog("Could not open /dev/net/tun %s\n", strerror(errno)); - goto done; - } - - memset(&ifr, 0, sizeof(ifr)); - - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; - if (*dev) - strncpy(ifr.ifr_name, dev, IFNAMSIZ); - - err = ioctl(fd, TUNSETIFF, (void *)&ifr); - if (err < 0) { - mpsslog("%s %s %d TUNSETIFF failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - close(fd); - return err; - } -#if GSO_ENABLED - offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN; - - err = ioctl(fd, TUNSETOFFLOAD, offload); - if (err < 0) { - mpsslog("%s %s %d TUNSETOFFLOAD failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - close(fd); - return err; - } -#endif - strcpy(dev, ifr.ifr_name); - mpsslog("Created TAP %s\n", dev); -done: - return fd; -} - -#define NET_FD_VIRTIO_NET 0 -#define NET_FD_TUN 1 -#define MAX_NET_FD 2 - -static void set_dp(struct mic_info *mic, int type, void *dp) -{ - switch (type) { - case VIRTIO_ID_CONSOLE: - mic->mic_console.console_dp = dp; - return; - case VIRTIO_ID_NET: - mic->mic_net.net_dp = dp; - return; - case VIRTIO_ID_BLOCK: - mic->mic_virtblk.block_dp = dp; - return; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - assert(0); -} - -static void *get_dp(struct mic_info *mic, int type) -{ - switch (type) { - case VIRTIO_ID_CONSOLE: - return mic->mic_console.console_dp; - case VIRTIO_ID_NET: - return mic->mic_net.net_dp; - case VIRTIO_ID_BLOCK: - return mic->mic_virtblk.block_dp; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - assert(0); - return NULL; -} - -static struct mic_device_desc *get_device_desc(struct mic_info *mic, int type) -{ - struct mic_device_desc *d; - int i; - void *dp = get_dp(mic, type); - - for (i = sizeof(struct mic_bootparam); i < PAGE_SIZE; - i += mic_total_desc_size(d)) { - d = dp + i; - - /* End of list */ - if (d->type == 0) - break; - - if (d->type == -1) - continue; - - mpsslog("%s %s d-> type %d d %p\n", - mic->name, __func__, d->type, d); - - if (d->type == (__u8)type) - return d; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - return NULL; -} - -/* See comments in vhost.c for explanation of next_desc() */ -static unsigned next_desc(struct vring_desc *desc) -{ - unsigned int next; - - if (!(le16toh(desc->flags) & VRING_DESC_F_NEXT)) - return -1U; - next = le16toh(desc->next); - return next; -} - -/* Sum up all the IOVEC length */ -static ssize_t -sum_iovec_len(struct mic_copy_desc *copy) -{ - ssize_t sum = 0; - unsigned int i; - - for (i = 0; i < copy->iovcnt; i++) - sum += copy->iov[i].iov_len; - return sum; -} - -static inline void verify_out_len(struct mic_info *mic, - struct mic_copy_desc *copy) -{ - if (copy->out_len != sum_iovec_len(copy)) { - mpsslog("%s %s %d BUG copy->out_len 0x%x len 0x%zx\n", - mic->name, __func__, __LINE__, - copy->out_len, sum_iovec_len(copy)); - assert(copy->out_len == sum_iovec_len(copy)); - } -} - -/* Display an iovec */ -static void -disp_iovec(struct mic_info *mic, struct mic_copy_desc *copy, - const char *s, int line) -{ - unsigned int i; - - for (i = 0; i < copy->iovcnt; i++) - mpsslog("%s %s %d copy->iov[%d] addr %p len 0x%zx\n", - mic->name, s, line, i, - copy->iov[i].iov_base, copy->iov[i].iov_len); -} - -static inline __u16 read_avail_idx(struct mic_vring *vr) -{ - return READ_ONCE(vr->info->avail_idx); -} - -static inline void txrx_prepare(int type, bool tx, struct mic_vring *vr, - struct mic_copy_desc *copy, ssize_t len) -{ - copy->vr_idx = tx ? 0 : 1; - copy->update_used = true; - if (type == VIRTIO_ID_NET) - copy->iov[1].iov_len = len - sizeof(struct virtio_net_hdr); - else - copy->iov[0].iov_len = len; -} - -/* Central API which triggers the copies */ -static int -mic_virtio_copy(struct mic_info *mic, int fd, - struct mic_vring *vr, struct mic_copy_desc *copy) -{ - int ret; - - ret = ioctl(fd, MIC_VIRTIO_COPY_DESC, copy); - if (ret) { - mpsslog("%s %s %d errno %s ret %d\n", - mic->name, __func__, __LINE__, - strerror(errno), ret); - } - return ret; -} - -static inline unsigned _vring_size(unsigned int num, unsigned long align) -{ - return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (3 + num) - + align - 1) & ~(align - 1)) - + sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num; -} - -/* - * This initialization routine requires at least one - * vring i.e. vr0. vr1 is optional. - */ -static void * -init_vr(struct mic_info *mic, int fd, int type, - struct mic_vring *vr0, struct mic_vring *vr1, int num_vq) -{ - int vr_size; - char *va; - - vr_size = PAGE_ALIGN(_vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN) + - sizeof(struct _mic_vring_info)); - va = mmap(NULL, MIC_DEVICE_PAGE_END + vr_size * num_vq, - PROT_READ, MAP_SHARED, fd, 0); - if (MAP_FAILED == va) { - mpsslog("%s %s %d mmap failed errno %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - goto done; - } - set_dp(mic, type, va); - vr0->va = (struct mic_vring *)&va[MIC_DEVICE_PAGE_END]; - vr0->info = vr0->va + - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN); - vring_init(&vr0->vr, - MIC_VRING_ENTRIES, vr0->va, MIC_VIRTIO_RING_ALIGN); - mpsslog("%s %s vr0 %p vr0->info %p vr_size 0x%x vring 0x%x ", - __func__, mic->name, vr0->va, vr0->info, vr_size, - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN)); - mpsslog("magic 0x%x expected 0x%x\n", - le32toh(vr0->info->magic), MIC_MAGIC + type); - assert(le32toh(vr0->info->magic) == MIC_MAGIC + type); - if (vr1) { - vr1->va = (struct mic_vring *) - &va[MIC_DEVICE_PAGE_END + vr_size]; - vr1->info = vr1->va + _vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN); - vring_init(&vr1->vr, - MIC_VRING_ENTRIES, vr1->va, MIC_VIRTIO_RING_ALIGN); - mpsslog("%s %s vr1 %p vr1->info %p vr_size 0x%x vring 0x%x ", - __func__, mic->name, vr1->va, vr1->info, vr_size, - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN)); - mpsslog("magic 0x%x expected 0x%x\n", - le32toh(vr1->info->magic), MIC_MAGIC + type + 1); - assert(le32toh(vr1->info->magic) == MIC_MAGIC + type + 1); - } -done: - return va; -} - -static int -wait_for_card_driver(struct mic_info *mic, int fd, int type) -{ - struct pollfd pollfd; - int err; - struct mic_device_desc *desc = get_device_desc(mic, type); - __u8 prev_status; - - if (!desc) - return -ENODEV; - prev_status = desc->status; - pollfd.fd = fd; - mpsslog("%s %s Waiting .... desc-> type %d status 0x%x\n", - mic->name, __func__, type, desc->status); - - while (1) { - pollfd.events = POLLIN; - pollfd.revents = 0; - err = poll(&pollfd, 1, -1); - if (err < 0) { - mpsslog("%s %s poll failed %s\n", - mic->name, __func__, strerror(errno)); - continue; - } - - if (pollfd.revents) { - if (desc->status != prev_status) { - mpsslog("%s %s Waiting... desc-> type %d " - "status 0x%x\n", - mic->name, __func__, type, - desc->status); - prev_status = desc->status; - } - if (desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - mpsslog("%s %s poll.revents %d\n", - mic->name, __func__, pollfd.revents); - mpsslog("%s %s desc-> type %d status 0x%x\n", - mic->name, __func__, type, - desc->status); - break; - } - } - } - return 0; -} - -/* Spin till we have some descriptors */ -static void -spin_for_descriptors(struct mic_info *mic, struct mic_vring *vr) -{ - __u16 avail_idx = read_avail_idx(vr); - - while (avail_idx == le16toh(READ_ONCE(vr->vr.avail->idx))) { -#ifdef DEBUG - mpsslog("%s %s waiting for desc avail %d info_avail %d\n", - mic->name, __func__, - le16toh(vr->vr.avail->idx), vr->info->avail_idx); -#endif - sched_yield(); - } -} - -static void * -virtio_net(void *arg) -{ - static __u8 vnet_hdr[2][sizeof(struct virtio_net_hdr)]; - static __u8 vnet_buf[2][MAX_NET_PKT_SIZE] __attribute__ ((aligned(64))); - struct iovec vnet_iov[2][2] = { - { { .iov_base = vnet_hdr[0], .iov_len = sizeof(vnet_hdr[0]) }, - { .iov_base = vnet_buf[0], .iov_len = sizeof(vnet_buf[0]) } }, - { { .iov_base = vnet_hdr[1], .iov_len = sizeof(vnet_hdr[1]) }, - { .iov_base = vnet_buf[1], .iov_len = sizeof(vnet_buf[1]) } }, - }; - struct iovec *iov0 = vnet_iov[0], *iov1 = vnet_iov[1]; - struct mic_info *mic = (struct mic_info *)arg; - char if_name[IFNAMSIZ]; - struct pollfd net_poll[MAX_NET_FD]; - struct mic_vring tx_vr, rx_vr; - struct mic_copy_desc copy; - struct mic_device_desc *desc; - int err; - - snprintf(if_name, IFNAMSIZ, "mic%d", mic->id); - mic->mic_net.tap_fd = tun_alloc(mic, if_name); - if (mic->mic_net.tap_fd < 0) - goto done; - - if (tap_configure(mic, if_name)) - goto done; - mpsslog("MIC name %s id %d\n", mic->name, mic->id); - - net_poll[NET_FD_VIRTIO_NET].fd = mic->mic_net.virtio_net_fd; - net_poll[NET_FD_VIRTIO_NET].events = POLLIN; - net_poll[NET_FD_TUN].fd = mic->mic_net.tap_fd; - net_poll[NET_FD_TUN].events = POLLIN; - - if (MAP_FAILED == init_vr(mic, mic->mic_net.virtio_net_fd, - VIRTIO_ID_NET, &tx_vr, &rx_vr, - virtnet_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - goto done; - } - - copy.iovcnt = 2; - desc = get_device_desc(mic, VIRTIO_ID_NET); - - while (1) { - ssize_t len; - - net_poll[NET_FD_VIRTIO_NET].revents = 0; - net_poll[NET_FD_TUN].revents = 0; - - /* Start polling for data from tap and virtio net */ - err = poll(net_poll, 2, -1); - if (err < 0) { - mpsslog("%s poll failed %s\n", - __func__, strerror(errno)); - continue; - } - if (!(desc->status & VIRTIO_CONFIG_S_DRIVER_OK)) { - err = wait_for_card_driver(mic, - mic->mic_net.virtio_net_fd, - VIRTIO_ID_NET); - if (err) { - mpsslog("%s %s %d Exiting...\n", - mic->name, __func__, __LINE__); - break; - } - } - /* - * Check if there is data to be read from TUN and write to - * virtio net fd if there is. - */ - if (net_poll[NET_FD_TUN].revents & POLLIN) { - copy.iov = iov0; - len = readv(net_poll[NET_FD_TUN].fd, - copy.iov, copy.iovcnt); - if (len > 0) { - struct virtio_net_hdr *hdr - = (struct virtio_net_hdr *)vnet_hdr[0]; - - /* Disable checksums on the card since we are on - a reliable PCIe link */ - hdr->flags |= VIRTIO_NET_HDR_F_DATA_VALID; -#ifdef DEBUG - mpsslog("%s %s %d hdr->flags 0x%x ", mic->name, - __func__, __LINE__, hdr->flags); - mpsslog("copy.out_len %d hdr->gso_type 0x%x\n", - copy.out_len, hdr->gso_type); -#endif -#ifdef DEBUG - disp_iovec(mic, copy, __func__, __LINE__); - mpsslog("%s %s %d read from tap 0x%lx\n", - mic->name, __func__, __LINE__, - len); -#endif - spin_for_descriptors(mic, &tx_vr); - txrx_prepare(VIRTIO_ID_NET, 1, &tx_vr, ©, - len); - - err = mic_virtio_copy(mic, - mic->mic_net.virtio_net_fd, &tx_vr, - ©); - if (err < 0) { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - } - if (!err) - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, copy, __func__, __LINE__); - mpsslog("%s %s %d wrote to net 0x%lx\n", - mic->name, __func__, __LINE__, - sum_iovec_len(©)); -#endif - /* Reinitialize IOV for next run */ - iov0[1].iov_len = MAX_NET_PKT_SIZE; - } else if (len < 0) { - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read failed %s ", mic->name, - __func__, __LINE__, strerror(errno)); - mpsslog("cnt %d sum %zd\n", - copy.iovcnt, sum_iovec_len(©)); - } - } - - /* - * Check if there is data to be read from virtio net and - * write to TUN if there is. - */ - if (net_poll[NET_FD_VIRTIO_NET].revents & POLLIN) { - while (rx_vr.info->avail_idx != - le16toh(rx_vr.vr.avail->idx)) { - copy.iov = iov1; - txrx_prepare(VIRTIO_ID_NET, 0, &rx_vr, ©, - MAX_NET_PKT_SIZE - + sizeof(struct virtio_net_hdr)); - - err = mic_virtio_copy(mic, - mic->mic_net.virtio_net_fd, &rx_vr, - ©); - if (!err) { -#ifdef DEBUG - struct virtio_net_hdr *hdr - = (struct virtio_net_hdr *) - vnet_hdr[1]; - - mpsslog("%s %s %d hdr->flags 0x%x, ", - mic->name, __func__, __LINE__, - hdr->flags); - mpsslog("out_len %d gso_type 0x%x\n", - copy.out_len, - hdr->gso_type); -#endif - /* Set the correct output iov_len */ - iov1[1].iov_len = copy.out_len - - sizeof(struct virtio_net_hdr); - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, copy, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, __LINE__); - mpsslog("read from net 0x%lx\n", - sum_iovec_len(copy)); -#endif - len = writev(net_poll[NET_FD_TUN].fd, - copy.iov, copy.iovcnt); - if (len != sum_iovec_len(©)) { - mpsslog("Tun write failed %s ", - strerror(errno)); - mpsslog("len 0x%zx ", len); - mpsslog("read_len 0x%zx\n", - sum_iovec_len(©)); - } else { -#ifdef DEBUG - disp_iovec(mic, ©, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, - __LINE__); - mpsslog("wrote to tap 0x%lx\n", - len); -#endif - } - } else { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - break; - } - } - } - if (net_poll[NET_FD_VIRTIO_NET].revents & POLLERR) - mpsslog("%s: %s: POLLERR\n", __func__, mic->name); - } -done: - pthread_exit(NULL); -} - -/* virtio_console */ -#define VIRTIO_CONSOLE_FD 0 -#define MONITOR_FD (VIRTIO_CONSOLE_FD + 1) -#define MAX_CONSOLE_FD (MONITOR_FD + 1) /* must be the last one + 1 */ -#define MAX_BUFFER_SIZE PAGE_SIZE - -static void * -virtio_console(void *arg) -{ - static __u8 vcons_buf[2][PAGE_SIZE]; - struct iovec vcons_iov[2] = { - { .iov_base = vcons_buf[0], .iov_len = sizeof(vcons_buf[0]) }, - { .iov_base = vcons_buf[1], .iov_len = sizeof(vcons_buf[1]) }, - }; - struct iovec *iov0 = &vcons_iov[0], *iov1 = &vcons_iov[1]; - struct mic_info *mic = (struct mic_info *)arg; - int err; - struct pollfd console_poll[MAX_CONSOLE_FD]; - int pty_fd; - char *pts_name; - ssize_t len; - struct mic_vring tx_vr, rx_vr; - struct mic_copy_desc copy; - struct mic_device_desc *desc; - - pty_fd = posix_openpt(O_RDWR); - if (pty_fd < 0) { - mpsslog("can't open a pseudoterminal master device: %s\n", - strerror(errno)); - goto _return; - } - pts_name = ptsname(pty_fd); - if (pts_name == NULL) { - mpsslog("can't get pts name\n"); - goto _close_pty; - } - printf("%s console message goes to %s\n", mic->name, pts_name); - mpsslog("%s console message goes to %s\n", mic->name, pts_name); - err = grantpt(pty_fd); - if (err < 0) { - mpsslog("can't grant access: %s %s\n", - pts_name, strerror(errno)); - goto _close_pty; - } - err = unlockpt(pty_fd); - if (err < 0) { - mpsslog("can't unlock a pseudoterminal: %s %s\n", - pts_name, strerror(errno)); - goto _close_pty; - } - console_poll[MONITOR_FD].fd = pty_fd; - console_poll[MONITOR_FD].events = POLLIN; - - console_poll[VIRTIO_CONSOLE_FD].fd = mic->mic_console.virtio_console_fd; - console_poll[VIRTIO_CONSOLE_FD].events = POLLIN; - - if (MAP_FAILED == init_vr(mic, mic->mic_console.virtio_console_fd, - VIRTIO_ID_CONSOLE, &tx_vr, &rx_vr, - virtcons_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - goto _close_pty; - } - - copy.iovcnt = 1; - desc = get_device_desc(mic, VIRTIO_ID_CONSOLE); - - for (;;) { - console_poll[MONITOR_FD].revents = 0; - console_poll[VIRTIO_CONSOLE_FD].revents = 0; - err = poll(console_poll, MAX_CONSOLE_FD, -1); - if (err < 0) { - mpsslog("%s %d: poll failed: %s\n", __func__, __LINE__, - strerror(errno)); - continue; - } - if (!(desc->status & VIRTIO_CONFIG_S_DRIVER_OK)) { - err = wait_for_card_driver(mic, - mic->mic_console.virtio_console_fd, - VIRTIO_ID_CONSOLE); - if (err) { - mpsslog("%s %s %d Exiting...\n", - mic->name, __func__, __LINE__); - break; - } - } - - if (console_poll[MONITOR_FD].revents & POLLIN) { - copy.iov = iov0; - len = readv(pty_fd, copy.iov, copy.iovcnt); - if (len > 0) { -#ifdef DEBUG - disp_iovec(mic, copy, __func__, __LINE__); - mpsslog("%s %s %d read from tap 0x%lx\n", - mic->name, __func__, __LINE__, - len); -#endif - spin_for_descriptors(mic, &tx_vr); - txrx_prepare(VIRTIO_ID_CONSOLE, 1, &tx_vr, - ©, len); - - err = mic_virtio_copy(mic, - mic->mic_console.virtio_console_fd, - &tx_vr, ©); - if (err < 0) { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - } - if (!err) - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, copy, __func__, __LINE__); - mpsslog("%s %s %d wrote to net 0x%lx\n", - mic->name, __func__, __LINE__, - sum_iovec_len(copy)); -#endif - /* Reinitialize IOV for next run */ - iov0->iov_len = PAGE_SIZE; - } else if (len < 0) { - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read failed %s ", - mic->name, __func__, __LINE__, - strerror(errno)); - mpsslog("cnt %d sum %zd\n", - copy.iovcnt, sum_iovec_len(©)); - } - } - - if (console_poll[VIRTIO_CONSOLE_FD].revents & POLLIN) { - while (rx_vr.info->avail_idx != - le16toh(rx_vr.vr.avail->idx)) { - copy.iov = iov1; - txrx_prepare(VIRTIO_ID_CONSOLE, 0, &rx_vr, - ©, PAGE_SIZE); - - err = mic_virtio_copy(mic, - mic->mic_console.virtio_console_fd, - &rx_vr, ©); - if (!err) { - /* Set the correct output iov_len */ - iov1->iov_len = copy.out_len; - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, copy, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, __LINE__); - mpsslog("read from net 0x%lx\n", - sum_iovec_len(copy)); -#endif - len = writev(pty_fd, - copy.iov, copy.iovcnt); - if (len != sum_iovec_len(©)) { - mpsslog("Tun write failed %s ", - strerror(errno)); - mpsslog("len 0x%zx ", len); - mpsslog("read_len 0x%zx\n", - sum_iovec_len(©)); - } else { -#ifdef DEBUG - disp_iovec(mic, copy, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, - __LINE__); - mpsslog("wrote to tap 0x%lx\n", - len); -#endif - } - } else { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - break; - } - } - } - if (console_poll[NET_FD_VIRTIO_NET].revents & POLLERR) - mpsslog("%s: %s: POLLERR\n", __func__, mic->name); - } -_close_pty: - close(pty_fd); -_return: - pthread_exit(NULL); -} - -static void -add_virtio_device(struct mic_info *mic, struct mic_device_desc *dd) -{ - char path[PATH_MAX]; - int fd, err; - - snprintf(path, PATH_MAX, "/dev/vop_virtio%d", mic->id); - fd = open(path, O_RDWR); - if (fd < 0) { - mpsslog("Could not open %s %s\n", path, strerror(errno)); - return; - } - - err = ioctl(fd, MIC_VIRTIO_ADD_DEVICE, dd); - if (err < 0) { - mpsslog("Could not add %d %s\n", dd->type, strerror(errno)); - close(fd); - return; - } - switch (dd->type) { - case VIRTIO_ID_NET: - mic->mic_net.virtio_net_fd = fd; - mpsslog("Added VIRTIO_ID_NET for %s\n", mic->name); - break; - case VIRTIO_ID_CONSOLE: - mic->mic_console.virtio_console_fd = fd; - mpsslog("Added VIRTIO_ID_CONSOLE for %s\n", mic->name); - break; - case VIRTIO_ID_BLOCK: - mic->mic_virtblk.virtio_block_fd = fd; - mpsslog("Added VIRTIO_ID_BLOCK for %s\n", mic->name); - break; - } -} - -static bool -set_backend_file(struct mic_info *mic) -{ - FILE *config; - char buff[PATH_MAX], *line, *evv, *p; - - snprintf(buff, PATH_MAX, "%s/mpssd%03d.conf", mic_config_dir, mic->id); - config = fopen(buff, "r"); - if (config == NULL) - return false; - do { /* look for "virtblk_backend=XXXX" */ - line = fgets(buff, PATH_MAX, config); - if (line == NULL) - break; - if (*line == '#') - continue; - p = strchr(line, '\n'); - if (p) - *p = '\0'; - } while (strncmp(line, virtblk_backend, strlen(virtblk_backend)) != 0); - fclose(config); - if (line == NULL) - return false; - evv = strchr(line, '='); - if (evv == NULL) - return false; - mic->mic_virtblk.backend_file = malloc(strlen(evv) + 1); - if (mic->mic_virtblk.backend_file == NULL) { - mpsslog("%s %d can't allocate memory\n", mic->name, mic->id); - return false; - } - strcpy(mic->mic_virtblk.backend_file, evv + 1); - return true; -} - -#define SECTOR_SIZE 512 -static bool -set_backend_size(struct mic_info *mic) -{ - mic->mic_virtblk.backend_size = lseek(mic->mic_virtblk.backend, 0, - SEEK_END); - if (mic->mic_virtblk.backend_size < 0) { - mpsslog("%s: can't seek: %s\n", - mic->name, mic->mic_virtblk.backend_file); - return false; - } - virtblk_dev_page.blk_config.capacity = - mic->mic_virtblk.backend_size / SECTOR_SIZE; - if ((mic->mic_virtblk.backend_size % SECTOR_SIZE) != 0) - virtblk_dev_page.blk_config.capacity++; - - virtblk_dev_page.blk_config.capacity = - htole64(virtblk_dev_page.blk_config.capacity); - - return true; -} - -static bool -open_backend(struct mic_info *mic) -{ - if (!set_backend_file(mic)) - goto _error_exit; - mic->mic_virtblk.backend = open(mic->mic_virtblk.backend_file, O_RDWR); - if (mic->mic_virtblk.backend < 0) { - mpsslog("%s: can't open: %s\n", mic->name, - mic->mic_virtblk.backend_file); - goto _error_free; - } - if (!set_backend_size(mic)) - goto _error_close; - mic->mic_virtblk.backend_addr = mmap(NULL, - mic->mic_virtblk.backend_size, - PROT_READ|PROT_WRITE, MAP_SHARED, - mic->mic_virtblk.backend, 0L); - if (mic->mic_virtblk.backend_addr == MAP_FAILED) { - mpsslog("%s: can't map: %s %s\n", - mic->name, mic->mic_virtblk.backend_file, - strerror(errno)); - goto _error_close; - } - return true; - - _error_close: - close(mic->mic_virtblk.backend); - _error_free: - free(mic->mic_virtblk.backend_file); - _error_exit: - return false; -} - -static void -close_backend(struct mic_info *mic) -{ - munmap(mic->mic_virtblk.backend_addr, mic->mic_virtblk.backend_size); - close(mic->mic_virtblk.backend); - free(mic->mic_virtblk.backend_file); -} - -static bool -start_virtblk(struct mic_info *mic, struct mic_vring *vring) -{ - if (((unsigned long)&virtblk_dev_page.blk_config % 8) != 0) { - mpsslog("%s: blk_config is not 8 byte aligned.\n", - mic->name); - return false; - } - add_virtio_device(mic, &virtblk_dev_page.dd); - if (MAP_FAILED == init_vr(mic, mic->mic_virtblk.virtio_block_fd, - VIRTIO_ID_BLOCK, vring, NULL, - virtblk_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - return false; - } - return true; -} - -static void -stop_virtblk(struct mic_info *mic) -{ - int vr_size, ret; - - vr_size = PAGE_ALIGN(_vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN) + - sizeof(struct _mic_vring_info)); - ret = munmap(mic->mic_virtblk.block_dp, - MIC_DEVICE_PAGE_END + vr_size * virtblk_dev_page.dd.num_vq); - if (ret < 0) - mpsslog("%s munmap errno %d\n", mic->name, errno); - close(mic->mic_virtblk.virtio_block_fd); -} - -static __u8 -header_error_check(struct vring_desc *desc) -{ - if (le32toh(desc->len) != sizeof(struct virtio_blk_outhdr)) { - mpsslog("%s() %d: length is not sizeof(virtio_blk_outhd)\n", - __func__, __LINE__); - return -EIO; - } - if (!(le16toh(desc->flags) & VRING_DESC_F_NEXT)) { - mpsslog("%s() %d: alone\n", - __func__, __LINE__); - return -EIO; - } - if (le16toh(desc->flags) & VRING_DESC_F_WRITE) { - mpsslog("%s() %d: not read\n", - __func__, __LINE__); - return -EIO; - } - return 0; -} - -static int -read_header(int fd, struct virtio_blk_outhdr *hdr, __u32 desc_idx) -{ - struct iovec iovec; - struct mic_copy_desc copy; - - iovec.iov_len = sizeof(*hdr); - iovec.iov_base = hdr; - copy.iov = &iovec; - copy.iovcnt = 1; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = false; /* do not update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -static int -transfer_blocks(int fd, struct iovec *iovec, __u32 iovcnt) -{ - struct mic_copy_desc copy; - - copy.iov = iovec; - copy.iovcnt = iovcnt; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = false; /* do not update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -static __u8 -status_error_check(struct vring_desc *desc) -{ - if (le32toh(desc->len) != sizeof(__u8)) { - mpsslog("%s() %d: length is not sizeof(status)\n", - __func__, __LINE__); - return -EIO; - } - return 0; -} - -static int -write_status(int fd, __u8 *status) -{ - struct iovec iovec; - struct mic_copy_desc copy; - - iovec.iov_base = status; - iovec.iov_len = sizeof(*status); - copy.iov = &iovec; - copy.iovcnt = 1; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = true; /* Update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -#ifndef VIRTIO_BLK_T_GET_ID -#define VIRTIO_BLK_T_GET_ID 8 -#endif - -static void * -virtio_block(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - int ret; - struct pollfd block_poll; - struct mic_vring vring; - __u16 avail_idx; - __u32 desc_idx; - struct vring_desc *desc; - struct iovec *iovec, *piov; - __u8 status; - __u32 buffer_desc_idx; - struct virtio_blk_outhdr hdr; - void *fos; - - for (;;) { /* forever */ - if (!open_backend(mic)) { /* No virtblk */ - for (mic->mic_virtblk.signaled = 0; - !mic->mic_virtblk.signaled;) - sleep(1); - continue; - } - - /* backend file is specified. */ - if (!start_virtblk(mic, &vring)) - goto _close_backend; - iovec = malloc(sizeof(*iovec) * - le32toh(virtblk_dev_page.blk_config.seg_max)); - if (!iovec) { - mpsslog("%s: can't alloc iovec: %s\n", - mic->name, strerror(ENOMEM)); - goto _stop_virtblk; - } - - block_poll.fd = mic->mic_virtblk.virtio_block_fd; - block_poll.events = POLLIN; - for (mic->mic_virtblk.signaled = 0; - !mic->mic_virtblk.signaled;) { - block_poll.revents = 0; - /* timeout in 1 sec to see signaled */ - ret = poll(&block_poll, 1, 1000); - if (ret < 0) { - mpsslog("%s %d: poll failed: %s\n", - __func__, __LINE__, - strerror(errno)); - continue; - } - - if (!(block_poll.revents & POLLIN)) { -#ifdef DEBUG - mpsslog("%s %d: block_poll.revents=0x%x\n", - __func__, __LINE__, block_poll.revents); -#endif - continue; - } - - /* POLLIN */ - while (vring.info->avail_idx != - le16toh(vring.vr.avail->idx)) { - /* read header element */ - avail_idx = - vring.info->avail_idx & - (vring.vr.num - 1); - desc_idx = le16toh( - vring.vr.avail->ring[avail_idx]); - desc = &vring.vr.desc[desc_idx]; -#ifdef DEBUG - mpsslog("%s() %d: avail_idx=%d ", - __func__, __LINE__, - vring.info->avail_idx); - mpsslog("vring.vr.num=%d desc=%p\n", - vring.vr.num, desc); -#endif - status = header_error_check(desc); - ret = read_header( - mic->mic_virtblk.virtio_block_fd, - &hdr, desc_idx); - if (ret < 0) { - mpsslog("%s() %d %s: ret=%d %s\n", - __func__, __LINE__, - mic->name, ret, - strerror(errno)); - break; - } - /* buffer element */ - piov = iovec; - status = 0; - fos = mic->mic_virtblk.backend_addr + - (hdr.sector * SECTOR_SIZE); - buffer_desc_idx = next_desc(desc); - desc_idx = buffer_desc_idx; - for (desc = &vring.vr.desc[buffer_desc_idx]; - desc->flags & VRING_DESC_F_NEXT; - desc_idx = next_desc(desc), - desc = &vring.vr.desc[desc_idx]) { - piov->iov_len = desc->len; - piov->iov_base = fos; - piov++; - fos += desc->len; - } - /* Returning NULLs for VIRTIO_BLK_T_GET_ID. */ - if (hdr.type & ~(VIRTIO_BLK_T_OUT | - VIRTIO_BLK_T_GET_ID)) { - /* - VIRTIO_BLK_T_IN - does not do - anything. Probably for documenting. - VIRTIO_BLK_T_SCSI_CMD - for - virtio_scsi. - VIRTIO_BLK_T_FLUSH - turned off in - config space. - VIRTIO_BLK_T_BARRIER - defined but not - used in anywhere. - */ - mpsslog("%s() %d: type %x ", - __func__, __LINE__, - hdr.type); - mpsslog("is not supported\n"); - status = -ENOTSUP; - - } else { - ret = transfer_blocks( - mic->mic_virtblk.virtio_block_fd, - iovec, - piov - iovec); - if (ret < 0 && - status != 0) - status = ret; - } - /* write status and update used pointer */ - if (status != 0) - status = status_error_check(desc); - ret = write_status( - mic->mic_virtblk.virtio_block_fd, - &status); -#ifdef DEBUG - mpsslog("%s() %d: write status=%d on desc=%p\n", - __func__, __LINE__, - status, desc); -#endif - } - } - free(iovec); -_stop_virtblk: - stop_virtblk(mic); -_close_backend: - close_backend(mic); - } /* forever */ - - pthread_exit(NULL); -} - -static void -reset(struct mic_info *mic) -{ -#define RESET_TIMEOUT 120 - int i = RESET_TIMEOUT; - setsysfs(mic->name, "state", "reset"); - while (i) { - char *state; - state = readsysfs(mic->name, "state"); - if (!state) - goto retry; - mpsslog("%s: %s %d state %s\n", - mic->name, __func__, __LINE__, state); - - if (!strcmp(state, "ready")) { - free(state); - break; - } - free(state); -retry: - sleep(1); - i--; - } -} - -static int -get_mic_shutdown_status(struct mic_info *mic, char *shutdown_status) -{ - if (!strcmp(shutdown_status, "nop")) - return MIC_NOP; - if (!strcmp(shutdown_status, "crashed")) - return MIC_CRASHED; - if (!strcmp(shutdown_status, "halted")) - return MIC_HALTED; - if (!strcmp(shutdown_status, "poweroff")) - return MIC_POWER_OFF; - if (!strcmp(shutdown_status, "restart")) - return MIC_RESTART; - mpsslog("%s: BUG invalid status %s\n", mic->name, shutdown_status); - /* Invalid state */ - assert(0); -}; - -static int get_mic_state(struct mic_info *mic) -{ - char *state = NULL; - enum mic_states mic_state; - - while (!state) { - state = readsysfs(mic->name, "state"); - sleep(1); - } - mpsslog("%s: %s %d state %s\n", - mic->name, __func__, __LINE__, state); - - if (!strcmp(state, "ready")) { - mic_state = MIC_READY; - } else if (!strcmp(state, "booting")) { - mic_state = MIC_BOOTING; - } else if (!strcmp(state, "online")) { - mic_state = MIC_ONLINE; - } else if (!strcmp(state, "shutting_down")) { - mic_state = MIC_SHUTTING_DOWN; - } else if (!strcmp(state, "reset_failed")) { - mic_state = MIC_RESET_FAILED; - } else if (!strcmp(state, "resetting")) { - mic_state = MIC_RESETTING; - } else { - mpsslog("%s: BUG invalid state %s\n", mic->name, state); - assert(0); - } - - free(state); - return mic_state; -}; - -static void mic_handle_shutdown(struct mic_info *mic) -{ -#define SHUTDOWN_TIMEOUT 60 - int i = SHUTDOWN_TIMEOUT; - char *shutdown_status; - while (i) { - shutdown_status = readsysfs(mic->name, "shutdown_status"); - if (!shutdown_status) { - sleep(1); - continue; - } - mpsslog("%s: %s %d shutdown_status %s\n", - mic->name, __func__, __LINE__, shutdown_status); - switch (get_mic_shutdown_status(mic, shutdown_status)) { - case MIC_RESTART: - mic->restart = 1; - case MIC_HALTED: - case MIC_POWER_OFF: - case MIC_CRASHED: - free(shutdown_status); - goto reset; - default: - break; - } - free(shutdown_status); - sleep(1); - i--; - } -reset: - if (!i) - mpsslog("%s: %s %d timing out waiting for shutdown_status %s\n", - mic->name, __func__, __LINE__, shutdown_status); - reset(mic); -} - -static int open_state_fd(struct mic_info *mic) -{ - char pathname[PATH_MAX]; - int fd; - - snprintf(pathname, PATH_MAX - 1, "%s/%s/%s", - MICSYSFSDIR, mic->name, "state"); - - fd = open(pathname, O_RDONLY); - if (fd < 0) - mpsslog("%s: opening file %s failed %s\n", - mic->name, pathname, strerror(errno)); - return fd; -} - -static int block_till_state_change(int fd, struct mic_info *mic) -{ - struct pollfd ufds[1]; - char value[PAGE_SIZE]; - int ret; - - ufds[0].fd = fd; - ufds[0].events = POLLERR | POLLPRI; - ret = poll(ufds, 1, -1); - if (ret < 0) { - mpsslog("%s: %s %d poll failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - ret = lseek(fd, 0, SEEK_SET); - if (ret < 0) { - mpsslog("%s: %s %d Failed to seek to 0: %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - ret = read(fd, value, sizeof(value)); - if (ret < 0) { - mpsslog("%s: %s %d Failed to read sysfs entry: %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - return 0; -} - -static void * -mic_config(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - int fd, ret, stat = 0; - - fd = open_state_fd(mic); - if (fd < 0) { - mpsslog("%s: %s %d open state fd failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - goto exit; - } - - do { - ret = block_till_state_change(fd, mic); - if (ret < 0) { - mpsslog("%s: %s %d block_till_state_change error %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - goto close_exit; - } - - switch (get_mic_state(mic)) { - case MIC_SHUTTING_DOWN: - mic_handle_shutdown(mic); - break; - case MIC_READY: - case MIC_RESET_FAILED: - ret = kill(mic->pid, SIGTERM); - mpsslog("%s: %s %d kill pid %d ret %d\n", - mic->name, __func__, __LINE__, - mic->pid, ret); - if (!ret) { - ret = waitpid(mic->pid, &stat, - WIFSIGNALED(stat)); - mpsslog("%s: %s %d waitpid ret %d pid %d\n", - mic->name, __func__, __LINE__, - ret, mic->pid); - } - if (mic->boot_on_resume) { - setsysfs(mic->name, "state", "boot"); - mic->boot_on_resume = 0; - } - goto close_exit; - default: - break; - } - } while (1); - -close_exit: - close(fd); -exit: - init_mic(mic); - pthread_exit(NULL); -} - -static void -set_cmdline(struct mic_info *mic) -{ - char buffer[PATH_MAX]; - int len; - - len = snprintf(buffer, PATH_MAX, - "clocksource=tsc highres=off nohz=off "); - len += snprintf(buffer + len, PATH_MAX - len, - "cpufreq_on;corec6_off;pc3_off;pc6_off "); - len += snprintf(buffer + len, PATH_MAX - len, - "ifcfg=static;address,172.31.%d.1;netmask,255.255.255.0", - mic->id + 1); - - setsysfs(mic->name, "cmdline", buffer); - mpsslog("%s: Command line: \"%s\"\n", mic->name, buffer); - snprintf(buffer, PATH_MAX, "172.31.%d.1", mic->id + 1); - mpsslog("%s: IPADDR: \"%s\"\n", mic->name, buffer); -} - -static void -set_log_buf_info(struct mic_info *mic) -{ - int fd; - off_t len; - char system_map[] = "/lib/firmware/mic/System.map"; - char *map, *temp, log_buf[17] = {'\0'}; - - fd = open(system_map, O_RDONLY); - if (fd < 0) { - mpsslog("%s: Opening System.map failed: %d\n", - mic->name, errno); - return; - } - len = lseek(fd, 0, SEEK_END); - if (len < 0) { - mpsslog("%s: Reading System.map size failed: %d\n", - mic->name, errno); - close(fd); - return; - } - map = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0); - if (map == MAP_FAILED) { - mpsslog("%s: mmap of System.map failed: %d\n", - mic->name, errno); - close(fd); - return; - } - temp = strstr(map, "__log_buf"); - if (!temp) { - mpsslog("%s: __log_buf not found: %d\n", mic->name, errno); - munmap(map, len); - close(fd); - return; - } - strncpy(log_buf, temp - 19, 16); - setsysfs(mic->name, "log_buf_addr", log_buf); - mpsslog("%s: log_buf_addr: %s\n", mic->name, log_buf); - temp = strstr(map, "log_buf_len"); - if (!temp) { - mpsslog("%s: log_buf_len not found: %d\n", mic->name, errno); - munmap(map, len); - close(fd); - return; - } - strncpy(log_buf, temp - 19, 16); - setsysfs(mic->name, "log_buf_len", log_buf); - mpsslog("%s: log_buf_len: %s\n", mic->name, log_buf); - munmap(map, len); - close(fd); -} - -static void -change_virtblk_backend(int x, siginfo_t *siginfo, void *p) -{ - struct mic_info *mic; - - for (mic = mic_list.next; mic != NULL; mic = mic->next) - mic->mic_virtblk.signaled = 1/* true */; -} - -static void -set_mic_boot_params(struct mic_info *mic) -{ - set_log_buf_info(mic); - set_cmdline(mic); -} - -static void * -init_mic(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - struct sigaction ignore = { - .sa_flags = 0, - .sa_handler = SIG_IGN - }; - struct sigaction act = { - .sa_flags = SA_SIGINFO, - .sa_sigaction = change_virtblk_backend, - }; - char buffer[PATH_MAX]; - int err, fd; - - /* - * Currently, one virtio block device is supported for each MIC card - * at a time. Any user (or test) can send a SIGUSR1 to the MIC daemon. - * The signal informs the virtio block backend about a change in the - * configuration file which specifies the virtio backend file name on - * the host. Virtio block backend then re-reads the configuration file - * and switches to the new block device. This signalling mechanism may - * not be required once multiple virtio block devices are supported by - * the MIC daemon. - */ - sigaction(SIGUSR1, &ignore, NULL); -retry: - fd = open_state_fd(mic); - if (fd < 0) { - mpsslog("%s: %s %d open state fd failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - sleep(2); - goto retry; - } - - if (mic->restart) { - snprintf(buffer, PATH_MAX, "boot"); - setsysfs(mic->name, "state", buffer); - mpsslog("%s restarting mic %d\n", - mic->name, mic->restart); - mic->restart = 0; - } - - while (1) { - while (block_till_state_change(fd, mic)) { - mpsslog("%s: %s %d block_till_state_change error %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - sleep(2); - continue; - } - - if (get_mic_state(mic) == MIC_BOOTING) - break; - } - - mic->pid = fork(); - switch (mic->pid) { - case 0: - add_virtio_device(mic, &virtcons_dev_page.dd); - add_virtio_device(mic, &virtnet_dev_page.dd); - err = pthread_create(&mic->mic_console.console_thread, NULL, - virtio_console, mic); - if (err) - mpsslog("%s virtcons pthread_create failed %s\n", - mic->name, strerror(err)); - err = pthread_create(&mic->mic_net.net_thread, NULL, - virtio_net, mic); - if (err) - mpsslog("%s virtnet pthread_create failed %s\n", - mic->name, strerror(err)); - err = pthread_create(&mic->mic_virtblk.block_thread, NULL, - virtio_block, mic); - if (err) - mpsslog("%s virtblk pthread_create failed %s\n", - mic->name, strerror(err)); - sigemptyset(&act.sa_mask); - err = sigaction(SIGUSR1, &act, NULL); - if (err) - mpsslog("%s sigaction SIGUSR1 failed %s\n", - mic->name, strerror(errno)); - while (1) - sleep(60); - case -1: - mpsslog("fork failed MIC name %s id %d errno %d\n", - mic->name, mic->id, errno); - break; - default: - err = pthread_create(&mic->config_thread, NULL, - mic_config, mic); - if (err) - mpsslog("%s mic_config pthread_create failed %s\n", - mic->name, strerror(err)); - } - - return NULL; -} - -static void -start_daemon(void) -{ - struct mic_info *mic; - int err; - - for (mic = mic_list.next; mic; mic = mic->next) { - set_mic_boot_params(mic); - err = pthread_create(&mic->init_thread, NULL, init_mic, mic); - if (err) - mpsslog("%s init_mic pthread_create failed %s\n", - mic->name, strerror(err)); - } - - while (1) - sleep(60); -} - -static int -init_mic_list(void) -{ - struct mic_info *mic = &mic_list; - struct dirent *file; - DIR *dp; - int cnt = 0; - - dp = opendir(MICSYSFSDIR); - if (!dp) - return 0; - - while ((file = readdir(dp)) != NULL) { - if (!strncmp(file->d_name, "mic", 3)) { - mic->next = calloc(1, sizeof(struct mic_info)); - if (mic->next) { - mic = mic->next; - mic->id = atoi(&file->d_name[3]); - mic->name = malloc(strlen(file->d_name) + 16); - if (mic->name) - strcpy(mic->name, file->d_name); - mpsslog("MIC name %s id %d\n", mic->name, - mic->id); - cnt++; - } - } - } - - closedir(dp); - return cnt; -} - -void -mpsslog(char *format, ...) -{ - va_list args; - char buffer[4096]; - char ts[52], *ts1; - time_t t; - - if (logfp == NULL) - return; - - va_start(args, format); - vsprintf(buffer, format, args); - va_end(args); - - time(&t); - ts1 = ctime_r(&t, ts); - ts1[strlen(ts1) - 1] = '\0'; - fprintf(logfp, "%s: %s", ts1, buffer); - - fflush(logfp); -} - -int -main(int argc, char *argv[]) -{ - int cnt; - pid_t pid; - - myname = argv[0]; - - logfp = fopen(LOGFILE_NAME, "a+"); - if (!logfp) { - fprintf(stderr, "cannot open logfile '%s'\n", LOGFILE_NAME); - exit(1); - } - pid = fork(); - switch (pid) { - case 0: - break; - case -1: - exit(2); - default: - exit(0); - } - - mpsslog("MIC Daemon start\n"); - - cnt = init_mic_list(); - if (cnt == 0) { - mpsslog("MIC module not loaded\n"); - exit(3); - } - mpsslog("MIC found %d devices\n", cnt); - - start_daemon(); - - exit(0); -} diff --git a/samples/mic/mpssd/mpssd.h b/samples/mic/mpssd/mpssd.h deleted file mode 100644 index 5f98bdafe653..000000000000 --- a/samples/mic/mpssd/mpssd.h +++ /dev/null @@ -1,89 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC User Space Tools. - */ -#ifndef _MPSSD_H_ -#define _MPSSD_H_ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <fcntl.h> -#include <unistd.h> -#include <dirent.h> -#include <libgen.h> -#include <pthread.h> -#include <stdarg.h> -#include <time.h> -#include <errno.h> -#include <sys/dir.h> -#include <sys/ioctl.h> -#include <sys/poll.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <sys/stat.h> -#include <sys/mman.h> -#include <sys/utsname.h> -#include <sys/wait.h> -#include <netinet/in.h> -#include <arpa/inet.h> -#include <netdb.h> -#include <signal.h> -#include <limits.h> -#include <syslog.h> -#include <getopt.h> -#include <net/if.h> -#include <linux/if_tun.h> -#include <linux/virtio_ids.h> - -#define MICSYSFSDIR "/sys/class/mic" -#define LOGFILE_NAME "/var/log/mpssd" -#define PAGE_SIZE 4096 - -struct mic_console_info { - pthread_t console_thread; - int virtio_console_fd; - void *console_dp; -}; - -struct mic_net_info { - pthread_t net_thread; - int virtio_net_fd; - int tap_fd; - void *net_dp; -}; - -struct mic_virtblk_info { - pthread_t block_thread; - int virtio_block_fd; - void *block_dp; - volatile sig_atomic_t signaled; - char *backend_file; - int backend; - void *backend_addr; - long backend_size; -}; - -struct mic_info { - int id; - char *name; - pthread_t config_thread; - pthread_t init_thread; - pid_t pid; - struct mic_console_info mic_console; - struct mic_net_info mic_net; - struct mic_virtblk_info mic_virtblk; - int restart; - int boot_on_resume; - struct mic_info *next; -}; - -__attribute__((format(printf, 1, 2))) -void mpsslog(char *format, ...); -char *readsysfs(char *dir, char *entry); -int setsysfs(char *dir, char *entry, char *value); -#endif diff --git a/samples/mic/mpssd/sysfs.c b/samples/mic/mpssd/sysfs.c deleted file mode 100644 index 3fb08eb7ed9d..000000000000 --- a/samples/mic/mpssd/sysfs.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC User Space Tools. - */ - -#include "mpssd.h" - -#define PAGE_SIZE 4096 - -char * -readsysfs(char *dir, char *entry) -{ - char filename[PATH_MAX]; - char value[PAGE_SIZE]; - char *string = NULL; - int fd; - int len; - - if (dir == NULL) - snprintf(filename, PATH_MAX, "%s/%s", MICSYSFSDIR, entry); - else - snprintf(filename, PATH_MAX, - "%s/%s/%s", MICSYSFSDIR, dir, entry); - - fd = open(filename, O_RDONLY); - if (fd < 0) { - mpsslog("Failed to open sysfs entry '%s': %s\n", - filename, strerror(errno)); - return NULL; - } - - len = read(fd, value, sizeof(value)); - if (len < 0) { - mpsslog("Failed to read sysfs entry '%s': %s\n", - filename, strerror(errno)); - goto readsys_ret; - } - if (len == 0) - goto readsys_ret; - - value[len - 1] = '\0'; - - string = malloc(strlen(value) + 1); - if (string) - strcpy(string, value); - -readsys_ret: - close(fd); - return string; -} - -int -setsysfs(char *dir, char *entry, char *value) -{ - char filename[PATH_MAX]; - char *oldvalue; - int fd, ret = 0; - - if (dir == NULL) - snprintf(filename, PATH_MAX, "%s/%s", MICSYSFSDIR, entry); - else - snprintf(filename, PATH_MAX, "%s/%s/%s", - MICSYSFSDIR, dir, entry); - - oldvalue = readsysfs(dir, entry); - - fd = open(filename, O_RDWR); - if (fd < 0) { - ret = errno; - mpsslog("Failed to open sysfs entry '%s': %s\n", - filename, strerror(errno)); - goto done; - } - - if (!oldvalue || strcmp(value, oldvalue)) { - if (write(fd, value, strlen(value)) < 0) { - ret = errno; - mpsslog("Failed to write new sysfs entry '%s': %s\n", - filename, strerror(errno)); - } - } - close(fd); -done: - if (oldvalue) - free(oldvalue); - return ret; -} diff --git a/samples/nitro_enclaves/.gitignore b/samples/nitro_enclaves/.gitignore new file mode 100644 index 000000000000..6a718eec71f4 --- /dev/null +++ b/samples/nitro_enclaves/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +/ne_ioctl_sample diff --git a/samples/nitro_enclaves/Makefile b/samples/nitro_enclaves/Makefile new file mode 100644 index 000000000000..a3ec78fefb52 --- /dev/null +++ b/samples/nitro_enclaves/Makefile @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +# Enclave lifetime management support for Nitro Enclaves (NE) - ioctl sample +# usage. + +.PHONY: all clean + +CFLAGS += -Wall + +all: + $(CC) $(CFLAGS) -o ne_ioctl_sample ne_ioctl_sample.c -lpthread + +clean: + rm -f ne_ioctl_sample diff --git a/samples/nitro_enclaves/ne_ioctl_sample.c b/samples/nitro_enclaves/ne_ioctl_sample.c new file mode 100644 index 000000000000..765b131c7319 --- /dev/null +++ b/samples/nitro_enclaves/ne_ioctl_sample.c @@ -0,0 +1,882 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. + */ + +/** + * DOC: Sample flow of using the ioctl interface provided by the Nitro Enclaves (NE) + * kernel driver. + * + * Usage + * ----- + * + * Load the nitro_enclaves module, setting also the enclave CPU pool. The + * enclave CPUs need to be full cores from the same NUMA node. CPU 0 and its + * siblings have to remain available for the primary / parent VM, so they + * cannot be included in the enclave CPU pool. + * + * See the cpu list section from the kernel documentation. + * https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists + * + * insmod drivers/virt/nitro_enclaves/nitro_enclaves.ko + * lsmod + * + * The CPU pool can be set at runtime, after the kernel module is loaded. + * + * echo <cpu-list> > /sys/module/nitro_enclaves/parameters/ne_cpus + * + * NUMA and CPU siblings information can be found using: + * + * lscpu + * /proc/cpuinfo + * + * Check the online / offline CPU list. The CPUs from the pool should be + * offlined. + * + * lscpu + * + * Check dmesg for any warnings / errors through the NE driver lifetime / usage. + * The NE logs contain the "nitro_enclaves" or "pci 0000:00:02.0" pattern. + * + * dmesg + * + * Setup hugetlbfs huge pages. The memory needs to be from the same NUMA node as + * the enclave CPUs. + * + * https://www.kernel.org/doc/html/latest/admin-guide/mm/hugetlbpage.html + * + * By default, the allocation of hugetlb pages are distributed on all possible + * NUMA nodes. Use the following configuration files to set the number of huge + * pages from a NUMA node: + * + * /sys/devices/system/node/node<X>/hugepages/hugepages-2048kB/nr_hugepages + * /sys/devices/system/node/node<X>/hugepages/hugepages-1048576kB/nr_hugepages + * + * or, if not on a system with multiple NUMA nodes, can also set the number + * of 2 MiB / 1 GiB huge pages using + * + * /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + * /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages + * + * In this example 256 hugepages of 2 MiB are used. + * + * Build and run the NE sample. + * + * make -C samples/nitro_enclaves clean + * make -C samples/nitro_enclaves + * ./samples/nitro_enclaves/ne_ioctl_sample <path_to_enclave_image> + * + * Unload the nitro_enclaves module. + * + * rmmod nitro_enclaves + * lsmod + */ + +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <poll.h> +#include <pthread.h> +#include <string.h> +#include <sys/eventfd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include <linux/mman.h> +#include <linux/nitro_enclaves.h> +#include <linux/vm_sockets.h> + +/** + * NE_DEV_NAME - Nitro Enclaves (NE) misc device that provides the ioctl interface. + */ +#define NE_DEV_NAME "/dev/nitro_enclaves" + +/** + * NE_POLL_WAIT_TIME - Timeout in seconds for each poll event. + */ +#define NE_POLL_WAIT_TIME (60) +/** + * NE_POLL_WAIT_TIME_MS - Timeout in milliseconds for each poll event. + */ +#define NE_POLL_WAIT_TIME_MS (NE_POLL_WAIT_TIME * 1000) + +/** + * NE_SLEEP_TIME - Amount of time in seconds for the process to keep the enclave alive. + */ +#define NE_SLEEP_TIME (300) + +/** + * NE_DEFAULT_NR_VCPUS - Default number of vCPUs set for an enclave. + */ +#define NE_DEFAULT_NR_VCPUS (2) + +/** + * NE_MIN_MEM_REGION_SIZE - Minimum size of a memory region - 2 MiB. + */ +#define NE_MIN_MEM_REGION_SIZE (2 * 1024 * 1024) + +/** + * NE_DEFAULT_NR_MEM_REGIONS - Default number of memory regions of 2 MiB set for + * an enclave. + */ +#define NE_DEFAULT_NR_MEM_REGIONS (256) + +/** + * NE_IMAGE_LOAD_HEARTBEAT_CID - Vsock CID for enclave image loading heartbeat logic. + */ +#define NE_IMAGE_LOAD_HEARTBEAT_CID (3) +/** + * NE_IMAGE_LOAD_HEARTBEAT_PORT - Vsock port for enclave image loading heartbeat logic. + */ +#define NE_IMAGE_LOAD_HEARTBEAT_PORT (9000) +/** + * NE_IMAGE_LOAD_HEARTBEAT_VALUE - Heartbeat value for enclave image loading. + */ +#define NE_IMAGE_LOAD_HEARTBEAT_VALUE (0xb7) + +/** + * struct ne_user_mem_region - User space memory region set for an enclave. + * @userspace_addr: Address of the user space memory region. + * @memory_size: Size of the user space memory region. + */ +struct ne_user_mem_region { + void *userspace_addr; + size_t memory_size; +}; + +/** + * ne_create_vm() - Create a slot for the enclave VM. + * @ne_dev_fd: The file descriptor of the NE misc device. + * @slot_uid: The generated slot uid for the enclave. + * @enclave_fd : The generated file descriptor for the enclave. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_create_vm(int ne_dev_fd, unsigned long *slot_uid, int *enclave_fd) +{ + int rc = -EINVAL; + *enclave_fd = ioctl(ne_dev_fd, NE_CREATE_VM, slot_uid); + + if (*enclave_fd < 0) { + rc = *enclave_fd; + switch (errno) { + case NE_ERR_NO_CPUS_AVAIL_IN_POOL: { + printf("Error in create VM, no CPUs available in the NE CPU pool\n"); + + break; + } + + default: + printf("Error in create VM [%m]\n"); + } + + return rc; + } + + return 0; +} + +/** + * ne_poll_enclave_fd() - Thread function for polling the enclave fd. + * @data: Argument provided for the polling function. + * + * Context: Process context. + * Return: + * * NULL on success / failure. + */ +void *ne_poll_enclave_fd(void *data) +{ + int enclave_fd = *(int *)data; + struct pollfd fds[1] = {}; + int i = 0; + int rc = -EINVAL; + + printf("Running from poll thread, enclave fd %d\n", enclave_fd); + + fds[0].fd = enclave_fd; + fds[0].events = POLLIN | POLLERR | POLLHUP; + + /* Keep on polling until the current process is terminated. */ + while (1) { + printf("[iter %d] Polling ...\n", i); + + rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS); + if (rc < 0) { + printf("Error in poll [%m]\n"); + + return NULL; + } + + i++; + + if (!rc) { + printf("Poll: %d seconds elapsed\n", + i * NE_POLL_WAIT_TIME); + + continue; + } + + printf("Poll received value 0x%x\n", fds[0].revents); + + if (fds[0].revents & POLLHUP) { + printf("Received POLLHUP\n"); + + return NULL; + } + + if (fds[0].revents & POLLNVAL) { + printf("Received POLLNVAL\n"); + + return NULL; + } + } + + return NULL; +} + +/** + * ne_alloc_user_mem_region() - Allocate a user space memory region for an enclave. + * @ne_user_mem_region: User space memory region allocated using hugetlbfs. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_alloc_user_mem_region(struct ne_user_mem_region *ne_user_mem_region) +{ + /** + * Check available hugetlb encodings for different huge page sizes in + * include/uapi/linux/mman.h. + */ + ne_user_mem_region->userspace_addr = mmap(NULL, ne_user_mem_region->memory_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | + MAP_HUGETLB | MAP_HUGE_2MB, -1, 0); + if (ne_user_mem_region->userspace_addr == MAP_FAILED) { + printf("Error in mmap memory [%m]\n"); + + return -1; + } + + return 0; +} + +/** + * ne_load_enclave_image() - Place the enclave image in the enclave memory. + * @enclave_fd : The file descriptor associated with the enclave. + * @ne_user_mem_regions: User space memory regions allocated for the enclave. + * @enclave_image_path : The file path of the enclave image. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_load_enclave_image(int enclave_fd, struct ne_user_mem_region ne_user_mem_regions[], + char *enclave_image_path) +{ + unsigned char *enclave_image = NULL; + int enclave_image_fd = -1; + size_t enclave_image_size = 0; + size_t enclave_memory_size = 0; + unsigned long i = 0; + size_t image_written_bytes = 0; + struct ne_image_load_info image_load_info = { + .flags = NE_EIF_IMAGE, + }; + struct stat image_stat_buf = {}; + int rc = -EINVAL; + size_t temp_image_offset = 0; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) + enclave_memory_size += ne_user_mem_regions[i].memory_size; + + rc = stat(enclave_image_path, &image_stat_buf); + if (rc < 0) { + printf("Error in get image stat info [%m]\n"); + + return rc; + } + + enclave_image_size = image_stat_buf.st_size; + + if (enclave_memory_size < enclave_image_size) { + printf("The enclave memory is smaller than the enclave image size\n"); + + return -ENOMEM; + } + + rc = ioctl(enclave_fd, NE_GET_IMAGE_LOAD_INFO, &image_load_info); + if (rc < 0) { + switch (errno) { + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in get image load info, enclave not in init state\n"); + + break; + } + + case NE_ERR_INVALID_FLAG_VALUE: { + printf("Error in get image load info, provided invalid flag\n"); + + break; + } + + default: + printf("Error in get image load info [%m]\n"); + } + + return rc; + } + + printf("Enclave image offset in enclave memory is %lld\n", + image_load_info.memory_offset); + + enclave_image_fd = open(enclave_image_path, O_RDONLY); + if (enclave_image_fd < 0) { + printf("Error in open enclave image file [%m]\n"); + + return enclave_image_fd; + } + + enclave_image = mmap(NULL, enclave_image_size, PROT_READ, + MAP_PRIVATE, enclave_image_fd, 0); + if (enclave_image == MAP_FAILED) { + printf("Error in mmap enclave image [%m]\n"); + + return -1; + } + + temp_image_offset = image_load_info.memory_offset; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { + size_t bytes_to_write = 0; + size_t memory_offset = 0; + size_t memory_size = ne_user_mem_regions[i].memory_size; + size_t remaining_bytes = 0; + void *userspace_addr = ne_user_mem_regions[i].userspace_addr; + + if (temp_image_offset >= memory_size) { + temp_image_offset -= memory_size; + + continue; + } else if (temp_image_offset != 0) { + memory_offset = temp_image_offset; + memory_size -= temp_image_offset; + temp_image_offset = 0; + } + + remaining_bytes = enclave_image_size - image_written_bytes; + bytes_to_write = memory_size < remaining_bytes ? + memory_size : remaining_bytes; + + memcpy(userspace_addr + memory_offset, + enclave_image + image_written_bytes, bytes_to_write); + + image_written_bytes += bytes_to_write; + + if (image_written_bytes == enclave_image_size) + break; + } + + munmap(enclave_image, enclave_image_size); + + close(enclave_image_fd); + + return 0; +} + +/** + * ne_set_user_mem_region() - Set a user space memory region for the given enclave. + * @enclave_fd : The file descriptor associated with the enclave. + * @ne_user_mem_region : User space memory region to be set for the enclave. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_set_user_mem_region(int enclave_fd, struct ne_user_mem_region ne_user_mem_region) +{ + struct ne_user_memory_region mem_region = { + .flags = NE_DEFAULT_MEMORY_REGION, + .memory_size = ne_user_mem_region.memory_size, + .userspace_addr = (__u64)ne_user_mem_region.userspace_addr, + }; + int rc = -EINVAL; + + rc = ioctl(enclave_fd, NE_SET_USER_MEMORY_REGION, &mem_region); + if (rc < 0) { + switch (errno) { + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in set user memory region, enclave not in init state\n"); + + break; + } + + case NE_ERR_INVALID_MEM_REGION_SIZE: { + printf("Error in set user memory region, mem size not multiple of 2 MiB\n"); + + break; + } + + case NE_ERR_INVALID_MEM_REGION_ADDR: { + printf("Error in set user memory region, invalid user space address\n"); + + break; + } + + case NE_ERR_UNALIGNED_MEM_REGION_ADDR: { + printf("Error in set user memory region, unaligned user space address\n"); + + break; + } + + case NE_ERR_MEM_REGION_ALREADY_USED: { + printf("Error in set user memory region, memory region already used\n"); + + break; + } + + case NE_ERR_MEM_NOT_HUGE_PAGE: { + printf("Error in set user memory region, not backed by huge pages\n"); + + break; + } + + case NE_ERR_MEM_DIFFERENT_NUMA_NODE: { + printf("Error in set user memory region, different NUMA node than CPUs\n"); + + break; + } + + case NE_ERR_MEM_MAX_REGIONS: { + printf("Error in set user memory region, max memory regions reached\n"); + + break; + } + + case NE_ERR_INVALID_PAGE_SIZE: { + printf("Error in set user memory region, has page not multiple of 2 MiB\n"); + + break; + } + + case NE_ERR_INVALID_FLAG_VALUE: { + printf("Error in set user memory region, provided invalid flag\n"); + + break; + } + + default: + printf("Error in set user memory region [%m]\n"); + } + + return rc; + } + + return 0; +} + +/** + * ne_free_mem_regions() - Unmap all the user space memory regions that were set + * aside for the enclave. + * @ne_user_mem_regions: The user space memory regions associated with an enclave. + * + * Context: Process context. + */ +static void ne_free_mem_regions(struct ne_user_mem_region ne_user_mem_regions[]) +{ + unsigned int i = 0; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) + munmap(ne_user_mem_regions[i].userspace_addr, + ne_user_mem_regions[i].memory_size); +} + +/** + * ne_add_vcpu() - Add a vCPU to the given enclave. + * @enclave_fd : The file descriptor associated with the enclave. + * @vcpu_id: vCPU id to be set for the enclave, either provided or + * auto-generated (if provided vCPU id is 0). + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_add_vcpu(int enclave_fd, unsigned int *vcpu_id) +{ + int rc = -EINVAL; + + rc = ioctl(enclave_fd, NE_ADD_VCPU, vcpu_id); + if (rc < 0) { + switch (errno) { + case NE_ERR_NO_CPUS_AVAIL_IN_POOL: { + printf("Error in add vcpu, no CPUs available in the NE CPU pool\n"); + + break; + } + + case NE_ERR_VCPU_ALREADY_USED: { + printf("Error in add vcpu, the provided vCPU is already used\n"); + + break; + } + + case NE_ERR_VCPU_NOT_IN_CPU_POOL: { + printf("Error in add vcpu, the provided vCPU is not in the NE CPU pool\n"); + + break; + } + + case NE_ERR_VCPU_INVALID_CPU_CORE: { + printf("Error in add vcpu, the core id of the provided vCPU is invalid\n"); + + break; + } + + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in add vcpu, enclave not in init state\n"); + + break; + } + + case NE_ERR_INVALID_VCPU: { + printf("Error in add vcpu, the provided vCPU is out of avail CPUs range\n"); + + break; + } + + default: + printf("Error in add vcpu [%m]\n"); + } + + return rc; + } + + return 0; +} + +/** + * ne_start_enclave() - Start the given enclave. + * @enclave_fd : The file descriptor associated with the enclave. + * @enclave_start_info : Enclave metadata used for starting e.g. vsock CID. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_start_enclave(int enclave_fd, struct ne_enclave_start_info *enclave_start_info) +{ + int rc = -EINVAL; + + rc = ioctl(enclave_fd, NE_START_ENCLAVE, enclave_start_info); + if (rc < 0) { + switch (errno) { + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in start enclave, enclave not in init state\n"); + + break; + } + + case NE_ERR_NO_MEM_REGIONS_ADDED: { + printf("Error in start enclave, no memory regions have been added\n"); + + break; + } + + case NE_ERR_NO_VCPUS_ADDED: { + printf("Error in start enclave, no vCPUs have been added\n"); + + break; + } + + case NE_ERR_FULL_CORES_NOT_USED: { + printf("Error in start enclave, enclave has no full cores set\n"); + + break; + } + + case NE_ERR_ENCLAVE_MEM_MIN_SIZE: { + printf("Error in start enclave, enclave memory is less than min size\n"); + + break; + } + + case NE_ERR_INVALID_FLAG_VALUE: { + printf("Error in start enclave, provided invalid flag\n"); + + break; + } + + case NE_ERR_INVALID_ENCLAVE_CID: { + printf("Error in start enclave, provided invalid enclave CID\n"); + + break; + } + + default: + printf("Error in start enclave [%m]\n"); + } + + return rc; + } + + return 0; +} + +/** + * ne_start_enclave_check_booted() - Start the enclave and wait for a heartbeat + * from it, on a newly created vsock channel, + * to check it has booted. + * @enclave_fd : The file descriptor associated with the enclave. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_start_enclave_check_booted(int enclave_fd) +{ + struct sockaddr_vm client_vsock_addr = {}; + int client_vsock_fd = -1; + socklen_t client_vsock_len = sizeof(client_vsock_addr); + struct ne_enclave_start_info enclave_start_info = {}; + struct pollfd fds[1] = {}; + int rc = -EINVAL; + unsigned char recv_buf = 0; + struct sockaddr_vm server_vsock_addr = { + .svm_family = AF_VSOCK, + .svm_cid = NE_IMAGE_LOAD_HEARTBEAT_CID, + .svm_port = NE_IMAGE_LOAD_HEARTBEAT_PORT, + }; + int server_vsock_fd = -1; + + server_vsock_fd = socket(AF_VSOCK, SOCK_STREAM, 0); + if (server_vsock_fd < 0) { + rc = server_vsock_fd; + + printf("Error in socket [%m]\n"); + + return rc; + } + + rc = bind(server_vsock_fd, (struct sockaddr *)&server_vsock_addr, + sizeof(server_vsock_addr)); + if (rc < 0) { + printf("Error in bind [%m]\n"); + + goto out; + } + + rc = listen(server_vsock_fd, 1); + if (rc < 0) { + printf("Error in listen [%m]\n"); + + goto out; + } + + rc = ne_start_enclave(enclave_fd, &enclave_start_info); + if (rc < 0) + goto out; + + printf("Enclave started, CID %llu\n", enclave_start_info.enclave_cid); + + fds[0].fd = server_vsock_fd; + fds[0].events = POLLIN; + + rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS); + if (rc < 0) { + printf("Error in poll [%m]\n"); + + goto out; + } + + if (!rc) { + printf("Poll timeout, %d seconds elapsed\n", NE_POLL_WAIT_TIME); + + rc = -ETIMEDOUT; + + goto out; + } + + if ((fds[0].revents & POLLIN) == 0) { + printf("Poll received value %d\n", fds[0].revents); + + rc = -EINVAL; + + goto out; + } + + rc = accept(server_vsock_fd, (struct sockaddr *)&client_vsock_addr, + &client_vsock_len); + if (rc < 0) { + printf("Error in accept [%m]\n"); + + goto out; + } + + client_vsock_fd = rc; + + /* + * Read the heartbeat value that the init process in the enclave sends + * after vsock connect. + */ + rc = read(client_vsock_fd, &recv_buf, sizeof(recv_buf)); + if (rc < 0) { + printf("Error in read [%m]\n"); + + goto out; + } + + if (rc != sizeof(recv_buf) || recv_buf != NE_IMAGE_LOAD_HEARTBEAT_VALUE) { + printf("Read %d instead of %d\n", recv_buf, + NE_IMAGE_LOAD_HEARTBEAT_VALUE); + + goto out; + } + + /* Write the heartbeat value back. */ + rc = write(client_vsock_fd, &recv_buf, sizeof(recv_buf)); + if (rc < 0) { + printf("Error in write [%m]\n"); + + goto out; + } + + rc = 0; + +out: + close(server_vsock_fd); + + return rc; +} + +int main(int argc, char *argv[]) +{ + int enclave_fd = -1; + unsigned int i = 0; + int ne_dev_fd = -1; + struct ne_user_mem_region ne_user_mem_regions[NE_DEFAULT_NR_MEM_REGIONS] = {}; + unsigned int ne_vcpus[NE_DEFAULT_NR_VCPUS] = {}; + int rc = -EINVAL; + pthread_t thread_id = 0; + unsigned long slot_uid = 0; + + if (argc != 2) { + printf("Usage: %s <path_to_enclave_image>\n", argv[0]); + + exit(EXIT_FAILURE); + } + + if (strlen(argv[1]) >= PATH_MAX) { + printf("The size of the path to enclave image is higher than max path\n"); + + exit(EXIT_FAILURE); + } + + ne_dev_fd = open(NE_DEV_NAME, O_RDWR | O_CLOEXEC); + if (ne_dev_fd < 0) { + printf("Error in open NE device [%m]\n"); + + exit(EXIT_FAILURE); + } + + printf("Creating enclave slot ...\n"); + + rc = ne_create_vm(ne_dev_fd, &slot_uid, &enclave_fd); + + close(ne_dev_fd); + + if (rc < 0) + exit(EXIT_FAILURE); + + printf("Enclave fd %d\n", enclave_fd); + + rc = pthread_create(&thread_id, NULL, ne_poll_enclave_fd, (void *)&enclave_fd); + if (rc < 0) { + printf("Error in thread create [%m]\n"); + + close(enclave_fd); + + exit(EXIT_FAILURE); + } + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { + ne_user_mem_regions[i].memory_size = NE_MIN_MEM_REGION_SIZE; + + rc = ne_alloc_user_mem_region(&ne_user_mem_regions[i]); + if (rc < 0) { + printf("Error in alloc userspace memory region, iter %d\n", i); + + goto release_enclave_fd; + } + } + + rc = ne_load_enclave_image(enclave_fd, ne_user_mem_regions, argv[1]); + if (rc < 0) + goto release_enclave_fd; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { + rc = ne_set_user_mem_region(enclave_fd, ne_user_mem_regions[i]); + if (rc < 0) { + printf("Error in set memory region, iter %d\n", i); + + goto release_enclave_fd; + } + } + + printf("Enclave memory regions were added\n"); + + for (i = 0; i < NE_DEFAULT_NR_VCPUS; i++) { + /* + * The vCPU is chosen from the enclave vCPU pool, if the value + * of the vcpu_id is 0. + */ + ne_vcpus[i] = 0; + rc = ne_add_vcpu(enclave_fd, &ne_vcpus[i]); + if (rc < 0) { + printf("Error in add vcpu, iter %d\n", i); + + goto release_enclave_fd; + } + + printf("Added vCPU %d to the enclave\n", ne_vcpus[i]); + } + + printf("Enclave vCPUs were added\n"); + + rc = ne_start_enclave_check_booted(enclave_fd); + if (rc < 0) { + printf("Error in the enclave start / image loading heartbeat logic [rc=%d]\n", rc); + + goto release_enclave_fd; + } + + printf("Entering sleep for %d seconds ...\n", NE_SLEEP_TIME); + + sleep(NE_SLEEP_TIME); + + close(enclave_fd); + + ne_free_mem_regions(ne_user_mem_regions); + + exit(EXIT_SUCCESS); + +release_enclave_fd: + close(enclave_fd); + ne_free_mem_regions(ne_user_mem_regions); + + exit(EXIT_FAILURE); +} diff --git a/samples/pidfd/.gitignore b/samples/pidfd/.gitignore index be52b3ba6e4b..d4cfa3176b1b 100644 --- a/samples/pidfd/.gitignore +++ b/samples/pidfd/.gitignore @@ -1 +1,2 @@ -pidfd-metadata +# SPDX-License-Identifier: GPL-2.0-only +/pidfd-metadata diff --git a/samples/pidfd/Makefile b/samples/pidfd/Makefile index ee2979849d92..9754e2d81f70 100644 --- a/samples/pidfd/Makefile +++ b/samples/pidfd/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 +usertprogs-always-y += pidfd-metadata -hostprogs := pidfd-metadata -always-y := $(hostprogs) -HOSTCFLAGS_pidfd-metadata.o += -I$(objtree)/usr/include -all: pidfd-metadata +userccflags += -I usr/include diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst index 3f6483e8b2df..f4adeed5f5f0 100644 --- a/samples/pktgen/README.rst +++ b/samples/pktgen/README.rst @@ -3,7 +3,7 @@ Sample and benchmark scripts for pktgen (packet generator) This directory contains some pktgen sample and benchmark scripts, that can easily be copied and adjusted for your own use-case. -General doc is located in kernel: Documentation/networking/pktgen.txt +General doc is located in kernel: Documentation/networking/pktgen.rst Helper include files ==================== @@ -28,10 +28,28 @@ across the sample scripts. Usage example is printed on errors:: -b : ($BURST) HW level bursting of SKBs -v : ($VERBOSE) verbose -x : ($DEBUG) debug + -6 : ($IP6) IPv6 + -w : ($DELAY) Tx Delay value (ns) + -a : ($APPEND) Script will not reset generator's state, but will append its config The global variable being set is also listed. E.g. the required interface/device parameter "-i" sets variable $DEV. +"-a" parameter may be used to create different flows simultaneously. +In this mode script will keep the existing config, will append its settings. +In this mode you'll have to manually run traffic with "pg_ctrl start". + +For example you may use: + + source ./samples/pktgen/functions.sh + pg_ctrl reset + # add first device + ./pktgen_sample06_numa_awared_queue_irq_affinity.sh -a -i ens1f0 -m 34:80:0d:a3:fc:c9 -t 8 + # add second device + ./pktgen_sample06_numa_awared_queue_irq_affinity.sh -a -i ens1f1 -m 34:80:0d:a3:fc:c9 -t 8 + # run joint traffic on two devs + pg_ctrl start + Common functions ---------------- The functions.sh file provides; Three different shell functions for diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh index dae06d5b38fa..933194257a24 100644 --- a/samples/pktgen/functions.sh +++ b/samples/pktgen/functions.sh @@ -108,7 +108,12 @@ function pgset() { fi } -[[ $EUID -eq 0 ]] && trap 'pg_ctrl "reset"' EXIT +if [[ -z "$APPEND" ]]; then + if [[ $EUID -eq 0 ]]; then + # Cleanup pktgen setup on exit if thats not "append mode" + trap 'pg_ctrl "reset"' EXIT + fi +fi ## -- General shell tricks -- @@ -118,7 +123,7 @@ function root_check_run_with_sudo() { if [ "$EUID" -ne 0 ]; then if [ -x $0 ]; then # Directly executable use sudo info "Not root, running with sudo" - sudo "$0" "$@" + sudo -E "$0" "$@" exit $? fi err 4 "cannot perform sudo run of $0" diff --git a/samples/pktgen/parameters.sh b/samples/pktgen/parameters.sh index ff0ed474fee9..81906f199454 100644 --- a/samples/pktgen/parameters.sh +++ b/samples/pktgen/parameters.sh @@ -11,6 +11,7 @@ function usage() { echo " -d : (\$DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed" echo " -m : (\$DST_MAC) destination MAC-addr" echo " -p : (\$DST_PORT) destination PORT range (e.g. 433-444) is also allowed" + echo " -k : (\$UDP_CSUM) enable UDP tx checksum" echo " -t : (\$THREADS) threads to start" echo " -f : (\$F_THREAD) index of first thread (zero indexed CPU number)" echo " -c : (\$SKB_CLONE) SKB clones send before alloc new SKB" @@ -19,12 +20,14 @@ function usage() { echo " -v : (\$VERBOSE) verbose" echo " -x : (\$DEBUG) debug" echo " -6 : (\$IP6) IPv6" + echo " -w : (\$DELAY) Tx Delay value (ns)" + echo " -a : (\$APPEND) Script will not reset generator's state, but will append its config" echo "" } ## --- Parse command line arguments / parameters --- ## echo "Commandline options:" -while getopts "s:i:d:m:p:f:t:c:n:b:vxh6" option; do +while getopts "s:i:d:m:p:f:t:c:n:b:w:vxh6ak" option; do case $option in i) # interface export DEV=$OPTARG @@ -66,6 +69,10 @@ while getopts "s:i:d:m:p:f:t:c:n:b:vxh6" option; do export BURST=$OPTARG info "SKB bursting: BURST=$BURST" ;; + w) + export DELAY=$OPTARG + info "DELAY=$DELAY" + ;; v) export VERBOSE=yes info "Verbose mode: VERBOSE=$VERBOSE" @@ -78,6 +85,14 @@ while getopts "s:i:d:m:p:f:t:c:n:b:vxh6" option; do export IP6=6 info "IP6: IP6=$IP6" ;; + a) + export APPEND=yes + info "Append mode: APPEND=$APPEND" + ;; + k) + export UDP_CSUM=yes + info "UDP tx checksum: UDP_CSUM=$UDP_CSUM" + ;; h|?|*) usage; err 2 "[ERROR] Unknown parameters!!!" @@ -100,6 +115,9 @@ if [ -z "$THREADS" ]; then export THREADS=1 fi +# default DELAY +[ -z "$DELAY" ] && export DELAY=0 # Zero means max speed + export L_THREAD=$(( THREADS + F_THREAD - 1 )) if [ -z "$DEV" ]; then diff --git a/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh index 1b6204125d2d..99ec0688b044 100755 --- a/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh +++ b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh @@ -50,9 +50,6 @@ if [ -n "$DST_PORT" ]; then validate_ports $UDP_DST_MIN $UDP_DST_MAX fi -# Base Config -DELAY="0" # Zero means max speed - # General cleanup everything since last run pg_ctrl "reset" @@ -92,14 +89,21 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "burst $BURST" done +# Run if user hits control-c +function print_result() { + # Print results + for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" + done +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT + # start_run echo "Running... ctrl^C to stop" >&2 pg_ctrl "start" echo "Done" >&2 -# Print results -for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do - dev=${DEV}@${thread} - echo "Device: $dev" - cat /proc/net/pktgen/$dev | grep -A2 "Result:" -done +print_result diff --git a/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh index e607cb369b20..04b0dd0c36d6 100755 --- a/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh +++ b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh @@ -33,9 +33,6 @@ if [ -n "$DST_PORT" ]; then validate_ports $UDP_DST_MIN $UDP_DST_MAX fi -# Base Config -DELAY="0" # Zero means max speed - # General cleanup everything since last run pg_ctrl "reset" @@ -72,14 +69,21 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "xmit_mode queue_xmit" done +# Run if user hits control-c +function print_result { + # Print results + for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" + done +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT + # start_run echo "Running... ctrl^C to stop" >&2 pg_ctrl "start" echo "Done" >&2 -# Print results -for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do - dev=${DEV}@${thread} - echo "Device: $dev" - cat /proc/net/pktgen/$dev | grep -A2 "Result:" -done +print_result diff --git a/samples/pktgen/pktgen_sample01_simple.sh b/samples/pktgen/pktgen_sample01_simple.sh index a4e250b45dce..09a92ea963f9 100755 --- a/samples/pktgen/pktgen_sample01_simple.sh +++ b/samples/pktgen/pktgen_sample01_simple.sh @@ -31,20 +31,17 @@ if [ -n "$DST_PORT" ]; then validate_ports $UDP_DST_MIN $UDP_DST_MAX fi -# Base Config -DELAY="0" # Zero means max speed - # Flow variation random source port between min and max UDP_SRC_MIN=9 UDP_SRC_MAX=109 # General cleanup everything since last run # (especially important if other threads were configured by other scripts) -pg_ctrl "reset" +[ -z "$APPEND" ] && pg_ctrl "reset" # Add remove all other devices and add_device $DEV to thread 0 thread=0 -pg_thread $thread "rem_device_all" +[ -z "$APPEND" ] && pg_thread $thread "rem_device_all" pg_thread $thread "add_device" $DEV # How many packets to send (zero means indefinitely) @@ -75,16 +72,29 @@ if [ -n "$DST_PORT" ]; then pg_set $DEV "udp_dst_max $UDP_DST_MAX" fi +[ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" + # Setup random UDP port src range pg_set $DEV "flag UDPSRC_RND" pg_set $DEV "udp_src_min $UDP_SRC_MIN" pg_set $DEV "udp_src_max $UDP_SRC_MAX" -# start_run -echo "Running... ctrl^C to stop" >&2 -pg_ctrl "start" -echo "Done" >&2 +# Run if user hits control-c +function print_result() { + # Print results + echo "Result device: $DEV" + cat /proc/net/pktgen/$DEV +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT + +if [ -z "$APPEND" ]; then + # start_run + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" + echo "Done" >&2 -# Print results -echo "Result device: $DEV" -cat /proc/net/pktgen/$DEV + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi
\ No newline at end of file diff --git a/samples/pktgen/pktgen_sample02_multiqueue.sh b/samples/pktgen/pktgen_sample02_multiqueue.sh index cb2495fcdc60..7fa41c84c32f 100755 --- a/samples/pktgen/pktgen_sample02_multiqueue.sh +++ b/samples/pktgen/pktgen_sample02_multiqueue.sh @@ -17,7 +17,6 @@ source ${basedir}/parameters.sh [ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely # Base Config -DELAY="0" # Zero means max speed [ -z "$CLONE_SKB" ] && CLONE_SKB="0" # Flow variation random source port between min and max @@ -39,7 +38,7 @@ if [ -n "$DST_PORT" ]; then fi # General cleanup everything since last run -pg_ctrl "reset" +[ -z "$APPEND" ] && pg_ctrl "reset" # Threads are specified with parameter -t value in $THREADS for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do @@ -48,7 +47,7 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do dev=${DEV}@${thread} # Add remove all other devices and add_device $dev to thread - pg_thread $thread "rem_device_all" + [ -z "$APPEND" ] && pg_thread $thread "rem_device_all" pg_thread $thread "add_device" $dev # Notice config queue to map to cpu (mirrors smp_processor_id()) @@ -76,20 +75,33 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "udp_dst_max $UDP_DST_MAX" fi + [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" + # Setup random UDP port src range pg_set $dev "flag UDPSRC_RND" pg_set $dev "udp_src_min $UDP_SRC_MIN" pg_set $dev "udp_src_max $UDP_SRC_MAX" done -# start_run -echo "Running... ctrl^C to stop" >&2 -pg_ctrl "start" -echo "Done" >&2 +# Run if user hits control-c +function print_result() { + # Print results + for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" + done +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT -# Print results -for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do - dev=${DEV}@${thread} - echo "Device: $dev" - cat /proc/net/pktgen/$dev | grep -A2 "Result:" -done +if [ -z "$APPEND" ]; then + # start_run + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" + echo "Done" >&2 + + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh index fff50765a5aa..8bf2fdffba16 100755 --- a/samples/pktgen/pktgen_sample03_burst_single_flow.sh +++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh @@ -42,18 +42,15 @@ if [ -n "$DST_PORT" ]; then validate_ports $UDP_DST_MIN $UDP_DST_MAX fi -# Base Config -DELAY="0" # Zero means max speed - # General cleanup everything since last run -pg_ctrl "reset" +[ -z "$APPEND" ] && pg_ctrl "reset" # Threads are specified with parameter -t value in $THREADS for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do dev=${DEV}@${thread} # Add remove all other devices and add_device $dev to thread - pg_thread $thread "rem_device_all" + [ -z "$APPEND" ] && pg_thread $thread "rem_device_all" pg_thread $thread "add_device" $dev # Base config @@ -76,6 +73,8 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "udp_dst_max $UDP_DST_MAX" fi + [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" + # Setup burst, for easy testing -b 0 disable bursting # (internally in pktgen default and minimum burst=1) if [[ ${BURST} -ne 0 ]]; then @@ -86,7 +85,7 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do done # Run if user hits control-c -function control_c() { +function print_result() { # Print results for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do dev=${DEV}@${thread} @@ -95,7 +94,13 @@ function control_c() { done } # trap keyboard interrupt (Ctrl-C) -trap control_c SIGINT +trap true SIGINT -echo "Running... ctrl^C to stop" >&2 -pg_ctrl "start" +if [ -z "$APPEND" ]; then + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" + + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi diff --git a/samples/pktgen/pktgen_sample04_many_flows.sh b/samples/pktgen/pktgen_sample04_many_flows.sh index 2cd6b701400d..cff51f861506 100755 --- a/samples/pktgen/pktgen_sample04_many_flows.sh +++ b/samples/pktgen/pktgen_sample04_many_flows.sh @@ -13,13 +13,15 @@ root_check_run_with_sudo "$@" # Parameter parsing via include source ${basedir}/parameters.sh # Set some default params, if they didn't get set -[ -z "$DEST_IP" ] && DEST_IP="198.18.0.42" +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" [ -z "$CLONE_SKB" ] && CLONE_SKB="0" [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely if [ -n "$DEST_IP" ]; then - validate_addr $DEST_IP - read -r DST_MIN DST_MAX <<< $(parse_addr $DEST_IP) + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) fi if [ -n "$DST_PORT" ]; then read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) @@ -34,9 +36,6 @@ fi [ -z "$FLOWS" ] && FLOWS="8000" [ -z "$FLOWLEN" ] && FLOWLEN="10" -# Base Config -DELAY="0" # Zero means max speed - if [[ -n "$BURST" ]]; then err 1 "Bursting not supported for this mode" fi @@ -45,14 +44,14 @@ fi read -r SRC_MIN SRC_MAX <<< $(parse_addr 198.18.0.0/15) # General cleanup everything since last run -pg_ctrl "reset" +[ -z "$APPEND" ] && pg_ctrl "reset" # Threads are specified with parameter -t value in $THREADS for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do dev=${DEV}@${thread} # Add remove all other devices and add_device $dev to thread - pg_thread $thread "rem_device_all" + [ -z "$APPEND" ] && pg_thread $thread "rem_device_all" pg_thread $thread "add_device" $dev # Base config @@ -65,8 +64,8 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do # Single destination pg_set $dev "dst_mac $DST_MAC" - pg_set $dev "dst_min $DST_MIN" - pg_set $dev "dst_max $DST_MAX" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" if [ -n "$DST_PORT" ]; then # Single destination port or random port range @@ -75,6 +74,8 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "udp_dst_max $UDP_DST_MAX" fi + [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" + # Randomize source IP-addresses pg_set $dev "flag IPSRC_RND" pg_set $dev "src_min $SRC_MIN" @@ -107,7 +108,11 @@ function print_result() { # trap keyboard interrupt (Ctrl-C) trap true SIGINT -echo "Running... ctrl^C to stop" >&2 -pg_ctrl "start" +if [ -z "$APPEND" ]; then + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" -print_result + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi diff --git a/samples/pktgen/pktgen_sample05_flow_per_thread.sh b/samples/pktgen/pktgen_sample05_flow_per_thread.sh index 4cb6252ade39..3578d0aa4ac5 100755 --- a/samples/pktgen/pktgen_sample05_flow_per_thread.sh +++ b/samples/pktgen/pktgen_sample05_flow_per_thread.sh @@ -17,32 +17,31 @@ root_check_run_with_sudo "$@" # Parameter parsing via include source ${basedir}/parameters.sh # Set some default params, if they didn't get set -[ -z "$DEST_IP" ] && DEST_IP="198.18.0.42" +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" [ -z "$CLONE_SKB" ] && CLONE_SKB="0" [ -z "$BURST" ] && BURST=32 [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely if [ -n "$DEST_IP" ]; then - validate_addr $DEST_IP - read -r DST_MIN DST_MAX <<< $(parse_addr $DEST_IP) + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) fi if [ -n "$DST_PORT" ]; then read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) validate_ports $UDP_DST_MIN $UDP_DST_MAX fi -# Base Config -DELAY="0" # Zero means max speed - # General cleanup everything since last run -pg_ctrl "reset" +[ -z "$APPEND" ] && pg_ctrl "reset" # Threads are specified with parameter -t value in $THREADS for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do dev=${DEV}@${thread} # Add remove all other devices and add_device $dev to thread - pg_thread $thread "rem_device_all" + [ -z "$APPEND" ] && pg_thread $thread "rem_device_all" pg_thread $thread "add_device" $dev # Base config @@ -55,8 +54,8 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do # Single destination pg_set $dev "dst_mac $DST_MAC" - pg_set $dev "dst_min $DST_MIN" - pg_set $dev "dst_max $DST_MAX" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" if [ -n "$DST_PORT" ]; then # Single destination port or random port range @@ -65,6 +64,8 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "udp_dst_max $UDP_DST_MAX" fi + [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" + # Setup source IP-addresses based on thread number pg_set $dev "src_min 198.18.$((thread+1)).1" pg_set $dev "src_max 198.18.$((thread+1)).1" @@ -91,7 +92,11 @@ function print_result() { # trap keyboard interrupt (Ctrl-C) trap true SIGINT -echo "Running... ctrl^C to stop" >&2 -pg_ctrl "start" +if [ -z "$APPEND" ]; then + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" -print_result + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh index 728106060a02..264cc5db9c49 100755 --- a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh +++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh @@ -15,7 +15,6 @@ root_check_run_with_sudo "$@" source ${basedir}/parameters.sh # Base Config -DELAY="0" # Zero means max speed [ -z "$COUNT" ] && COUNT="20000000" # Zero means indefinitely [ -z "$CLONE_SKB" ] && CLONE_SKB="0" @@ -45,7 +44,7 @@ if [ -n "$DST_PORT" ]; then fi # General cleanup everything since last run -pg_ctrl "reset" +[ -z "$APPEND" ] && pg_ctrl "reset" # Threads are specified with parameter -t value in $THREADS for ((i = 0; i < $THREADS; i++)); do @@ -59,7 +58,7 @@ for ((i = 0; i < $THREADS; i++)); do info "irq ${irq_array[$i]} is set affinity to `cat /proc/irq/${irq_array[$i]}/smp_affinity_list`" # Add remove all other devices and add_device $dev to thread - pg_thread $thread "rem_device_all" + [ -z "$APPEND" ] && pg_thread $thread "rem_device_all" pg_thread $thread "add_device" $dev # select queue and bind the queue and $dev in 1:1 relationship @@ -93,21 +92,34 @@ for ((i = 0; i < $THREADS; i++)); do pg_set $dev "udp_dst_max $UDP_DST_MAX" fi + [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" + # Setup random UDP port src range pg_set $dev "flag UDPSRC_RND" pg_set $dev "udp_src_min $UDP_SRC_MIN" pg_set $dev "udp_src_max $UDP_SRC_MAX" done -# start_run -echo "Running... ctrl^C to stop" >&2 -pg_ctrl "start" -echo "Done" >&2 +# Run if user hits control-c +function print_result() { + # Print results + for ((i = 0; i < $THREADS; i++)); do + thread=${cpu_array[$((i+F_THREAD))]} + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" + done +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT -# Print results -for ((i = 0; i < $THREADS; i++)); do - thread=${cpu_array[$((i+F_THREAD))]} - dev=${DEV}@${thread} - echo "Device: $dev" - cat /proc/net/pktgen/$dev | grep -A2 "Result:" -done +# start_run +if [ -z "$APPEND" ]; then + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" + echo "Done" >&2 + + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi diff --git a/samples/qmi/qmi_sample_client.c b/samples/qmi/qmi_sample_client.c index c9e7276c3d83..c045e3d24326 100644 --- a/samples/qmi/qmi_sample_client.c +++ b/samples/qmi/qmi_sample_client.c @@ -42,7 +42,7 @@ struct test_name_type_v01 { char name[TEST_MAX_NAME_SIZE_V01]; }; -static struct qmi_elem_info test_name_type_v01_ei[] = { +static const struct qmi_elem_info test_name_type_v01_ei[] = { { .data_type = QMI_DATA_LEN, .elem_len = 1, @@ -71,7 +71,7 @@ struct test_ping_req_msg_v01 { struct test_name_type_v01 client_name; }; -static struct qmi_elem_info test_ping_req_msg_v01_ei[] = { +static const struct qmi_elem_info test_ping_req_msg_v01_ei[] = { { .data_type = QMI_UNSIGNED_1_BYTE, .elem_len = 4, @@ -113,7 +113,7 @@ struct test_ping_resp_msg_v01 { struct test_name_type_v01 service_name; }; -static struct qmi_elem_info test_ping_resp_msg_v01_ei[] = { +static const struct qmi_elem_info test_ping_resp_msg_v01_ei[] = { { .data_type = QMI_STRUCT, .elem_len = 1, @@ -172,7 +172,7 @@ struct test_data_req_msg_v01 { struct test_name_type_v01 client_name; }; -static struct qmi_elem_info test_data_req_msg_v01_ei[] = { +static const struct qmi_elem_info test_data_req_msg_v01_ei[] = { { .data_type = QMI_DATA_LEN, .elem_len = 1, @@ -224,7 +224,7 @@ struct test_data_resp_msg_v01 { struct test_name_type_v01 service_name; }; -static struct qmi_elem_info test_data_resp_msg_v01_ei[] = { +static const struct qmi_elem_info test_data_resp_msg_v01_ei[] = { { .data_type = QMI_STRUCT, .elem_len = 1, @@ -429,7 +429,7 @@ static const struct file_operations data_fops = { .write = data_write, }; -static struct qmi_msg_handler qmi_sample_handlers[] = { +static const struct qmi_msg_handler qmi_sample_handlers[] = { { .type = QMI_RESPONSE, .msg_id = TEST_PING_REQ_MSG_ID_V01, @@ -571,7 +571,7 @@ static void qmi_sample_del_server(struct qmi_handle *qmi, static struct qmi_handle lookup_client; -static struct qmi_ops lookup_ops = { +static const struct qmi_ops lookup_ops = { .new_server = qmi_sample_new_server, .del_server = qmi_sample_del_server, }; diff --git a/samples/rust/Kconfig b/samples/rust/Kconfig new file mode 100644 index 000000000000..841e0906e943 --- /dev/null +++ b/samples/rust/Kconfig @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 + +menuconfig SAMPLES_RUST + bool "Rust samples" + depends on RUST + help + You can build sample Rust kernel code here. + + If unsure, say N. + +if SAMPLES_RUST + +config SAMPLE_RUST_MINIMAL + tristate "Minimal" + help + This option builds the Rust minimal module sample. + + To compile this as a module, choose M here: + the module will be called rust_minimal. + + If unsure, say N. + +config SAMPLE_RUST_HOSTPROGS + bool "Host programs" + help + This option builds the Rust host program samples. + + If unsure, say N. + +endif # SAMPLES_RUST diff --git a/samples/rust/Makefile b/samples/rust/Makefile new file mode 100644 index 000000000000..1daba5f8658a --- /dev/null +++ b/samples/rust/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_SAMPLE_RUST_MINIMAL) += rust_minimal.o + +subdir-$(CONFIG_SAMPLE_RUST_HOSTPROGS) += hostprogs diff --git a/samples/rust/hostprogs/.gitignore b/samples/rust/hostprogs/.gitignore new file mode 100644 index 000000000000..a6c173da5048 --- /dev/null +++ b/samples/rust/hostprogs/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +single diff --git a/samples/rust/hostprogs/Makefile b/samples/rust/hostprogs/Makefile new file mode 100644 index 000000000000..8ddcbd7416db --- /dev/null +++ b/samples/rust/hostprogs/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +hostprogs-always-y := single + +single-rust := y diff --git a/samples/rust/hostprogs/a.rs b/samples/rust/hostprogs/a.rs new file mode 100644 index 000000000000..f7a4a3d0f4e0 --- /dev/null +++ b/samples/rust/hostprogs/a.rs @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust single host program sample: module `a`. + +pub(crate) fn f(x: i32) { + println!("The number is {}.", x); +} diff --git a/samples/rust/hostprogs/b.rs b/samples/rust/hostprogs/b.rs new file mode 100644 index 000000000000..c1675890648f --- /dev/null +++ b/samples/rust/hostprogs/b.rs @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust single host program sample: module `b`. + +pub(crate) const CONSTANT: i32 = 42; diff --git a/samples/rust/hostprogs/single.rs b/samples/rust/hostprogs/single.rs new file mode 100644 index 000000000000..8c48a119339a --- /dev/null +++ b/samples/rust/hostprogs/single.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust single host program sample. + +mod a; +mod b; + +fn main() { + println!("Hello world!"); + + a::f(b::CONSTANT); +} diff --git a/samples/rust/rust_minimal.rs b/samples/rust/rust_minimal.rs new file mode 100644 index 000000000000..54ad17685742 --- /dev/null +++ b/samples/rust/rust_minimal.rs @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust minimal sample. + +use kernel::prelude::*; + +module! { + type: RustMinimal, + name: b"rust_minimal", + author: b"Rust for Linux Contributors", + description: b"Rust minimal sample", + license: b"GPL", +} + +struct RustMinimal { + numbers: Vec<i32>, +} + +impl kernel::Module for RustMinimal { + fn init(_module: &'static ThisModule) -> Result<Self> { + pr_info!("Rust minimal sample (init)\n"); + pr_info!("Am I built-in? {}\n", !cfg!(MODULE)); + + let mut numbers = Vec::new(); + numbers.try_push(72)?; + numbers.try_push(108)?; + numbers.try_push(200)?; + + Ok(RustMinimal { numbers }) + } +} + +impl Drop for RustMinimal { + fn drop(&mut self) { + pr_info!("My numbers are {:?}\n", self.numbers); + pr_info!("Rust minimal sample (exit)\n"); + } +} diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore index d1e2e817d556..a6df0da77c5d 100644 --- a/samples/seccomp/.gitignore +++ b/samples/seccomp/.gitignore @@ -1,4 +1,5 @@ -bpf-direct -bpf-fancy -dropper -user-trap +# SPDX-License-Identifier: GPL-2.0-only +/bpf-direct +/bpf-fancy +/dropper +/user-trap diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile index 89279e8b87df..c85ae0ed8342 100644 --- a/samples/seccomp/Makefile +++ b/samples/seccomp/Makefile @@ -1,44 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -ifndef CROSS_COMPILE -hostprogs := bpf-fancy dropper bpf-direct user-trap +userprogs-always-y += bpf-fancy dropper bpf-direct user-trap -HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include -HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include -HOSTCFLAGS_bpf-helper.o += -I$(objtree)/usr/include -HOSTCFLAGS_bpf-helper.o += -idirafter $(objtree)/include bpf-fancy-objs := bpf-fancy.o bpf-helper.o -HOSTCFLAGS_dropper.o += -I$(objtree)/usr/include -HOSTCFLAGS_dropper.o += -idirafter $(objtree)/include -dropper-objs := dropper.o - -HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include -HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include -bpf-direct-objs := bpf-direct.o - -HOSTCFLAGS_user-trap.o += -I$(objtree)/usr/include -HOSTCFLAGS_user-trap.o += -idirafter $(objtree)/include -user-trap-objs := user-trap.o - -# Try to match the kernel target. -ifndef CONFIG_64BIT - -# s390 has -m31 flag to build 31 bit binaries -ifndef CONFIG_S390 -MFLAG = -m32 -else -MFLAG = -m31 -endif - -HOSTCFLAGS_bpf-direct.o += $(MFLAG) -HOSTCFLAGS_dropper.o += $(MFLAG) -HOSTCFLAGS_bpf-helper.o += $(MFLAG) -HOSTCFLAGS_bpf-fancy.o += $(MFLAG) -HOSTCFLAGS_user-trap.o += $(MFLAG) -HOSTLDLIBS_bpf-direct += $(MFLAG) -HOSTLDLIBS_bpf-fancy += $(MFLAG) -HOSTLDLIBS_dropper += $(MFLAG) -HOSTLDLIBS_user-trap += $(MFLAG) -endif -always-y := $(hostprogs) -endif +userccflags += -I usr/include diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h index 0cc9816fe8e8..417e48a4c4df 100644 --- a/samples/seccomp/bpf-helper.h +++ b/samples/seccomp/bpf-helper.h @@ -62,9 +62,9 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count); #define EXPAND(...) __VA_ARGS__ /* Ensure that we load the logically correct offset. */ -#if __BYTE_ORDER == __LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) -#elif __BYTE_ORDER == __BIG_ENDIAN +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32) #else #error "Unknown endianness" @@ -85,10 +85,10 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count); #elif __BITS_PER_LONG == 64 /* Ensure that we load the logically correct offset. */ -#if __BYTE_ORDER == __LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define ENDIAN(_lo, _hi) _lo, _hi #define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32) -#elif __BYTE_ORDER == __BIG_ENDIAN +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define ENDIAN(_lo, _hi) _hi, _lo #define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) #endif diff --git a/samples/seccomp/dropper.c b/samples/seccomp/dropper.c index cc0648eb389e..4bca4b70f665 100644 --- a/samples/seccomp/dropper.c +++ b/samples/seccomp/dropper.c @@ -25,7 +25,7 @@ #include <sys/prctl.h> #include <unistd.h> -static int install_filter(int nr, int arch, int error) +static int install_filter(int arch, int nr, int error) { struct sock_filter filter[] = { BPF_STMT(BPF_LD+BPF_W+BPF_ABS, @@ -42,6 +42,10 @@ static int install_filter(int nr, int arch, int error) .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), .filter = filter, }; + if (error == -1) { + struct sock_filter kill = BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL); + filter[4] = kill; + } if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { perror("prctl(NO_NEW_PRIVS)"); return 1; @@ -57,9 +61,10 @@ int main(int argc, char **argv) { if (argc < 5) { fprintf(stderr, "Usage:\n" - "dropper <syscall_nr> <arch> <errno> <prog> [<args>]\n" + "dropper <arch> <syscall_nr> <errno> <prog> [<args>]\n" "Hint: AUDIT_ARCH_I386: 0x%X\n" " AUDIT_ARCH_X86_64: 0x%X\n" + " errno == -1 means SECCOMP_RET_KILL\n" "\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64); return 1; } diff --git a/samples/timers/.gitignore b/samples/timers/.gitignore index c5c45d7ec0df..cd9ff7b95383 100644 --- a/samples/timers/.gitignore +++ b/samples/timers/.gitignore @@ -1 +1,2 @@ -hpet_example +# SPDX-License-Identifier: GPL-2.0-only +/hpet_example diff --git a/samples/timers/Makefile b/samples/timers/Makefile index f9fa07460802..e6836cdea4e2 100644 --- a/samples/timers/Makefile +++ b/samples/timers/Makefile @@ -1,16 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -ifndef CROSS_COMPILE -uname_M := $(shell uname -m 2>/dev/null || echo not) -ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) +userprogs-always-y += hpet_example -ifeq ($(ARCH),x86) -CC := $(CROSS_COMPILE)gcc -PROGS := hpet_example - -all: $(PROGS) - -clean: - rm -fr $(PROGS) - -endif -endif +userccflags += -I usr/include diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile index b78344e7bbed..b3808bb4cf8b 100644 --- a/samples/trace_events/Makefile +++ b/samples/trace_events/Makefile @@ -11,5 +11,7 @@ # Here trace-events-sample.c does the CREATE_TRACE_POINTS. # CFLAGS_trace-events-sample.o := -I$(src) +CFLAGS_trace_custom_sched.o := -I$(src) obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o +obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_custom_sched.o diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index 1a72b7d95cdc..608c4ae3b08a 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -19,8 +19,10 @@ static const char *random_strings[] = { "One ring to rule them all" }; -static void simple_thread_func(int cnt) +static void do_simple_thread_func(int cnt, const char *fmt, ...) { + unsigned long bitmask[1] = {0xdeadbeefUL}; + va_list va; int array[6]; int len = cnt % 5; int i; @@ -32,9 +34,13 @@ static void simple_thread_func(int cnt) array[i] = i + 1; array[i] = 0; + va_start(va, fmt); + /* Silly tracepoints */ trace_foo_bar("hello", cnt, array, random_strings[len], - current->cpus_ptr); + current->cpus_ptr, fmt, &va); + + va_end(va); trace_foo_with_template_simple("HELLO", cnt); @@ -43,6 +49,13 @@ static void simple_thread_func(int cnt) trace_foo_with_template_cond("prints other times", cnt); trace_foo_with_template_print("I have to be different", cnt); + + trace_foo_rel_loc("Hello __rel_loc", cnt, bitmask); +} + +static void simple_thread_func(int cnt) +{ + do_simple_thread_func(cnt, "iter=%d", cnt); } static int simple_thread(void *arg) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 80b4a70315b6..1a92226202fc 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -141,6 +141,54 @@ * In most cases, the __assign_str() macro will take the same * parameters as the __string() macro had to declare the string. * + * __vstring: This is similar to __string() but instead of taking a + * dynamic length, it takes a variable list va_list 'va' variable. + * Some event callers already have a message from parameters saved + * in a va_list. Passing in the format and the va_list variable + * will save just enough on the ring buffer for that string. + * Note, the va variable used is a pointer to a va_list, not + * to the va_list directly. + * + * (va_list *va) + * + * __vstring(foo, fmt, va) is similar to: vsnprintf(foo, fmt, va) + * + * To assign the string, use the helper macro __assign_vstr(). + * + * __assign_vstr(foo, fmt, va); + * + * In most cases, the __assign_vstr() macro will take the same + * parameters as the __vstring() macro had to declare the string. + * Use __get_str() to retrieve the __vstring() just like it would for + * __string(). + * + * __string_len: This is a helper to a __dynamic_array, but it understands + * that the array has characters in it, and with the combined + * use of __assign_str_len(), it will allocate 'len' + 1 bytes + * in the ring buffer and add a '\0' to the string. This is + * useful if the string being saved has no terminating '\0' byte. + * It requires that the length of the string is known as it acts + * like a memcpy(). + * + * Declared with: + * + * __string_len(foo, bar, len) + * + * To assign this string, use the helper macro __assign_str_len(). + * + * __assign_str_len(foo, bar, len); + * + * Then len + 1 is allocated to the ring buffer, and a nul terminating + * byte is added. This is similar to: + * + * memcpy(__get_str(foo), bar, len); + * __get_str(foo)[len] = 0; + * + * The advantage of using this over __dynamic_array, is that it + * takes care of allocating the extra byte on the ring buffer + * for the '\0' terminating byte, and __get_str(foo) can be used + * in the TP_printk(). + * * __bitmask: This is another kind of __dynamic_array, but it expects * an array of longs, and the number of bits to parse. It takes * two parameters (name, nr_bits), where name is the name of the @@ -229,9 +277,10 @@ TRACE_DEFINE_ENUM(TRACE_SAMPLE_ZOO); TRACE_EVENT(foo_bar, TP_PROTO(const char *foo, int bar, const int *lst, - const char *string, const struct cpumask *mask), + const char *string, const struct cpumask *mask, + const char *fmt, va_list *va), - TP_ARGS(foo, bar, lst, string, mask), + TP_ARGS(foo, bar, lst, string, mask, fmt, va), TP_STRUCT__entry( __array( char, foo, 10 ) @@ -239,6 +288,7 @@ TRACE_EVENT(foo_bar, __dynamic_array(int, list, __length_of(lst)) __string( str, string ) __bitmask( cpus, num_possible_cpus() ) + __vstring( vstr, fmt, va ) ), TP_fast_assign( @@ -247,10 +297,11 @@ TRACE_EVENT(foo_bar, memcpy(__get_dynamic_array(list), lst, __length_of(lst) * sizeof(int)); __assign_str(str, string); + __assign_vstr(vstr, fmt, va); __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus()); ), - TP_printk("foo %s %d %s %s %s %s (%s)", __entry->foo, __entry->bar, + TP_printk("foo %s %d %s %s %s %s (%s) %s", __entry->foo, __entry->bar, /* * Notice here the use of some helper functions. This includes: @@ -294,7 +345,7 @@ TRACE_EVENT(foo_bar, __print_array(__get_dynamic_array(list), __get_dynamic_array_len(list) / sizeof(int), sizeof(int)), - __get_str(str), __get_bitmask(cpus)) + __get_str(str), __get_bitmask(cpus), __get_str(vstr)) ); /* @@ -416,7 +467,7 @@ TRACE_EVENT_FN(foo_bar_with_fn, * Note, TRACE_EVENT() itself is simply defined as: * * #define TRACE_EVENT(name, proto, args, tstruct, assign, printk) \ - * DEFINE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \ + * DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \ * DEFINE_EVENT(name, name, proto, args) * * The DEFINE_EVENT() also can be declared with conditions and reg functions: @@ -479,6 +530,39 @@ DEFINE_EVENT_PRINT(foo_template, foo_with_template_print, TP_ARGS(foo, bar), TP_printk("bar %s %d", __get_str(foo), __entry->bar)); +/* + * There are yet another __rel_loc dynamic data attribute. If you + * use __rel_dynamic_array() and __rel_string() etc. macros, you + * can use this attribute. There is no difference from the viewpoint + * of functionality with/without 'rel' but the encoding is a bit + * different. This is expected to be used with user-space event, + * there is no reason that the kernel event use this, but only for + * testing. + */ + +TRACE_EVENT(foo_rel_loc, + + TP_PROTO(const char *foo, int bar, unsigned long *mask), + + TP_ARGS(foo, bar, mask), + + TP_STRUCT__entry( + __rel_string( foo, foo ) + __field( int, bar ) + __rel_bitmask( bitmask, + BITS_PER_BYTE * sizeof(unsigned long) ) + ), + + TP_fast_assign( + __assign_rel_str(foo, foo); + __entry->bar = bar; + __assign_rel_bitmask(bitmask, mask, + BITS_PER_BYTE * sizeof(unsigned long)); + ), + + TP_printk("foo_rel_loc %s, %d, %s", __get_rel_str(foo), __entry->bar, + __get_rel_bitmask(bitmask)) +); #endif /***** NOTICE! The #if protection ends here. *****/ diff --git a/samples/trace_events/trace_custom_sched.c b/samples/trace_events/trace_custom_sched.c new file mode 100644 index 000000000000..b99d9ab7db85 --- /dev/null +++ b/samples/trace_events/trace_custom_sched.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * event tracer + * + * Copyright (C) 2022 Google Inc, Steven Rostedt <rostedt@goodmis.org> + */ + +#define pr_fmt(fmt) fmt + +#include <linux/trace_events.h> +#include <linux/version.h> +#include <linux/module.h> +#include <linux/sched.h> + +/* + * Must include the event header that the custom event will attach to, + * from the C file, and not in the custom header file. + */ +#include <trace/events/sched.h> + +/* Declare CREATE_CUSTOM_TRACE_EVENTS before including custom header */ +#define CREATE_CUSTOM_TRACE_EVENTS + +#include "trace_custom_sched.h" + +/* + * As the trace events are not exported to modules, the use of + * for_each_kernel_tracepoint() is needed to find the trace event + * to attach to. The fct() function below, is a callback that + * will be called for every event. + * + * Helper functions are created by the TRACE_CUSTOM_EVENT() macro + * update the event. Those are of the form: + * + * trace_custom_event_<event>_update() + * + * Where <event> is the event to attach. + */ +static void fct(struct tracepoint *tp, void *priv) +{ + trace_custom_event_sched_switch_update(tp); + trace_custom_event_sched_waking_update(tp); +} + +static int __init trace_sched_init(void) +{ + for_each_kernel_tracepoint(fct, NULL); + return 0; +} + +static void __exit trace_sched_exit(void) +{ +} + +module_init(trace_sched_init); +module_exit(trace_sched_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("Custom scheduling events"); +MODULE_LICENSE("GPL"); diff --git a/samples/trace_events/trace_custom_sched.h b/samples/trace_events/trace_custom_sched.h new file mode 100644 index 000000000000..951388334a3f --- /dev/null +++ b/samples/trace_events/trace_custom_sched.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Like the headers that use TRACE_EVENT(), the TRACE_CUSTOM_EVENT() + * needs a header that allows for multiple inclusions. + * + * Test for a unique name (here we have _TRACE_CUSTOM_SCHED_H), + * also allowing to continue if TRACE_CUSTOM_MULTI_READ is defined. + */ +#if !defined(_TRACE_CUSTOM_SCHED_H) || defined(TRACE_CUSTOM_MULTI_READ) +#define _TRACE_CUSTOM_SCHED_H + +/* Include linux/trace_events.h for initial defines of TRACE_CUSTOM_EVENT() */ +#include <linux/trace_events.h> + +/* + * TRACE_CUSTOM_EVENT() is just like TRACE_EVENT(). The first parameter + * is the event name of an existing event where the TRACE_EVENT has been included + * in the C file before including this file. + */ +TRACE_CUSTOM_EVENT(sched_switch, + + /* + * The TP_PROTO() and TP_ARGS must match the trace event + * that the custom event is using. + */ + TP_PROTO(bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state), + + TP_ARGS(preempt, prev, next, prev_state), + + /* + * The next fields are where the customization happens. + * The TP_STRUCT__entry() defines what will be recorded + * in the ring buffer when the custom event triggers. + * + * The rest is just like the TRACE_EVENT() macro except that + * it uses the custom entry. + */ + TP_STRUCT__entry( + __field( unsigned short, prev_prio ) + __field( unsigned short, next_prio ) + __field( pid_t, next_pid ) + ), + + TP_fast_assign( + __entry->prev_prio = prev->prio; + __entry->next_pid = next->pid; + __entry->next_prio = next->prio; + ), + + TP_printk("prev_prio=%d next_pid=%d next_prio=%d", + __entry->prev_prio, __entry->next_pid, __entry->next_prio) +) + + +TRACE_CUSTOM_EVENT(sched_waking, + + TP_PROTO(struct task_struct *p), + + TP_ARGS(p), + + TP_STRUCT__entry( + __field( pid_t, pid ) + __field( unsigned short, prio ) + ), + + TP_fast_assign( + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("pid=%d prio=%d", __entry->pid, __entry->prio) +) +#endif +/* + * Just like the headers that create TRACE_EVENTs, the below must + * be outside the protection of the above #if block. + */ + +/* + * It is required that the Makefile includes: + * CFLAGS_<c_file>.o := -I$(src) + */ +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . + +/* + * It is requred that the TRACE_INCLUDE_FILE be the same + * as this file without the ".h". + */ +#define TRACE_INCLUDE_FILE trace_custom_sched +#include <trace/define_custom_trace.h> diff --git a/samples/uhid/.gitignore b/samples/uhid/.gitignore new file mode 100644 index 000000000000..0e0a5a929f5d --- /dev/null +++ b/samples/uhid/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/uhid-example diff --git a/samples/uhid/Makefile b/samples/uhid/Makefile index 5f44ea40d6d5..0aa424ec4899 100644 --- a/samples/uhid/Makefile +++ b/samples/uhid/Makefile @@ -1,8 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -# List of programs to build -hostprogs := uhid-example +userprogs-always-y += uhid-example -# Tell kbuild to always build the programs -always-y := $(hostprogs) - -HOSTCFLAGS_uhid-example.o += -I$(objtree)/usr/include +userccflags += -I usr/include diff --git a/samples/uhid/uhid-example.c b/samples/uhid/uhid-example.c index b72d645ce828..015cb06a241e 100644 --- a/samples/uhid/uhid-example.c +++ b/samples/uhid/uhid-example.c @@ -165,7 +165,7 @@ static int uhid_write(int fd, const struct uhid_event *ev) fprintf(stderr, "Cannot write to uhid: %m\n"); return -errno; } else if (ret != sizeof(*ev)) { - fprintf(stderr, "Wrong size written to uhid: %ld != %lu\n", + fprintf(stderr, "Wrong size written to uhid: %zd != %zu\n", ret, sizeof(ev)); return -EFAULT; } else { @@ -236,7 +236,7 @@ static int event(int fd) fprintf(stderr, "Cannot read uhid-cdev: %m\n"); return -errno; } else if (ret != sizeof(ev)) { - fprintf(stderr, "Invalid size read from uhid-dev: %ld != %lu\n", + fprintf(stderr, "Invalid size read from uhid-dev: %zd != %zu\n", ret, sizeof(ev)); return -EFAULT; } diff --git a/samples/user_events/Makefile b/samples/user_events/Makefile new file mode 100644 index 000000000000..7252b589db57 --- /dev/null +++ b/samples/user_events/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -Wl,-no-as-needed -Wall -I../../usr/include + +example: example.o +example.o: example.c diff --git a/samples/user_events/example.c b/samples/user_events/example.c new file mode 100644 index 000000000000..d06dc24156ec --- /dev/null +++ b/samples/user_events/example.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021, Microsoft Corporation. + * + * Authors: + * Beau Belgrave <beaub@linux.microsoft.com> + */ + +#include <errno.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <stdio.h> +#include <unistd.h> +#include <asm/bitsperlong.h> +#include <endian.h> +#include <linux/user_events.h> + +#if __BITS_PER_LONG == 64 +#define endian_swap(x) htole64(x) +#else +#define endian_swap(x) htole32(x) +#endif + +/* Assumes debugfs is mounted */ +const char *data_file = "/sys/kernel/debug/tracing/user_events_data"; +const char *status_file = "/sys/kernel/debug/tracing/user_events_status"; + +static int event_status(long **status) +{ + int fd = open(status_file, O_RDONLY); + + *status = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ, + MAP_SHARED, fd, 0); + + close(fd); + + if (*status == MAP_FAILED) + return -1; + + return 0; +} + +static int event_reg(int fd, const char *command, long *index, long *mask, + int *write) +{ + struct user_reg reg = {0}; + + reg.size = sizeof(reg); + reg.name_args = (__u64)command; + + if (ioctl(fd, DIAG_IOCSREG, ®) == -1) + return -1; + + *index = reg.status_bit / __BITS_PER_LONG; + *mask = endian_swap(1L << (reg.status_bit % __BITS_PER_LONG)); + *write = reg.write_index; + + return 0; +} + +int main(int argc, char **argv) +{ + int data_fd, write; + long index, mask; + long *status_page; + struct iovec io[2]; + __u32 count = 0; + + if (event_status(&status_page) == -1) + return errno; + + data_fd = open(data_file, O_RDWR); + + if (event_reg(data_fd, "test u32 count", &index, &mask, &write) == -1) + return errno; + + /* Setup iovec */ + io[0].iov_base = &write; + io[0].iov_len = sizeof(write); + io[1].iov_base = &count; + io[1].iov_len = sizeof(count); + +ask: + printf("Press enter to check status...\n"); + getchar(); + + /* Check if anyone is listening */ + if (status_page[index] & mask) { + /* Yep, trace out our data */ + writev(data_fd, (const struct iovec *)io, 2); + + /* Increase the count */ + count++; + + printf("Something was attached, wrote data\n"); + } + + goto ask; + + return 0; +} diff --git a/samples/v4l/v4l2-pci-skeleton.c b/samples/v4l/v4l2-pci-skeleton.c index f6a551bd57ef..a61f94db18d9 100644 --- a/samples/v4l/v4l2-pci-skeleton.c +++ b/samples/v4l/v4l2-pci-skeleton.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * This is a V4L2 PCI Skeleton Driver. It gives an initial skeleton source * for use with other PCI drivers. @@ -6,19 +7,6 @@ * input 0 and an HDMI connector as input 1. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. - * - * This program is free software; you may redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 of the License. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. */ #include <linux/types.h> @@ -766,7 +754,7 @@ static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ret = pci_enable_device(pdev); if (ret) return ret; - ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + ret = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)); if (ret) { dev_err(&pdev->dev, "no suitable DMA available.\n"); goto disable_pci; @@ -879,7 +867,7 @@ static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent) vdev->tvnorms = SKEL_TVNORMS; video_set_drvdata(vdev, skel); - ret = video_register_device(vdev, VFL_TYPE_GRABBER, -1); + ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1); if (ret) goto free_hdl; diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 3cc5e5921682..117a8d799f71 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -21,7 +21,6 @@ */ #include <linux/init.h> #include <linux/module.h> -#include <linux/device.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -100,36 +99,46 @@ MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices"); #define MBOCHS_TYPE_2 "medium" #define MBOCHS_TYPE_3 "large" -static const struct mbochs_type { - const char *name; +static struct mbochs_type { + struct mdev_type type; u32 mbytes; u32 max_x; u32 max_y; } mbochs_types[] = { { - .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, + .type.sysfs_name = MBOCHS_TYPE_1, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, .mbytes = 4, .max_x = 800, .max_y = 600, }, { - .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, + .type.sysfs_name = MBOCHS_TYPE_2, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, .mbytes = 16, .max_x = 1920, .max_y = 1440, }, { - .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, + .type.sysfs_name = MBOCHS_TYPE_3, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, .mbytes = 64, .max_x = 0, .max_y = 0, }, }; +static struct mdev_type *mbochs_mdev_types[] = { + &mbochs_types[0].type, + &mbochs_types[1].type, + &mbochs_types[2].type, +}; static dev_t mbochs_devt; static struct class *mbochs_class; static struct cdev mbochs_cdev; static struct device mbochs_dev; -static int mbochs_used_mbytes; +static struct mdev_parent mbochs_parent; +static atomic_t mbochs_avail_mbytes; +static const struct vfio_device_ops mbochs_dev_ops; struct vfio_region_info_ext { struct vfio_region_info base; @@ -160,6 +169,7 @@ struct mbochs_dmabuf { /* State of each mdev device */ struct mdev_state { + struct vfio_device vdev; u8 *vconfig; u64 bar_mask[3]; u32 memory_bar_mask; @@ -205,16 +215,6 @@ static struct page *__mbochs_get_page(struct mdev_state *mdev_state, static struct page *mbochs_get_page(struct mdev_state *mdev_state, pgoff_t pgoff); -static const struct mbochs_type *mbochs_find_type(struct kobject *kobj) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mbochs_types); i++) - if (strcmp(mbochs_types[i].name, kobj->name) == 0) - return mbochs_types + i; - return NULL; -} - static void mbochs_create_config_space(struct mdev_state *mdev_state) { STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID], @@ -435,11 +435,9 @@ static void handle_edid_blob(struct mdev_state *mdev_state, u16 offset, memcpy(buf, mdev_state->edid_blob + offset, count); } -static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, - loff_t pos, bool is_write) +static ssize_t mdev_access(struct mdev_state *mdev_state, char *buf, + size_t count, loff_t pos, bool is_write) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); - struct device *dev = mdev_dev(mdev); struct page *pg; loff_t poff; char *map; @@ -488,7 +486,7 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, put_page(pg); } else { - dev_dbg(dev, "%s: %s @0x%llx (unhandled)\n", + dev_dbg(mdev_state->vdev.dev, "%s: %s @0x%llx (unhandled)\n", __func__, is_write ? "WR" : "RD", pos); ret = -1; goto accessfailed; @@ -503,9 +501,8 @@ accessfailed: return ret; } -static int mbochs_reset(struct mdev_device *mdev) +static int mbochs_reset(struct mdev_state *mdev_state) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); u32 size64k = mdev_state->memsize / (64 * 1024); int i; @@ -516,24 +513,25 @@ static int mbochs_reset(struct mdev_device *mdev) return 0; } -static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev) +static int mbochs_init_dev(struct vfio_device *vdev) { - const struct mbochs_type *type = mbochs_find_type(kobj); - struct device *dev = mdev_dev(mdev); - struct mdev_state *mdev_state; - - if (!type) - type = &mbochs_types[0]; - if (type->mbytes + mbochs_used_mbytes > max_mbytes) - return -ENOMEM; - - mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); - if (mdev_state == NULL) - return -ENOMEM; + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); + struct mbochs_type *type = + container_of(mdev->type, struct mbochs_type, type); + int avail_mbytes = atomic_read(&mbochs_avail_mbytes); + int ret = -ENOMEM; + + do { + if (avail_mbytes < type->mbytes) + return -ENOSPC; + } while (!atomic_try_cmpxchg(&mbochs_avail_mbytes, &avail_mbytes, + avail_mbytes - type->mbytes)); mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL); - if (mdev_state->vconfig == NULL) - goto err_mem; + if (!mdev_state->vconfig) + goto err_avail; mdev_state->memsize = type->mbytes * 1024 * 1024; mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT; @@ -541,14 +539,10 @@ static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev) sizeof(struct page *), GFP_KERNEL); if (!mdev_state->pages) - goto err_mem; - - dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__, - kobj->name, type->mbytes, mdev_state->pagecount); + goto err_vconfig; mutex_init(&mdev_state->ops_lock); mdev_state->mdev = mdev; - mdev_set_drvdata(mdev, mdev_state); INIT_LIST_HEAD(&mdev_state->dmabufs); mdev_state->next_id = 1; @@ -558,32 +552,64 @@ static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev) mdev_state->edid_regs.edid_offset = MBOCHS_EDID_BLOB_OFFSET; mdev_state->edid_regs.edid_max_size = sizeof(mdev_state->edid_blob); mbochs_create_config_space(mdev_state); - mbochs_reset(mdev); + mbochs_reset(mdev_state); - mbochs_used_mbytes += type->mbytes; + dev_info(vdev->dev, "%s: %s, %d MB, %ld pages\n", __func__, + type->type.pretty_name, type->mbytes, mdev_state->pagecount); return 0; -err_mem: +err_vconfig: kfree(mdev_state->vconfig); - kfree(mdev_state); - return -ENOMEM; +err_avail: + atomic_add(type->mbytes, &mbochs_avail_mbytes); + return ret; } -static int mbochs_remove(struct mdev_device *mdev) +static int mbochs_probe(struct mdev_device *mdev) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct mdev_state *mdev_state; + int ret = -ENOMEM; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mbochs_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); - mbochs_used_mbytes -= mdev_state->type->mbytes; - mdev_set_drvdata(mdev, NULL); + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); + if (ret) + goto err_put_vdev; + dev_set_drvdata(&mdev->dev, mdev_state); + return 0; + +err_put_vdev: + vfio_put_device(&mdev_state->vdev); + return ret; +} + +static void mbochs_release_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes); kfree(mdev_state->pages); kfree(mdev_state->vconfig); - kfree(mdev_state); - return 0; + vfio_free_device(vdev); } -static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf, +static void mbochs_remove(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); + + vfio_unregister_group_dev(&mdev_state->vdev); + vfio_put_device(&mdev_state->vdev); +} + +static ssize_t mbochs_read(struct vfio_device *vdev, char __user *buf, size_t count, loff_t *ppos) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int done = 0; int ret; @@ -593,7 +619,7 @@ static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf, if (count >= 4 && !(*ppos % 4)) { u32 val; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), *ppos, false); if (ret <= 0) goto read_err; @@ -605,7 +631,7 @@ static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf, } else if (count >= 2 && !(*ppos % 2)) { u16 val; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), *ppos, false); if (ret <= 0) goto read_err; @@ -617,7 +643,7 @@ static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf, } else { u8 val; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), *ppos, false); if (ret <= 0) goto read_err; @@ -640,9 +666,11 @@ read_err: return -EFAULT; } -static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf, +static ssize_t mbochs_write(struct vfio_device *vdev, const char __user *buf, size_t count, loff_t *ppos) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int done = 0; int ret; @@ -655,7 +683,7 @@ static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -667,7 +695,7 @@ static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -679,7 +707,7 @@ static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -765,9 +793,10 @@ static const struct vm_operations_struct mbochs_region_vm_ops = { .fault = mbochs_region_vm_fault, }; -static int mbochs_mmap(struct mdev_device *mdev, struct vm_area_struct *vma) +static int mbochs_mmap(struct vfio_device *vdev, struct vm_area_struct *vma) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT) return -EINVAL; @@ -846,7 +875,7 @@ static struct sg_table *mbochs_map_dmabuf(struct dma_buf_attachment *at, if (sg_alloc_table_from_pages(sg, dmabuf->pages, dmabuf->pagecount, 0, dmabuf->mode.size, GFP_KERNEL) < 0) goto err2; - if (!dma_map_sg(at->dev, sg->sgl, sg->nents, direction)) + if (dma_map_sgtable(at->dev, sg, direction, 0)) goto err3; return sg; @@ -868,6 +897,7 @@ static void mbochs_unmap_dmabuf(struct dma_buf_attachment *at, dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); + dma_unmap_sgtable(at->dev, sg, direction, 0); sg_free_table(sg); kfree(sg); } @@ -973,7 +1003,7 @@ mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id) static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf) { struct mdev_state *mdev_state = dmabuf->mdev_state; - struct device *dev = mdev_dev(mdev_state->mdev); + struct device *dev = mdev_state->vdev.dev; DEFINE_DMA_BUF_EXPORT_INFO(exp_info); struct dma_buf *buf; @@ -1001,15 +1031,10 @@ static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf) return 0; } -static int mbochs_get_region_info(struct mdev_device *mdev, +static int mbochs_get_region_info(struct mdev_state *mdev_state, struct vfio_region_info_ext *ext) { struct vfio_region_info *region_info = &ext->base; - struct mdev_state *mdev_state; - - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) - return -EINVAL; if (region_info->index >= MBOCHS_NUM_REGIONS) return -EINVAL; @@ -1057,15 +1082,13 @@ static int mbochs_get_region_info(struct mdev_device *mdev, return 0; } -static int mbochs_get_irq_info(struct mdev_device *mdev, - struct vfio_irq_info *irq_info) +static int mbochs_get_irq_info(struct vfio_irq_info *irq_info) { irq_info->count = 0; return 0; } -static int mbochs_get_device_info(struct mdev_device *mdev, - struct vfio_device_info *dev_info) +static int mbochs_get_device_info(struct vfio_device_info *dev_info) { dev_info->flags = VFIO_DEVICE_FLAGS_PCI; dev_info->num_regions = MBOCHS_NUM_REGIONS; @@ -1073,11 +1096,9 @@ static int mbochs_get_device_info(struct mdev_device *mdev, return 0; } -static int mbochs_query_gfx_plane(struct mdev_device *mdev, +static int mbochs_query_gfx_plane(struct mdev_state *mdev_state, struct vfio_device_gfx_plane_info *plane) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); - struct device *dev = mdev_dev(mdev); struct mbochs_dmabuf *dmabuf; struct mbochs_mode mode; int ret; @@ -1131,18 +1152,16 @@ static int mbochs_query_gfx_plane(struct mdev_device *mdev, done: if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY && mdev_state->active_id != plane->dmabuf_id) { - dev_dbg(dev, "%s: primary: %d => %d\n", __func__, - mdev_state->active_id, plane->dmabuf_id); + dev_dbg(mdev_state->vdev.dev, "%s: primary: %d => %d\n", + __func__, mdev_state->active_id, plane->dmabuf_id); mdev_state->active_id = plane->dmabuf_id; } mutex_unlock(&mdev_state->ops_lock); return 0; } -static int mbochs_get_gfx_dmabuf(struct mdev_device *mdev, - u32 id) +static int mbochs_get_gfx_dmabuf(struct mdev_state *mdev_state, u32 id) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); struct mbochs_dmabuf *dmabuf; mutex_lock(&mdev_state->ops_lock); @@ -1164,9 +1183,11 @@ static int mbochs_get_gfx_dmabuf(struct mdev_device *mdev, return dma_buf_fd(dmabuf->buf, 0); } -static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, - unsigned long arg) +static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd, + unsigned long arg) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); int ret = 0; unsigned long minsz, outsz; @@ -1183,7 +1204,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, if (info.argsz < minsz) return -EINVAL; - ret = mbochs_get_device_info(mdev, &info); + ret = mbochs_get_device_info(&info); if (ret) return ret; @@ -1207,7 +1228,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, if (outsz > sizeof(info)) return -EINVAL; - ret = mbochs_get_region_info(mdev, &info); + ret = mbochs_get_region_info(mdev_state, &info); if (ret) return ret; @@ -1230,7 +1251,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, (info.index >= VFIO_PCI_NUM_IRQS)) return -EINVAL; - ret = mbochs_get_irq_info(mdev, &info); + ret = mbochs_get_irq_info(&info); if (ret) return ret; @@ -1253,7 +1274,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, if (plane.argsz < minsz) return -EINVAL; - ret = mbochs_query_gfx_plane(mdev, &plane); + ret = mbochs_query_gfx_plane(mdev_state, &plane); if (ret) return ret; @@ -1270,29 +1291,22 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, if (get_user(dmabuf_id, (__u32 __user *)arg)) return -EFAULT; - return mbochs_get_gfx_dmabuf(mdev, dmabuf_id); + return mbochs_get_gfx_dmabuf(mdev_state, dmabuf_id); } case VFIO_DEVICE_SET_IRQS: return -EINVAL; case VFIO_DEVICE_RESET: - return mbochs_reset(mdev); + return mbochs_reset(mdev_state); } return -ENOTTY; } -static int mbochs_open(struct mdev_device *mdev) -{ - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - - return 0; -} - -static void mbochs_close(struct mdev_device *mdev) +static void mbochs_close_device(struct vfio_device *vdev) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); struct mbochs_dmabuf *dmabuf, *tmp; mutex_lock(&mdev_state->ops_lock); @@ -1309,15 +1323,13 @@ static void mbochs_close(struct mdev_device *mdev) mbochs_put_pages(mdev_state); mutex_unlock(&mdev_state->ops_lock); - module_put(THIS_MODULE); } static ssize_t memory_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct mdev_device *mdev = mdev_from_dev(dev); - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct mdev_state *mdev_state = dev_get_drvdata(dev); return sprintf(buf, "%d MB\n", mdev_state->type->mbytes); } @@ -1333,87 +1345,50 @@ static const struct attribute_group mdev_dev_group = { .attrs = mdev_dev_attrs, }; -const struct attribute_group *mdev_dev_groups[] = { +static const struct attribute_group *mdev_dev_groups[] = { &mdev_dev_group, NULL, }; -static ssize_t -name_show(struct kobject *kobj, struct device *dev, char *buf) +static ssize_t mbochs_show_description(struct mdev_type *mtype, char *buf) { - return sprintf(buf, "%s\n", kobj->name); -} -MDEV_TYPE_ATTR_RO(name); - -static ssize_t -description_show(struct kobject *kobj, struct device *dev, char *buf) -{ - const struct mbochs_type *type = mbochs_find_type(kobj); + struct mbochs_type *type = + container_of(mtype, struct mbochs_type, type); return sprintf(buf, "virtual display, %d MB video memory\n", type ? type->mbytes : 0); } -MDEV_TYPE_ATTR_RO(description); -static ssize_t -available_instances_show(struct kobject *kobj, struct device *dev, char *buf) +static unsigned int mbochs_get_available(struct mdev_type *mtype) { - const struct mbochs_type *type = mbochs_find_type(kobj); - int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes; - - return sprintf(buf, "%d\n", count); -} -MDEV_TYPE_ATTR_RO(available_instances); + struct mbochs_type *type = + container_of(mtype, struct mbochs_type, type); -static ssize_t device_api_show(struct kobject *kobj, struct device *dev, - char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); + return atomic_read(&mbochs_avail_mbytes) / type->mbytes; } -MDEV_TYPE_ATTR_RO(device_api); - -static struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, - &mdev_type_attr_description.attr, - &mdev_type_attr_device_api.attr, - &mdev_type_attr_available_instances.attr, - NULL, -}; - -static struct attribute_group mdev_type_group1 = { - .name = MBOCHS_TYPE_1, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group2 = { - .name = MBOCHS_TYPE_2, - .attrs = mdev_types_attrs, -}; -static struct attribute_group mdev_type_group3 = { - .name = MBOCHS_TYPE_3, - .attrs = mdev_types_attrs, +static const struct vfio_device_ops mbochs_dev_ops = { + .close_device = mbochs_close_device, + .init = mbochs_init_dev, + .release = mbochs_release_dev, + .read = mbochs_read, + .write = mbochs_write, + .ioctl = mbochs_ioctl, + .mmap = mbochs_mmap, }; -static struct attribute_group *mdev_type_groups[] = { - &mdev_type_group1, - &mdev_type_group2, - &mdev_type_group3, - NULL, -}; - -static const struct mdev_parent_ops mdev_fops = { - .owner = THIS_MODULE, - .mdev_attr_groups = mdev_dev_groups, - .supported_type_groups = mdev_type_groups, - .create = mbochs_create, - .remove = mbochs_remove, - .open = mbochs_open, - .release = mbochs_close, - .read = mbochs_read, - .write = mbochs_write, - .ioctl = mbochs_ioctl, - .mmap = mbochs_mmap, +static struct mdev_driver mbochs_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, + .driver = { + .name = "mbochs", + .owner = THIS_MODULE, + .mod_name = KBUILD_MODNAME, + .dev_groups = mdev_dev_groups, + }, + .probe = mbochs_probe, + .remove = mbochs_remove, + .get_available = mbochs_get_available, + .show_description = mbochs_show_description, }; static const struct file_operations vd_fops = { @@ -1429,6 +1404,8 @@ static int __init mbochs_dev_init(void) { int ret = 0; + atomic_set(&mbochs_avail_mbytes, max_mbytes); + ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK + 1, MBOCHS_NAME); if (ret < 0) { pr_err("Error: failed to register mbochs_dev, err: %d\n", ret); @@ -1438,11 +1415,15 @@ static int __init mbochs_dev_init(void) cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK + 1); pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt)); + ret = mdev_register_driver(&mbochs_driver); + if (ret) + goto err_cdev; + mbochs_class = class_create(THIS_MODULE, MBOCHS_CLASS_NAME); if (IS_ERR(mbochs_class)) { pr_err("Error: failed to register mbochs_dev class\n"); ret = PTR_ERR(mbochs_class); - goto failed1; + goto err_driver; } mbochs_dev.class = mbochs_class; mbochs_dev.release = mbochs_device_release; @@ -1450,19 +1431,23 @@ static int __init mbochs_dev_init(void) ret = device_register(&mbochs_dev); if (ret) - goto failed2; + goto err_class; - ret = mdev_register_device(&mbochs_dev, &mdev_fops); + ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver, + mbochs_mdev_types, + ARRAY_SIZE(mbochs_mdev_types)); if (ret) - goto failed3; + goto err_device; return 0; -failed3: +err_device: device_unregister(&mbochs_dev); -failed2: +err_class: class_destroy(mbochs_class); -failed1: +err_driver: + mdev_unregister_driver(&mbochs_driver); +err_cdev: cdev_del(&mbochs_cdev); unregister_chrdev_region(mbochs_devt, MINORMASK + 1); return ret; @@ -1471,14 +1456,16 @@ failed1: static void __exit mbochs_dev_exit(void) { mbochs_dev.bus = NULL; - mdev_unregister_device(&mbochs_dev); + mdev_unregister_parent(&mbochs_parent); device_unregister(&mbochs_dev); + mdev_unregister_driver(&mbochs_driver); cdev_del(&mbochs_cdev); unregister_chrdev_region(mbochs_devt, MINORMASK + 1); class_destroy(mbochs_class); mbochs_class = NULL; } +MODULE_IMPORT_NS(DMA_BUF); module_init(mbochs_dev_init) module_exit(mbochs_dev_exit) diff --git a/samples/vfio-mdev/mdpy-defs.h b/samples/vfio-mdev/mdpy-defs.h index eb26421b6429..961c55ec3ffd 100644 --- a/samples/vfio-mdev/mdpy-defs.h +++ b/samples/vfio-mdev/mdpy-defs.h @@ -9,7 +9,7 @@ */ /* pci ids */ -#define MDPY_PCI_VENDOR_ID 0x1b36 /* redhat */ +#define MDPY_PCI_VENDOR_ID PCI_VENDOR_ID_REDHAT #define MDPY_PCI_DEVICE_ID 0x000f #define MDPY_PCI_SUBVENDOR_ID PCI_SUBVENDOR_ID_REDHAT_QUMRANET #define MDPY_PCI_SUBDEVICE_ID PCI_SUBDEVICE_ID_QEMU diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c index 21dbf63d6e41..9ec93d90e8a5 100644 --- a/samples/vfio-mdev/mdpy-fb.c +++ b/samples/vfio-mdev/mdpy-fb.c @@ -117,22 +117,27 @@ static int mdpy_fb_probe(struct pci_dev *pdev, if (format != DRM_FORMAT_XRGB8888) { pci_err(pdev, "format mismatch (0x%x != 0x%x)\n", format, DRM_FORMAT_XRGB8888); - return -EINVAL; + ret = -EINVAL; + goto err_release_regions; } if (width < 100 || width > 10000) { pci_err(pdev, "width (%d) out of range\n", width); - return -EINVAL; + ret = -EINVAL; + goto err_release_regions; } if (height < 100 || height > 10000) { pci_err(pdev, "height (%d) out of range\n", height); - return -EINVAL; + ret = -EINVAL; + goto err_release_regions; } pci_info(pdev, "mdpy found: %dx%d framebuffer\n", width, height); info = framebuffer_alloc(sizeof(struct mdpy_fb_par), &pdev->dev); - if (!info) + if (!info) { + ret = -ENOMEM; goto err_release_regions; + } pci_set_drvdata(pdev, info); par = info->par; diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index cc86bf6566e4..946e8cfde6fd 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -17,7 +17,6 @@ */ #include <linux/init.h> #include <linux/module.h> -#include <linux/device.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -43,36 +42,34 @@ MODULE_LICENSE("GPL v2"); -static int max_devices = 4; -module_param_named(count, max_devices, int, 0444); -MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); - - #define MDPY_TYPE_1 "vga" #define MDPY_TYPE_2 "xga" #define MDPY_TYPE_3 "hd" -static const struct mdpy_type { - const char *name; +static struct mdpy_type { + struct mdev_type type; u32 format; u32 bytepp; u32 width; u32 height; } mdpy_types[] = { { - .name = MDPY_CLASS_NAME "-" MDPY_TYPE_1, + .type.sysfs_name = MDPY_TYPE_1, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_1, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 640, .height = 480, }, { - .name = MDPY_CLASS_NAME "-" MDPY_TYPE_2, + .type.sysfs_name = MDPY_TYPE_2, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_2, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 1024, .height = 768, }, { - .name = MDPY_CLASS_NAME "-" MDPY_TYPE_3, + .type.sysfs_name = MDPY_TYPE_3, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_3, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 1920, @@ -80,14 +77,22 @@ static const struct mdpy_type { }, }; +static struct mdev_type *mdpy_mdev_types[] = { + &mdpy_types[0].type, + &mdpy_types[1].type, + &mdpy_types[2].type, +}; + static dev_t mdpy_devt; static struct class *mdpy_class; static struct cdev mdpy_cdev; static struct device mdpy_dev; -static u32 mdpy_count; +static struct mdev_parent mdpy_parent; +static const struct vfio_device_ops mdpy_dev_ops; /* State of each mdev device */ struct mdev_state { + struct vfio_device vdev; u8 *vconfig; u32 bar_mask; struct mutex ops_lock; @@ -99,16 +104,6 @@ struct mdev_state { void *memblk; }; -static const struct mdpy_type *mdpy_find_type(struct kobject *kobj) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mdpy_types); i++) - if (strcmp(mdpy_types[i].name, kobj->name) == 0) - return mdpy_types + i; - return NULL; -} - static void mdpy_create_config_space(struct mdev_state *mdev_state) { STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID], @@ -172,11 +167,9 @@ static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, } } -static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, - loff_t pos, bool is_write) +static ssize_t mdev_access(struct mdev_state *mdev_state, char *buf, + size_t count, loff_t pos, bool is_write) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); - struct device *dev = mdev_dev(mdev); int ret = 0; mutex_lock(&mdev_state->ops_lock); @@ -197,8 +190,9 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, memcpy(buf, mdev_state->memblk, count); } else { - dev_info(dev, "%s: %s @0x%llx (unhandled)\n", - __func__, is_write ? "WR" : "RD", pos); + dev_info(mdev_state->vdev.dev, + "%s: %s @0x%llx (unhandled)\n", __func__, + is_write ? "WR" : "RD", pos); ret = -1; goto accessfailed; } @@ -212,9 +206,8 @@ accessfailed: return ret; } -static int mdpy_reset(struct mdev_device *mdev) +static int mdpy_reset(struct mdev_state *mdev_state) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); u32 stride, i; /* initialize with gray gradient */ @@ -226,71 +219,88 @@ static int mdpy_reset(struct mdev_device *mdev) return 0; } -static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev) +static int mdpy_init_dev(struct vfio_device *vdev) { - const struct mdpy_type *type = mdpy_find_type(kobj); - struct device *dev = mdev_dev(mdev); - struct mdev_state *mdev_state; + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); + const struct mdpy_type *type = + container_of(mdev->type, struct mdpy_type, type); u32 fbsize; - - if (mdpy_count >= max_devices) - return -ENOMEM; - - mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); - if (mdev_state == NULL) - return -ENOMEM; + int ret = -ENOMEM; mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL); - if (mdev_state->vconfig == NULL) { - kfree(mdev_state); - return -ENOMEM; - } + if (!mdev_state->vconfig) + return ret; - if (!type) - type = &mdpy_types[0]; fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp); mdev_state->memblk = vmalloc_user(fbsize); - if (!mdev_state->memblk) { - kfree(mdev_state->vconfig); - kfree(mdev_state); - return -ENOMEM; - } - dev_info(dev, "%s: %s (%dx%d)\n", - __func__, kobj->name, type->width, type->height); + if (!mdev_state->memblk) + goto out_vconfig; mutex_init(&mdev_state->ops_lock); mdev_state->mdev = mdev; - mdev_set_drvdata(mdev, mdev_state); - - mdev_state->type = type; + mdev_state->type = type; mdev_state->memsize = fbsize; mdpy_create_config_space(mdev_state); - mdpy_reset(mdev); + mdpy_reset(mdev_state); - mdpy_count++; + dev_info(vdev->dev, "%s: %s (%dx%d)\n", __func__, type->type.pretty_name, + type->width, type->height); return 0; + +out_vconfig: + kfree(mdev_state->vconfig); + return ret; } -static int mdpy_remove(struct mdev_device *mdev) +static int mdpy_probe(struct mdev_device *mdev) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); - struct device *dev = mdev_dev(mdev); + struct mdev_state *mdev_state; + int ret; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mdpy_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); + + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); + if (ret) + goto err_put_vdev; + dev_set_drvdata(&mdev->dev, mdev_state); + return 0; - dev_info(dev, "%s\n", __func__); +err_put_vdev: + vfio_put_device(&mdev_state->vdev); + return ret; +} + +static void mdpy_release_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); - mdev_set_drvdata(mdev, NULL); vfree(mdev_state->memblk); kfree(mdev_state->vconfig); - kfree(mdev_state); + vfio_free_device(vdev); +} - mdpy_count--; - return 0; +static void mdpy_remove(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); + + dev_info(&mdev->dev, "%s\n", __func__); + + vfio_unregister_group_dev(&mdev_state->vdev); + vfio_put_device(&mdev_state->vdev); } -static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf, +static ssize_t mdpy_read(struct vfio_device *vdev, char __user *buf, size_t count, loff_t *ppos) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int done = 0; int ret; @@ -300,8 +310,8 @@ static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf, if (count >= 4 && !(*ppos % 4)) { u32 val; - ret = mdev_access(mdev, (char *)&val, sizeof(val), - *ppos, false); + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, false); if (ret <= 0) goto read_err; @@ -312,7 +322,7 @@ static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf, } else if (count >= 2 && !(*ppos % 2)) { u16 val; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), *ppos, false); if (ret <= 0) goto read_err; @@ -324,7 +334,7 @@ static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf, } else { u8 val; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), *ppos, false); if (ret <= 0) goto read_err; @@ -347,9 +357,11 @@ read_err: return -EFAULT; } -static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf, +static ssize_t mdpy_write(struct vfio_device *vdev, const char __user *buf, size_t count, loff_t *ppos) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int done = 0; int ret; @@ -362,7 +374,7 @@ static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -374,7 +386,7 @@ static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -386,7 +398,7 @@ static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -404,9 +416,10 @@ write_err: return -EFAULT; } -static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma) +static int mdpy_mmap(struct vfio_device *vdev, struct vm_area_struct *vma) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT) return -EINVAL; @@ -417,21 +430,13 @@ static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma) if ((vma->vm_flags & VM_SHARED) == 0) return -EINVAL; - return remap_vmalloc_range_partial(vma, vma->vm_start, - mdev_state->memblk, - vma->vm_end - vma->vm_start); + return remap_vmalloc_range(vma, mdev_state->memblk, 0); } -static int mdpy_get_region_info(struct mdev_device *mdev, +static int mdpy_get_region_info(struct mdev_state *mdev_state, struct vfio_region_info *region_info, u16 *cap_type_id, void **cap_type) { - struct mdev_state *mdev_state; - - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) - return -EINVAL; - if (region_info->index >= VFIO_PCI_NUM_REGIONS && region_info->index != MDPY_DISPLAY_REGION) return -EINVAL; @@ -460,15 +465,13 @@ static int mdpy_get_region_info(struct mdev_device *mdev, return 0; } -static int mdpy_get_irq_info(struct mdev_device *mdev, - struct vfio_irq_info *irq_info) +static int mdpy_get_irq_info(struct vfio_irq_info *irq_info) { irq_info->count = 0; return 0; } -static int mdpy_get_device_info(struct mdev_device *mdev, - struct vfio_device_info *dev_info) +static int mdpy_get_device_info(struct vfio_device_info *dev_info) { dev_info->flags = VFIO_DEVICE_FLAGS_PCI; dev_info->num_regions = VFIO_PCI_NUM_REGIONS; @@ -476,11 +479,9 @@ static int mdpy_get_device_info(struct mdev_device *mdev, return 0; } -static int mdpy_query_gfx_plane(struct mdev_device *mdev, +static int mdpy_query_gfx_plane(struct mdev_state *mdev_state, struct vfio_device_gfx_plane_info *plane) { - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); - if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) { if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE | VFIO_GFX_PLANE_TYPE_REGION)) @@ -509,14 +510,13 @@ static int mdpy_query_gfx_plane(struct mdev_device *mdev, return 0; } -static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd, +static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { int ret = 0; unsigned long minsz; - struct mdev_state *mdev_state; - - mdev_state = mdev_get_drvdata(mdev); + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); switch (cmd) { case VFIO_DEVICE_GET_INFO: @@ -531,7 +531,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd, if (info.argsz < minsz) return -EINVAL; - ret = mdpy_get_device_info(mdev, &info); + ret = mdpy_get_device_info(&info); if (ret) return ret; @@ -556,7 +556,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd, if (info.argsz < minsz) return -EINVAL; - ret = mdpy_get_region_info(mdev, &info, &cap_type_id, + ret = mdpy_get_region_info(mdev_state, &info, &cap_type_id, &cap_type); if (ret) return ret; @@ -580,7 +580,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd, (info.index >= mdev_state->dev_info.num_irqs)) return -EINVAL; - ret = mdpy_get_irq_info(mdev, &info); + ret = mdpy_get_irq_info(&info); if (ret) return ret; @@ -603,7 +603,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd, if (plane.argsz < minsz) return -EINVAL; - ret = mdpy_query_gfx_plane(mdev, &plane); + ret = mdpy_query_gfx_plane(mdev_state, &plane); if (ret) return ret; @@ -617,30 +617,16 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd, return -EINVAL; case VFIO_DEVICE_RESET: - return mdpy_reset(mdev); + return mdpy_reset(mdev_state); } return -ENOTTY; } -static int mdpy_open(struct mdev_device *mdev) -{ - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - - return 0; -} - -static void mdpy_close(struct mdev_device *mdev) -{ - module_put(THIS_MODULE); -} - static ssize_t resolution_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct mdev_device *mdev = mdev_from_dev(dev); - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct mdev_state *mdev_state = dev_get_drvdata(dev); return sprintf(buf, "%dx%d\n", mdev_state->type->width, @@ -658,85 +644,40 @@ static const struct attribute_group mdev_dev_group = { .attrs = mdev_dev_attrs, }; -const struct attribute_group *mdev_dev_groups[] = { +static const struct attribute_group *mdev_dev_groups[] = { &mdev_dev_group, NULL, }; -static ssize_t -name_show(struct kobject *kobj, struct device *dev, char *buf) +static ssize_t mdpy_show_description(struct mdev_type *mtype, char *buf) { - return sprintf(buf, "%s\n", kobj->name); -} -MDEV_TYPE_ATTR_RO(name); - -static ssize_t -description_show(struct kobject *kobj, struct device *dev, char *buf) -{ - const struct mdpy_type *type = mdpy_find_type(kobj); + struct mdpy_type *type = container_of(mtype, struct mdpy_type, type); return sprintf(buf, "virtual display, %dx%d framebuffer\n", - type ? type->width : 0, - type ? type->height : 0); -} -MDEV_TYPE_ATTR_RO(description); - -static ssize_t -available_instances_show(struct kobject *kobj, struct device *dev, char *buf) -{ - return sprintf(buf, "%d\n", max_devices - mdpy_count); + type->width, type->height); } -MDEV_TYPE_ATTR_RO(available_instances); - -static ssize_t device_api_show(struct kobject *kobj, struct device *dev, - char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); -} -MDEV_TYPE_ATTR_RO(device_api); - -static struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, - &mdev_type_attr_description.attr, - &mdev_type_attr_device_api.attr, - &mdev_type_attr_available_instances.attr, - NULL, -}; -static struct attribute_group mdev_type_group1 = { - .name = MDPY_TYPE_1, - .attrs = mdev_types_attrs, +static const struct vfio_device_ops mdpy_dev_ops = { + .init = mdpy_init_dev, + .release = mdpy_release_dev, + .read = mdpy_read, + .write = mdpy_write, + .ioctl = mdpy_ioctl, + .mmap = mdpy_mmap, }; -static struct attribute_group mdev_type_group2 = { - .name = MDPY_TYPE_2, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group3 = { - .name = MDPY_TYPE_3, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group *mdev_type_groups[] = { - &mdev_type_group1, - &mdev_type_group2, - &mdev_type_group3, - NULL, -}; - -static const struct mdev_parent_ops mdev_fops = { - .owner = THIS_MODULE, - .mdev_attr_groups = mdev_dev_groups, - .supported_type_groups = mdev_type_groups, - .create = mdpy_create, - .remove = mdpy_remove, - .open = mdpy_open, - .release = mdpy_close, - .read = mdpy_read, - .write = mdpy_write, - .ioctl = mdpy_ioctl, - .mmap = mdpy_mmap, +static struct mdev_driver mdpy_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, + .max_instances = 4, + .driver = { + .name = "mdpy", + .owner = THIS_MODULE, + .mod_name = KBUILD_MODNAME, + .dev_groups = mdev_dev_groups, + }, + .probe = mdpy_probe, + .remove = mdpy_remove, + .show_description = mdpy_show_description, }; static const struct file_operations vd_fops = { @@ -761,11 +702,15 @@ static int __init mdpy_dev_init(void) cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK + 1); pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt)); + ret = mdev_register_driver(&mdpy_driver); + if (ret) + goto err_cdev; + mdpy_class = class_create(THIS_MODULE, MDPY_CLASS_NAME); if (IS_ERR(mdpy_class)) { pr_err("Error: failed to register mdpy_dev class\n"); ret = PTR_ERR(mdpy_class); - goto failed1; + goto err_driver; } mdpy_dev.class = mdpy_class; mdpy_dev.release = mdpy_device_release; @@ -773,19 +718,23 @@ static int __init mdpy_dev_init(void) ret = device_register(&mdpy_dev); if (ret) - goto failed2; + goto err_class; - ret = mdev_register_device(&mdpy_dev, &mdev_fops); + ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver, + mdpy_mdev_types, + ARRAY_SIZE(mdpy_mdev_types)); if (ret) - goto failed3; + goto err_device; return 0; -failed3: +err_device: device_unregister(&mdpy_dev); -failed2: +err_class: class_destroy(mdpy_class); -failed1: +err_driver: + mdev_unregister_driver(&mdpy_driver); +err_cdev: cdev_del(&mdpy_cdev); unregister_chrdev_region(mdpy_devt, MINORMASK + 1); return ret; @@ -794,14 +743,18 @@ failed1: static void __exit mdpy_dev_exit(void) { mdpy_dev.bus = NULL; - mdev_unregister_device(&mdpy_dev); + mdev_unregister_parent(&mdpy_parent); device_unregister(&mdpy_dev); + mdev_unregister_driver(&mdpy_driver); cdev_del(&mdpy_cdev); unregister_chrdev_region(mdpy_devt, MINORMASK + 1); class_destroy(mdpy_class); mdpy_class = NULL; } +module_param_named(count, mdpy_driver.max_instances, int, 0444); +MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); + module_init(mdpy_dev_init) module_exit(mdpy_dev_exit) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index ce84a300a4da..e72085fc1376 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -12,7 +12,6 @@ #include <linux/init.h> #include <linux/module.h> -#include <linux/device.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/poll.h> @@ -20,7 +19,6 @@ #include <linux/cdev.h> #include <linux/sched.h> #include <linux/wait.h> -#include <linux/uuid.h> #include <linux/vfio.h> #include <linux/iommu.h> #include <linux/sysfs.h> @@ -74,6 +72,7 @@ static struct mtty_dev { struct cdev vd_cdev; struct idr vd_idr; struct device dev; + struct mdev_parent parent; } mtty_dev; struct mdev_region_info { @@ -127,6 +126,7 @@ struct serial_port { /* State of each mdev device */ struct mdev_state { + struct vfio_device vdev; int irq_fd; struct eventfd_ctx *intx_evtfd; struct eventfd_ctx *msi_evtfd; @@ -143,13 +143,29 @@ struct mdev_state { int nr_ports; }; -static struct mutex mdev_list_lock; -static struct list_head mdev_devices_list; +static struct mtty_type { + struct mdev_type type; + int nr_ports; +} mtty_types[2] = { + { .nr_ports = 1, .type.sysfs_name = "1", + .type.pretty_name = "Single port serial" }, + { .nr_ports = 2, .type.sysfs_name = "2", + .type.pretty_name = "Dual port serial" }, +}; + +static struct mdev_type *mtty_mdev_types[] = { + &mtty_types[0].type, + &mtty_types[1].type, +}; + +static atomic_t mdev_avail_ports = ATOMIC_INIT(MAX_MTTYS); static const struct file_operations vd_fops = { .owner = THIS_MODULE, }; +static const struct vfio_device_ops mtty_dev_ops; + /* function prototypes */ static int mtty_trigger_interrupt(struct mdev_state *mdev_state); @@ -631,23 +647,16 @@ static void mdev_read_base(struct mdev_state *mdev_state) } } -static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count, +static ssize_t mdev_access(struct mdev_state *mdev_state, u8 *buf, size_t count, loff_t pos, bool is_write) { - struct mdev_state *mdev_state; unsigned int index; loff_t offset; int ret = 0; - if (!mdev || !buf) + if (!buf) return -EINVAL; - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) { - pr_err("%s mdev_state not found\n", __func__); - return -EINVAL; - } - mutex_lock(&mdev_state->ops_lock); index = MTTY_VFIO_PCI_OFFSET_TO_INDEX(pos); @@ -708,97 +717,96 @@ accessfailed: return ret; } -static int mtty_create(struct kobject *kobj, struct mdev_device *mdev) +static int mtty_init_dev(struct vfio_device *vdev) { - struct mdev_state *mdev_state; - char name[MTTY_STRING_LEN]; - int nr_ports = 0, i; - - if (!mdev) - return -EINVAL; - - for (i = 0; i < 2; i++) { - snprintf(name, MTTY_STRING_LEN, "%s-%d", - dev_driver_string(mdev_parent_dev(mdev)), i + 1); - if (!strcmp(kobj->name, name)) { - nr_ports = i + 1; - break; - } - } - - if (!nr_ports) - return -EINVAL; + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); + struct mtty_type *type = + container_of(mdev->type, struct mtty_type, type); + int avail_ports = atomic_read(&mdev_avail_ports); + int ret; - mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); - if (mdev_state == NULL) - return -ENOMEM; + do { + if (avail_ports < type->nr_ports) + return -ENOSPC; + } while (!atomic_try_cmpxchg(&mdev_avail_ports, + &avail_ports, + avail_ports - type->nr_ports)); - mdev_state->nr_ports = nr_ports; + mdev_state->nr_ports = type->nr_ports; mdev_state->irq_index = -1; mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE; mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE; mutex_init(&mdev_state->rxtx_lock); - mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); - if (mdev_state->vconfig == NULL) { - kfree(mdev_state); - return -ENOMEM; + mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); + if (!mdev_state->vconfig) { + ret = -ENOMEM; + goto err_nr_ports; } mutex_init(&mdev_state->ops_lock); mdev_state->mdev = mdev; - mdev_set_drvdata(mdev, mdev_state); - mtty_create_config_space(mdev_state); - - mutex_lock(&mdev_list_lock); - list_add(&mdev_state->next, &mdev_devices_list); - mutex_unlock(&mdev_list_lock); - return 0; + +err_nr_ports: + atomic_add(type->nr_ports, &mdev_avail_ports); + return ret; } -static int mtty_remove(struct mdev_device *mdev) +static int mtty_probe(struct mdev_device *mdev) { - struct mdev_state *mds, *tmp_mds; - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); - int ret = -EINVAL; - - mutex_lock(&mdev_list_lock); - list_for_each_entry_safe(mds, tmp_mds, &mdev_devices_list, next) { - if (mdev_state == mds) { - list_del(&mdev_state->next); - mdev_set_drvdata(mdev, NULL); - kfree(mdev_state->vconfig); - kfree(mdev_state); - ret = 0; - break; - } - } - mutex_unlock(&mdev_list_lock); + struct mdev_state *mdev_state; + int ret; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mtty_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); + + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); + if (ret) + goto err_put_vdev; + dev_set_drvdata(&mdev->dev, mdev_state); + return 0; +err_put_vdev: + vfio_put_device(&mdev_state->vdev); return ret; } -static int mtty_reset(struct mdev_device *mdev) +static void mtty_release_dev(struct vfio_device *vdev) { - struct mdev_state *mdev_state; + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); - if (!mdev) - return -EINVAL; + atomic_add(mdev_state->nr_ports, &mdev_avail_ports); + kfree(mdev_state->vconfig); + vfio_free_device(vdev); +} - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) - return -EINVAL; +static void mtty_remove(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); + vfio_unregister_group_dev(&mdev_state->vdev); + vfio_put_device(&mdev_state->vdev); +} + +static int mtty_reset(struct mdev_state *mdev_state) +{ pr_info("%s: called\n", __func__); return 0; } -static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, +static ssize_t mtty_read(struct vfio_device *vdev, char __user *buf, size_t count, loff_t *ppos) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int done = 0; int ret; @@ -808,7 +816,7 @@ static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, if (count >= 4 && !(*ppos % 4)) { u32 val; - ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, false); if (ret <= 0) goto read_err; @@ -820,7 +828,7 @@ static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, } else if (count >= 2 && !(*ppos % 2)) { u16 val; - ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, false); if (ret <= 0) goto read_err; @@ -832,7 +840,7 @@ static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, } else { u8 val; - ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, false); if (ret <= 0) goto read_err; @@ -855,9 +863,11 @@ read_err: return -EFAULT; } -static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf, +static ssize_t mtty_write(struct vfio_device *vdev, const char __user *buf, size_t count, loff_t *ppos) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int done = 0; int ret; @@ -870,7 +880,7 @@ static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -882,7 +892,7 @@ static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -894,7 +904,7 @@ static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -912,19 +922,11 @@ write_err: return -EFAULT; } -static int mtty_set_irqs(struct mdev_device *mdev, uint32_t flags, +static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags, unsigned int index, unsigned int start, unsigned int count, void *data) { int ret = 0; - struct mdev_state *mdev_state; - - if (!mdev) - return -EINVAL; - - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) - return -EINVAL; mutex_lock(&mdev_state->ops_lock); switch (index) { @@ -1040,21 +1042,13 @@ static int mtty_trigger_interrupt(struct mdev_state *mdev_state) return ret; } -static int mtty_get_region_info(struct mdev_device *mdev, +static int mtty_get_region_info(struct mdev_state *mdev_state, struct vfio_region_info *region_info, u16 *cap_type_id, void **cap_type) { unsigned int size = 0; - struct mdev_state *mdev_state; u32 bar_index; - if (!mdev) - return -EINVAL; - - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) - return -EINVAL; - bar_index = region_info->index; if (bar_index >= VFIO_PCI_NUM_REGIONS) return -EINVAL; @@ -1089,8 +1083,7 @@ static int mtty_get_region_info(struct mdev_device *mdev, return 0; } -static int mtty_get_irq_info(struct mdev_device *mdev, - struct vfio_irq_info *irq_info) +static int mtty_get_irq_info(struct vfio_irq_info *irq_info) { switch (irq_info->index) { case VFIO_PCI_INTX_IRQ_INDEX: @@ -1114,8 +1107,7 @@ static int mtty_get_irq_info(struct mdev_device *mdev, return 0; } -static int mtty_get_device_info(struct mdev_device *mdev, - struct vfio_device_info *dev_info) +static int mtty_get_device_info(struct vfio_device_info *dev_info) { dev_info->flags = VFIO_DEVICE_FLAGS_PCI; dev_info->num_regions = VFIO_PCI_NUM_REGIONS; @@ -1124,19 +1116,13 @@ static int mtty_get_device_info(struct mdev_device *mdev, return 0; } -static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, +static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); int ret = 0; unsigned long minsz; - struct mdev_state *mdev_state; - - if (!mdev) - return -EINVAL; - - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) - return -ENODEV; switch (cmd) { case VFIO_DEVICE_GET_INFO: @@ -1151,7 +1137,7 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, if (info.argsz < minsz) return -EINVAL; - ret = mtty_get_device_info(mdev, &info); + ret = mtty_get_device_info(&info); if (ret) return ret; @@ -1176,7 +1162,7 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, if (info.argsz < minsz) return -EINVAL; - ret = mtty_get_region_info(mdev, &info, &cap_type_id, + ret = mtty_get_region_info(mdev_state, &info, &cap_type_id, &cap_type); if (ret) return ret; @@ -1200,7 +1186,7 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, (info.index >= mdev_state->dev_info.num_irqs)) return -EINVAL; - ret = mtty_get_irq_info(mdev, &info); + ret = mtty_get_irq_info(&info); if (ret) return ret; @@ -1234,61 +1220,23 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, return PTR_ERR(data); } - ret = mtty_set_irqs(mdev, hdr.flags, hdr.index, hdr.start, + ret = mtty_set_irqs(mdev_state, hdr.flags, hdr.index, hdr.start, hdr.count, data); kfree(ptr); return ret; } case VFIO_DEVICE_RESET: - return mtty_reset(mdev); + return mtty_reset(mdev_state); } return -ENOTTY; } -static int mtty_open(struct mdev_device *mdev) -{ - pr_info("%s\n", __func__); - return 0; -} - -static void mtty_close(struct mdev_device *mdev) -{ - pr_info("%s\n", __func__); -} - -static ssize_t -sample_mtty_dev_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "This is phy device\n"); -} - -static DEVICE_ATTR_RO(sample_mtty_dev); - -static struct attribute *mtty_dev_attrs[] = { - &dev_attr_sample_mtty_dev.attr, - NULL, -}; - -static const struct attribute_group mtty_dev_group = { - .name = "mtty_dev", - .attrs = mtty_dev_attrs, -}; - -static const struct attribute_group *mtty_dev_groups[] = { - &mtty_dev_group, - NULL, -}; - static ssize_t sample_mdev_dev_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (mdev_from_dev(dev)) - return sprintf(buf, "This is MDEV %s\n", dev_name(dev)); - - return sprintf(buf, "\n"); + return sprintf(buf, "This is MDEV %s\n", dev_name(dev)); } static DEVICE_ATTR_RO(sample_mdev_dev); @@ -1308,97 +1256,33 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t -name_show(struct kobject *kobj, struct device *dev, char *buf) +static unsigned int mtty_get_available(struct mdev_type *mtype) { - char name[MTTY_STRING_LEN]; - int i; - const char *name_str[2] = {"Single port serial", "Dual port serial"}; + struct mtty_type *type = container_of(mtype, struct mtty_type, type); - for (i = 0; i < 2; i++) { - snprintf(name, MTTY_STRING_LEN, "%s-%d", - dev_driver_string(dev), i + 1); - if (!strcmp(kobj->name, name)) - return sprintf(buf, "%s\n", name_str[i]); - } - - return -EINVAL; + return atomic_read(&mdev_avail_ports) / type->nr_ports; } -static MDEV_TYPE_ATTR_RO(name); - -static ssize_t -available_instances_show(struct kobject *kobj, struct device *dev, char *buf) -{ - char name[MTTY_STRING_LEN]; - int i; - struct mdev_state *mds; - int ports = 0, used = 0; - - for (i = 0; i < 2; i++) { - snprintf(name, MTTY_STRING_LEN, "%s-%d", - dev_driver_string(dev), i + 1); - if (!strcmp(kobj->name, name)) { - ports = i + 1; - break; - } - } - - if (!ports) - return -EINVAL; - - list_for_each_entry(mds, &mdev_devices_list, next) - used += mds->nr_ports; - - return sprintf(buf, "%d\n", (MAX_MTTYS - used)/ports); -} - -static MDEV_TYPE_ATTR_RO(available_instances); - - -static ssize_t device_api_show(struct kobject *kobj, struct device *dev, - char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); -} - -static MDEV_TYPE_ATTR_RO(device_api); - -static struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, - &mdev_type_attr_device_api.attr, - &mdev_type_attr_available_instances.attr, - NULL, +static const struct vfio_device_ops mtty_dev_ops = { + .name = "vfio-mtty", + .init = mtty_init_dev, + .release = mtty_release_dev, + .read = mtty_read, + .write = mtty_write, + .ioctl = mtty_ioctl, }; -static struct attribute_group mdev_type_group1 = { - .name = "1", - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group2 = { - .name = "2", - .attrs = mdev_types_attrs, -}; - -static struct attribute_group *mdev_type_groups[] = { - &mdev_type_group1, - &mdev_type_group2, - NULL, -}; - -static const struct mdev_parent_ops mdev_fops = { - .owner = THIS_MODULE, - .dev_attr_groups = mtty_dev_groups, - .mdev_attr_groups = mdev_dev_groups, - .supported_type_groups = mdev_type_groups, - .create = mtty_create, - .remove = mtty_remove, - .open = mtty_open, - .release = mtty_close, - .read = mtty_read, - .write = mtty_write, - .ioctl = mtty_ioctl, +static struct mdev_driver mtty_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, + .driver = { + .name = "mtty", + .owner = THIS_MODULE, + .mod_name = KBUILD_MODNAME, + .dev_groups = mdev_dev_groups, + }, + .probe = mtty_probe, + .remove = mtty_remove, + .get_available = mtty_get_available, }; static void mtty_device_release(struct device *dev) @@ -1429,12 +1313,16 @@ static int __init mtty_dev_init(void) pr_info("major_number:%d\n", MAJOR(mtty_dev.vd_devt)); + ret = mdev_register_driver(&mtty_driver); + if (ret) + goto err_cdev; + mtty_dev.vd_class = class_create(THIS_MODULE, MTTY_CLASS_NAME); if (IS_ERR(mtty_dev.vd_class)) { pr_err("Error: failed to register mtty_dev class\n"); ret = PTR_ERR(mtty_dev.vd_class); - goto failed1; + goto err_driver; } mtty_dev.dev.class = mtty_dev.vd_class; @@ -1443,38 +1331,35 @@ static int __init mtty_dev_init(void) ret = device_register(&mtty_dev.dev); if (ret) - goto failed2; + goto err_class; - ret = mdev_register_device(&mtty_dev.dev, &mdev_fops); + ret = mdev_register_parent(&mtty_dev.parent, &mtty_dev.dev, + &mtty_driver, mtty_mdev_types, + ARRAY_SIZE(mtty_mdev_types)); if (ret) - goto failed3; - - mutex_init(&mdev_list_lock); - INIT_LIST_HEAD(&mdev_devices_list); - - goto all_done; - -failed3: + goto err_device; + return 0; +err_device: device_unregister(&mtty_dev.dev); -failed2: +err_class: class_destroy(mtty_dev.vd_class); - -failed1: +err_driver: + mdev_unregister_driver(&mtty_driver); +err_cdev: cdev_del(&mtty_dev.vd_cdev); unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1); - -all_done: return ret; } static void __exit mtty_dev_exit(void) { mtty_dev.dev.bus = NULL; - mdev_unregister_device(&mtty_dev.dev); + mdev_unregister_parent(&mtty_dev.parent); device_unregister(&mtty_dev.dev); idr_destroy(&mtty_dev.vd_idr); + mdev_unregister_driver(&mtty_driver); cdev_del(&mtty_dev.vd_cdev); unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1); class_destroy(mtty_dev.vd_class); diff --git a/samples/vfs/.gitignore b/samples/vfs/.gitignore index 0806eb0be62d..79212d91285b 100644 --- a/samples/vfs/.gitignore +++ b/samples/vfs/.gitignore @@ -1,2 +1,3 @@ -test-fsmount -test-statx +# SPDX-License-Identifier: GPL-2.0-only +/test-fsmount +/test-statx diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile index 65acdde5c117..6377a678134a 100644 --- a/samples/vfs/Makefile +++ b/samples/vfs/Makefile @@ -1,10 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -# List of programs to build -hostprogs := \ - test-fsmount \ - test-statx +userprogs-always-y += test-fsmount test-statx -always-y := $(hostprogs) - -HOSTCFLAGS_test-fsmount.o += -I$(objtree)/usr/include -HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include +userccflags += -I usr/include diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c index a3d68159fb51..49c7a46cee07 100644 --- a/samples/vfs/test-statx.c +++ b/samples/vfs/test-statx.c @@ -23,6 +23,8 @@ #include <linux/fcntl.h> #define statx foo #define statx_timestamp foo_timestamp +struct statx; +struct statx_timestamp; #include <sys/stat.h> #undef statx #undef statx_timestamp @@ -216,7 +218,7 @@ int main(int argc, char **argv) struct statx stx; int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW; - unsigned int mask = STATX_ALL; + unsigned int mask = STATX_BASIC_STATS | STATX_BTIME; for (argv++; *argv; argv++) { if (strcmp(*argv, "-F") == 0) { diff --git a/samples/watch_queue/.gitignore b/samples/watch_queue/.gitignore new file mode 100644 index 000000000000..823b351d3db9 --- /dev/null +++ b/samples/watch_queue/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/watch_test diff --git a/samples/watch_queue/Makefile b/samples/watch_queue/Makefile new file mode 100644 index 000000000000..c0db3a6bc524 --- /dev/null +++ b/samples/watch_queue/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += watch_test + +userccflags += -I usr/include diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c new file mode 100644 index 000000000000..8c6cb57d5cfc --- /dev/null +++ b/samples/watch_queue/watch_test.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Use watch_queue API to watch for notifications. + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define _GNU_SOURCE +#include <stdbool.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <signal.h> +#include <unistd.h> +#include <errno.h> +#include <sys/ioctl.h> +#include <limits.h> +#include <linux/watch_queue.h> +#include <linux/unistd.h> +#include <linux/keyctl.h> + +#ifndef KEYCTL_WATCH_KEY +#define KEYCTL_WATCH_KEY -1 +#endif +#ifndef __NR_keyctl +#define __NR_keyctl -1 +#endif + +#define BUF_SIZE 256 + +static long keyctl_watch_key(int key, int watch_fd, int watch_id) +{ + return syscall(__NR_keyctl, KEYCTL_WATCH_KEY, key, watch_fd, watch_id); +} + +static const char *key_subtypes[256] = { + [NOTIFY_KEY_INSTANTIATED] = "instantiated", + [NOTIFY_KEY_UPDATED] = "updated", + [NOTIFY_KEY_LINKED] = "linked", + [NOTIFY_KEY_UNLINKED] = "unlinked", + [NOTIFY_KEY_CLEARED] = "cleared", + [NOTIFY_KEY_REVOKED] = "revoked", + [NOTIFY_KEY_INVALIDATED] = "invalidated", + [NOTIFY_KEY_SETATTR] = "setattr", +}; + +static void saw_key_change(struct watch_notification *n, size_t len) +{ + struct key_notification *k = (struct key_notification *)n; + + if (len != sizeof(struct key_notification)) { + fprintf(stderr, "Incorrect key message length\n"); + return; + } + + printf("KEY %08x change=%u[%s] aux=%u\n", + k->key_id, n->subtype, key_subtypes[n->subtype], k->aux); +} + +/* + * Consume and display events. + */ +static void consumer(int fd) +{ + unsigned char buffer[433], *p, *end; + union { + struct watch_notification n; + unsigned char buf1[128]; + } n; + ssize_t buf_len; + + for (;;) { + buf_len = read(fd, buffer, sizeof(buffer)); + if (buf_len == -1) { + perror("read"); + exit(1); + } + + if (buf_len == 0) { + printf("-- END --\n"); + return; + } + + if (buf_len > sizeof(buffer)) { + fprintf(stderr, "Read buffer overrun: %zd\n", buf_len); + return; + } + + printf("read() = %zd\n", buf_len); + + p = buffer; + end = buffer + buf_len; + while (p < end) { + size_t largest, len; + + largest = end - p; + if (largest > 128) + largest = 128; + if (largest < sizeof(struct watch_notification)) { + fprintf(stderr, "Short message header: %zu\n", largest); + return; + } + memcpy(&n, p, largest); + + printf("NOTIFY[%03zx]: ty=%06x sy=%02x i=%08x\n", + p - buffer, n.n.type, n.n.subtype, n.n.info); + + len = n.n.info & WATCH_INFO_LENGTH; + if (len < sizeof(n.n) || len > largest) { + fprintf(stderr, "Bad message length: %zu/%zu\n", len, largest); + exit(1); + } + + switch (n.n.type) { + case WATCH_TYPE_META: + switch (n.n.subtype) { + case WATCH_META_REMOVAL_NOTIFICATION: + printf("REMOVAL of watchpoint %08x\n", + (n.n.info & WATCH_INFO_ID) >> + WATCH_INFO_ID__SHIFT); + break; + case WATCH_META_LOSS_NOTIFICATION: + printf("-- LOSS --\n"); + break; + default: + printf("other meta record\n"); + break; + } + break; + case WATCH_TYPE_KEY_NOTIFY: + saw_key_change(&n.n, len); + break; + default: + printf("other type\n"); + break; + } + + p += len; + } + } +} + +static struct watch_notification_filter filter = { + .nr_filters = 1, + .filters = { + [0] = { + .type = WATCH_TYPE_KEY_NOTIFY, + .subtype_filter[0] = UINT_MAX, + }, + }, +}; + +int main(int argc, char **argv) +{ + int pipefd[2], fd; + + if (pipe2(pipefd, O_NOTIFICATION_PIPE) == -1) { + perror("pipe2"); + exit(1); + } + fd = pipefd[0]; + + if (ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE) == -1) { + perror("watch_queue(size)"); + exit(1); + } + + if (ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) == -1) { + perror("watch_queue(filter)"); + exit(1); + } + + if (keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01) == -1) { + perror("keyctl"); + exit(1); + } + + if (keyctl_watch_key(KEY_SPEC_USER_KEYRING, fd, 0x02) == -1) { + perror("keyctl"); + exit(1); + } + + consumer(fd); + exit(0); +} diff --git a/samples/watchdog/.gitignore b/samples/watchdog/.gitignore index ff0ebb540333..a70a0150ed9f 100644 --- a/samples/watchdog/.gitignore +++ b/samples/watchdog/.gitignore @@ -1 +1,2 @@ -watchdog-simple +# SPDX-License-Identifier: GPL-2.0-only +/watchdog-simple diff --git a/samples/watchdog/Makefile b/samples/watchdog/Makefile index a9430fa60253..ab39d23dc96b 100644 --- a/samples/watchdog/Makefile +++ b/samples/watchdog/Makefile @@ -1,9 +1,2 @@ # SPDX-License-Identifier: GPL-2.0 -CC := $(CROSS_COMPILE)gcc -PROGS := watchdog-simple - -all: $(PROGS) - -clean: - rm -fr $(PROGS) - +userprogs-always-y += watchdog-simple |