diff options
Diffstat (limited to 'tools/testing/selftests')
510 files changed, 30482 insertions, 3670 deletions
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index bc3299a20338..c852eb40c4f7 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -35,9 +35,11 @@ TARGETS += memory-hotplug TARGETS += mincore TARGETS += mount TARGETS += mount_setattr +TARGETS += move_mount_set_group TARGETS += mqueue TARGETS += nci TARGETS += net +TARGETS += net/af_unix TARGETS += net/forwarding TARGETS += net/mptcp TARGETS += netfilter @@ -49,6 +51,7 @@ TARGETS += proc TARGETS += pstore TARGETS += ptrace TARGETS += openat2 +TARGETS += rlimits TARGETS += rseq TARGETS += rtc TARGETS += seccomp diff --git a/tools/testing/selftests/arm64/fp/.gitignore b/tools/testing/selftests/arm64/fp/.gitignore index d66f76d2a650..b67395903b9b 100644 --- a/tools/testing/selftests/arm64/fp/.gitignore +++ b/tools/testing/selftests/arm64/fp/.gitignore @@ -1,5 +1,7 @@ fpsimd-test +rdvl-sve sve-probe-vls sve-ptrace sve-test +vec-syscfg vlset diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile index a57009d3a0dc..f2abdd6ba12e 100644 --- a/tools/testing/selftests/arm64/fp/Makefile +++ b/tools/testing/selftests/arm64/fp/Makefile @@ -1,17 +1,22 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -I../../../../../usr/include/ -TEST_GEN_PROGS := sve-ptrace sve-probe-vls -TEST_PROGS_EXTENDED := fpsimd-test fpsimd-stress sve-test sve-stress vlset +TEST_GEN_PROGS := sve-ptrace sve-probe-vls vec-syscfg +TEST_PROGS_EXTENDED := fpsimd-test fpsimd-stress \ + rdvl-sve \ + sve-test sve-stress \ + vlset all: $(TEST_GEN_PROGS) $(TEST_PROGS_EXTENDED) fpsimd-test: fpsimd-test.o $(CC) -nostdlib $^ -o $@ +rdvl-sve: rdvl-sve.o rdvl.o sve-ptrace: sve-ptrace.o sve-ptrace-asm.o -sve-probe-vls: sve-probe-vls.o +sve-probe-vls: sve-probe-vls.o rdvl.o sve-test: sve-test.o $(CC) -nostdlib $^ -o $@ +vec-syscfg: vec-syscfg.o rdvl.o vlset: vlset.o include ../../lib.mk diff --git a/tools/testing/selftests/arm64/fp/TODO b/tools/testing/selftests/arm64/fp/TODO new file mode 100644 index 000000000000..b6b7ebfcf362 --- /dev/null +++ b/tools/testing/selftests/arm64/fp/TODO @@ -0,0 +1,4 @@ +- Test unsupported values in the ABIs. +- More coverage for ptrace (eg, vector length conversions). +- Coverage for signals. +- Test PR_SVE_VL_INHERITY after a double fork. diff --git a/tools/testing/selftests/arm64/fp/rdvl-sve.c b/tools/testing/selftests/arm64/fp/rdvl-sve.c new file mode 100644 index 000000000000..7f8a13a18f5d --- /dev/null +++ b/tools/testing/selftests/arm64/fp/rdvl-sve.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <stdio.h> + +#include "rdvl.h" + +int main(void) +{ + int vl = rdvl_sve(); + + printf("%d\n", vl); + + return 0; +} diff --git a/tools/testing/selftests/arm64/fp/rdvl.S b/tools/testing/selftests/arm64/fp/rdvl.S new file mode 100644 index 000000000000..c916c1c9defd --- /dev/null +++ b/tools/testing/selftests/arm64/fp/rdvl.S @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (C) 2021 ARM Limited. + +.arch_extension sve + +.globl rdvl_sve +rdvl_sve: + hint 34 // BTI C + rdvl x0, #1 + ret diff --git a/tools/testing/selftests/arm64/fp/rdvl.h b/tools/testing/selftests/arm64/fp/rdvl.h new file mode 100644 index 000000000000..7c9d953fc9e7 --- /dev/null +++ b/tools/testing/selftests/arm64/fp/rdvl.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef RDVL_H +#define RDVL_H + +int rdvl_sve(void); + +#endif diff --git a/tools/testing/selftests/arm64/fp/sve-probe-vls.c b/tools/testing/selftests/arm64/fp/sve-probe-vls.c index b29cbc642c57..a24eca7a4ecb 100644 --- a/tools/testing/selftests/arm64/fp/sve-probe-vls.c +++ b/tools/testing/selftests/arm64/fp/sve-probe-vls.c @@ -13,6 +13,7 @@ #include <asm/sigcontext.h> #include "../../kselftest.h" +#include "rdvl.h" int main(int argc, char **argv) { @@ -25,7 +26,7 @@ int main(int argc, char **argv) ksft_set_plan(2); if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) - ksft_exit_skip("SVE not available"); + ksft_exit_skip("SVE not available\n"); /* * Enumerate up to SVE_VQ_MAX vector lengths @@ -38,6 +39,10 @@ int main(int argc, char **argv) vl &= PR_SVE_VL_LEN_MASK; + if (rdvl_sve() != vl) + ksft_exit_fail_msg("PR_SVE_SET_VL reports %d, RDVL %d\n", + vl, rdvl_sve()); + if (!sve_vl_valid(vl)) ksft_exit_fail_msg("VL %d invalid\n", vl); vq = sve_vq_from_vl(vl); diff --git a/tools/testing/selftests/arm64/fp/vec-syscfg.c b/tools/testing/selftests/arm64/fp/vec-syscfg.c new file mode 100644 index 000000000000..c02071dcb563 --- /dev/null +++ b/tools/testing/selftests/arm64/fp/vec-syscfg.c @@ -0,0 +1,593 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 ARM Limited. + * Original author: Mark Brown <broonie@kernel.org> + */ +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/auxv.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <asm/sigcontext.h> +#include <asm/hwcap.h> + +#include "../../kselftest.h" +#include "rdvl.h" + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) + +#define ARCH_MIN_VL SVE_VL_MIN + +struct vec_data { + const char *name; + unsigned long hwcap_type; + unsigned long hwcap; + const char *rdvl_binary; + int (*rdvl)(void); + + int prctl_get; + int prctl_set; + const char *default_vl_file; + + int default_vl; + int min_vl; + int max_vl; +}; + + +static struct vec_data vec_data[] = { + { + .name = "SVE", + .hwcap_type = AT_HWCAP, + .hwcap = HWCAP_SVE, + .rdvl = rdvl_sve, + .rdvl_binary = "./rdvl-sve", + .prctl_get = PR_SVE_GET_VL, + .prctl_set = PR_SVE_SET_VL, + .default_vl_file = "/proc/sys/abi/sve_default_vector_length", + }, +}; + +static int stdio_read_integer(FILE *f, const char *what, int *val) +{ + int n = 0; + int ret; + + ret = fscanf(f, "%d%*1[\n]%n", val, &n); + if (ret < 1 || n < 1) { + ksft_print_msg("failed to parse integer from %s\n", what); + return -1; + } + + return 0; +} + +/* Start a new process and return the vector length it sees */ +static int get_child_rdvl(struct vec_data *data) +{ + FILE *out; + int pipefd[2]; + pid_t pid, child; + int read_vl, ret; + + ret = pipe(pipefd); + if (ret == -1) { + ksft_print_msg("pipe() failed: %d (%s)\n", + errno, strerror(errno)); + return -1; + } + + fflush(stdout); + + child = fork(); + if (child == -1) { + ksft_print_msg("fork() failed: %d (%s)\n", + errno, strerror(errno)); + close(pipefd[0]); + close(pipefd[1]); + return -1; + } + + /* Child: put vector length on the pipe */ + if (child == 0) { + /* + * Replace stdout with the pipe, errors to stderr from + * here as kselftest prints to stdout. + */ + ret = dup2(pipefd[1], 1); + if (ret == -1) { + fprintf(stderr, "dup2() %d\n", errno); + exit(EXIT_FAILURE); + } + + /* exec() a new binary which puts the VL on stdout */ + ret = execl(data->rdvl_binary, data->rdvl_binary, NULL); + fprintf(stderr, "execl(%s) failed: %d\n", + data->rdvl_binary, errno, strerror(errno)); + + exit(EXIT_FAILURE); + } + + close(pipefd[1]); + + /* Parent; wait for the exit status from the child & verify it */ + do { + pid = wait(&ret); + if (pid == -1) { + ksft_print_msg("wait() failed: %d (%s)\n", + errno, strerror(errno)); + close(pipefd[0]); + return -1; + } + } while (pid != child); + + assert(pid == child); + + if (!WIFEXITED(ret)) { + ksft_print_msg("child exited abnormally\n"); + close(pipefd[0]); + return -1; + } + + if (WEXITSTATUS(ret) != 0) { + ksft_print_msg("child returned error %d\n", + WEXITSTATUS(ret)); + close(pipefd[0]); + return -1; + } + + out = fdopen(pipefd[0], "r"); + if (!out) { + ksft_print_msg("failed to open child stdout\n"); + close(pipefd[0]); + return -1; + } + + ret = stdio_read_integer(out, "child", &read_vl); + fclose(out); + if (ret != 0) + return ret; + + return read_vl; +} + +static int file_read_integer(const char *name, int *val) +{ + FILE *f; + int ret; + + f = fopen(name, "r"); + if (!f) { + ksft_test_result_fail("Unable to open %s: %d (%s)\n", + name, errno, + strerror(errno)); + return -1; + } + + ret = stdio_read_integer(f, name, val); + fclose(f); + + return ret; +} + +static int file_write_integer(const char *name, int val) +{ + FILE *f; + int ret; + + f = fopen(name, "w"); + if (!f) { + ksft_test_result_fail("Unable to open %s: %d (%s)\n", + name, errno, + strerror(errno)); + return -1; + } + + fprintf(f, "%d", val); + fclose(f); + if (ret < 0) { + ksft_test_result_fail("Error writing %d to %s\n", + val, name); + return -1; + } + + return 0; +} + +/* + * Verify that we can read the default VL via proc, checking that it + * is set in a freshly spawned child. + */ +static void proc_read_default(struct vec_data *data) +{ + int default_vl, child_vl, ret; + + ret = file_read_integer(data->default_vl_file, &default_vl); + if (ret != 0) + return; + + /* Is this the actual default seen by new processes? */ + child_vl = get_child_rdvl(data); + if (child_vl != default_vl) { + ksft_test_result_fail("%s is %d but child VL is %d\n", + data->default_vl_file, + default_vl, child_vl); + return; + } + + ksft_test_result_pass("%s default vector length %d\n", data->name, + default_vl); + data->default_vl = default_vl; +} + +/* Verify that we can write a minimum value and have it take effect */ +static void proc_write_min(struct vec_data *data) +{ + int ret, new_default, child_vl; + + if (geteuid() != 0) { + ksft_test_result_skip("Need to be root to write to /proc\n"); + return; + } + + ret = file_write_integer(data->default_vl_file, ARCH_MIN_VL); + if (ret != 0) + return; + + /* What was the new value? */ + ret = file_read_integer(data->default_vl_file, &new_default); + if (ret != 0) + return; + + /* Did it take effect in a new process? */ + child_vl = get_child_rdvl(data); + if (child_vl != new_default) { + ksft_test_result_fail("%s is %d but child VL is %d\n", + data->default_vl_file, + new_default, child_vl); + return; + } + + ksft_test_result_pass("%s minimum vector length %d\n", data->name, + new_default); + data->min_vl = new_default; + + file_write_integer(data->default_vl_file, data->default_vl); +} + +/* Verify that we can write a maximum value and have it take effect */ +static void proc_write_max(struct vec_data *data) +{ + int ret, new_default, child_vl; + + if (geteuid() != 0) { + ksft_test_result_skip("Need to be root to write to /proc\n"); + return; + } + + /* -1 is accepted by the /proc interface as the maximum VL */ + ret = file_write_integer(data->default_vl_file, -1); + if (ret != 0) + return; + + /* What was the new value? */ + ret = file_read_integer(data->default_vl_file, &new_default); + if (ret != 0) + return; + + /* Did it take effect in a new process? */ + child_vl = get_child_rdvl(data); + if (child_vl != new_default) { + ksft_test_result_fail("%s is %d but child VL is %d\n", + data->default_vl_file, + new_default, child_vl); + return; + } + + ksft_test_result_pass("%s maximum vector length %d\n", data->name, + new_default); + data->max_vl = new_default; + + file_write_integer(data->default_vl_file, data->default_vl); +} + +/* Can we read back a VL from prctl? */ +static void prctl_get(struct vec_data *data) +{ + int ret; + + ret = prctl(data->prctl_get); + if (ret == -1) { + ksft_test_result_fail("%s prctl() read failed: %d (%s)\n", + data->name, errno, strerror(errno)); + return; + } + + /* Mask out any flags */ + ret &= PR_SVE_VL_LEN_MASK; + + /* Is that what we can read back directly? */ + if (ret == data->rdvl()) + ksft_test_result_pass("%s current VL is %d\n", + data->name, ret); + else + ksft_test_result_fail("%s prctl() VL %d but RDVL is %d\n", + data->name, ret, data->rdvl()); +} + +/* Does the prctl let us set the VL we already have? */ +static void prctl_set_same(struct vec_data *data) +{ + int cur_vl = data->rdvl(); + int ret; + + ret = prctl(data->prctl_set, cur_vl); + if (ret < 0) { + ksft_test_result_fail("%s prctl set failed: %d (%s)\n", + data->name, errno, strerror(errno)); + return; + } + + if (cur_vl != data->rdvl()) + ksft_test_result_pass("%s current VL is %d\n", + data->name, ret); + else + ksft_test_result_fail("%s prctl() VL %d but RDVL is %d\n", + data->name, ret, data->rdvl()); +} + +/* Can we set a new VL for this process? */ +static void prctl_set(struct vec_data *data) +{ + int ret; + + if (data->min_vl == data->max_vl) { + ksft_test_result_skip("%s only one VL supported\n", + data->name); + return; + } + + /* Try to set the minimum VL */ + ret = prctl(data->prctl_set, data->min_vl); + if (ret < 0) { + ksft_test_result_fail("%s prctl set failed for %d: %d (%s)\n", + data->name, data->min_vl, + errno, strerror(errno)); + return; + } + + if ((ret & PR_SVE_VL_LEN_MASK) != data->min_vl) { + ksft_test_result_fail("%s prctl set %d but return value is %d\n", + data->name, data->min_vl, data->rdvl()); + return; + } + + if (data->rdvl() != data->min_vl) { + ksft_test_result_fail("%s set %d but RDVL is %d\n", + data->name, data->min_vl, data->rdvl()); + return; + } + + /* Try to set the maximum VL */ + ret = prctl(data->prctl_set, data->max_vl); + if (ret < 0) { + ksft_test_result_fail("%s prctl set failed for %d: %d (%s)\n", + data->name, data->max_vl, + errno, strerror(errno)); + return; + } + + if ((ret & PR_SVE_VL_LEN_MASK) != data->max_vl) { + ksft_test_result_fail("%s prctl() set %d but return value is %d\n", + data->name, data->max_vl, data->rdvl()); + return; + } + + /* The _INHERIT flag should not be present when we read the VL */ + ret = prctl(data->prctl_get); + if (ret == -1) { + ksft_test_result_fail("%s prctl() read failed: %d (%s)\n", + data->name, errno, strerror(errno)); + return; + } + + if (ret & PR_SVE_VL_INHERIT) { + ksft_test_result_fail("%s prctl() reports _INHERIT\n", + data->name); + return; + } + + ksft_test_result_pass("%s prctl() set min/max\n", data->name); +} + +/* If we didn't request it a new VL shouldn't affect the child */ +static void prctl_set_no_child(struct vec_data *data) +{ + int ret, child_vl; + + if (data->min_vl == data->max_vl) { + ksft_test_result_skip("%s only one VL supported\n", + data->name); + return; + } + + ret = prctl(data->prctl_set, data->min_vl); + if (ret < 0) { + ksft_test_result_fail("%s prctl set failed for %d: %d (%s)\n", + data->name, data->min_vl, + errno, strerror(errno)); + return; + } + + /* Ensure the default VL is different */ + ret = file_write_integer(data->default_vl_file, data->max_vl); + if (ret != 0) + return; + + /* Check that the child has the default we just set */ + child_vl = get_child_rdvl(data); + if (child_vl != data->max_vl) { + ksft_test_result_fail("%s is %d but child VL is %d\n", + data->default_vl_file, + data->max_vl, child_vl); + return; + } + + ksft_test_result_pass("%s vector length used default\n", data->name); + + file_write_integer(data->default_vl_file, data->default_vl); +} + +/* If we didn't request it a new VL shouldn't affect the child */ +static void prctl_set_for_child(struct vec_data *data) +{ + int ret, child_vl; + + if (data->min_vl == data->max_vl) { + ksft_test_result_skip("%s only one VL supported\n", + data->name); + return; + } + + ret = prctl(data->prctl_set, data->min_vl | PR_SVE_VL_INHERIT); + if (ret < 0) { + ksft_test_result_fail("%s prctl set failed for %d: %d (%s)\n", + data->name, data->min_vl, + errno, strerror(errno)); + return; + } + + /* The _INHERIT flag should be present when we read the VL */ + ret = prctl(data->prctl_get); + if (ret == -1) { + ksft_test_result_fail("%s prctl() read failed: %d (%s)\n", + data->name, errno, strerror(errno)); + return; + } + if (!(ret & PR_SVE_VL_INHERIT)) { + ksft_test_result_fail("%s prctl() does not report _INHERIT\n", + data->name); + return; + } + + /* Ensure the default VL is different */ + ret = file_write_integer(data->default_vl_file, data->max_vl); + if (ret != 0) + return; + + /* Check that the child inherited our VL */ + child_vl = get_child_rdvl(data); + if (child_vl != data->min_vl) { + ksft_test_result_fail("%s is %d but child VL is %d\n", + data->default_vl_file, + data->min_vl, child_vl); + return; + } + + ksft_test_result_pass("%s vector length was inherited\n", data->name); + + file_write_integer(data->default_vl_file, data->default_vl); +} + +/* _ONEXEC takes effect only in the child process */ +static void prctl_set_onexec(struct vec_data *data) +{ + int ret, child_vl; + + if (data->min_vl == data->max_vl) { + ksft_test_result_skip("%s only one VL supported\n", + data->name); + return; + } + + /* Set a known value for the default and our current VL */ + ret = file_write_integer(data->default_vl_file, data->max_vl); + if (ret != 0) + return; + + ret = prctl(data->prctl_set, data->max_vl); + if (ret < 0) { + ksft_test_result_fail("%s prctl set failed for %d: %d (%s)\n", + data->name, data->min_vl, + errno, strerror(errno)); + return; + } + + /* Set a different value for the child to have on exec */ + ret = prctl(data->prctl_set, data->min_vl | PR_SVE_SET_VL_ONEXEC); + if (ret < 0) { + ksft_test_result_fail("%s prctl set failed for %d: %d (%s)\n", + data->name, data->min_vl, + errno, strerror(errno)); + return; + } + + /* Our current VL should stay the same */ + if (data->rdvl() != data->max_vl) { + ksft_test_result_fail("%s VL changed by _ONEXEC prctl()\n", + data->name); + return; + } + + /* Check that the child inherited our VL */ + child_vl = get_child_rdvl(data); + if (child_vl != data->min_vl) { + ksft_test_result_fail("Set %d _ONEXEC but child VL is %d\n", + data->min_vl, child_vl); + return; + } + + ksft_test_result_pass("%s vector length set on exec\n", data->name); + + file_write_integer(data->default_vl_file, data->default_vl); +} + +typedef void (*test_type)(struct vec_data *); + +static const test_type tests[] = { + /* + * The default/min/max tests must be first and in this order + * to provide data for other tests. + */ + proc_read_default, + proc_write_min, + proc_write_max, + + prctl_get, + prctl_set, + prctl_set_no_child, + prctl_set_for_child, + prctl_set_onexec, +}; + +int main(void) +{ + int i, j; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests) * ARRAY_SIZE(vec_data)); + + for (i = 0; i < ARRAY_SIZE(vec_data); i++) { + struct vec_data *data = &vec_data[i]; + unsigned long supported; + + supported = getauxval(data->hwcap_type) & data->hwcap; + + for (j = 0; j < ARRAY_SIZE(tests); j++) { + if (supported) + tests[j](data); + else + ksft_test_result_skip("%s not supported\n", + data->name); + } + } + + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore index bc3ac63f3314..d1fe4ddf1669 100644 --- a/tools/testing/selftests/arm64/mte/.gitignore +++ b/tools/testing/selftests/arm64/mte/.gitignore @@ -1,4 +1,5 @@ check_buffer_fill +check_gcr_el1_cswitch check_tags_inclusion check_child_memory check_mmap_options diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c index f50ac31920d1..0328a1e08f65 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.c +++ b/tools/testing/selftests/arm64/mte/mte_common_util.c @@ -298,7 +298,7 @@ int mte_default_setup(void) int ret; if (!(hwcaps2 & HWCAP2_MTE)) { - ksft_print_msg("FAIL: MTE features unavailable\n"); + ksft_print_msg("SKIP: MTE features unavailable\n"); return KSFT_SKIP; } /* Get current mte mode */ diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c index 592fe538506e..b743daa772f5 100644 --- a/tools/testing/selftests/arm64/pauth/pac.c +++ b/tools/testing/selftests/arm64/pauth/pac.c @@ -25,13 +25,15 @@ do { \ unsigned long hwcaps = getauxval(AT_HWCAP); \ /* data key instructions are not in NOP space. This prevents a SIGILL */ \ - ASSERT_NE(0, hwcaps & HWCAP_PACA) TH_LOG("PAUTH not enabled"); \ + if (!(hwcaps & HWCAP_PACA)) \ + SKIP(return, "PAUTH not enabled"); \ } while (0) #define ASSERT_GENERIC_PAUTH_ENABLED() \ do { \ unsigned long hwcaps = getauxval(AT_HWCAP); \ /* generic key instructions are not in NOP space. This prevents a SIGILL */ \ - ASSERT_NE(0, hwcaps & HWCAP_PACG) TH_LOG("Generic PAUTH not enabled"); \ + if (!(hwcaps & HWCAP_PACG)) \ + SKIP(return, "Generic PAUTH not enabled"); \ } while (0) void sign_specific(struct signatures *sign, size_t val) @@ -256,7 +258,7 @@ TEST(single_thread_different_keys) unsigned long hwcaps = getauxval(AT_HWCAP); /* generic and data key instructions are not in NOP space. This prevents a SIGILL */ - ASSERT_NE(0, hwcaps & HWCAP_PACA) TH_LOG("PAUTH not enabled"); + ASSERT_PAUTH_ENABLED(); if (!(hwcaps & HWCAP_PACG)) { TH_LOG("WARNING: Generic PAUTH not enabled. Skipping generic key checks"); nkeys = NKEYS - 1; @@ -299,7 +301,7 @@ TEST(exec_changed_keys) unsigned long hwcaps = getauxval(AT_HWCAP); /* generic and data key instructions are not in NOP space. This prevents a SIGILL */ - ASSERT_NE(0, hwcaps & HWCAP_PACA) TH_LOG("PAUTH not enabled"); + ASSERT_PAUTH_ENABLED(); if (!(hwcaps & HWCAP_PACG)) { TH_LOG("WARNING: Generic PAUTH not enabled. Skipping generic key checks"); nkeys = NKEYS - 1; diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore index 78c902045ca7..c1742755abb9 100644 --- a/tools/testing/selftests/arm64/signal/.gitignore +++ b/tools/testing/selftests/arm64/signal/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only mangle_* fake_sigreturn_* +sve_* !*.[ch] diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h index f96baf1cef1a..ebe8694dbef0 100644 --- a/tools/testing/selftests/arm64/signal/test_signals.h +++ b/tools/testing/selftests/arm64/signal/test_signals.h @@ -33,10 +33,12 @@ */ enum { FSSBS_BIT, + FSVE_BIT, FMAX_END }; #define FEAT_SSBS (1UL << FSSBS_BIT) +#define FEAT_SVE (1UL << FSVE_BIT) /* * A descriptor used to describe and configure a test case. diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c index 2de6e5ed5e25..22722abc9dfa 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.c +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c @@ -26,6 +26,7 @@ static int sig_copyctx = SIGTRAP; static char const *const feats_names[FMAX_END] = { " SSBS ", + " SVE ", }; #define MAX_FEATS_SZ 128 @@ -263,16 +264,21 @@ int test_init(struct tdescr *td) */ if (getauxval(AT_HWCAP) & HWCAP_SSBS) td->feats_supported |= FEAT_SSBS; - if (feats_ok(td)) + if (getauxval(AT_HWCAP) & HWCAP_SVE) + td->feats_supported |= FEAT_SVE; + if (feats_ok(td)) { fprintf(stderr, "Required Features: [%s] supported\n", feats_to_string(td->feats_required & td->feats_supported)); - else + } else { fprintf(stderr, "Required Features: [%s] NOT supported\n", feats_to_string(td->feats_required & ~td->feats_supported)); + td->result = KSFT_SKIP; + return 0; + } } /* Perform test specific additional initialization */ diff --git a/tools/testing/selftests/arm64/signal/testcases/TODO b/tools/testing/selftests/arm64/signal/testcases/TODO new file mode 100644 index 000000000000..110ff9fd195d --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/TODO @@ -0,0 +1,2 @@ +- Validate that register contents are saved and restored as expected. +- Support and validate extra_context. diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c new file mode 100644 index 000000000000..bb50b5adbf10 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 ARM Limited + * + * Attempt to change the SVE vector length in a signal hander, this is not + * supported and is expected to segfault. + */ + +#include <signal.h> +#include <ucontext.h> +#include <sys/prctl.h> + +#include "test_signals_utils.h" +#include "testcases.h" + +struct fake_sigframe sf; +static unsigned int vls[SVE_VQ_MAX]; +unsigned int nvls = 0; + +static bool sve_get_vls(struct tdescr *td) +{ + int vq, vl; + + /* + * Enumerate up to SVE_VQ_MAX vector lengths + */ + for (vq = SVE_VQ_MAX; vq > 0; --vq) { + vl = prctl(PR_SVE_SET_VL, vq * 16); + if (vl == -1) + return false; + + vl &= PR_SVE_VL_LEN_MASK; + + /* Skip missing VLs */ + vq = sve_vq_from_vl(vl); + + vls[nvls++] = vl; + } + + /* We need at least two VLs */ + if (nvls < 2) { + fprintf(stderr, "Only %d VL supported\n", nvls); + return false; + } + + return true; +} + +static int fake_sigreturn_sve_change_vl(struct tdescr *td, + siginfo_t *si, ucontext_t *uc) +{ + size_t resv_sz, offset; + struct _aarch64_ctx *head = GET_SF_RESV_HEAD(sf); + struct sve_context *sve; + + /* Get a signal context with a SVE frame in it */ + if (!get_current_context(td, &sf.uc)) + return 1; + + resv_sz = GET_SF_RESV_SIZE(sf); + head = get_header(head, SVE_MAGIC, resv_sz, &offset); + if (!head) { + fprintf(stderr, "No SVE context\n"); + return 1; + } + + if (head->size != sizeof(struct sve_context)) { + fprintf(stderr, "SVE register state active, skipping\n"); + return 1; + } + + sve = (struct sve_context *)head; + + /* No changes are supported; init left us at minimum VL so go to max */ + fprintf(stderr, "Attempting to change VL from %d to %d\n", + sve->vl, vls[0]); + sve->vl = vls[0]; + + fake_sigreturn(&sf, sizeof(sf), 0); + + return 1; +} + +struct tdescr tde = { + .name = "FAKE_SIGRETURN_SVE_CHANGE", + .descr = "Attempt to change SVE VL", + .feats_required = FEAT_SVE, + .sig_ok = SIGSEGV, + .timeout = 3, + .init = sve_get_vls, + .run = fake_sigreturn_sve_change_vl, +}; diff --git a/tools/testing/selftests/arm64/signal/testcases/sve_regs.c b/tools/testing/selftests/arm64/signal/testcases/sve_regs.c new file mode 100644 index 000000000000..4b2418aa08a9 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/sve_regs.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 ARM Limited + * + * Verify that the SVE register context in signal frames is set up as + * expected. + */ + +#include <signal.h> +#include <ucontext.h> +#include <sys/prctl.h> + +#include "test_signals_utils.h" +#include "testcases.h" + +struct fake_sigframe sf; +static unsigned int vls[SVE_VQ_MAX]; +unsigned int nvls = 0; + +static bool sve_get_vls(struct tdescr *td) +{ + int vq, vl; + + /* + * Enumerate up to SVE_VQ_MAX vector lengths + */ + for (vq = SVE_VQ_MAX; vq > 0; --vq) { + vl = prctl(PR_SVE_SET_VL, vq * 16); + if (vl == -1) + return false; + + vl &= PR_SVE_VL_LEN_MASK; + + /* Skip missing VLs */ + vq = sve_vq_from_vl(vl); + + vls[nvls++] = vl; + } + + /* We need at least one VL */ + if (nvls < 1) { + fprintf(stderr, "Only %d VL supported\n", nvls); + return false; + } + + return true; +} + +static void setup_sve_regs(void) +{ + /* RDVL x16, #1 so we should have SVE regs; real data is TODO */ + asm volatile(".inst 0x04bf5030" : : : "x16" ); +} + +static int do_one_sve_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc, + unsigned int vl) +{ + size_t resv_sz, offset; + struct _aarch64_ctx *head = GET_SF_RESV_HEAD(sf); + struct sve_context *sve; + + fprintf(stderr, "Testing VL %d\n", vl); + + if (prctl(PR_SVE_SET_VL, vl) == -1) { + fprintf(stderr, "Failed to set VL\n"); + return 1; + } + + /* + * Get a signal context which should have a SVE frame and registers + * in it. + */ + setup_sve_regs(); + if (!get_current_context(td, &sf.uc)) + return 1; + + resv_sz = GET_SF_RESV_SIZE(sf); + head = get_header(head, SVE_MAGIC, resv_sz, &offset); + if (!head) { + fprintf(stderr, "No SVE context\n"); + return 1; + } + + sve = (struct sve_context *)head; + if (sve->vl != vl) { + fprintf(stderr, "Got VL %d, expected %d\n", sve->vl, vl); + return 1; + } + + /* The actual size validation is done in get_current_context() */ + fprintf(stderr, "Got expected size %u and VL %d\n", + head->size, sve->vl); + + return 0; +} + +static int sve_regs(struct tdescr *td, siginfo_t *si, ucontext_t *uc) +{ + int i; + + for (i = 0; i < nvls; i++) { + /* + * TODO: the signal test helpers can't currently cope + * with signal frames bigger than struct sigcontext, + * skip VLs that will trigger that. + */ + if (vls[i] > 64) + continue; + + if (do_one_sve_vl(td, si, uc, vls[i])) + return 1; + } + + td->pass = 1; + + return 0; +} + +struct tdescr tde = { + .name = "SVE registers", + .descr = "Check that we get the right SVE registers reported", + .feats_required = FEAT_SVE, + .timeout = 3, + .init = sve_get_vls, + .run = sve_regs, +}; diff --git a/tools/testing/selftests/arm64/signal/testcases/sve_vl.c b/tools/testing/selftests/arm64/signal/testcases/sve_vl.c new file mode 100644 index 000000000000..92904653add1 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/sve_vl.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 ARM Limited + * + * Check that the SVE vector length reported in signal contexts is the + * expected one. + */ + +#include <signal.h> +#include <ucontext.h> +#include <sys/prctl.h> + +#include "test_signals_utils.h" +#include "testcases.h" + +struct fake_sigframe sf; +unsigned int vl; + +static bool get_sve_vl(struct tdescr *td) +{ + int ret = prctl(PR_SVE_GET_VL); + if (ret == -1) + return false; + + vl = ret; + + return true; +} + +static int sve_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc) +{ + size_t resv_sz, offset; + struct _aarch64_ctx *head = GET_SF_RESV_HEAD(sf); + struct sve_context *sve; + + /* Get a signal context which should have a SVE frame in it */ + if (!get_current_context(td, &sf.uc)) + return 1; + + resv_sz = GET_SF_RESV_SIZE(sf); + head = get_header(head, SVE_MAGIC, resv_sz, &offset); + if (!head) { + fprintf(stderr, "No SVE context\n"); + return 1; + } + sve = (struct sve_context *)head; + + if (sve->vl != vl) { + fprintf(stderr, "sigframe VL %u, expected %u\n", + sve->vl, vl); + return 1; + } else { + fprintf(stderr, "got expected VL %u\n", vl); + } + + td->pass = 1; + + return 0; +} + +struct tdescr tde = { + .name = "SVE VL", + .descr = "Check that we get the right SVE VL reported", + .feats_required = FEAT_SVE, + .timeout = 3, + .init = get_sve_vl, + .run = sve_vl, +}; diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c index 61ebcdf63831..8c2a57fc2f9c 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.c +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c @@ -50,12 +50,38 @@ bool validate_extra_context(struct extra_context *extra, char **err) return true; } +bool validate_sve_context(struct sve_context *sve, char **err) +{ + /* Size will be rounded up to a multiple of 16 bytes */ + size_t regs_size + = ((SVE_SIG_CONTEXT_SIZE(sve_vq_from_vl(sve->vl)) + 15) / 16) * 16; + + if (!sve || !err) + return false; + + /* Either a bare sve_context or a sve_context followed by regs data */ + if ((sve->head.size != sizeof(struct sve_context)) && + (sve->head.size != regs_size)) { + *err = "bad size for SVE context"; + return false; + } + + if (!sve_vl_valid(sve->vl)) { + *err = "SVE VL invalid"; + + return false; + } + + return true; +} + bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err) { bool terminated = false; size_t offs = 0; int flags = 0; struct extra_context *extra = NULL; + struct sve_context *sve = NULL; struct _aarch64_ctx *head = (struct _aarch64_ctx *)uc->uc_mcontext.__reserved; @@ -90,9 +116,8 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err) case SVE_MAGIC: if (flags & SVE_CTX) *err = "Multiple SVE_MAGIC"; - else if (head->size != - sizeof(struct sve_context)) - *err = "Bad size for sve_context"; + /* Size is validated in validate_sve_context() */ + sve = (struct sve_context *)head; flags |= SVE_CTX; break; case EXTRA_MAGIC: @@ -137,6 +162,9 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err) if (flags & EXTRA_CTX) if (!validate_extra_context(extra, err)) return false; + if (flags & SVE_CTX) + if (!validate_sve_context(sve, err)) + return false; head = GET_RESV_NEXT_HEAD(head); } diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 4866f6a21901..433f8bef261e 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -10,6 +10,7 @@ FEATURE-DUMP.libbpf fixdep test_dev_cgroup /test_progs* +!test_progs.h test_verifier_log feature test_sock @@ -22,7 +23,6 @@ test_skb_cgroup_id_user test_cgroup_storage test_flow_dissector flow_dissector_load -test_netcnt test_tcpnotify_user test_libbpf test_tcp_check_syncookie_user @@ -30,10 +30,13 @@ test_sysctl xdping test_cpp *.skel.h +*.lskel.h /no_alu32 /bpf_gcc /tools /runqslower /bench *.ko +*.tmp xdpxceiver +xdp_redirect_multi diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 511259c2c6c5..799b88152e9e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -38,7 +38,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_verifier_log test_dev_cgroup \ test_sock test_sockmap get_cgroup_id_user \ test_cgroup_storage \ - test_netcnt test_tcpnotify_user test_sysctl \ + test_tcpnotify_user test_sysctl \ test_progs-no_alu32 # Also test bpf-gcc, if present @@ -54,6 +54,7 @@ TEST_FILES = xsk_prereqs.sh \ # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ test_xdp_redirect.sh \ + test_xdp_redirect_multi.sh \ test_xdp_meta.sh \ test_xdp_veth.sh \ test_offload.py \ @@ -78,13 +79,13 @@ TEST_PROGS := test_kmod.sh \ TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ - test_xdp_vlan.sh + test_xdp_vlan.sh test_bpftool.py # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ - xdpxceiver + xdpxceiver xdp_redirect_multi TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read @@ -186,6 +187,8 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) && \ cp $(SCRATCH_DIR)/runqslower $@ +TEST_GEN_PROGS_EXTENDED += $(DEFAULT_BPFTOOL) + $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c @@ -196,7 +199,6 @@ $(OUTPUT)/test_sockmap: cgroup_helpers.c $(OUTPUT)/test_tcpnotify_user: cgroup_helpers.c trace_helpers.c $(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c $(OUTPUT)/test_cgroup_storage: cgroup_helpers.c -$(OUTPUT)/test_netcnt: cgroup_helpers.c $(OUTPUT)/test_sock_fields: cgroup_helpers.c $(OUTPUT)/test_sysctl: cgroup_helpers.c @@ -312,6 +314,10 @@ SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ linked_vars.skel.h linked_maps.skel.h +LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \ + test_ksyms_module.c test_ringbuf.c atomics.c trace_printk.c +SKEL_BLACKLIST += $$(LSKELS) + test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o linked_funcs.skel.h-deps := linked_funcs1.o linked_funcs2.o linked_vars.skel.h-deps := linked_vars1.o linked_vars2.o @@ -339,6 +345,7 @@ TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS) TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ $$(filter-out $(SKEL_BLACKLIST) $(LINKED_BPF_SRCS),\ $$(TRUNNER_BPF_SRCS))) +TRUNNER_BPF_LSKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.lskel.h, $$(LSKELS)) TRUNNER_BPF_SKELS_LINKED := $$(addprefix $$(TRUNNER_OUTPUT)/,$(LINKED_SKELS)) TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) @@ -368,7 +375,8 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_PROGS_DIR)/%.c \ $(TRUNNER_BPF_PROGS_DIR)/*.h \ $$(INCLUDE_DIR)/vmlinux.h \ - $(wildcard $(BPFDIR)/bpf_*.h) | $(TRUNNER_OUTPUT) + $(wildcard $(BPFDIR)/bpf_*.h) \ + | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS)) @@ -380,6 +388,14 @@ $(TRUNNER_BPF_SKELS): %.skel.h: %.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.o=)) > $$@ +$(TRUNNER_BPF_LSKELS): %.lskel.h: %.o $(BPFTOOL) | $(TRUNNER_OUTPUT) + $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$< + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o) + $(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) + $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.linked3.o) name $$(notdir $$(<:.o=)) > $$@ + $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.o)) $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) @@ -409,6 +425,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_EXTRA_HDRS) \ $(TRUNNER_BPF_OBJS) \ $(TRUNNER_BPF_SKELS) \ + $(TRUNNER_BPF_LSKELS) \ $(TRUNNER_BPF_SKELS_LINKED) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) @@ -516,6 +533,6 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature \ - $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko) + $(addprefix $(OUTPUT)/,*.o *.skel.h *.lskel.h no_alu32 bpf_gcc bpf_testmod.ko) .PHONY: docs docs-clean diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs index ccf260021e83..eb6a4fea8c79 100644 --- a/tools/testing/selftests/bpf/Makefile.docs +++ b/tools/testing/selftests/bpf/Makefile.docs @@ -52,7 +52,8 @@ $(OUTPUT)%.$2: $(OUTPUT)%.rst ifndef RST2MAN_DEP $$(error "rst2man not found, but required to generate man pages") endif - $$(QUIET_GEN)rst2man $$< > $$@ + $$(QUIET_GEN)rst2man --exit-status=1 $$< > $$@.tmp + $$(QUIET_GEN)mv $$@.tmp $$@ docs-clean-$1: $$(call QUIET_CLEAN, eBPF_$1-manpage) diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 3353778c30f8..9b17f2867488 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -19,6 +19,13 @@ the CI. It builds the kernel (without overwriting your existing Kconfig), recomp bpf selftests, runs them (by default ``tools/testing/selftests/bpf/test_progs``) and saves the resulting output (by default in ``~/.bpf_selftests``). +Script dependencies: +- clang (preferably built from sources, https://github.com/llvm/llvm-project); +- pahole (preferably built from sources, https://git.kernel.org/pub/scm/devel/pahole/pahole.git/); +- qemu; +- docutils (for ``rst2man``); +- libcap-devel. + For more information on about using the script, run: .. code-block:: console @@ -202,3 +209,22 @@ generate valid BTF information for weak variables. Please make sure you use Clang that contains the fix. __ https://reviews.llvm.org/D100362 + +Clang relocation changes +======================== + +Clang 13 patch `clang reloc patch`_ made some changes on relocations such +that existing relocation types are broken into more types and +each new type corresponds to only one way to resolve relocation. +See `kernel llvm reloc`_ for more explanation and some examples. +Using clang 13 to compile old libbpf which has static linker support, +there will be a compilation failure:: + + libbpf: ELF relo #0 in section #6 has unexpected type 2 in .../bpf_tcp_nogpl.o + +Here, ``type 2`` refers to new relocation type ``R_BPF_64_ABS64``. +To fix this issue, user newer libbpf. + +.. Links +.. _clang reloc patch: https://reviews.llvm.org/D102712 +.. _kernel llvm reloc: /Documentation/bpf/llvm_reloc.rst diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 332ed2f7b402..6ea15b93a2f8 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -43,6 +43,7 @@ void setup_libbpf() { int err; + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); libbpf_set_print(libbpf_print_fn); err = bump_memlock_rlimit(); diff --git a/tools/testing/selftests/bpf/benchs/bench_rename.c b/tools/testing/selftests/bpf/benchs/bench_rename.c index a967674098ad..c7ec114eca56 100644 --- a/tools/testing/selftests/bpf/benchs/bench_rename.c +++ b/tools/testing/selftests/bpf/benchs/bench_rename.c @@ -65,7 +65,7 @@ static void attach_bpf(struct bpf_program *prog) struct bpf_link *link; link = bpf_program__attach(prog); - if (IS_ERR(link)) { + if (!link) { fprintf(stderr, "failed to attach program!\n"); exit(1); } diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c index bde6c9d4cbd4..d167bffac679 100644 --- a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c +++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c @@ -181,7 +181,7 @@ static void ringbuf_libbpf_setup() } link = bpf_program__attach(ctx->skel->progs.bench_ringbuf); - if (IS_ERR(link)) { + if (!link) { fprintf(stderr, "failed to attach program!\n"); exit(1); } @@ -271,7 +271,7 @@ static void ringbuf_custom_setup() } link = bpf_program__attach(ctx->skel->progs.bench_ringbuf); - if (IS_ERR(link)) { + if (!link) { fprintf(stderr, "failed to attach program\n"); exit(1); } @@ -430,7 +430,7 @@ static void perfbuf_libbpf_setup() } link = bpf_program__attach(ctx->skel->progs.bench_perfbuf); - if (IS_ERR(link)) { + if (!link) { fprintf(stderr, "failed to attach program\n"); exit(1); } diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 2a0b6c9885a4..f41a491a8cc0 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -60,7 +60,7 @@ static void attach_bpf(struct bpf_program *prog) struct bpf_link *link; link = bpf_program__attach(prog); - if (IS_ERR(link)) { + if (!link) { fprintf(stderr, "failed to attach program!\n"); exit(1); } diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index 029589c008c9..b1ede6f0b821 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -12,6 +12,10 @@ SEC("struct_ops/"#name) \ BPF_PROG(name, args) +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + #define tcp_jiffies32 ((__u32)bpf_jiffies64()) struct sock_common { @@ -27,6 +31,7 @@ enum sk_pacing { struct sock { struct sock_common __sk_common; +#define sk_state __sk_common.skc_state unsigned long sk_pacing_rate; __u32 sk_pacing_status; /* see enum sk_pacing */ } __attribute__((preserve_access_index)); @@ -203,6 +208,20 @@ static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk) return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); } +static __always_inline bool tcp_cc_eq(const char *a, const char *b) +{ + int i; + + for (i = 0; i < TCP_CA_NAME_MAX; i++) { + if (a[i] != b[i]) + return false; + if (!a[i]) + break; + } + + return true; +} + extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 033051717ba5..f3daa44a8266 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -12,27 +12,36 @@ #include <unistd.h> #include <ftw.h> - #include "cgroup_helpers.h" /* * To avoid relying on the system setup, when setup_cgroup_env is called - * we create a new mount namespace, and cgroup namespace. The cgroup2 - * root is mounted at CGROUP_MOUNT_PATH - * - * Unfortunately, most people don't have cgroupv2 enabled at this point in time. - * It's easier to create our own mount namespace and manage it ourselves. + * we create a new mount namespace, and cgroup namespace. The cgroupv2 + * root is mounted at CGROUP_MOUNT_PATH. Unfortunately, most people don't + * have cgroupv2 enabled at this point in time. It's easier to create our + * own mount namespace and manage it ourselves. We assume /mnt exists. * - * We assume /mnt exists. + * Related cgroupv1 helpers are named *classid*(), since we only use the + * net_cls controller for tagging net_cls.classid. We assume the default + * mount under /sys/fs/cgroup/net_cls, which should be the case for the + * vast majority of users. */ #define WALK_FD_LIMIT 16 + #define CGROUP_MOUNT_PATH "/mnt" +#define CGROUP_MOUNT_DFLT "/sys/fs/cgroup" +#define NETCLS_MOUNT_PATH CGROUP_MOUNT_DFLT "/net_cls" #define CGROUP_WORK_DIR "/cgroup-test-work-dir" + #define format_cgroup_path(buf, path) \ snprintf(buf, sizeof(buf), "%s%s%s", CGROUP_MOUNT_PATH, \ CGROUP_WORK_DIR, path) +#define format_classid_path(buf) \ + snprintf(buf, sizeof(buf), "%s%s", NETCLS_MOUNT_PATH, \ + CGROUP_WORK_DIR) + /** * enable_all_controllers() - Enable all available cgroup v2 controllers * @@ -139,8 +148,7 @@ static int nftwfunc(const char *filename, const struct stat *statptr, return 0; } - -static int join_cgroup_from_top(char *cgroup_path) +static int join_cgroup_from_top(const char *cgroup_path) { char cgroup_procs_path[PATH_MAX + 1]; pid_t pid = getpid(); @@ -313,3 +321,114 @@ int cgroup_setup_and_join(const char *path) { } return cg_fd; } + +/** + * setup_classid_environment() - Setup the cgroupv1 net_cls environment + * + * After calling this function, cleanup_classid_environment should be called + * once testing is complete. + * + * This function will print an error to stderr and return 1 if it is unable + * to setup the cgroup environment. If setup is successful, 0 is returned. + */ +int setup_classid_environment(void) +{ + char cgroup_workdir[PATH_MAX + 1]; + + format_classid_path(cgroup_workdir); + + if (mount("tmpfs", CGROUP_MOUNT_DFLT, "tmpfs", 0, NULL) && + errno != EBUSY) { + log_err("mount cgroup base"); + return 1; + } + + if (mkdir(NETCLS_MOUNT_PATH, 0777) && errno != EEXIST) { + log_err("mkdir cgroup net_cls"); + return 1; + } + + if (mount("net_cls", NETCLS_MOUNT_PATH, "cgroup", 0, "net_cls") && + errno != EBUSY) { + log_err("mount cgroup net_cls"); + return 1; + } + + cleanup_classid_environment(); + + if (mkdir(cgroup_workdir, 0777) && errno != EEXIST) { + log_err("mkdir cgroup work dir"); + return 1; + } + + return 0; +} + +/** + * set_classid() - Set a cgroupv1 net_cls classid + * @id: the numeric classid + * + * Writes the passed classid into the cgroup work dir's net_cls.classid + * file in order to later on trigger socket tagging. + * + * On success, it returns 0, otherwise on failure it returns 1. If there + * is a failure, it prints the error to stderr. + */ +int set_classid(unsigned int id) +{ + char cgroup_workdir[PATH_MAX - 42]; + char cgroup_classid_path[PATH_MAX + 1]; + int fd, rc = 0; + + format_classid_path(cgroup_workdir); + snprintf(cgroup_classid_path, sizeof(cgroup_classid_path), + "%s/net_cls.classid", cgroup_workdir); + + fd = open(cgroup_classid_path, O_WRONLY); + if (fd < 0) { + log_err("Opening cgroup classid: %s", cgroup_classid_path); + return 1; + } + + if (dprintf(fd, "%u\n", id) < 0) { + log_err("Setting cgroup classid"); + rc = 1; + } + + close(fd); + return rc; +} + +/** + * join_classid() - Join a cgroupv1 net_cls classid + * + * This function expects the cgroup work dir to be already created, as we + * join it here. This causes the process sockets to be tagged with the given + * net_cls classid. + * + * On success, it returns 0, otherwise on failure it returns 1. + */ +int join_classid(void) +{ + char cgroup_workdir[PATH_MAX + 1]; + + format_classid_path(cgroup_workdir); + return join_cgroup_from_top(cgroup_workdir); +} + +/** + * cleanup_classid_environment() - Cleanup the cgroupv1 net_cls environment + * + * At call time, it moves the calling process to the root cgroup, and then + * runs the deletion process. + * + * On failure, it will print an error to stderr, and try to continue. + */ +void cleanup_classid_environment(void) +{ + char cgroup_workdir[PATH_MAX + 1]; + + format_classid_path(cgroup_workdir); + join_cgroup_from_top(NETCLS_MOUNT_PATH); + nftw(cgroup_workdir, nftwfunc, WALK_FD_LIMIT, FTW_DEPTH | FTW_MOUNT); +} diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h index 5fe3d88e4f0d..629da3854b3e 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.h +++ b/tools/testing/selftests/bpf/cgroup_helpers.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CGROUP_HELPERS_H #define __CGROUP_HELPERS_H + #include <errno.h> #include <string.h> @@ -8,12 +9,21 @@ #define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \ __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) - +/* cgroupv2 related */ int cgroup_setup_and_join(const char *path); int create_and_get_cgroup(const char *path); +unsigned long long get_cgroup_id(const char *path); + int join_cgroup(const char *path); + int setup_cgroup_environment(void); void cleanup_cgroup_environment(void); -unsigned long long get_cgroup_id(const char *path); -#endif +/* cgroupv1 related */ +int set_classid(unsigned int id); +int join_classid(void); + +int setup_classid_environment(void); +void cleanup_classid_environment(void); + +#endif /* __CGROUP_HELPERS_H */ diff --git a/tools/testing/selftests/bpf/netcnt_common.h b/tools/testing/selftests/bpf/netcnt_common.h index 81084c1c2c23..0ab1c88041cd 100644 --- a/tools/testing/selftests/bpf/netcnt_common.h +++ b/tools/testing/selftests/bpf/netcnt_common.h @@ -6,19 +6,39 @@ #define MAX_PERCPU_PACKETS 32 -struct percpu_net_cnt { - __u64 packets; - __u64 bytes; +/* sizeof(struct bpf_local_storage_elem): + * + * It really is about 128 bytes on x86_64, but allocate more to account for + * possible layout changes, different architectures, etc. + * The kernel will wrap up to PAGE_SIZE internally anyway. + */ +#define SIZEOF_BPF_LOCAL_STORAGE_ELEM 256 - __u64 prev_ts; +/* Try to estimate kernel's BPF_LOCAL_STORAGE_MAX_VALUE_SIZE: */ +#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE (0xFFFF - \ + SIZEOF_BPF_LOCAL_STORAGE_ELEM) - __u64 prev_packets; - __u64 prev_bytes; +#define PCPU_MIN_UNIT_SIZE 32768 + +union percpu_net_cnt { + struct { + __u64 packets; + __u64 bytes; + + __u64 prev_ts; + + __u64 prev_packets; + __u64 prev_bytes; + }; + __u8 data[PCPU_MIN_UNIT_SIZE]; }; -struct net_cnt { - __u64 packets; - __u64 bytes; +union net_cnt { + struct { + __u64 packets; + __u64 bytes; + }; + __u8 data[BPF_LOCAL_STORAGE_MAX_VALUE_SIZE]; }; #endif diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 2060bc122c53..6db1af8fdee7 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -66,17 +66,13 @@ int settimeo(int fd, int timeout_ms) #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; }) -int start_server(int family, int type, const char *addr_str, __u16 port, - int timeout_ms) +static int __start_server(int type, const struct sockaddr *addr, + socklen_t addrlen, int timeout_ms, bool reuseport) { - struct sockaddr_storage addr = {}; - socklen_t len; + int on = 1; int fd; - if (make_sockaddr(family, addr_str, port, &addr, &len)) - return -1; - - fd = socket(family, type, 0); + fd = socket(addr->sa_family, type, 0); if (fd < 0) { log_err("Failed to create server socket"); return -1; @@ -85,7 +81,13 @@ int start_server(int family, int type, const char *addr_str, __u16 port, if (settimeo(fd, timeout_ms)) goto error_close; - if (bind(fd, (const struct sockaddr *)&addr, len) < 0) { + if (reuseport && + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on))) { + log_err("Failed to set SO_REUSEPORT"); + return -1; + } + + if (bind(fd, addr, addrlen) < 0) { log_err("Failed to bind socket"); goto error_close; } @@ -104,6 +106,69 @@ error_close: return -1; } +int start_server(int family, int type, const char *addr_str, __u16 port, + int timeout_ms) +{ + struct sockaddr_storage addr; + socklen_t addrlen; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + return -1; + + return __start_server(type, (struct sockaddr *)&addr, + addrlen, timeout_ms, false); +} + +int *start_reuseport_server(int family, int type, const char *addr_str, + __u16 port, int timeout_ms, unsigned int nr_listens) +{ + struct sockaddr_storage addr; + unsigned int nr_fds = 0; + socklen_t addrlen; + int *fds; + + if (!nr_listens) + return NULL; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + return NULL; + + fds = malloc(sizeof(*fds) * nr_listens); + if (!fds) + return NULL; + + fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, + timeout_ms, true); + if (fds[0] == -1) + goto close_fds; + nr_fds = 1; + + if (getsockname(fds[0], (struct sockaddr *)&addr, &addrlen)) + goto close_fds; + + for (; nr_fds < nr_listens; nr_fds++) { + fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, + addrlen, timeout_ms, true); + if (fds[nr_fds] == -1) + goto close_fds; + } + + return fds; + +close_fds: + free_fds(fds, nr_fds); + return NULL; +} + +void free_fds(int *fds, unsigned int nr_close_fds) +{ + if (fds) { + while (nr_close_fds) + close(fds[--nr_close_fds]); + free(fds); + } +} + int fastopen_connect(int server_fd, const char *data, unsigned int data_len, int timeout_ms) { @@ -143,23 +208,43 @@ error_close: static int connect_fd_to_addr(int fd, const struct sockaddr_storage *addr, - socklen_t addrlen) + socklen_t addrlen, const bool must_fail) { - if (connect(fd, (const struct sockaddr *)addr, addrlen)) { - log_err("Failed to connect to server"); - return -1; + int ret; + + errno = 0; + ret = connect(fd, (const struct sockaddr *)addr, addrlen); + if (must_fail) { + if (!ret) { + log_err("Unexpected success to connect to server"); + return -1; + } + if (errno != EPERM) { + log_err("Unexpected error from connect to server"); + return -1; + } + } else { + if (ret) { + log_err("Failed to connect to server"); + return -1; + } } return 0; } -int connect_to_fd(int server_fd, int timeout_ms) +static const struct network_helper_opts default_opts; + +int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) { struct sockaddr_storage addr; struct sockaddr_in *addr_in; socklen_t addrlen, optlen; int fd, type; + if (!opts) + opts = &default_opts; + optlen = sizeof(type); if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) { log_err("getsockopt(SOL_TYPE)"); @@ -179,10 +264,15 @@ int connect_to_fd(int server_fd, int timeout_ms) return -1; } - if (settimeo(fd, timeout_ms)) + if (settimeo(fd, opts->timeout_ms)) + goto error_close; + + if (opts->cc && opts->cc[0] && + setsockopt(fd, SOL_TCP, TCP_CONGESTION, opts->cc, + strlen(opts->cc) + 1)) goto error_close; - if (connect_fd_to_addr(fd, &addr, addrlen)) + if (connect_fd_to_addr(fd, &addr, addrlen, opts->must_fail)) goto error_close; return fd; @@ -192,6 +282,15 @@ error_close: return -1; } +int connect_to_fd(int server_fd, int timeout_ms) +{ + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + }; + + return connect_to_fd_opts(server_fd, &opts); +} + int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms) { struct sockaddr_storage addr; @@ -205,7 +304,7 @@ int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms) return -1; } - if (connect_fd_to_addr(client_fd, &addr, len)) + if (connect_fd_to_addr(client_fd, &addr, len, false)) return -1; return 0; @@ -217,6 +316,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, if (family == AF_INET) { struct sockaddr_in *sin = (void *)addr; + memset(addr, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_port = htons(port); if (addr_str && @@ -230,6 +330,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, } else if (family == AF_INET6) { struct sockaddr_in6 *sin6 = (void *)addr; + memset(addr, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_port = htons(port); if (addr_str && @@ -243,3 +344,15 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, } return -1; } + +char *ping_command(int family) +{ + if (family == AF_INET6) { + /* On some systems 'ping' doesn't support IPv6, so use ping6 if it is present. */ + if (!system("which ping6 >/dev/null 2>&1")) + return "ping6"; + else + return "ping -6"; + } + return "ping"; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 5e0d51c07b63..d198181a5648 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -17,6 +17,12 @@ typedef __u16 __sum16; #define VIP_NUM 5 #define MAGIC_BYTES 123 +struct network_helper_opts { + const char *cc; + int timeout_ms; + bool must_fail; +}; + /* ipv4 test vector */ struct ipv4_packet { struct ethhdr eth; @@ -36,11 +42,17 @@ extern struct ipv6_packet pkt_v6; int settimeo(int fd, int timeout_ms); int start_server(int family, int type, const char *addr, __u16 port, int timeout_ms); +int *start_reuseport_server(int family, int type, const char *addr_str, + __u16 port, int timeout_ms, + unsigned int nr_listens); +void free_fds(int *fds, unsigned int nr_close_fds); int connect_to_fd(int server_fd, int timeout_ms); +int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); int fastopen_connect(int server_fd, const char *data, unsigned int data_len, int timeout_ms); int make_sockaddr(int family, const char *addr_str, __u16 port, struct sockaddr_storage *addr, socklen_t *len); +char *ping_command(int family); #endif diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c index 21efe7bbf10d..ba0e1efe5a45 100644 --- a/tools/testing/selftests/bpf/prog_tests/atomics.c +++ b/tools/testing/selftests/bpf/prog_tests/atomics.c @@ -2,19 +2,19 @@ #include <test_progs.h> -#include "atomics.skel.h" +#include "atomics.lskel.h" static void test_add(struct atomics *skel) { int err, prog_fd; __u32 duration = 0, retval; - struct bpf_link *link; + int link_fd; - link = bpf_program__attach(skel->progs.add); - if (CHECK(IS_ERR(link), "attach(add)", "err: %ld\n", PTR_ERR(link))) + link_fd = atomics__add__attach(skel); + if (!ASSERT_GT(link_fd, 0, "attach(add)")) return; - prog_fd = bpf_program__fd(skel->progs.add); + prog_fd = skel->progs.add.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); if (CHECK(err || retval, "test_run add", @@ -33,20 +33,20 @@ static void test_add(struct atomics *skel) ASSERT_EQ(skel->data->add_noreturn_value, 3, "add_noreturn_value"); cleanup: - bpf_link__destroy(link); + close(link_fd); } static void test_sub(struct atomics *skel) { int err, prog_fd; __u32 duration = 0, retval; - struct bpf_link *link; + int link_fd; - link = bpf_program__attach(skel->progs.sub); - if (CHECK(IS_ERR(link), "attach(sub)", "err: %ld\n", PTR_ERR(link))) + link_fd = atomics__sub__attach(skel); + if (!ASSERT_GT(link_fd, 0, "attach(sub)")) return; - prog_fd = bpf_program__fd(skel->progs.sub); + prog_fd = skel->progs.sub.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); if (CHECK(err || retval, "test_run sub", @@ -66,20 +66,20 @@ static void test_sub(struct atomics *skel) ASSERT_EQ(skel->data->sub_noreturn_value, -1, "sub_noreturn_value"); cleanup: - bpf_link__destroy(link); + close(link_fd); } static void test_and(struct atomics *skel) { int err, prog_fd; __u32 duration = 0, retval; - struct bpf_link *link; + int link_fd; - link = bpf_program__attach(skel->progs.and); - if (CHECK(IS_ERR(link), "attach(and)", "err: %ld\n", PTR_ERR(link))) + link_fd = atomics__and__attach(skel); + if (!ASSERT_GT(link_fd, 0, "attach(and)")) return; - prog_fd = bpf_program__fd(skel->progs.and); + prog_fd = skel->progs.and.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); if (CHECK(err || retval, "test_run and", @@ -94,20 +94,20 @@ static void test_and(struct atomics *skel) ASSERT_EQ(skel->data->and_noreturn_value, 0x010ull << 32, "and_noreturn_value"); cleanup: - bpf_link__destroy(link); + close(link_fd); } static void test_or(struct atomics *skel) { int err, prog_fd; __u32 duration = 0, retval; - struct bpf_link *link; + int link_fd; - link = bpf_program__attach(skel->progs.or); - if (CHECK(IS_ERR(link), "attach(or)", "err: %ld\n", PTR_ERR(link))) + link_fd = atomics__or__attach(skel); + if (!ASSERT_GT(link_fd, 0, "attach(or)")) return; - prog_fd = bpf_program__fd(skel->progs.or); + prog_fd = skel->progs.or.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); if (CHECK(err || retval, "test_run or", @@ -123,20 +123,20 @@ static void test_or(struct atomics *skel) ASSERT_EQ(skel->data->or_noreturn_value, 0x111ull << 32, "or_noreturn_value"); cleanup: - bpf_link__destroy(link); + close(link_fd); } static void test_xor(struct atomics *skel) { int err, prog_fd; __u32 duration = 0, retval; - struct bpf_link *link; + int link_fd; - link = bpf_program__attach(skel->progs.xor); - if (CHECK(IS_ERR(link), "attach(xor)", "err: %ld\n", PTR_ERR(link))) + link_fd = atomics__xor__attach(skel); + if (!ASSERT_GT(link_fd, 0, "attach(xor)")) return; - prog_fd = bpf_program__fd(skel->progs.xor); + prog_fd = skel->progs.xor.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); if (CHECK(err || retval, "test_run xor", @@ -151,20 +151,20 @@ static void test_xor(struct atomics *skel) ASSERT_EQ(skel->data->xor_noreturn_value, 0x101ull << 32, "xor_nxoreturn_value"); cleanup: - bpf_link__destroy(link); + close(link_fd); } static void test_cmpxchg(struct atomics *skel) { int err, prog_fd; __u32 duration = 0, retval; - struct bpf_link *link; + int link_fd; - link = bpf_program__attach(skel->progs.cmpxchg); - if (CHECK(IS_ERR(link), "attach(cmpxchg)", "err: %ld\n", PTR_ERR(link))) + link_fd = atomics__cmpxchg__attach(skel); + if (!ASSERT_GT(link_fd, 0, "attach(cmpxchg)")) return; - prog_fd = bpf_program__fd(skel->progs.cmpxchg); + prog_fd = skel->progs.cmpxchg.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); if (CHECK(err || retval, "test_run add", @@ -180,20 +180,20 @@ static void test_cmpxchg(struct atomics *skel) ASSERT_EQ(skel->bss->cmpxchg32_result_succeed, 1, "cmpxchg_result_succeed"); cleanup: - bpf_link__destroy(link); + close(link_fd); } static void test_xchg(struct atomics *skel) { int err, prog_fd; __u32 duration = 0, retval; - struct bpf_link *link; + int link_fd; - link = bpf_program__attach(skel->progs.xchg); - if (CHECK(IS_ERR(link), "attach(xchg)", "err: %ld\n", PTR_ERR(link))) + link_fd = atomics__xchg__attach(skel); + if (!ASSERT_GT(link_fd, 0, "attach(xchg)")) return; - prog_fd = bpf_program__fd(skel->progs.xchg); + prog_fd = skel->progs.xchg.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); if (CHECK(err || retval, "test_run add", @@ -207,7 +207,7 @@ static void test_xchg(struct atomics *skel) ASSERT_EQ(skel->bss->xchg32_result, 1, "xchg32_result"); cleanup: - bpf_link__destroy(link); + close(link_fd); } void test_atomics(void) diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index 9dc4e3dfbcf3..bf307bb9e446 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -2,79 +2,28 @@ #include <test_progs.h> #include "test_attach_probe.skel.h" -#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 - -#define OP_RT_RA_MASK 0xffff0000UL -#define LIS_R2 0x3c400000UL -#define ADDIS_R2_R12 0x3c4c0000UL -#define ADDI_R2_R2 0x38420000UL - -static ssize_t get_offset(ssize_t addr, ssize_t base) -{ - u32 *insn = (u32 *) addr; - - /* - * A PPC64 ABIv2 function may have a local and a global entry - * point. We need to use the local entry point when patching - * functions, so identify and step over the global entry point - * sequence. - * - * The global entry point sequence is always of the form: - * - * addis r2,r12,XXXX - * addi r2,r2,XXXX - * - * A linker optimisation may convert the addis to lis: - * - * lis r2,XXXX - * addi r2,r2,XXXX - */ - if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) || - ((*insn & OP_RT_RA_MASK) == LIS_R2)) && - ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2)) - return (ssize_t)(insn + 2) - base; - else - return addr - base; -} -#else -#define get_offset(addr, base) (addr - base) -#endif - -ssize_t get_base_addr() { - size_t start, offset; - char buf[256]; - FILE *f; - - f = fopen("/proc/self/maps", "r"); - if (!f) - return -errno; - - while (fscanf(f, "%zx-%*x %s %zx %*[^\n]\n", - &start, buf, &offset) == 3) { - if (strcmp(buf, "r-xp") == 0) { - fclose(f); - return start - offset; - } - } - - fclose(f); - return -EINVAL; -} +/* this is how USDT semaphore is actually defined, except volatile modifier */ +volatile unsigned short uprobe_ref_ctr __attribute__((unused)) __attribute((section(".probes"))); void test_attach_probe(void) { + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); int duration = 0; struct bpf_link *kprobe_link, *kretprobe_link; struct bpf_link *uprobe_link, *uretprobe_link; struct test_attach_probe* skel; size_t uprobe_offset; - ssize_t base_addr; + ssize_t base_addr, ref_ctr_offset; base_addr = get_base_addr(); if (CHECK(base_addr < 0, "get_base_addr", "failed to find base addr: %zd", base_addr)) return; - uprobe_offset = get_offset((size_t)&get_base_addr, base_addr); + uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr); + + ref_ctr_offset = get_rel_offset((uintptr_t)&uprobe_ref_ctr); + if (!ASSERT_GE(ref_ctr_offset, 0, "ref_ctr_offset")) + return; skel = test_attach_probe__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) @@ -85,36 +34,40 @@ void test_attach_probe(void) kprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kprobe, false /* retprobe */, SYS_NANOSLEEP_KPROBE_NAME); - if (CHECK(IS_ERR(kprobe_link), "attach_kprobe", - "err %ld\n", PTR_ERR(kprobe_link))) + if (!ASSERT_OK_PTR(kprobe_link, "attach_kprobe")) goto cleanup; skel->links.handle_kprobe = kprobe_link; kretprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kretprobe, true /* retprobe */, SYS_NANOSLEEP_KPROBE_NAME); - if (CHECK(IS_ERR(kretprobe_link), "attach_kretprobe", - "err %ld\n", PTR_ERR(kretprobe_link))) + if (!ASSERT_OK_PTR(kretprobe_link, "attach_kretprobe")) goto cleanup; skel->links.handle_kretprobe = kretprobe_link; - uprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uprobe, - false /* retprobe */, - 0 /* self pid */, - "/proc/self/exe", - uprobe_offset); - if (CHECK(IS_ERR(uprobe_link), "attach_uprobe", - "err %ld\n", PTR_ERR(uprobe_link))) + ASSERT_EQ(uprobe_ref_ctr, 0, "uprobe_ref_ctr_before"); + + uprobe_opts.retprobe = false; + uprobe_opts.ref_ctr_offset = ref_ctr_offset; + uprobe_link = bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe, + 0 /* self pid */, + "/proc/self/exe", + uprobe_offset, + &uprobe_opts); + if (!ASSERT_OK_PTR(uprobe_link, "attach_uprobe")) goto cleanup; skel->links.handle_uprobe = uprobe_link; - uretprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uretprobe, - true /* retprobe */, - -1 /* any pid */, - "/proc/self/exe", - uprobe_offset); - if (CHECK(IS_ERR(uretprobe_link), "attach_uretprobe", - "err %ld\n", PTR_ERR(uretprobe_link))) + ASSERT_GT(uprobe_ref_ctr, 0, "uprobe_ref_ctr_after"); + + /* if uprobe uses ref_ctr, uretprobe has to use ref_ctr as well */ + uprobe_opts.retprobe = true; + uprobe_opts.ref_ctr_offset = ref_ctr_offset; + uretprobe_link = bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe, + -1 /* any pid */, + "/proc/self/exe", + uprobe_offset, &uprobe_opts); + if (!ASSERT_OK_PTR(uretprobe_link, "attach_uretprobe")) goto cleanup; skel->links.handle_uretprobe = uretprobe_link; @@ -140,4 +93,5 @@ void test_attach_probe(void) cleanup: test_attach_probe__destroy(skel); + ASSERT_EQ(uprobe_ref_ctr, 0, "uprobe_ref_ctr_cleanup"); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c new file mode 100644 index 000000000000..5eea3c3a40fe --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#define _GNU_SOURCE +#include <pthread.h> +#include <sched.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <test_progs.h> +#include "test_bpf_cookie.skel.h" + +static void kprobe_subtest(struct test_bpf_cookie *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts); + struct bpf_link *link1 = NULL, *link2 = NULL; + struct bpf_link *retlink1 = NULL, *retlink2 = NULL; + + /* attach two kprobes */ + opts.bpf_cookie = 0x1; + opts.retprobe = false; + link1 = bpf_program__attach_kprobe_opts(skel->progs.handle_kprobe, + SYS_NANOSLEEP_KPROBE_NAME, &opts); + if (!ASSERT_OK_PTR(link1, "link1")) + goto cleanup; + + opts.bpf_cookie = 0x2; + opts.retprobe = false; + link2 = bpf_program__attach_kprobe_opts(skel->progs.handle_kprobe, + SYS_NANOSLEEP_KPROBE_NAME, &opts); + if (!ASSERT_OK_PTR(link2, "link2")) + goto cleanup; + + /* attach two kretprobes */ + opts.bpf_cookie = 0x10; + opts.retprobe = true; + retlink1 = bpf_program__attach_kprobe_opts(skel->progs.handle_kretprobe, + SYS_NANOSLEEP_KPROBE_NAME, &opts); + if (!ASSERT_OK_PTR(retlink1, "retlink1")) + goto cleanup; + + opts.bpf_cookie = 0x20; + opts.retprobe = true; + retlink2 = bpf_program__attach_kprobe_opts(skel->progs.handle_kretprobe, + SYS_NANOSLEEP_KPROBE_NAME, &opts); + if (!ASSERT_OK_PTR(retlink2, "retlink2")) + goto cleanup; + + /* trigger kprobe && kretprobe */ + usleep(1); + + ASSERT_EQ(skel->bss->kprobe_res, 0x1 | 0x2, "kprobe_res"); + ASSERT_EQ(skel->bss->kretprobe_res, 0x10 | 0x20, "kretprobe_res"); + +cleanup: + bpf_link__destroy(link1); + bpf_link__destroy(link2); + bpf_link__destroy(retlink1); + bpf_link__destroy(retlink2); +} + +static void uprobe_subtest(struct test_bpf_cookie *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct bpf_link *link1 = NULL, *link2 = NULL; + struct bpf_link *retlink1 = NULL, *retlink2 = NULL; + size_t uprobe_offset; + ssize_t base_addr; + + base_addr = get_base_addr(); + uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr); + + /* attach two uprobes */ + opts.bpf_cookie = 0x100; + opts.retprobe = false; + link1 = bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe, 0 /* self pid */, + "/proc/self/exe", uprobe_offset, &opts); + if (!ASSERT_OK_PTR(link1, "link1")) + goto cleanup; + + opts.bpf_cookie = 0x200; + opts.retprobe = false; + link2 = bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe, -1 /* any pid */, + "/proc/self/exe", uprobe_offset, &opts); + if (!ASSERT_OK_PTR(link2, "link2")) + goto cleanup; + + /* attach two uretprobes */ + opts.bpf_cookie = 0x1000; + opts.retprobe = true; + retlink1 = bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe, -1 /* any pid */, + "/proc/self/exe", uprobe_offset, &opts); + if (!ASSERT_OK_PTR(retlink1, "retlink1")) + goto cleanup; + + opts.bpf_cookie = 0x2000; + opts.retprobe = true; + retlink2 = bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe, 0 /* self pid */, + "/proc/self/exe", uprobe_offset, &opts); + if (!ASSERT_OK_PTR(retlink2, "retlink2")) + goto cleanup; + + /* trigger uprobe && uretprobe */ + get_base_addr(); + + ASSERT_EQ(skel->bss->uprobe_res, 0x100 | 0x200, "uprobe_res"); + ASSERT_EQ(skel->bss->uretprobe_res, 0x1000 | 0x2000, "uretprobe_res"); + +cleanup: + bpf_link__destroy(link1); + bpf_link__destroy(link2); + bpf_link__destroy(retlink1); + bpf_link__destroy(retlink2); +} + +static void tp_subtest(struct test_bpf_cookie *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_tracepoint_opts, opts); + struct bpf_link *link1 = NULL, *link2 = NULL, *link3 = NULL; + + /* attach first tp prog */ + opts.bpf_cookie = 0x10000; + link1 = bpf_program__attach_tracepoint_opts(skel->progs.handle_tp1, + "syscalls", "sys_enter_nanosleep", &opts); + if (!ASSERT_OK_PTR(link1, "link1")) + goto cleanup; + + /* attach second tp prog */ + opts.bpf_cookie = 0x20000; + link2 = bpf_program__attach_tracepoint_opts(skel->progs.handle_tp2, + "syscalls", "sys_enter_nanosleep", &opts); + if (!ASSERT_OK_PTR(link2, "link2")) + goto cleanup; + + /* trigger tracepoints */ + usleep(1); + + ASSERT_EQ(skel->bss->tp_res, 0x10000 | 0x20000, "tp_res1"); + + /* now we detach first prog and will attach third one, which causes + * two internal calls to bpf_prog_array_copy(), shuffling + * bpf_prog_array_items around. We test here that we don't lose track + * of associated bpf_cookies. + */ + bpf_link__destroy(link1); + link1 = NULL; + kern_sync_rcu(); + skel->bss->tp_res = 0; + + /* attach third tp prog */ + opts.bpf_cookie = 0x40000; + link3 = bpf_program__attach_tracepoint_opts(skel->progs.handle_tp3, + "syscalls", "sys_enter_nanosleep", &opts); + if (!ASSERT_OK_PTR(link3, "link3")) + goto cleanup; + + /* trigger tracepoints */ + usleep(1); + + ASSERT_EQ(skel->bss->tp_res, 0x20000 | 0x40000, "tp_res2"); + +cleanup: + bpf_link__destroy(link1); + bpf_link__destroy(link2); + bpf_link__destroy(link3); +} + +static void burn_cpu(void) +{ + volatile int j = 0; + cpu_set_t cpu_set; + int i, err; + + /* generate some branches on cpu 0 */ + CPU_ZERO(&cpu_set); + CPU_SET(0, &cpu_set); + err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); + ASSERT_OK(err, "set_thread_affinity"); + + /* spin the loop for a while (random high number) */ + for (i = 0; i < 1000000; ++i) + ++j; +} + +static void pe_subtest(struct test_bpf_cookie *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, opts); + struct bpf_link *link = NULL; + struct perf_event_attr attr; + int pfd = -1; + + /* create perf event */ + memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; + attr.sample_freq = 4000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (!ASSERT_GE(pfd, 0, "perf_fd")) + goto cleanup; + + opts.bpf_cookie = 0x100000; + link = bpf_program__attach_perf_event_opts(skel->progs.handle_pe, pfd, &opts); + if (!ASSERT_OK_PTR(link, "link1")) + goto cleanup; + + burn_cpu(); /* trigger BPF prog */ + + ASSERT_EQ(skel->bss->pe_res, 0x100000, "pe_res1"); + + /* prevent bpf_link__destroy() closing pfd itself */ + bpf_link__disconnect(link); + /* close BPF link's FD explicitly */ + close(bpf_link__fd(link)); + /* free up memory used by struct bpf_link */ + bpf_link__destroy(link); + link = NULL; + kern_sync_rcu(); + skel->bss->pe_res = 0; + + opts.bpf_cookie = 0x200000; + link = bpf_program__attach_perf_event_opts(skel->progs.handle_pe, pfd, &opts); + if (!ASSERT_OK_PTR(link, "link2")) + goto cleanup; + + burn_cpu(); /* trigger BPF prog */ + + ASSERT_EQ(skel->bss->pe_res, 0x200000, "pe_res2"); + +cleanup: + close(pfd); + bpf_link__destroy(link); +} + +void test_bpf_cookie(void) +{ + struct test_bpf_cookie *skel; + + skel = test_bpf_cookie__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->my_tid = syscall(SYS_gettid); + + if (test__start_subtest("kprobe")) + kprobe_subtest(skel); + if (test__start_subtest("uprobe")) + uprobe_subtest(skel); + if (test__start_subtest("tracepoint")) + tp_subtest(skel); + if (test__start_subtest("perf_event")) + pe_subtest(skel); + + test_bpf_cookie__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 2d3590cfb5e1..77ac24b191d4 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -13,6 +13,7 @@ #include "bpf_iter_tcp6.skel.h" #include "bpf_iter_udp4.skel.h" #include "bpf_iter_udp6.skel.h" +#include "bpf_iter_unix.skel.h" #include "bpf_iter_test_kern1.skel.h" #include "bpf_iter_test_kern2.skel.h" #include "bpf_iter_test_kern3.skel.h" @@ -47,7 +48,7 @@ static void do_dummy_read(struct bpf_program *prog) int iter_fd, len; link = bpf_program__attach_iter(prog, NULL); - if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + if (!ASSERT_OK_PTR(link, "attach_iter")) return; iter_fd = bpf_iter_create(bpf_link__fd(link)); @@ -201,7 +202,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel) int ret = 0; link = bpf_program__attach_iter(prog, NULL); - if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + if (!ASSERT_OK_PTR(link, "attach_iter")) return ret; iter_fd = bpf_iter_create(bpf_link__fd(link)); @@ -313,6 +314,19 @@ static void test_udp6(void) bpf_iter_udp6__destroy(skel); } +static void test_unix(void) +{ + struct bpf_iter_unix *skel; + + skel = bpf_iter_unix__open_and_load(); + if (!ASSERT_OK_PTR(skel, "bpf_iter_unix__open_and_load")) + return; + + do_dummy_read(skel->progs.dump_unix); + + bpf_iter_unix__destroy(skel); +} + /* The expected string is less than 16 bytes */ static int do_read_with_fd(int iter_fd, const char *expected, bool read_one_char) @@ -396,7 +410,7 @@ static void test_file_iter(void) return; link = bpf_program__attach_iter(skel1->progs.dump_task, NULL); - if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + if (!ASSERT_OK_PTR(link, "attach_iter")) goto out; /* unlink this path if it exists. */ @@ -502,7 +516,7 @@ static void test_overflow(bool test_e2big_overflow, bool ret1) skel->bss->map2_id = map_info.id; link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL); - if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + if (!ASSERT_OK_PTR(link, "attach_iter")) goto free_map2; iter_fd = bpf_iter_create(bpf_link__fd(link)); @@ -607,14 +621,12 @@ static void test_bpf_hash_map(void) opts.link_info = &linfo; opts.link_info_len = sizeof(linfo); link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts); - if (CHECK(!IS_ERR(link), "attach_iter", - "attach_iter for hashmap2 unexpected succeeded\n")) + if (!ASSERT_ERR_PTR(link, "attach_iter")) goto out; linfo.map.map_fd = bpf_map__fd(skel->maps.hashmap3); link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts); - if (CHECK(!IS_ERR(link), "attach_iter", - "attach_iter for hashmap3 unexpected succeeded\n")) + if (!ASSERT_ERR_PTR(link, "attach_iter")) goto out; /* hashmap1 should be good, update map values here */ @@ -636,7 +648,7 @@ static void test_bpf_hash_map(void) linfo.map.map_fd = map_fd; link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts); - if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + if (!ASSERT_OK_PTR(link, "attach_iter")) goto out; iter_fd = bpf_iter_create(bpf_link__fd(link)); @@ -727,7 +739,7 @@ static void test_bpf_percpu_hash_map(void) opts.link_info = &linfo; opts.link_info_len = sizeof(linfo); link = bpf_program__attach_iter(skel->progs.dump_bpf_percpu_hash_map, &opts); - if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + if (!ASSERT_OK_PTR(link, "attach_iter")) goto out; iter_fd = bpf_iter_create(bpf_link__fd(link)); @@ -798,7 +810,7 @@ static void test_bpf_array_map(void) opts.link_info = &linfo; opts.link_info_len = sizeof(linfo); link = bpf_program__attach_iter(skel->progs.dump_bpf_array_map, &opts); - if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + if (!ASSERT_OK_PTR(link, "attach_iter")) goto out; iter_fd = bpf_iter_create(bpf_link__fd(link)); @@ -894,7 +906,7 @@ static void test_bpf_percpu_array_map(void) opts.link_info = &linfo; opts.link_info_len = sizeof(linfo); link = bpf_program__attach_iter(skel->progs.dump_bpf_percpu_array_map, &opts); - if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + if (!ASSERT_OK_PTR(link, "attach_iter")) goto out; iter_fd = bpf_iter_create(bpf_link__fd(link)); @@ -957,7 +969,7 @@ static void test_bpf_sk_storage_delete(void) opts.link_info_len = sizeof(linfo); link = bpf_program__attach_iter(skel->progs.delete_bpf_sk_storage_map, &opts); - if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + if (!ASSERT_OK_PTR(link, "attach_iter")) goto out; iter_fd = bpf_iter_create(bpf_link__fd(link)); @@ -1075,7 +1087,7 @@ static void test_bpf_sk_storage_map(void) opts.link_info = &linfo; opts.link_info_len = sizeof(linfo); link = bpf_program__attach_iter(skel->progs.dump_bpf_sk_storage_map, &opts); - if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + if (!ASSERT_OK_PTR(link, "attach_iter")) goto out; iter_fd = bpf_iter_create(bpf_link__fd(link)); @@ -1128,7 +1140,7 @@ static void test_rdonly_buf_out_of_bound(void) opts.link_info = &linfo; opts.link_info_len = sizeof(linfo); link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts); - if (CHECK(!IS_ERR(link), "attach_iter", "unexpected success\n")) + if (!ASSERT_ERR_PTR(link, "attach_iter")) bpf_link__destroy(link); bpf_iter_test_kern5__destroy(skel); @@ -1186,8 +1198,7 @@ static void test_task_vma(void) skel->links.proc_maps = bpf_program__attach_iter( skel->progs.proc_maps, NULL); - if (CHECK(IS_ERR(skel->links.proc_maps), "bpf_program__attach_iter", - "attach iterator failed\n")) { + if (!ASSERT_OK_PTR(skel->links.proc_maps, "bpf_program__attach_iter")) { skel->links.proc_maps = NULL; goto out; } @@ -1258,6 +1269,8 @@ void test_bpf_iter(void) test_udp4(); if (test__start_subtest("udp6")) test_udp6(); + if (test__start_subtest("unix")) + test_unix(); if (test__start_subtest("anon")) test_anon_iter(false); if (test__start_subtest("anon-read-one-char")) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c new file mode 100644 index 000000000000..85babb0487b3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#define _GNU_SOURCE +#include <sched.h> +#include <test_progs.h> +#include "network_helpers.h" +#include "bpf_dctcp.skel.h" +#include "bpf_cubic.skel.h" +#include "bpf_iter_setsockopt.skel.h" + +static int create_netns(void) +{ + if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns")) + return -1; + + if (!ASSERT_OK(system("ip link set dev lo up"), "bring up lo")) + return -1; + + return 0; +} + +static unsigned int set_bpf_cubic(int *fds, unsigned int nr_fds) +{ + unsigned int i; + + for (i = 0; i < nr_fds; i++) { + if (setsockopt(fds[i], SOL_TCP, TCP_CONGESTION, "bpf_cubic", + sizeof("bpf_cubic"))) + return i; + } + + return nr_fds; +} + +static unsigned int check_bpf_dctcp(int *fds, unsigned int nr_fds) +{ + char tcp_cc[16]; + socklen_t optlen = sizeof(tcp_cc); + unsigned int i; + + for (i = 0; i < nr_fds; i++) { + if (getsockopt(fds[i], SOL_TCP, TCP_CONGESTION, + tcp_cc, &optlen) || + strcmp(tcp_cc, "bpf_dctcp")) + return i; + } + + return nr_fds; +} + +static int *make_established(int listen_fd, unsigned int nr_est, + int **paccepted_fds) +{ + int *est_fds, *accepted_fds; + unsigned int i; + + est_fds = malloc(sizeof(*est_fds) * nr_est); + if (!est_fds) + return NULL; + + accepted_fds = malloc(sizeof(*accepted_fds) * nr_est); + if (!accepted_fds) { + free(est_fds); + return NULL; + } + + for (i = 0; i < nr_est; i++) { + est_fds[i] = connect_to_fd(listen_fd, 0); + if (est_fds[i] == -1) + break; + if (set_bpf_cubic(&est_fds[i], 1) != 1) { + close(est_fds[i]); + break; + } + + accepted_fds[i] = accept(listen_fd, NULL, 0); + if (accepted_fds[i] == -1) { + close(est_fds[i]); + break; + } + } + + if (!ASSERT_EQ(i, nr_est, "create established fds")) { + free_fds(accepted_fds, i); + free_fds(est_fds, i); + return NULL; + } + + *paccepted_fds = accepted_fds; + return est_fds; +} + +static unsigned short get_local_port(int fd) +{ + struct sockaddr_in6 addr; + socklen_t addrlen = sizeof(addr); + + if (!getsockname(fd, &addr, &addrlen)) + return ntohs(addr.sin6_port); + + return 0; +} + +static void do_bpf_iter_setsockopt(struct bpf_iter_setsockopt *iter_skel, + bool random_retry) +{ + int *reuse_listen_fds = NULL, *accepted_fds = NULL, *est_fds = NULL; + unsigned int nr_reuse_listens = 256, nr_est = 256; + int err, iter_fd = -1, listen_fd = -1; + char buf; + + /* Prepare non-reuseport listen_fd */ + listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(listen_fd, 0, "start_server")) + return; + if (!ASSERT_EQ(set_bpf_cubic(&listen_fd, 1), 1, + "set listen_fd to cubic")) + goto done; + iter_skel->bss->listen_hport = get_local_port(listen_fd); + if (!ASSERT_NEQ(iter_skel->bss->listen_hport, 0, + "get_local_port(listen_fd)")) + goto done; + + /* Connect to non-reuseport listen_fd */ + est_fds = make_established(listen_fd, nr_est, &accepted_fds); + if (!ASSERT_OK_PTR(est_fds, "create established")) + goto done; + + /* Prepare reuseport listen fds */ + reuse_listen_fds = start_reuseport_server(AF_INET6, SOCK_STREAM, + "::1", 0, 0, + nr_reuse_listens); + if (!ASSERT_OK_PTR(reuse_listen_fds, "start_reuseport_server")) + goto done; + if (!ASSERT_EQ(set_bpf_cubic(reuse_listen_fds, nr_reuse_listens), + nr_reuse_listens, "set reuse_listen_fds to cubic")) + goto done; + iter_skel->bss->reuse_listen_hport = get_local_port(reuse_listen_fds[0]); + if (!ASSERT_NEQ(iter_skel->bss->reuse_listen_hport, 0, + "get_local_port(reuse_listen_fds[0])")) + goto done; + + /* Run bpf tcp iter to switch from bpf_cubic to bpf_dctcp */ + iter_skel->bss->random_retry = random_retry; + iter_fd = bpf_iter_create(bpf_link__fd(iter_skel->links.change_tcp_cc)); + if (!ASSERT_GE(iter_fd, 0, "create iter_fd")) + goto done; + + while ((err = read(iter_fd, &buf, sizeof(buf))) == -1 && + errno == EAGAIN) + ; + if (!ASSERT_OK(err, "read iter error")) + goto done; + + /* Check reuseport listen fds for dctcp */ + ASSERT_EQ(check_bpf_dctcp(reuse_listen_fds, nr_reuse_listens), + nr_reuse_listens, + "check reuse_listen_fds dctcp"); + + /* Check non reuseport listen fd for dctcp */ + ASSERT_EQ(check_bpf_dctcp(&listen_fd, 1), 1, + "check listen_fd dctcp"); + + /* Check established fds for dctcp */ + ASSERT_EQ(check_bpf_dctcp(est_fds, nr_est), nr_est, + "check est_fds dctcp"); + + /* Check accepted fds for dctcp */ + ASSERT_EQ(check_bpf_dctcp(accepted_fds, nr_est), nr_est, + "check accepted_fds dctcp"); + +done: + if (iter_fd != -1) + close(iter_fd); + if (listen_fd != -1) + close(listen_fd); + free_fds(reuse_listen_fds, nr_reuse_listens); + free_fds(accepted_fds, nr_est); + free_fds(est_fds, nr_est); +} + +void test_bpf_iter_setsockopt(void) +{ + struct bpf_iter_setsockopt *iter_skel = NULL; + struct bpf_cubic *cubic_skel = NULL; + struct bpf_dctcp *dctcp_skel = NULL; + struct bpf_link *cubic_link = NULL; + struct bpf_link *dctcp_link = NULL; + + if (create_netns()) + return; + + /* Load iter_skel */ + iter_skel = bpf_iter_setsockopt__open_and_load(); + if (!ASSERT_OK_PTR(iter_skel, "iter_skel")) + return; + iter_skel->links.change_tcp_cc = bpf_program__attach_iter(iter_skel->progs.change_tcp_cc, NULL); + if (!ASSERT_OK_PTR(iter_skel->links.change_tcp_cc, "attach iter")) + goto done; + + /* Load bpf_cubic */ + cubic_skel = bpf_cubic__open_and_load(); + if (!ASSERT_OK_PTR(cubic_skel, "cubic_skel")) + goto done; + cubic_link = bpf_map__attach_struct_ops(cubic_skel->maps.cubic); + if (!ASSERT_OK_PTR(cubic_link, "cubic_link")) + goto done; + + /* Load bpf_dctcp */ + dctcp_skel = bpf_dctcp__open_and_load(); + if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel")) + goto done; + dctcp_link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp); + if (!ASSERT_OK_PTR(dctcp_link, "dctcp_link")) + goto done; + + do_bpf_iter_setsockopt(iter_skel, true); + do_bpf_iter_setsockopt(iter_skel, false); + +done: + bpf_link__destroy(cubic_link); + bpf_link__destroy(dctcp_link); + bpf_cubic__destroy(cubic_skel); + bpf_dctcp__destroy(dctcp_skel); + bpf_iter_setsockopt__destroy(iter_skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index e25917f04602..94e03df69d71 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -4,37 +4,22 @@ #include <linux/err.h> #include <netinet/tcp.h> #include <test_progs.h> +#include "network_helpers.h" #include "bpf_dctcp.skel.h" #include "bpf_cubic.skel.h" #include "bpf_tcp_nogpl.skel.h" +#include "bpf_dctcp_release.skel.h" #define min(a, b) ((a) < (b) ? (a) : (b)) +#ifndef ENOTSUPP +#define ENOTSUPP 524 +#endif + static const unsigned int total_bytes = 10 * 1024 * 1024; -static const struct timeval timeo_sec = { .tv_sec = 10 }; -static const size_t timeo_optlen = sizeof(timeo_sec); static int expected_stg = 0xeB9F; static int stop, duration; -static int settimeo(int fd) -{ - int err; - - err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, - timeo_optlen); - if (CHECK(err == -1, "setsockopt(fd, SO_RCVTIMEO)", "errno:%d\n", - errno)) - return -1; - - err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, - timeo_optlen); - if (CHECK(err == -1, "setsockopt(fd, SO_SNDTIMEO)", "errno:%d\n", - errno)) - return -1; - - return 0; -} - static int settcpca(int fd, const char *tcp_ca) { int err; @@ -61,7 +46,7 @@ static void *server(void *arg) goto done; } - if (settimeo(fd)) { + if (settimeo(fd, 0)) { err = -errno; goto done; } @@ -82,7 +67,7 @@ static void *server(void *arg) bytes, total_bytes, nr_sent, errno); done: - if (fd != -1) + if (fd >= 0) close(fd); if (err) { WRITE_ONCE(stop, 1); @@ -114,7 +99,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) } if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) || - settimeo(lfd) || settimeo(fd)) + settimeo(lfd, 0) || settimeo(fd, 0)) goto done; /* bind, listen and start server thread to accept */ @@ -191,8 +176,7 @@ static void test_cubic(void) return; link = bpf_map__attach_struct_ops(cubic_skel->maps.cubic); - if (CHECK(IS_ERR(link), "bpf_map__attach_struct_ops", "err:%ld\n", - PTR_ERR(link))) { + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { bpf_cubic__destroy(cubic_skel); return; } @@ -213,8 +197,7 @@ static void test_dctcp(void) return; link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp); - if (CHECK(IS_ERR(link), "bpf_map__attach_struct_ops", "err:%ld\n", - PTR_ERR(link))) { + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { bpf_dctcp__destroy(dctcp_skel); return; } @@ -269,6 +252,77 @@ static void test_invalid_license(void) libbpf_set_print(old_print_fn); } +static void test_dctcp_fallback(void) +{ + int err, lfd = -1, cli_fd = -1, srv_fd = -1; + struct network_helper_opts opts = { + .cc = "cubic", + }; + struct bpf_dctcp *dctcp_skel; + struct bpf_link *link = NULL; + char srv_cc[16]; + socklen_t cc_len = sizeof(srv_cc); + + dctcp_skel = bpf_dctcp__open(); + if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel")) + return; + strcpy(dctcp_skel->rodata->fallback, "cubic"); + if (!ASSERT_OK(bpf_dctcp__load(dctcp_skel), "bpf_dctcp__load")) + goto done; + + link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp); + if (!ASSERT_OK_PTR(link, "dctcp link")) + goto done; + + lfd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(lfd, 0, "lfd") || + !ASSERT_OK(settcpca(lfd, "bpf_dctcp"), "lfd=>bpf_dctcp")) + goto done; + + cli_fd = connect_to_fd_opts(lfd, &opts); + if (!ASSERT_GE(cli_fd, 0, "cli_fd")) + goto done; + + srv_fd = accept(lfd, NULL, 0); + if (!ASSERT_GE(srv_fd, 0, "srv_fd")) + goto done; + ASSERT_STREQ(dctcp_skel->bss->cc_res, "cubic", "cc_res"); + ASSERT_EQ(dctcp_skel->bss->tcp_cdg_res, -ENOTSUPP, "tcp_cdg_res"); + + err = getsockopt(srv_fd, SOL_TCP, TCP_CONGESTION, srv_cc, &cc_len); + if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) + goto done; + ASSERT_STREQ(srv_cc, "cubic", "srv_fd cc"); + +done: + bpf_link__destroy(link); + bpf_dctcp__destroy(dctcp_skel); + if (lfd != -1) + close(lfd); + if (srv_fd != -1) + close(srv_fd); + if (cli_fd != -1) + close(cli_fd); +} + +static void test_rel_setsockopt(void) +{ + struct bpf_dctcp_release *rel_skel; + libbpf_print_fn_t old_print_fn; + + err_str = "unknown func bpf_setsockopt"; + found = false; + + old_print_fn = libbpf_set_print(libbpf_debug_print); + rel_skel = bpf_dctcp_release__open_and_load(); + libbpf_set_print(old_print_fn); + + ASSERT_ERR_PTR(rel_skel, "rel_skel"); + ASSERT_TRUE(found, "expected_err_msg"); + + bpf_dctcp_release__destroy(rel_skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -277,4 +331,8 @@ void test_bpf_tcp_ca(void) test_cubic(); if (test__start_subtest("invalid_license")) test_invalid_license(); + if (test__start_subtest("dctcp_fallback")) + test_dctcp_fallback(); + if (test__start_subtest("rel_setsockopt")) + test_rel_setsockopt(); } diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 0457ae32b270..649f87382c8d 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -3811,7 +3811,7 @@ static void do_test_raw(unsigned int test_num) always_log); free(raw_btf); - err = ((btf_fd == -1) != test->btf_load_err); + err = ((btf_fd < 0) != test->btf_load_err); if (CHECK(err, "btf_fd:%d test->btf_load_err:%u", btf_fd, test->btf_load_err) || CHECK(test->err_str && !strstr(btf_log_buf, test->err_str), @@ -3820,7 +3820,7 @@ static void do_test_raw(unsigned int test_num) goto done; } - if (err || btf_fd == -1) + if (err || btf_fd < 0) goto done; create_attr.name = test->map_name; @@ -3834,16 +3834,16 @@ static void do_test_raw(unsigned int test_num) map_fd = bpf_create_map_xattr(&create_attr); - err = ((map_fd == -1) != test->map_create_err); + err = ((map_fd < 0) != test->map_create_err); CHECK(err, "map_fd:%d test->map_create_err:%u", map_fd, test->map_create_err); done: if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); - if (btf_fd != -1) + if (btf_fd >= 0) close(btf_fd); - if (map_fd != -1) + if (map_fd >= 0) close(map_fd); } @@ -3941,7 +3941,7 @@ static int test_big_btf_info(unsigned int test_num) btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, always_log); - if (CHECK(btf_fd == -1, "errno:%d", errno)) { + if (CHECK(btf_fd < 0, "errno:%d", errno)) { err = -1; goto done; } @@ -3987,7 +3987,7 @@ done: free(raw_btf); free(user_btf); - if (btf_fd != -1) + if (btf_fd >= 0) close(btf_fd); return err; @@ -4029,7 +4029,7 @@ static int test_btf_id(unsigned int test_num) btf_fd[0] = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, always_log); - if (CHECK(btf_fd[0] == -1, "errno:%d", errno)) { + if (CHECK(btf_fd[0] < 0, "errno:%d", errno)) { err = -1; goto done; } @@ -4043,7 +4043,7 @@ static int test_btf_id(unsigned int test_num) } btf_fd[1] = bpf_btf_get_fd_by_id(info[0].id); - if (CHECK(btf_fd[1] == -1, "errno:%d", errno)) { + if (CHECK(btf_fd[1] < 0, "errno:%d", errno)) { err = -1; goto done; } @@ -4071,7 +4071,7 @@ static int test_btf_id(unsigned int test_num) create_attr.btf_value_type_id = 2; map_fd = bpf_create_map_xattr(&create_attr); - if (CHECK(map_fd == -1, "errno:%d", errno)) { + if (CHECK(map_fd < 0, "errno:%d", errno)) { err = -1; goto done; } @@ -4094,7 +4094,7 @@ static int test_btf_id(unsigned int test_num) /* Test BTF ID is removed from the kernel */ btf_fd[0] = bpf_btf_get_fd_by_id(map_info.btf_id); - if (CHECK(btf_fd[0] == -1, "errno:%d", errno)) { + if (CHECK(btf_fd[0] < 0, "errno:%d", errno)) { err = -1; goto done; } @@ -4105,7 +4105,7 @@ static int test_btf_id(unsigned int test_num) close(map_fd); map_fd = -1; btf_fd[0] = bpf_btf_get_fd_by_id(map_info.btf_id); - if (CHECK(btf_fd[0] != -1, "BTF lingers")) { + if (CHECK(btf_fd[0] >= 0, "BTF lingers")) { err = -1; goto done; } @@ -4117,11 +4117,11 @@ done: fprintf(stderr, "\n%s", btf_log_buf); free(raw_btf); - if (map_fd != -1) + if (map_fd >= 0) close(map_fd); for (i = 0; i < 2; i++) { free(user_btf[i]); - if (btf_fd[i] != -1) + if (btf_fd[i] >= 0) close(btf_fd[i]); } @@ -4166,7 +4166,7 @@ static void do_test_get_info(unsigned int test_num) btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, always_log); - if (CHECK(btf_fd == -1, "errno:%d", errno)) { + if (CHECK(btf_fd <= 0, "errno:%d", errno)) { err = -1; goto done; } @@ -4212,7 +4212,7 @@ done: free(raw_btf); free(user_btf); - if (btf_fd != -1) + if (btf_fd >= 0) close(btf_fd); } @@ -4249,8 +4249,9 @@ static void do_test_file(unsigned int test_num) return; btf = btf__parse_elf(test->file, &btf_ext); - if (IS_ERR(btf)) { - if (PTR_ERR(btf) == -ENOENT) { + err = libbpf_get_error(btf); + if (err) { + if (err == -ENOENT) { printf("%s:SKIP: No ELF %s found", __func__, BTF_ELF_SEC); test__skip(); return; @@ -4263,7 +4264,8 @@ static void do_test_file(unsigned int test_num) btf_ext__free(btf_ext); obj = bpf_object__open(test->file); - if (CHECK(IS_ERR(obj), "obj: %ld", PTR_ERR(obj))) + err = libbpf_get_error(obj); + if (CHECK(err, "obj: %d", err)) return; prog = bpf_program__next(NULL, obj); @@ -4298,7 +4300,7 @@ static void do_test_file(unsigned int test_num) info_len = sizeof(struct bpf_prog_info); err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (CHECK(err == -1, "invalid get info (1st) errno:%d", errno)) { + if (CHECK(err < 0, "invalid get info (1st) errno:%d", errno)) { fprintf(stderr, "%s\n", btf_log_buf); err = -1; goto done; @@ -4330,7 +4332,7 @@ static void do_test_file(unsigned int test_num) err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (CHECK(err == -1, "invalid get info (2nd) errno:%d", errno)) { + if (CHECK(err < 0, "invalid get info (2nd) errno:%d", errno)) { fprintf(stderr, "%s\n", btf_log_buf); err = -1; goto done; @@ -4348,7 +4350,8 @@ static void do_test_file(unsigned int test_num) goto done; } - err = btf__get_from_id(info.btf_id, &btf); + btf = btf__load_from_kernel_by_id(info.btf_id); + err = libbpf_get_error(btf); if (CHECK(err, "cannot get btf from kernel, err: %d", err)) goto done; @@ -4384,6 +4387,7 @@ skip: fprintf(stderr, "OK"); done: + btf__free(btf); free(func_info); bpf_object__close(obj); } @@ -4886,7 +4890,7 @@ static void do_test_pprint(int test_num) always_log); free(raw_btf); - if (CHECK(btf_fd == -1, "errno:%d", errno)) { + if (CHECK(btf_fd < 0, "errno:%d", errno)) { err = -1; goto done; } @@ -4901,7 +4905,7 @@ static void do_test_pprint(int test_num) create_attr.btf_value_type_id = test->value_type_id; map_fd = bpf_create_map_xattr(&create_attr); - if (CHECK(map_fd == -1, "errno:%d", errno)) { + if (CHECK(map_fd < 0, "errno:%d", errno)) { err = -1; goto done; } @@ -4982,7 +4986,7 @@ static void do_test_pprint(int test_num) err = check_line(expected_line, nexpected_line, sizeof(expected_line), line); - if (err == -1) + if (err < 0) goto done; } @@ -4998,7 +5002,7 @@ static void do_test_pprint(int test_num) cpu, cmapv); err = check_line(expected_line, nexpected_line, sizeof(expected_line), line); - if (err == -1) + if (err < 0) goto done; cmapv = cmapv + rounded_value_size; @@ -5036,9 +5040,9 @@ done: fprintf(stderr, "OK"); if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); - if (btf_fd != -1) + if (btf_fd >= 0) close(btf_fd); - if (map_fd != -1) + if (map_fd >= 0) close(map_fd); if (pin_file) fclose(pin_file); @@ -5950,7 +5954,7 @@ static int test_get_finfo(const struct prog_info_raw_test *test, /* get necessary lens */ info_len = sizeof(struct bpf_prog_info); err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (CHECK(err == -1, "invalid get info (1st) errno:%d", errno)) { + if (CHECK(err < 0, "invalid get info (1st) errno:%d", errno)) { fprintf(stderr, "%s\n", btf_log_buf); return -1; } @@ -5980,7 +5984,7 @@ static int test_get_finfo(const struct prog_info_raw_test *test, info.func_info_rec_size = rec_size; info.func_info = ptr_to_u64(func_info); err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (CHECK(err == -1, "invalid get info (2nd) errno:%d", errno)) { + if (CHECK(err < 0, "invalid get info (2nd) errno:%d", errno)) { fprintf(stderr, "%s\n", btf_log_buf); err = -1; goto done; @@ -6044,7 +6048,7 @@ static int test_get_linfo(const struct prog_info_raw_test *test, info_len = sizeof(struct bpf_prog_info); err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (CHECK(err == -1, "err:%d errno:%d", err, errno)) { + if (CHECK(err < 0, "err:%d errno:%d", err, errno)) { err = -1; goto done; } @@ -6123,7 +6127,7 @@ static int test_get_linfo(const struct prog_info_raw_test *test, * Only recheck the info.*line_info* fields. * Other fields are not the concern of this test. */ - if (CHECK(err == -1 || + if (CHECK(err < 0 || info.nr_line_info != cnt || (jited_cnt && !info.jited_line_info) || info.nr_jited_line_info != jited_cnt || @@ -6260,7 +6264,7 @@ static void do_test_info_raw(unsigned int test_num) always_log); free(raw_btf); - if (CHECK(btf_fd == -1, "invalid btf_fd errno:%d", errno)) { + if (CHECK(btf_fd < 0, "invalid btf_fd errno:%d", errno)) { err = -1; goto done; } @@ -6273,7 +6277,8 @@ static void do_test_info_raw(unsigned int test_num) patched_linfo = patch_name_tbd(test->line_info, test->str_sec, linfo_str_off, test->str_sec_size, &linfo_size); - if (IS_ERR(patched_linfo)) { + err = libbpf_get_error(patched_linfo); + if (err) { fprintf(stderr, "error in creating raw bpf_line_info"); err = -1; goto done; @@ -6297,7 +6302,7 @@ static void do_test_info_raw(unsigned int test_num) } prog_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); - err = ((prog_fd == -1) != test->expected_prog_load_failure); + err = ((prog_fd < 0) != test->expected_prog_load_failure); if (CHECK(err, "prog_fd:%d expected_prog_load_failure:%u errno:%d", prog_fd, test->expected_prog_load_failure, errno) || CHECK(test->err_str && !strstr(btf_log_buf, test->err_str), @@ -6306,7 +6311,7 @@ static void do_test_info_raw(unsigned int test_num) goto done; } - if (prog_fd == -1) + if (prog_fd < 0) goto done; err = test_get_finfo(test, prog_fd); @@ -6323,12 +6328,12 @@ done: if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); - if (btf_fd != -1) + if (btf_fd >= 0) close(btf_fd); - if (prog_fd != -1) + if (prog_fd >= 0) close(prog_fd); - if (!IS_ERR(patched_linfo)) + if (!libbpf_get_error(patched_linfo)) free(patched_linfo); } @@ -6839,9 +6844,9 @@ static void do_test_dedup(unsigned int test_num) return; test_btf = btf__new((__u8 *)raw_btf, raw_btf_size); + err = libbpf_get_error(test_btf); free(raw_btf); - if (CHECK(IS_ERR(test_btf), "invalid test_btf errno:%ld", - PTR_ERR(test_btf))) { + if (CHECK(err, "invalid test_btf errno:%d", err)) { err = -1; goto done; } @@ -6853,9 +6858,9 @@ static void do_test_dedup(unsigned int test_num) if (!raw_btf) return; expect_btf = btf__new((__u8 *)raw_btf, raw_btf_size); + err = libbpf_get_error(expect_btf); free(raw_btf); - if (CHECK(IS_ERR(expect_btf), "invalid expect_btf errno:%ld", - PTR_ERR(expect_btf))) { + if (CHECK(err, "invalid expect_btf errno:%d", err)) { err = -1; goto done; } @@ -6966,10 +6971,8 @@ static void do_test_dedup(unsigned int test_num) } done: - if (!IS_ERR(test_btf)) - btf__free(test_btf); - if (!IS_ERR(expect_btf)) - btf__free(expect_btf); + btf__free(test_btf); + btf__free(expect_btf); } void test_btf(void) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 5e129dc2073c..52ccf0cf35e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -32,8 +32,9 @@ static int btf_dump_all_types(const struct btf *btf, int err = 0, id; d = btf_dump__new(btf, NULL, opts, btf_dump_printf); - if (IS_ERR(d)) - return PTR_ERR(d); + err = libbpf_get_error(d); + if (err) + return err; for (id = 1; id <= type_cnt; id++) { err = btf_dump__dump_type(d, id); @@ -56,8 +57,7 @@ static int test_btf_dump_case(int n, struct btf_dump_test_case *t) snprintf(test_file, sizeof(test_file), "%s.o", t->file); btf = btf__parse_elf(test_file, NULL); - if (CHECK(IS_ERR(btf), "btf_parse_elf", - "failed to load test BTF: %ld\n", PTR_ERR(btf))) { + if (!ASSERT_OK_PTR(btf, "btf_parse_elf")) { err = -PTR_ERR(btf); btf = NULL; goto done; @@ -232,7 +232,593 @@ err_out: btf__free(btf); } +#define STRSIZE 4096 + +static void btf_dump_snprintf(void *ctx, const char *fmt, va_list args) +{ + char *s = ctx, new[STRSIZE]; + + vsnprintf(new, STRSIZE, fmt, args); + if (strlen(s) < STRSIZE) + strncat(s, new, STRSIZE - strlen(s) - 1); +} + +static int btf_dump_data(struct btf *btf, struct btf_dump *d, + char *name, char *prefix, __u64 flags, void *ptr, + size_t ptr_sz, char *str, const char *expected_val) +{ + DECLARE_LIBBPF_OPTS(btf_dump_type_data_opts, opts); + size_t type_sz; + __s32 type_id; + int ret = 0; + + if (flags & BTF_F_COMPACT) + opts.compact = true; + if (flags & BTF_F_NONAME) + opts.skip_names = true; + if (flags & BTF_F_ZERO) + opts.emit_zeroes = true; + if (prefix) { + ASSERT_STRNEQ(name, prefix, strlen(prefix), + "verify prefix match"); + name += strlen(prefix) + 1; + } + type_id = btf__find_by_name(btf, name); + if (!ASSERT_GE(type_id, 0, "find type id")) + return -ENOENT; + type_sz = btf__resolve_size(btf, type_id); + str[0] = '\0'; + ret = btf_dump__dump_type_data(d, type_id, ptr, ptr_sz, &opts); + if (type_sz <= ptr_sz) { + if (!ASSERT_EQ(ret, type_sz, "failed/unexpected type_sz")) + return -EINVAL; + } else { + if (!ASSERT_EQ(ret, -E2BIG, "failed to return -E2BIG")) + return -EINVAL; + } + if (!ASSERT_STREQ(str, expected_val, "ensure expected/actual match")) + return -EFAULT; + return 0; +} + +#define TEST_BTF_DUMP_DATA(_b, _d, _prefix, _str, _type, _flags, \ + _expected, ...) \ + do { \ + char __ptrtype[64] = #_type; \ + char *_ptrtype = (char *)__ptrtype; \ + _type _ptrdata = __VA_ARGS__; \ + void *_ptr = &_ptrdata; \ + \ + (void) btf_dump_data(_b, _d, _ptrtype, _prefix, _flags, \ + _ptr, sizeof(_type), _str, \ + _expected); \ + } while (0) + +/* Use where expected data string matches its stringified declaration */ +#define TEST_BTF_DUMP_DATA_C(_b, _d, _prefix, _str, _type, _flags, \ + ...) \ + TEST_BTF_DUMP_DATA(_b, _d, _prefix, _str, _type, _flags, \ + "(" #_type ")" #__VA_ARGS__, __VA_ARGS__) + +/* overflow test; pass typesize < expected type size, ensure E2BIG returned */ +#define TEST_BTF_DUMP_DATA_OVER(_b, _d, _prefix, _str, _type, _type_sz, \ + _expected, ...) \ + do { \ + char __ptrtype[64] = #_type; \ + char *_ptrtype = (char *)__ptrtype; \ + _type _ptrdata = __VA_ARGS__; \ + void *_ptr = &_ptrdata; \ + \ + (void) btf_dump_data(_b, _d, _ptrtype, _prefix, 0, \ + _ptr, _type_sz, _str, _expected); \ + } while (0) + +#define TEST_BTF_DUMP_VAR(_b, _d, _prefix, _str, _var, _type, _flags, \ + _expected, ...) \ + do { \ + _type _ptrdata = __VA_ARGS__; \ + void *_ptr = &_ptrdata; \ + \ + (void) btf_dump_data(_b, _d, _var, _prefix, _flags, \ + _ptr, sizeof(_type), _str, \ + _expected); \ + } while (0) + +static void test_btf_dump_int_data(struct btf *btf, struct btf_dump *d, + char *str) +{ +#ifdef __SIZEOF_INT128__ + __int128 i = 0xffffffffffffffff; + + /* this dance is required because we cannot directly initialize + * a 128-bit value to anything larger than a 64-bit value. + */ + i = (i << 64) | (i - 1); +#endif + /* simple int */ + TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, int, BTF_F_COMPACT, 1234); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_NONAME, + "1234", 1234); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, 0, "(int)1234", 1234); + + /* zero value should be printed at toplevel */ + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT, "(int)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_NONAME, + "0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_ZERO, + "(int)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "0", 0); + TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, int, BTF_F_COMPACT, -4567); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_NONAME, + "-4567", -4567); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, 0, "(int)-4567", -4567); + + TEST_BTF_DUMP_DATA_OVER(btf, d, NULL, str, int, sizeof(int)-1, "", 1); + +#ifdef __SIZEOF_INT128__ + TEST_BTF_DUMP_DATA(btf, d, NULL, str, __int128, BTF_F_COMPACT, + "(__int128)0xffffffffffffffff", + 0xffffffffffffffff); + ASSERT_OK(btf_dump_data(btf, d, "__int128", NULL, 0, &i, 16, str, + "(__int128)0xfffffffffffffffffffffffffffffffe"), + "dump __int128"); +#endif +} + +static void test_btf_dump_float_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + float t1 = 1.234567; + float t2 = -1.234567; + float t3 = 0.0; + double t4 = 5.678912; + double t5 = -5.678912; + double t6 = 0.0; + long double t7 = 9.876543; + long double t8 = -9.876543; + long double t9 = 0.0; + + /* since the kernel does not likely have any float types in its BTF, we + * will need to add some of various sizes. + */ + + ASSERT_GT(btf__add_float(btf, "test_float", 4), 0, "add float"); + ASSERT_OK(btf_dump_data(btf, d, "test_float", NULL, 0, &t1, 4, str, + "(test_float)1.234567"), "dump float"); + ASSERT_OK(btf_dump_data(btf, d, "test_float", NULL, 0, &t2, 4, str, + "(test_float)-1.234567"), "dump float"); + ASSERT_OK(btf_dump_data(btf, d, "test_float", NULL, 0, &t3, 4, str, + "(test_float)0.000000"), "dump float"); + + ASSERT_GT(btf__add_float(btf, "test_double", 8), 0, "add_double"); + ASSERT_OK(btf_dump_data(btf, d, "test_double", NULL, 0, &t4, 8, str, + "(test_double)5.678912"), "dump double"); + ASSERT_OK(btf_dump_data(btf, d, "test_double", NULL, 0, &t5, 8, str, + "(test_double)-5.678912"), "dump double"); + ASSERT_OK(btf_dump_data(btf, d, "test_double", NULL, 0, &t6, 8, str, + "(test_double)0.000000"), "dump double"); + + ASSERT_GT(btf__add_float(btf, "test_long_double", 16), 0, "add long double"); + ASSERT_OK(btf_dump_data(btf, d, "test_long_double", NULL, 0, &t7, 16, + str, "(test_long_double)9.876543"), + "dump long_double"); + ASSERT_OK(btf_dump_data(btf, d, "test_long_double", NULL, 0, &t8, 16, + str, "(test_long_double)-9.876543"), + "dump long_double"); + ASSERT_OK(btf_dump_data(btf, d, "test_long_double", NULL, 0, &t9, 16, + str, "(test_long_double)0.000000"), + "dump long_double"); +} + +static void test_btf_dump_char_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + /* simple char */ + TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, char, BTF_F_COMPACT, 100); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_NONAME, + "100", 100); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, 0, "(char)100", 100); + /* zero value should be printed at toplevel */ + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT, + "(char)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_NONAME, + "0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_ZERO, + "(char)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, 0, "(char)0", 0); + + TEST_BTF_DUMP_DATA_OVER(btf, d, NULL, str, char, sizeof(char)-1, "", 100); +} + +static void test_btf_dump_typedef_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + /* simple typedef */ + TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, uint64_t, BTF_F_COMPACT, 100); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT | BTF_F_NONAME, + "1", 1); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, 0, "(u64)1", 1); + /* zero value should be printed at toplevel */ + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT, "(u64)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT | BTF_F_NONAME, + "0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT | BTF_F_ZERO, + "(u64)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, 0, "(u64)0", 0); + + /* typedef struct */ + TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, atomic_t, BTF_F_COMPACT, + {.counter = (int)1,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT | BTF_F_NONAME, + "{1,}", { .counter = 1 }); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, 0, +"(atomic_t){\n" +" .counter = (int)1,\n" +"}", + {.counter = 1,}); + /* typedef with 0 value should be printed at toplevel */ + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT, "(atomic_t){}", + {.counter = 0,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT | BTF_F_NONAME, + "{}", {.counter = 0,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, 0, +"(atomic_t){\n" +"}", + {.counter = 0,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT | BTF_F_ZERO, + "(atomic_t){.counter = (int)0,}", + {.counter = 0,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "{0,}", {.counter = 0,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_ZERO, +"(atomic_t){\n" +" .counter = (int)0,\n" +"}", + { .counter = 0,}); + + /* overflow should show type but not value since it overflows */ + TEST_BTF_DUMP_DATA_OVER(btf, d, NULL, str, atomic_t, sizeof(atomic_t)-1, + "(atomic_t){\n", { .counter = 1}); +} + +static void test_btf_dump_enum_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + /* enum where enum value does (and does not) exist */ + TEST_BTF_DUMP_DATA_C(btf, d, "enum", str, enum bpf_cmd, BTF_F_COMPACT, + BPF_MAP_CREATE); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, BTF_F_COMPACT, + "(enum bpf_cmd)BPF_MAP_CREATE", 0); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, + BTF_F_COMPACT | BTF_F_NONAME, + "BPF_MAP_CREATE", + BPF_MAP_CREATE); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, 0, + "(enum bpf_cmd)BPF_MAP_CREATE", + BPF_MAP_CREATE); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "BPF_MAP_CREATE", 0); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, + BTF_F_COMPACT | BTF_F_ZERO, + "(enum bpf_cmd)BPF_MAP_CREATE", + BPF_MAP_CREATE); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "BPF_MAP_CREATE", BPF_MAP_CREATE); + TEST_BTF_DUMP_DATA_C(btf, d, "enum", str, enum bpf_cmd, BTF_F_COMPACT, 2000); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, + BTF_F_COMPACT | BTF_F_NONAME, + "2000", 2000); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, 0, + "(enum bpf_cmd)2000", 2000); + + TEST_BTF_DUMP_DATA_OVER(btf, d, "enum", str, enum bpf_cmd, + sizeof(enum bpf_cmd) - 1, "", BPF_MAP_CREATE); +} + +static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + DECLARE_LIBBPF_OPTS(btf_dump_type_data_opts, opts); + char zero_data[512] = { }; + char type_data[512]; + void *fops = type_data; + void *skb = type_data; + size_t type_sz; + __s32 type_id; + char *cmpstr; + int ret; + + memset(type_data, 255, sizeof(type_data)); + + /* simple struct */ + TEST_BTF_DUMP_DATA_C(btf, d, "struct", str, struct btf_enum, BTF_F_COMPACT, + {.name_off = (__u32)3,.val = (__s32)-1,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_COMPACT | BTF_F_NONAME, + "{3,-1,}", + { .name_off = 3, .val = -1,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, 0, +"(struct btf_enum){\n" +" .name_off = (__u32)3,\n" +" .val = (__s32)-1,\n" +"}", + { .name_off = 3, .val = -1,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_COMPACT | BTF_F_NONAME, + "{-1,}", + { .name_off = 0, .val = -1,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "{0,-1,}", + { .name_off = 0, .val = -1,}); + /* empty struct should be printed */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, BTF_F_COMPACT, + "(struct btf_enum){}", + { .name_off = 0, .val = 0,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_COMPACT | BTF_F_NONAME, + "{}", + { .name_off = 0, .val = 0,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, 0, +"(struct btf_enum){\n" +"}", + { .name_off = 0, .val = 0,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_COMPACT | BTF_F_ZERO, + "(struct btf_enum){.name_off = (__u32)0,.val = (__s32)0,}", + { .name_off = 0, .val = 0,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_ZERO, +"(struct btf_enum){\n" +" .name_off = (__u32)0,\n" +" .val = (__s32)0,\n" +"}", + { .name_off = 0, .val = 0,}); + + /* struct with pointers */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, BTF_F_COMPACT, + "(struct list_head){.next = (struct list_head *)0x1,}", + { .next = (struct list_head *)1 }); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, 0, +"(struct list_head){\n" +" .next = (struct list_head *)0x1,\n" +"}", + { .next = (struct list_head *)1 }); + /* NULL pointer should not be displayed */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, BTF_F_COMPACT, + "(struct list_head){}", + { .next = (struct list_head *)0 }); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, 0, +"(struct list_head){\n" +"}", + { .next = (struct list_head *)0 }); + + /* struct with function pointers */ + type_id = btf__find_by_name(btf, "file_operations"); + if (ASSERT_GT(type_id, 0, "find type id")) { + type_sz = btf__resolve_size(btf, type_id); + str[0] = '\0'; + + ret = btf_dump__dump_type_data(d, type_id, fops, type_sz, &opts); + ASSERT_EQ(ret, type_sz, + "unexpected return value dumping file_operations"); + cmpstr = +"(struct file_operations){\n" +" .owner = (struct module *)0xffffffffffffffff,\n" +" .llseek = (loff_t (*)(struct file *, loff_t, int))0xffffffffffffffff,"; + + ASSERT_STRNEQ(str, cmpstr, strlen(cmpstr), "file_operations"); + } + + /* struct with char array */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, BTF_F_COMPACT, + "(struct bpf_prog_info){.name = (char[16])['f','o','o',],}", + { .name = "foo",}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, + BTF_F_COMPACT | BTF_F_NONAME, + "{['f','o','o',],}", + {.name = "foo",}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, 0, +"(struct bpf_prog_info){\n" +" .name = (char[16])[\n" +" 'f',\n" +" 'o',\n" +" 'o',\n" +" ],\n" +"}", + {.name = "foo",}); + /* leading null char means do not display string */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, BTF_F_COMPACT, + "(struct bpf_prog_info){}", + {.name = {'\0', 'f', 'o', 'o'}}); + /* handle non-printable characters */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, BTF_F_COMPACT, + "(struct bpf_prog_info){.name = (char[16])[1,2,3,],}", + { .name = {1, 2, 3, 0}}); + + /* struct with non-char array */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, BTF_F_COMPACT, + "(struct __sk_buff){.cb = (__u32[5])[1,2,3,4,5,],}", + { .cb = {1, 2, 3, 4, 5,},}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, + BTF_F_COMPACT | BTF_F_NONAME, + "{[1,2,3,4,5,],}", + { .cb = { 1, 2, 3, 4, 5},}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, 0, +"(struct __sk_buff){\n" +" .cb = (__u32[5])[\n" +" 1,\n" +" 2,\n" +" 3,\n" +" 4,\n" +" 5,\n" +" ],\n" +"}", + { .cb = { 1, 2, 3, 4, 5},}); + /* For non-char, arrays, show non-zero values only */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, BTF_F_COMPACT, + "(struct __sk_buff){.cb = (__u32[5])[0,0,1,0,0,],}", + { .cb = { 0, 0, 1, 0, 0},}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, 0, +"(struct __sk_buff){\n" +" .cb = (__u32[5])[\n" +" 0,\n" +" 0,\n" +" 1,\n" +" 0,\n" +" 0,\n" +" ],\n" +"}", + { .cb = { 0, 0, 1, 0, 0},}); + + /* struct with bitfields */ + TEST_BTF_DUMP_DATA_C(btf, d, "struct", str, struct bpf_insn, BTF_F_COMPACT, + {.code = (__u8)1,.dst_reg = (__u8)0x2,.src_reg = (__u8)0x3,.off = (__s16)4,.imm = (__s32)5,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_insn, + BTF_F_COMPACT | BTF_F_NONAME, + "{1,0x2,0x3,4,5,}", + { .code = 1, .dst_reg = 0x2, .src_reg = 0x3, .off = 4, + .imm = 5,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_insn, 0, +"(struct bpf_insn){\n" +" .code = (__u8)1,\n" +" .dst_reg = (__u8)0x2,\n" +" .src_reg = (__u8)0x3,\n" +" .off = (__s16)4,\n" +" .imm = (__s32)5,\n" +"}", + {.code = 1, .dst_reg = 2, .src_reg = 3, .off = 4, .imm = 5}); + + /* zeroed bitfields should not be displayed */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_insn, BTF_F_COMPACT, + "(struct bpf_insn){.dst_reg = (__u8)0x1,}", + { .code = 0, .dst_reg = 1}); + + /* struct with enum bitfield */ + type_id = btf__find_by_name(btf, "fs_context"); + if (ASSERT_GT(type_id, 0, "find fs_context")) { + type_sz = btf__resolve_size(btf, type_id); + str[0] = '\0'; + + opts.emit_zeroes = true; + ret = btf_dump__dump_type_data(d, type_id, zero_data, type_sz, &opts); + ASSERT_EQ(ret, type_sz, + "unexpected return value dumping fs_context"); + + ASSERT_NEQ(strstr(str, "FS_CONTEXT_FOR_MOUNT"), NULL, + "bitfield value not present"); + } + + /* struct with nested anon union */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_sock_ops, BTF_F_COMPACT, + "(struct bpf_sock_ops){.op = (__u32)1,(union){.args = (__u32[4])[1,2,3,4,],.reply = (__u32)1,.replylong = (__u32[4])[1,2,3,4,],},}", + { .op = 1, .args = { 1, 2, 3, 4}}); + + /* union with nested struct */ + TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT, + "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},}", + { .map = { .map_fd = 1 }}); + + /* struct skb with nested structs/unions; because type output is so + * complex, we don't do a string comparison, just verify we return + * the type size as the amount of data displayed. + */ + type_id = btf__find_by_name(btf, "sk_buff"); + if (ASSERT_GT(type_id, 0, "find struct sk_buff")) { + type_sz = btf__resolve_size(btf, type_id); + str[0] = '\0'; + + ret = btf_dump__dump_type_data(d, type_id, skb, type_sz, &opts); + ASSERT_EQ(ret, type_sz, + "unexpected return value dumping sk_buff"); + } + + /* overflow bpf_sock_ops struct with final element nonzero/zero. + * Regardless of the value of the final field, we don't have all the + * data we need to display it, so we should trigger an overflow. + * In other words oveflow checking should trump "is field zero?" + * checks because if we've overflowed, it shouldn't matter what the + * field is - we can't trust its value so shouldn't display it. + */ + TEST_BTF_DUMP_DATA_OVER(btf, d, "struct", str, struct bpf_sock_ops, + sizeof(struct bpf_sock_ops) - 1, + "(struct bpf_sock_ops){\n\t.op = (__u32)1,\n", + { .op = 1, .skb_tcp_flags = 2}); + TEST_BTF_DUMP_DATA_OVER(btf, d, "struct", str, struct bpf_sock_ops, + sizeof(struct bpf_sock_ops) - 1, + "(struct bpf_sock_ops){\n\t.op = (__u32)1,\n", + { .op = 1, .skb_tcp_flags = 0}); +} + +static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, + "int cpu_number = (int)100", 100); + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_profile_flip", int, BTF_F_COMPACT, + "static int cpu_profile_flip = (int)2", 2); +} + +static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str, + const char *name, const char *expected_val, + void *data, size_t data_sz) +{ + DECLARE_LIBBPF_OPTS(btf_dump_type_data_opts, opts); + int ret = 0, cmp; + size_t secsize; + __s32 type_id; + + opts.compact = true; + + type_id = btf__find_by_name(btf, name); + if (!ASSERT_GT(type_id, 0, "find type id")) + return; + + secsize = btf__resolve_size(btf, type_id); + ASSERT_EQ(secsize, 0, "verify section size"); + + str[0] = '\0'; + ret = btf_dump__dump_type_data(d, type_id, data, data_sz, &opts); + ASSERT_EQ(ret, 0, "unexpected return value"); + + cmp = strcmp(str, expected_val); + ASSERT_EQ(cmp, 0, "ensure expected/actual match"); +} + +static void test_btf_dump_datasec_data(char *str) +{ + struct btf *btf = btf__parse("xdping_kern.o", NULL); + struct btf_dump_opts opts = { .ctx = str }; + char license[4] = "GPL"; + struct btf_dump *d; + + if (!ASSERT_OK_PTR(btf, "xdping_kern.o BTF not found")) + return; + + d = btf_dump__new(btf, NULL, &opts, btf_dump_snprintf); + if (!ASSERT_OK_PTR(d, "could not create BTF dump")) + return; + + test_btf_datasec(btf, d, str, "license", + "SEC(\"license\") char[4] _license = (char[4])['G','P','L',];", + license, sizeof(license)); +} + void test_btf_dump() { + char str[STRSIZE]; + struct btf_dump_opts opts = { .ctx = str }; + struct btf_dump *d; + struct btf *btf; int i; for (i = 0; i < ARRAY_SIZE(btf_dump_test_cases); i++) { @@ -245,4 +831,33 @@ void test_btf_dump() { } if (test__start_subtest("btf_dump: incremental")) test_btf_dump_incremental(); + + btf = libbpf_find_kernel_btf(); + if (!ASSERT_OK_PTR(btf, "no kernel BTF found")) + return; + + d = btf_dump__new(btf, NULL, &opts, btf_dump_snprintf); + if (!ASSERT_OK_PTR(d, "could not create BTF dump")) + return; + + /* Verify type display for various types. */ + if (test__start_subtest("btf_dump: int_data")) + test_btf_dump_int_data(btf, d, str); + if (test__start_subtest("btf_dump: float_data")) + test_btf_dump_float_data(btf, d, str); + if (test__start_subtest("btf_dump: char_data")) + test_btf_dump_char_data(btf, d, str); + if (test__start_subtest("btf_dump: typedef_data")) + test_btf_dump_typedef_data(btf, d, str); + if (test__start_subtest("btf_dump: enum_data")) + test_btf_dump_enum_data(btf, d, str); + if (test__start_subtest("btf_dump: struct_data")) + test_btf_dump_struct_data(btf, d, str); + if (test__start_subtest("btf_dump: var_data")) + test_btf_dump_var_data(btf, d, str); + btf_dump__free(d); + btf__free(btf); + + if (test__start_subtest("btf_dump: datasec_data")) + test_btf_dump_datasec_data(str); } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_module.c b/tools/testing/selftests/bpf/prog_tests/btf_module.c new file mode 100644 index 000000000000..2239d1fe0332 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_module.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021 Hengqi Chen */ + +#include <test_progs.h> +#include <bpf/btf.h> + +static const char *module_name = "bpf_testmod"; +static const char *symbol_name = "bpf_testmod_test_read"; + +void test_btf_module() +{ + struct btf *vmlinux_btf, *module_btf; + __s32 type_id; + + if (!env.has_testmod) { + test__skip(); + return; + } + + vmlinux_btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK_PTR(vmlinux_btf, "could not load vmlinux BTF")) + return; + + module_btf = btf__load_module_btf(module_name, vmlinux_btf); + if (!ASSERT_OK_PTR(module_btf, "could not load module BTF")) + goto cleanup; + + type_id = btf__find_by_name(module_btf, symbol_name); + ASSERT_GT(type_id, 0, "func not found"); + +cleanup: + btf__free(module_btf); + btf__free(vmlinux_btf); +} diff --git a/tools/testing/selftests/bpf/prog_tests/btf_write.c b/tools/testing/selftests/bpf/prog_tests/btf_write.c index f36da15b134f..022c7d89d6f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_write.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_write.c @@ -4,8 +4,6 @@ #include <bpf/btf.h> #include "btf_helpers.h" -static int duration = 0; - void test_btf_write() { const struct btf_var_secinfo *vi; const struct btf_type *t; @@ -16,7 +14,7 @@ void test_btf_write() { int id, err, str_off; btf = btf__new_empty(); - if (CHECK(IS_ERR(btf), "new_empty", "failed: %ld\n", PTR_ERR(btf))) + if (!ASSERT_OK_PTR(btf, "new_empty")) return; str_off = btf__find_str(btf, "int"); diff --git a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c index 643dfa35419c..876be0ecb654 100644 --- a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c @@ -102,8 +102,7 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd) */ parent_link = bpf_program__attach_cgroup(obj->progs.egress, parent_cgroup_fd); - if (CHECK(IS_ERR(parent_link), "parent-cg-attach", - "err %ld", PTR_ERR(parent_link))) + if (!ASSERT_OK_PTR(parent_link, "parent-cg-attach")) goto close_bpf_object; err = connect_send(CHILD_CGROUP); if (CHECK(err, "first-connect-send", "errno %d", errno)) @@ -126,8 +125,7 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd) */ child_link = bpf_program__attach_cgroup(obj->progs.egress, child_cgroup_fd); - if (CHECK(IS_ERR(child_link), "child-cg-attach", - "err %ld", PTR_ERR(child_link))) + if (!ASSERT_OK_PTR(child_link, "child-cg-attach")) goto close_bpf_object; err = connect_send(CHILD_CGROUP); if (CHECK(err, "second-connect-send", "errno %d", errno)) @@ -147,10 +145,8 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd) goto close_bpf_object; close_bpf_object: - if (!IS_ERR(parent_link)) - bpf_link__destroy(parent_link); - if (!IS_ERR(child_link)) - bpf_link__destroy(child_link); + bpf_link__destroy(parent_link); + bpf_link__destroy(child_link); cg_storage_multi_egress_only__destroy(obj); } @@ -176,18 +172,15 @@ static void test_isolated(int parent_cgroup_fd, int child_cgroup_fd) */ parent_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1, parent_cgroup_fd); - if (CHECK(IS_ERR(parent_egress1_link), "parent-egress1-cg-attach", - "err %ld", PTR_ERR(parent_egress1_link))) + if (!ASSERT_OK_PTR(parent_egress1_link, "parent-egress1-cg-attach")) goto close_bpf_object; parent_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2, parent_cgroup_fd); - if (CHECK(IS_ERR(parent_egress2_link), "parent-egress2-cg-attach", - "err %ld", PTR_ERR(parent_egress2_link))) + if (!ASSERT_OK_PTR(parent_egress2_link, "parent-egress2-cg-attach")) goto close_bpf_object; parent_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress, parent_cgroup_fd); - if (CHECK(IS_ERR(parent_ingress_link), "parent-ingress-cg-attach", - "err %ld", PTR_ERR(parent_ingress_link))) + if (!ASSERT_OK_PTR(parent_ingress_link, "parent-ingress-cg-attach")) goto close_bpf_object; err = connect_send(CHILD_CGROUP); if (CHECK(err, "first-connect-send", "errno %d", errno)) @@ -221,18 +214,15 @@ static void test_isolated(int parent_cgroup_fd, int child_cgroup_fd) */ child_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1, child_cgroup_fd); - if (CHECK(IS_ERR(child_egress1_link), "child-egress1-cg-attach", - "err %ld", PTR_ERR(child_egress1_link))) + if (!ASSERT_OK_PTR(child_egress1_link, "child-egress1-cg-attach")) goto close_bpf_object; child_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2, child_cgroup_fd); - if (CHECK(IS_ERR(child_egress2_link), "child-egress2-cg-attach", - "err %ld", PTR_ERR(child_egress2_link))) + if (!ASSERT_OK_PTR(child_egress2_link, "child-egress2-cg-attach")) goto close_bpf_object; child_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress, child_cgroup_fd); - if (CHECK(IS_ERR(child_ingress_link), "child-ingress-cg-attach", - "err %ld", PTR_ERR(child_ingress_link))) + if (!ASSERT_OK_PTR(child_ingress_link, "child-ingress-cg-attach")) goto close_bpf_object; err = connect_send(CHILD_CGROUP); if (CHECK(err, "second-connect-send", "errno %d", errno)) @@ -264,18 +254,12 @@ static void test_isolated(int parent_cgroup_fd, int child_cgroup_fd) goto close_bpf_object; close_bpf_object: - if (!IS_ERR(parent_egress1_link)) - bpf_link__destroy(parent_egress1_link); - if (!IS_ERR(parent_egress2_link)) - bpf_link__destroy(parent_egress2_link); - if (!IS_ERR(parent_ingress_link)) - bpf_link__destroy(parent_ingress_link); - if (!IS_ERR(child_egress1_link)) - bpf_link__destroy(child_egress1_link); - if (!IS_ERR(child_egress2_link)) - bpf_link__destroy(child_egress2_link); - if (!IS_ERR(child_ingress_link)) - bpf_link__destroy(child_ingress_link); + bpf_link__destroy(parent_egress1_link); + bpf_link__destroy(parent_egress2_link); + bpf_link__destroy(parent_ingress_link); + bpf_link__destroy(child_egress1_link); + bpf_link__destroy(child_egress2_link); + bpf_link__destroy(child_ingress_link); cg_storage_multi_isolated__destroy(obj); } @@ -301,18 +285,15 @@ static void test_shared(int parent_cgroup_fd, int child_cgroup_fd) */ parent_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1, parent_cgroup_fd); - if (CHECK(IS_ERR(parent_egress1_link), "parent-egress1-cg-attach", - "err %ld", PTR_ERR(parent_egress1_link))) + if (!ASSERT_OK_PTR(parent_egress1_link, "parent-egress1-cg-attach")) goto close_bpf_object; parent_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2, parent_cgroup_fd); - if (CHECK(IS_ERR(parent_egress2_link), "parent-egress2-cg-attach", - "err %ld", PTR_ERR(parent_egress2_link))) + if (!ASSERT_OK_PTR(parent_egress2_link, "parent-egress2-cg-attach")) goto close_bpf_object; parent_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress, parent_cgroup_fd); - if (CHECK(IS_ERR(parent_ingress_link), "parent-ingress-cg-attach", - "err %ld", PTR_ERR(parent_ingress_link))) + if (!ASSERT_OK_PTR(parent_ingress_link, "parent-ingress-cg-attach")) goto close_bpf_object; err = connect_send(CHILD_CGROUP); if (CHECK(err, "first-connect-send", "errno %d", errno)) @@ -338,18 +319,15 @@ static void test_shared(int parent_cgroup_fd, int child_cgroup_fd) */ child_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1, child_cgroup_fd); - if (CHECK(IS_ERR(child_egress1_link), "child-egress1-cg-attach", - "err %ld", PTR_ERR(child_egress1_link))) + if (!ASSERT_OK_PTR(child_egress1_link, "child-egress1-cg-attach")) goto close_bpf_object; child_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2, child_cgroup_fd); - if (CHECK(IS_ERR(child_egress2_link), "child-egress2-cg-attach", - "err %ld", PTR_ERR(child_egress2_link))) + if (!ASSERT_OK_PTR(child_egress2_link, "child-egress2-cg-attach")) goto close_bpf_object; child_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress, child_cgroup_fd); - if (CHECK(IS_ERR(child_ingress_link), "child-ingress-cg-attach", - "err %ld", PTR_ERR(child_ingress_link))) + if (!ASSERT_OK_PTR(child_ingress_link, "child-ingress-cg-attach")) goto close_bpf_object; err = connect_send(CHILD_CGROUP); if (CHECK(err, "second-connect-send", "errno %d", errno)) @@ -375,18 +353,12 @@ static void test_shared(int parent_cgroup_fd, int child_cgroup_fd) goto close_bpf_object; close_bpf_object: - if (!IS_ERR(parent_egress1_link)) - bpf_link__destroy(parent_egress1_link); - if (!IS_ERR(parent_egress2_link)) - bpf_link__destroy(parent_egress2_link); - if (!IS_ERR(parent_ingress_link)) - bpf_link__destroy(parent_ingress_link); - if (!IS_ERR(child_egress1_link)) - bpf_link__destroy(child_egress1_link); - if (!IS_ERR(child_egress2_link)) - bpf_link__destroy(child_egress2_link); - if (!IS_ERR(child_ingress_link)) - bpf_link__destroy(child_ingress_link); + bpf_link__destroy(parent_egress1_link); + bpf_link__destroy(parent_egress2_link); + bpf_link__destroy(parent_ingress_link); + bpf_link__destroy(child_egress1_link); + bpf_link__destroy(child_egress2_link); + bpf_link__destroy(child_ingress_link); cg_storage_multi_shared__destroy(obj); } diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c index 0a1fc9816cef..20bb8831dda6 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c @@ -167,7 +167,7 @@ void test_cgroup_attach_multi(void) prog_cnt = 2; CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, &attach_flags, - prog_ids, &prog_cnt) != -1); + prog_ids, &prog_cnt) >= 0); CHECK_FAIL(errno != ENOSPC); CHECK_FAIL(prog_cnt != 4); /* check that prog_ids are returned even when buffer is too small */ diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c index 736796e56ed1..9091524131d6 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c @@ -65,8 +65,7 @@ void test_cgroup_link(void) for (i = 0; i < cg_nr; i++) { links[i] = bpf_program__attach_cgroup(skel->progs.egress, cgs[i].fd); - if (CHECK(IS_ERR(links[i]), "cg_attach", "i: %d, err: %ld\n", - i, PTR_ERR(links[i]))) + if (!ASSERT_OK_PTR(links[i], "cg_attach")) goto cleanup; } @@ -121,8 +120,7 @@ void test_cgroup_link(void) links[last_cg] = bpf_program__attach_cgroup(skel->progs.egress, cgs[last_cg].fd); - if (CHECK(IS_ERR(links[last_cg]), "cg_attach", "err: %ld\n", - PTR_ERR(links[last_cg]))) + if (!ASSERT_OK_PTR(links[last_cg], "cg_attach")) goto cleanup; ping_and_check(cg_nr + 1, 0); @@ -147,7 +145,7 @@ void test_cgroup_link(void) /* attempt to mix in with multi-attach bpf_link */ tmp_link = bpf_program__attach_cgroup(skel->progs.egress, cgs[last_cg].fd); - if (CHECK(!IS_ERR(tmp_link), "cg_attach_fail", "unexpected success!\n")) { + if (!ASSERT_ERR_PTR(tmp_link, "cg_attach_fail")) { bpf_link__destroy(tmp_link); goto cleanup; } @@ -165,8 +163,7 @@ void test_cgroup_link(void) /* attach back link-based one */ links[last_cg] = bpf_program__attach_cgroup(skel->progs.egress, cgs[last_cg].fd); - if (CHECK(IS_ERR(links[last_cg]), "cg_attach", "err: %ld\n", - PTR_ERR(links[last_cg]))) + if (!ASSERT_OK_PTR(links[last_cg], "cg_attach")) goto cleanup; ping_and_check(cg_nr, 0); @@ -249,8 +246,7 @@ cleanup: BPF_CGROUP_INET_EGRESS); for (i = 0; i < cg_nr; i++) { - if (!IS_ERR(links[i])) - bpf_link__destroy(links[i]); + bpf_link__destroy(links[i]); } test_cgroup_link__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c index 464edc1c1708..b9dc4ec655b5 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c @@ -60,7 +60,7 @@ static void run_cgroup_bpf_test(const char *cg_path, int out_sk) goto cleanup; link = bpf_program__attach_cgroup(skel->progs.ingress_lookup, cgfd); - if (CHECK(IS_ERR(link), "cgroup_attach", "err: %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "cgroup_attach")) goto cleanup; run_lookup_test(&skel->bss->g_serv_port, out_sk); diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c new file mode 100644 index 000000000000..ab3b9bc5e6d1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> + +#include "connect4_dropper.skel.h" + +#include "cgroup_helpers.h" +#include "network_helpers.h" + +static int run_test(int cgroup_fd, int server_fd, bool classid) +{ + struct network_helper_opts opts = { + .must_fail = true, + }; + struct connect4_dropper *skel; + int fd, err = 0; + + skel = connect4_dropper__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return -1; + + skel->links.connect_v4_dropper = + bpf_program__attach_cgroup(skel->progs.connect_v4_dropper, + cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.connect_v4_dropper, "prog_attach")) { + err = -1; + goto out; + } + + if (classid && !ASSERT_OK(join_classid(), "join_classid")) { + err = -1; + goto out; + } + + fd = connect_to_fd_opts(server_fd, &opts); + if (fd < 0) + err = -1; + else + close(fd); +out: + connect4_dropper__destroy(skel); + return err; +} + +void test_cgroup_v1v2(void) +{ + struct network_helper_opts opts = {}; + int server_fd, client_fd, cgroup_fd; + static const int port = 60123; + + /* Step 1: Check base connectivity works without any BPF. */ + server_fd = start_server(AF_INET, SOCK_STREAM, NULL, port, 0); + if (!ASSERT_GE(server_fd, 0, "server_fd")) + return; + client_fd = connect_to_fd_opts(server_fd, &opts); + if (!ASSERT_GE(client_fd, 0, "client_fd")) { + close(server_fd); + return; + } + close(client_fd); + close(server_fd); + + /* Step 2: Check BPF policy prog attached to cgroups drops connectivity. */ + cgroup_fd = test__join_cgroup("/connect_dropper"); + if (!ASSERT_GE(cgroup_fd, 0, "cgroup_fd")) + return; + server_fd = start_server(AF_INET, SOCK_STREAM, NULL, port, 0); + if (!ASSERT_GE(server_fd, 0, "server_fd")) { + close(cgroup_fd); + return; + } + ASSERT_OK(run_test(cgroup_fd, server_fd, false), "cgroup-v2-only"); + setup_classid_environment(); + set_classid(42); + ASSERT_OK(run_test(cgroup_fd, server_fd, true), "cgroup-v1v2"); + cleanup_classid_environment(); + close(server_fd); + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/check_mtu.c b/tools/testing/selftests/bpf/prog_tests/check_mtu.c index b62a39315336..012068f33a0a 100644 --- a/tools/testing/selftests/bpf/prog_tests/check_mtu.c +++ b/tools/testing/selftests/bpf/prog_tests/check_mtu.c @@ -53,7 +53,7 @@ static void test_check_mtu_xdp_attach(void) prog = skel->progs.xdp_use_helper_basic; link = bpf_program__attach_xdp(prog, IFINDEX_LO); - if (CHECK(IS_ERR(link), "link_attach", "failed: %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "link_attach")) goto out; skel->links.xdp_use_helper_basic = link; diff --git a/tools/testing/selftests/bpf/prog_tests/core_autosize.c b/tools/testing/selftests/bpf/prog_tests/core_autosize.c index 981c251453d9..3d4b2a358d47 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_autosize.c +++ b/tools/testing/selftests/bpf/prog_tests/core_autosize.c @@ -53,8 +53,8 @@ void test_core_autosize(void) char btf_file[] = "/tmp/core_autosize.btf.XXXXXX"; int err, fd = -1, zero = 0; int char_id, short_id, int_id, long_long_id, void_ptr_id, id; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts); struct test_core_autosize* skel = NULL; - struct bpf_object_load_attr load_attr = {}; struct bpf_program *prog; struct bpf_map *bss_map; struct btf *btf = NULL; @@ -125,9 +125,10 @@ void test_core_autosize(void) fd = -1; /* open and load BPF program with custom BTF as the kernel BTF */ - skel = test_core_autosize__open(); + open_opts.btf_custom_path = btf_file; + skel = test_core_autosize__open_opts(&open_opts); if (!ASSERT_OK_PTR(skel, "skel_open")) - return; + goto cleanup; /* disable handle_signed() for now */ prog = bpf_object__find_program_by_name(skel->obj, "handle_signed"); @@ -135,9 +136,7 @@ void test_core_autosize(void) goto cleanup; bpf_program__set_autoload(prog, false); - load_attr.obj = skel->obj; - load_attr.target_btf_path = btf_file; - err = bpf_object__load_xattr(&load_attr); + err = bpf_object__load(skel->obj); if (!ASSERT_OK(err, "prog_load")) goto cleanup; @@ -204,14 +203,13 @@ void test_core_autosize(void) skel = NULL; /* now re-load with handle_signed() enabled, it should fail loading */ - skel = test_core_autosize__open(); + open_opts.btf_custom_path = btf_file; + skel = test_core_autosize__open_opts(&open_opts); if (!ASSERT_OK_PTR(skel, "skel_open")) - return; + goto cleanup; - load_attr.obj = skel->obj; - load_attr.target_btf_path = btf_file; - err = bpf_object__load_xattr(&load_attr); - if (!ASSERT_ERR(err, "bad_prog_load")) + err = test_core_autosize__load(skel); + if (!ASSERT_ERR(err, "skel_load")) goto cleanup; cleanup: diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 607710826dca..4739b15b2a97 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -369,8 +369,7 @@ static int setup_type_id_case_local(struct core_reloc_test_case *test) const char *name; int i; - if (CHECK(IS_ERR(local_btf), "local_btf", "failed: %ld\n", PTR_ERR(local_btf)) || - CHECK(IS_ERR(targ_btf), "targ_btf", "failed: %ld\n", PTR_ERR(targ_btf))) { + if (!ASSERT_OK_PTR(local_btf, "local_btf") || !ASSERT_OK_PTR(targ_btf, "targ_btf")) { btf__free(local_btf); btf__free(targ_btf); return -EINVAL; @@ -817,7 +816,7 @@ static size_t roundup_page(size_t sz) void test_core_reloc(void) { const size_t mmap_sz = roundup_page(sizeof(struct data)); - struct bpf_object_load_attr load_attr = {}; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts); struct core_reloc_test_case *test_case; const char *tp_name, *probe_name; int err, i, equal; @@ -847,10 +846,16 @@ void test_core_reloc(void) continue; } - obj = bpf_object__open_file(test_case->bpf_obj_file, NULL); - if (CHECK(IS_ERR(obj), "obj_open", "failed to open '%s': %ld\n", - test_case->bpf_obj_file, PTR_ERR(obj))) - continue; + if (test_case->btf_src_file) { + err = access(test_case->btf_src_file, R_OK); + if (!ASSERT_OK(err, "btf_src_file")) + goto cleanup; + } + + open_opts.btf_custom_path = test_case->btf_src_file; + obj = bpf_object__open_file(test_case->bpf_obj_file, &open_opts); + if (!ASSERT_OK_PTR(obj, "obj_open")) + goto cleanup; probe_name = "raw_tracepoint/sys_enter"; tp_name = "sys_enter"; @@ -864,17 +869,7 @@ void test_core_reloc(void) "prog '%s' not found\n", probe_name)) goto cleanup; - - if (test_case->btf_src_file) { - err = access(test_case->btf_src_file, R_OK); - if (!ASSERT_OK(err, "btf_src_file")) - goto cleanup; - } - - load_attr.obj = obj; - load_attr.log_level = 0; - load_attr.target_btf_path = test_case->btf_src_file; - err = bpf_object__load_xattr(&load_attr); + err = bpf_object__load(obj); if (err) { if (!test_case->fails) ASSERT_OK(err, "obj_load"); @@ -899,8 +894,7 @@ void test_core_reloc(void) data->my_pid_tgid = my_pid_tgid; link = bpf_program__attach_raw_tracepoint(prog, tp_name); - if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", - PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_raw_tp")) goto cleanup; /* trigger test run */ @@ -941,10 +935,8 @@ cleanup: CHECK_FAIL(munmap(mmap_data, mmap_sz)); mmap_data = NULL; } - if (!IS_ERR_OR_NULL(link)) { - bpf_link__destroy(link); - link = NULL; - } + bpf_link__destroy(link); + link = NULL; bpf_object__close(obj); } } diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c index 109d0345a2be..91154c2ba256 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <test_progs.h> -#include "fentry_test.skel.h" -#include "fexit_test.skel.h" +#include "fentry_test.lskel.h" +#include "fexit_test.lskel.h" void test_fentry_fexit(void) { @@ -26,7 +26,7 @@ void test_fentry_fexit(void) if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) goto close_prog; - prog_fd = bpf_program__fd(fexit_skel->progs.test1); + prog_fd = fexit_skel->progs.test1.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); CHECK(err || retval, "ipv6", diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index 7cb111b11995..174c89e7456e 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -1,13 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <test_progs.h> -#include "fentry_test.skel.h" +#include "fentry_test.lskel.h" static int fentry_test(struct fentry_test *fentry_skel) { int err, prog_fd, i; __u32 duration = 0, retval; - struct bpf_link *link; + int link_fd; __u64 *result; err = fentry_test__attach(fentry_skel); @@ -15,11 +15,11 @@ static int fentry_test(struct fentry_test *fentry_skel) return err; /* Check that already linked program can't be attached again. */ - link = bpf_program__attach(fentry_skel->progs.test1); - if (!ASSERT_ERR_PTR(link, "fentry_attach_link")) + link_fd = fentry_test__test1__attach(fentry_skel); + if (!ASSERT_LT(link_fd, 0, "fentry_attach_link")) return -1; - prog_fd = bpf_program__fd(fentry_skel->progs.test1); + prog_fd = fentry_skel->progs.test1.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); ASSERT_OK(err, "test_run"); diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index 63990842d20f..73b4c76e6b86 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -146,10 +146,8 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, close_prog: for (i = 0; i < prog_cnt; i++) - if (!IS_ERR_OR_NULL(link[i])) - bpf_link__destroy(link[i]); - if (!IS_ERR_OR_NULL(obj)) - bpf_object__close(obj); + bpf_link__destroy(link[i]); + bpf_object__close(obj); bpf_object__close(tgt_obj); free(link); free(prog); @@ -231,7 +229,7 @@ static int test_second_attach(struct bpf_object *obj) return err; link = bpf_program__attach_freplace(prog, tgt_fd, tgt_name); - if (CHECK(IS_ERR(link), "second_link", "failed to attach second link prog_fd %d tgt_fd %d\n", bpf_program__fd(prog), tgt_fd)) + if (!ASSERT_OK_PTR(link, "second_link")) goto out; err = bpf_prog_test_run(tgt_fd, 1, &pkt_v6, sizeof(pkt_v6), @@ -283,9 +281,7 @@ static void test_fmod_ret_freplace(void) opts.attach_prog_fd = pkt_fd; freplace_obj = bpf_object__open_file(freplace_name, &opts); - if (CHECK(IS_ERR_OR_NULL(freplace_obj), "freplace_obj_open", - "failed to open %s: %ld\n", freplace_name, - PTR_ERR(freplace_obj))) + if (!ASSERT_OK_PTR(freplace_obj, "freplace_obj_open")) goto out; err = bpf_object__load(freplace_obj); @@ -294,14 +290,12 @@ static void test_fmod_ret_freplace(void) prog = bpf_program__next(NULL, freplace_obj); freplace_link = bpf_program__attach_trace(prog); - if (CHECK(IS_ERR(freplace_link), "freplace_attach_trace", "failed to link\n")) + if (!ASSERT_OK_PTR(freplace_link, "freplace_attach_trace")) goto out; opts.attach_prog_fd = bpf_program__fd(prog); fmod_obj = bpf_object__open_file(fmod_ret_name, &opts); - if (CHECK(IS_ERR_OR_NULL(fmod_obj), "fmod_obj_open", - "failed to open %s: %ld\n", fmod_ret_name, - PTR_ERR(fmod_obj))) + if (!ASSERT_OK_PTR(fmod_obj, "fmod_obj_open")) goto out; err = bpf_object__load(fmod_obj); @@ -350,9 +344,7 @@ static void test_obj_load_failure_common(const char *obj_file, ); obj = bpf_object__open_file(obj_file, &opts); - if (CHECK(IS_ERR_OR_NULL(obj), "obj_open", - "failed to open %s: %ld\n", obj_file, - PTR_ERR(obj))) + if (!ASSERT_OK_PTR(obj, "obj_open")) goto close_prog; /* It should fail to load the program */ @@ -361,8 +353,7 @@ static void test_obj_load_failure_common(const char *obj_file, goto close_prog; close_prog: - if (!IS_ERR_OR_NULL(obj)) - bpf_object__close(obj); + bpf_object__close(obj); bpf_object__close(pkt_obj); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c index ccc7e8a34ab6..4e7f4b42ea29 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c @@ -6,7 +6,7 @@ #include <time.h> #include <sys/mman.h> #include <sys/syscall.h> -#include "fexit_sleep.skel.h" +#include "fexit_sleep.lskel.h" static int do_sleep(void *skel) { @@ -58,8 +58,8 @@ void test_fexit_sleep(void) * waiting for percpu_ref_kill to confirm). The other one * will be freed quickly. */ - close(bpf_program__fd(fexit_skel->progs.nanosleep_fentry)); - close(bpf_program__fd(fexit_skel->progs.nanosleep_fexit)); + close(fexit_skel->progs.nanosleep_fentry.prog_fd); + close(fexit_skel->progs.nanosleep_fexit.prog_fd); fexit_sleep__detach(fexit_skel); /* kill the thread to unwind sys_nanosleep stack through the trampoline */ diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c index 6792e41f7f69..af3dba726701 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c @@ -1,13 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <test_progs.h> -#include "fexit_test.skel.h" +#include "fexit_test.lskel.h" static int fexit_test(struct fexit_test *fexit_skel) { int err, prog_fd, i; __u32 duration = 0, retval; - struct bpf_link *link; + int link_fd; __u64 *result; err = fexit_test__attach(fexit_skel); @@ -15,11 +15,11 @@ static int fexit_test(struct fexit_test *fexit_skel) return err; /* Check that already linked program can't be attached again. */ - link = bpf_program__attach(fexit_skel->progs.test1); - if (!ASSERT_ERR_PTR(link, "fexit_attach_link")) + link_fd = fexit_test__test1__attach(fexit_skel); + if (!ASSERT_LT(link_fd, 0, "fexit_attach_link")) return -1; - prog_fd = bpf_program__fd(fexit_skel->progs.test1); + prog_fd = fexit_skel->progs.test1.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); ASSERT_OK(err, "test_run"); diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index cd6dc80edf18..225714f71ac6 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -541,7 +541,7 @@ static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd) return; link = bpf_program__attach_netns(skel->progs._dissect, net_fd); - if (CHECK(IS_ERR(link), "attach_netns", "err %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_netns")) goto out_close; run_tests_skb_less(tap_fd, skel->maps.last_dissection); diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c index 172c586b6996..3931ede5c534 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c @@ -134,9 +134,9 @@ static void test_link_create_link_create(int netns, int prog1, int prog2) /* Expect failure creating link when another link exists */ errno = 0; link2 = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts); - if (CHECK_FAIL(link2 != -1 || errno != E2BIG)) + if (CHECK_FAIL(link2 >= 0 || errno != E2BIG)) perror("bpf_prog_attach(prog2) expected E2BIG"); - if (link2 != -1) + if (link2 >= 0) close(link2); CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); @@ -159,9 +159,9 @@ static void test_prog_attach_link_create(int netns, int prog1, int prog2) /* Expect failure creating link when prog attached */ errno = 0; link = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts); - if (CHECK_FAIL(link != -1 || errno != EEXIST)) + if (CHECK_FAIL(link >= 0 || errno != EEXIST)) perror("bpf_link_create(prog2) expected EEXIST"); - if (link != -1) + if (link >= 0) close(link); CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); @@ -623,7 +623,7 @@ static void run_tests(int netns) } out_close: for (i = 0; i < ARRAY_SIZE(progs); i++) { - if (progs[i] != -1) + if (progs[i] >= 0) CHECK_FAIL(close(progs[i])); } } diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c new file mode 100644 index 000000000000..02a465f36d59 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include "get_func_ip_test.skel.h" + +void test_get_func_ip_test(void) +{ + struct get_func_ip_test *skel = NULL; + __u32 duration = 0, retval; + int err, prog_fd; + + skel = get_func_ip_test__open(); + if (!ASSERT_OK_PTR(skel, "get_func_ip_test__open")) + return; + + /* test6 is x86_64 specifc because of the instruction + * offset, disabling it for all other archs + */ +#ifndef __x86_64__ + bpf_program__set_autoload(skel->progs.test6, false); + bpf_program__set_autoload(skel->progs.test7, false); +#endif + + err = get_func_ip_test__load(skel); + if (!ASSERT_OK(err, "get_func_ip_test__load")) + goto cleanup; + + err = get_func_ip_test__attach(skel); + if (!ASSERT_OK(err, "get_func_ip_test__attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(retval, 0, "test_run"); + + prog_fd = bpf_program__fd(skel->progs.test5); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + + ASSERT_OK(err, "test_run"); + + ASSERT_EQ(skel->bss->test1_result, 1, "test1_result"); + ASSERT_EQ(skel->bss->test2_result, 1, "test2_result"); + ASSERT_EQ(skel->bss->test3_result, 1, "test3_result"); + ASSERT_EQ(skel->bss->test4_result, 1, "test4_result"); + ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); +#ifdef __x86_64__ + ASSERT_EQ(skel->bss->test6_result, 1, "test6_result"); + ASSERT_EQ(skel->bss->test7_result, 1, "test7_result"); +#endif + +cleanup: + get_func_ip_test__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c index 925722217edf..522237aa4470 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c @@ -121,12 +121,12 @@ void test_get_stack_raw_tp(void) goto close_prog; link = bpf_program__attach_raw_tracepoint(prog, "sys_enter"); - if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_raw_tp")) goto close_prog; pb_opts.sample_cb = get_stack_print_output; pb = perf_buffer__new(bpf_map__fd(map), 8, &pb_opts); - if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb))) + if (!ASSERT_OK_PTR(pb, "perf_buf__new")) goto close_prog; /* trigger some syscall action */ @@ -141,9 +141,7 @@ void test_get_stack_raw_tp(void) } close_prog: - if (!IS_ERR_OR_NULL(link)) - bpf_link__destroy(link); - if (!IS_ERR_OR_NULL(pb)) - perf_buffer__free(pb); + bpf_link__destroy(link); + perf_buffer__free(pb); bpf_object__close(obj); } diff --git a/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c b/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c index d884b2ed5bc5..8d5a6023a1bb 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c @@ -48,8 +48,7 @@ void test_get_stackid_cannot_attach(void) skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu, pmu_fd); - CHECK(!IS_ERR(skel->links.oncpu), "attach_perf_event_no_callchain", - "should have failed\n"); + ASSERT_ERR_PTR(skel->links.oncpu, "attach_perf_event_no_callchain"); close(pmu_fd); /* add PERF_SAMPLE_CALLCHAIN, attach should succeed */ @@ -65,8 +64,7 @@ void test_get_stackid_cannot_attach(void) skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu, pmu_fd); - CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event_callchain", - "err: %ld\n", PTR_ERR(skel->links.oncpu)); + ASSERT_OK_PTR(skel->links.oncpu, "attach_perf_event_callchain"); close(pmu_fd); /* add exclude_callchain_kernel, attach should fail */ @@ -82,8 +80,7 @@ void test_get_stackid_cannot_attach(void) skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu, pmu_fd); - CHECK(!IS_ERR(skel->links.oncpu), "attach_perf_event_exclude_callchain_kernel", - "should have failed\n"); + ASSERT_ERR_PTR(skel->links.oncpu, "attach_perf_event_exclude_callchain_kernel"); close(pmu_fd); cleanup: diff --git a/tools/testing/selftests/bpf/prog_tests/hashmap.c b/tools/testing/selftests/bpf/prog_tests/hashmap.c index 428d488830c6..4747ab18f97f 100644 --- a/tools/testing/selftests/bpf/prog_tests/hashmap.c +++ b/tools/testing/selftests/bpf/prog_tests/hashmap.c @@ -48,8 +48,7 @@ static void test_hashmap_generic(void) struct hashmap *map; map = hashmap__new(hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "hashmap__new", - "failed to create map: %ld\n", PTR_ERR(map))) + if (!ASSERT_OK_PTR(map, "hashmap__new")) return; for (i = 0; i < ELEM_CNT; i++) { @@ -267,8 +266,7 @@ static void test_hashmap_multimap(void) /* force collisions */ map = hashmap__new(collision_hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "hashmap__new", - "failed to create map: %ld\n", PTR_ERR(map))) + if (!ASSERT_OK_PTR(map, "hashmap__new")) return; /* set up multimap: @@ -339,8 +337,7 @@ static void test_hashmap_empty() /* force collisions */ map = hashmap__new(hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "hashmap__new", - "failed to create map: %ld\n", PTR_ERR(map))) + if (!ASSERT_OK_PTR(map, "hashmap__new")) goto cleanup; if (CHECK(hashmap__size(map) != 0, "hashmap__size", diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c index d65107919998..ddfb6bf97152 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c @@ -97,15 +97,13 @@ void test_kfree_skb(void) goto close_prog; link = bpf_program__attach_raw_tracepoint(prog, NULL); - if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_raw_tp")) goto close_prog; link_fentry = bpf_program__attach_trace(fentry); - if (CHECK(IS_ERR(link_fentry), "attach fentry", "err %ld\n", - PTR_ERR(link_fentry))) + if (!ASSERT_OK_PTR(link_fentry, "attach fentry")) goto close_prog; link_fexit = bpf_program__attach_trace(fexit); - if (CHECK(IS_ERR(link_fexit), "attach fexit", "err %ld\n", - PTR_ERR(link_fexit))) + if (!ASSERT_OK_PTR(link_fexit, "attach fexit")) goto close_prog; perf_buf_map = bpf_object__find_map_by_name(obj2, "perf_buf_map"); @@ -116,7 +114,7 @@ void test_kfree_skb(void) pb_opts.sample_cb = on_sample; pb_opts.ctx = &passed; pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1, &pb_opts); - if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb))) + if (!ASSERT_OK_PTR(pb, "perf_buf__new")) goto close_prog; memcpy(skb.cb, &cb, sizeof(cb)); @@ -144,12 +142,9 @@ void test_kfree_skb(void) CHECK_FAIL(!test_ok[0] || !test_ok[1]); close_prog: perf_buffer__free(pb); - if (!IS_ERR_OR_NULL(link)) - bpf_link__destroy(link); - if (!IS_ERR_OR_NULL(link_fentry)) - bpf_link__destroy(link_fentry); - if (!IS_ERR_OR_NULL(link_fexit)) - bpf_link__destroy(link_fexit); + bpf_link__destroy(link); + bpf_link__destroy(link_fentry); + bpf_link__destroy(link_fexit); bpf_object__close(obj); bpf_object__close(obj2); } diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index 7fc0951ee75f..9611f2bc50df 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -2,7 +2,7 @@ /* Copyright (c) 2021 Facebook */ #include <test_progs.h> #include <network_helpers.h> -#include "kfunc_call_test.skel.h" +#include "kfunc_call_test.lskel.h" #include "kfunc_call_test_subprog.skel.h" static void test_main(void) @@ -14,13 +14,13 @@ static void test_main(void) if (!ASSERT_OK_PTR(skel, "skel")) return; - prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); + prog_fd = skel->progs.kfunc_call_test1.prog_fd; err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), NULL, NULL, (__u32 *)&retval, NULL); ASSERT_OK(err, "bpf_prog_test_run(test1)"); ASSERT_EQ(retval, 12, "test1-retval"); - prog_fd = bpf_program__fd(skel->progs.kfunc_call_test2); + prog_fd = skel->progs.kfunc_call_test2.prog_fd; err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), NULL, NULL, (__u32 *)&retval, NULL); ASSERT_OK(err, "bpf_prog_test_run(test2)"); @@ -44,7 +44,7 @@ static void test_subprog(void) ASSERT_OK(err, "bpf_prog_test_run(test1)"); ASSERT_EQ(retval, 10, "test1-retval"); ASSERT_NEQ(skel->data->active_res, -1, "active_res"); - ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state"); + ASSERT_EQ(skel->data->sk_state_res, BPF_TCP_CLOSE, "sk_state_res"); kfunc_call_test_subprog__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c index b58b775d19f3..cf3acfa5a91d 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c @@ -6,6 +6,7 @@ #include <bpf/btf.h> #include "test_ksyms_btf.skel.h" #include "test_ksyms_btf_null_check.skel.h" +#include "test_ksyms_weak.skel.h" static int duration; @@ -81,14 +82,40 @@ static void test_null_check(void) test_ksyms_btf_null_check__destroy(skel); } +static void test_weak_syms(void) +{ + struct test_ksyms_weak *skel; + struct test_ksyms_weak__data *data; + int err; + + skel = test_ksyms_weak__open_and_load(); + if (CHECK(!skel, "test_ksyms_weak__open_and_load", "failed\n")) + return; + + err = test_ksyms_weak__attach(skel); + if (CHECK(err, "test_ksyms_weak__attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* trigger tracepoint */ + usleep(1); + + data = skel->data; + ASSERT_EQ(data->out__existing_typed, 0, "existing typed ksym"); + ASSERT_NEQ(data->out__existing_typeless, -1, "existing typeless ksym"); + ASSERT_EQ(data->out__non_existent_typeless, 0, "nonexistent typeless ksym"); + ASSERT_EQ(data->out__non_existent_typed, 0, "nonexistent typed ksym"); + +cleanup: + test_ksyms_weak__destroy(skel); +} + void test_ksyms_btf(void) { int percpu_datasec; struct btf *btf; btf = libbpf_find_kernel_btf(); - if (CHECK(IS_ERR(btf), "btf_exists", "failed to load kernel BTF: %ld\n", - PTR_ERR(btf))) + if (!ASSERT_OK_PTR(btf, "btf_exists")) return; percpu_datasec = btf__find_by_name_kind(btf, ".data..percpu", @@ -106,4 +133,7 @@ void test_ksyms_btf(void) if (test__start_subtest("null_check")) test_null_check(); + + if (test__start_subtest("weak_ksyms")) + test_weak_syms(); } diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_module.c b/tools/testing/selftests/bpf/prog_tests/ksyms_module.c index 4c232b456479..2cd5cded543f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms_module.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms_module.c @@ -4,7 +4,7 @@ #include <test_progs.h> #include <bpf/libbpf.h> #include <bpf/btf.h> -#include "test_ksyms_module.skel.h" +#include "test_ksyms_module.lskel.h" static int duration; diff --git a/tools/testing/selftests/bpf/prog_tests/link_pinning.c b/tools/testing/selftests/bpf/prog_tests/link_pinning.c index a743288cf384..6fc97c45f71e 100644 --- a/tools/testing/selftests/bpf/prog_tests/link_pinning.c +++ b/tools/testing/selftests/bpf/prog_tests/link_pinning.c @@ -17,7 +17,7 @@ void test_link_pinning_subtest(struct bpf_program *prog, int err, i; link = bpf_program__attach(prog); - if (CHECK(IS_ERR(link), "link_attach", "err: %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "link_attach")) goto cleanup; bss->in = 1; @@ -51,7 +51,7 @@ void test_link_pinning_subtest(struct bpf_program *prog, /* re-open link from BPFFS */ link = bpf_link__open(link_pin_path); - if (CHECK(IS_ERR(link), "link_open", "err: %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "link_open")) goto cleanup; CHECK(strcmp(link_pin_path, bpf_link__pin_path(link)), "pin_path2", @@ -84,8 +84,7 @@ void test_link_pinning_subtest(struct bpf_program *prog, CHECK(i == 10000, "link_attached", "got to iteration #%d\n", i); cleanup: - if (!IS_ERR(link)) - bpf_link__destroy(link); + bpf_link__destroy(link); } void test_link_pinning(void) diff --git a/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c b/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c new file mode 100644 index 000000000000..beebfa9730e1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <test_progs.h> +#include "test_lookup_and_delete.skel.h" + +#define START_VALUE 1234 +#define NEW_VALUE 4321 +#define MAX_ENTRIES 2 + +static int duration; +static int nr_cpus; + +static int fill_values(int map_fd) +{ + __u64 key, value = START_VALUE; + int err; + + for (key = 1; key < MAX_ENTRIES + 1; key++) { + err = bpf_map_update_elem(map_fd, &key, &value, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + return -1; + } + + return 0; +} + +static int fill_values_percpu(int map_fd) +{ + __u64 key, value[nr_cpus]; + int i, err; + + for (i = 0; i < nr_cpus; i++) + value[i] = START_VALUE; + + for (key = 1; key < MAX_ENTRIES + 1; key++) { + err = bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + return -1; + } + + return 0; +} + +static struct test_lookup_and_delete *setup_prog(enum bpf_map_type map_type, + int *map_fd) +{ + struct test_lookup_and_delete *skel; + int err; + + skel = test_lookup_and_delete__open(); + if (!ASSERT_OK_PTR(skel, "test_lookup_and_delete__open")) + return NULL; + + err = bpf_map__set_type(skel->maps.hash_map, map_type); + if (!ASSERT_OK(err, "bpf_map__set_type")) + goto cleanup; + + err = bpf_map__set_max_entries(skel->maps.hash_map, MAX_ENTRIES); + if (!ASSERT_OK(err, "bpf_map__set_max_entries")) + goto cleanup; + + err = test_lookup_and_delete__load(skel); + if (!ASSERT_OK(err, "test_lookup_and_delete__load")) + goto cleanup; + + *map_fd = bpf_map__fd(skel->maps.hash_map); + if (!ASSERT_GE(*map_fd, 0, "bpf_map__fd")) + goto cleanup; + + return skel; + +cleanup: + test_lookup_and_delete__destroy(skel); + return NULL; +} + +/* Triggers BPF program that updates map with given key and value */ +static int trigger_tp(struct test_lookup_and_delete *skel, __u64 key, + __u64 value) +{ + int err; + + skel->bss->set_pid = getpid(); + skel->bss->set_key = key; + skel->bss->set_value = value; + + err = test_lookup_and_delete__attach(skel); + if (!ASSERT_OK(err, "test_lookup_and_delete__attach")) + return -1; + + syscall(__NR_getpgid); + + test_lookup_and_delete__detach(skel); + + return 0; +} + +static void test_lookup_and_delete_hash(void) +{ + struct test_lookup_and_delete *skel; + __u64 key, value; + int map_fd, err; + + /* Setup program and fill the map. */ + skel = setup_prog(BPF_MAP_TYPE_HASH, &map_fd); + if (!ASSERT_OK_PTR(skel, "setup_prog")) + return; + + err = fill_values(map_fd); + if (!ASSERT_OK(err, "fill_values")) + goto cleanup; + + /* Lookup and delete element. */ + key = 1; + err = bpf_map_lookup_and_delete_elem(map_fd, &key, &value); + if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem")) + goto cleanup; + + /* Fetched value should match the initially set value. */ + if (CHECK(value != START_VALUE, "bpf_map_lookup_and_delete_elem", + "unexpected value=%lld\n", value)) + goto cleanup; + + /* Check that the entry is non existent. */ + err = bpf_map_lookup_elem(map_fd, &key, &value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + +cleanup: + test_lookup_and_delete__destroy(skel); +} + +static void test_lookup_and_delete_percpu_hash(void) +{ + struct test_lookup_and_delete *skel; + __u64 key, val, value[nr_cpus]; + int map_fd, err, i; + + /* Setup program and fill the map. */ + skel = setup_prog(BPF_MAP_TYPE_PERCPU_HASH, &map_fd); + if (!ASSERT_OK_PTR(skel, "setup_prog")) + return; + + err = fill_values_percpu(map_fd); + if (!ASSERT_OK(err, "fill_values_percpu")) + goto cleanup; + + /* Lookup and delete element. */ + key = 1; + err = bpf_map_lookup_and_delete_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem")) + goto cleanup; + + for (i = 0; i < nr_cpus; i++) { + val = value[i]; + + /* Fetched value should match the initially set value. */ + if (CHECK(val != START_VALUE, "map value", + "unexpected for cpu %d: %lld\n", i, val)) + goto cleanup; + } + + /* Check that the entry is non existent. */ + err = bpf_map_lookup_elem(map_fd, &key, value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + +cleanup: + test_lookup_and_delete__destroy(skel); +} + +static void test_lookup_and_delete_lru_hash(void) +{ + struct test_lookup_and_delete *skel; + __u64 key, value; + int map_fd, err; + + /* Setup program and fill the LRU map. */ + skel = setup_prog(BPF_MAP_TYPE_LRU_HASH, &map_fd); + if (!ASSERT_OK_PTR(skel, "setup_prog")) + return; + + err = fill_values(map_fd); + if (!ASSERT_OK(err, "fill_values")) + goto cleanup; + + /* Insert new element at key=3, should reuse LRU element. */ + key = 3; + err = trigger_tp(skel, key, NEW_VALUE); + if (!ASSERT_OK(err, "trigger_tp")) + goto cleanup; + + /* Lookup and delete element 3. */ + err = bpf_map_lookup_and_delete_elem(map_fd, &key, &value); + if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem")) + goto cleanup; + + /* Value should match the new value. */ + if (CHECK(value != NEW_VALUE, "bpf_map_lookup_and_delete_elem", + "unexpected value=%lld\n", value)) + goto cleanup; + + /* Check that entries 3 and 1 are non existent. */ + err = bpf_map_lookup_elem(map_fd, &key, &value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + + key = 1; + err = bpf_map_lookup_elem(map_fd, &key, &value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + +cleanup: + test_lookup_and_delete__destroy(skel); +} + +static void test_lookup_and_delete_lru_percpu_hash(void) +{ + struct test_lookup_and_delete *skel; + __u64 key, val, value[nr_cpus]; + int map_fd, err, i, cpucnt = 0; + + /* Setup program and fill the LRU map. */ + skel = setup_prog(BPF_MAP_TYPE_LRU_PERCPU_HASH, &map_fd); + if (!ASSERT_OK_PTR(skel, "setup_prog")) + return; + + err = fill_values_percpu(map_fd); + if (!ASSERT_OK(err, "fill_values_percpu")) + goto cleanup; + + /* Insert new element at key=3, should reuse LRU element 1. */ + key = 3; + err = trigger_tp(skel, key, NEW_VALUE); + if (!ASSERT_OK(err, "trigger_tp")) + goto cleanup; + + /* Clean value. */ + for (i = 0; i < nr_cpus; i++) + value[i] = 0; + + /* Lookup and delete element 3. */ + err = bpf_map_lookup_and_delete_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_and_delete_elem")) { + goto cleanup; + } + + /* Check if only one CPU has set the value. */ + for (i = 0; i < nr_cpus; i++) { + val = value[i]; + if (val) { + if (CHECK(val != NEW_VALUE, "map value", + "unexpected for cpu %d: %lld\n", i, val)) + goto cleanup; + cpucnt++; + } + } + if (CHECK(cpucnt != 1, "map value", "set for %d CPUs instead of 1!\n", + cpucnt)) + goto cleanup; + + /* Check that entries 3 and 1 are non existent. */ + err = bpf_map_lookup_elem(map_fd, &key, &value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + + key = 1; + err = bpf_map_lookup_elem(map_fd, &key, &value); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem")) + goto cleanup; + +cleanup: + test_lookup_and_delete__destroy(skel); +} + +void test_lookup_and_delete(void) +{ + nr_cpus = bpf_num_possible_cpus(); + + if (test__start_subtest("lookup_and_delete")) + test_lookup_and_delete_hash(); + if (test__start_subtest("lookup_and_delete_percpu")) + test_lookup_and_delete_percpu_hash(); + if (test__start_subtest("lookup_and_delete_lru")) + test_lookup_and_delete_lru_hash(); + if (test__start_subtest("lookup_and_delete_lru_percpu")) + test_lookup_and_delete_lru_percpu_hash(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c new file mode 100644 index 000000000000..59adb4715394 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check if we can migrate child sockets. + * + * 1. call listen() for 4 server sockets. + * 2. call connect() for 25 client sockets. + * 3. call listen() for 1 server socket. (migration target) + * 4. update a map to migrate all child sockets + * to the last server socket (migrate_map[cookie] = 4) + * 5. call shutdown() for first 4 server sockets + * and migrate the requests in the accept queue + * to the last server socket. + * 6. call listen() for the second server socket. + * 7. call shutdown() for the last server + * and migrate the requests in the accept queue + * to the second server socket. + * 8. call listen() for the last server. + * 9. call shutdown() for the second server + * and migrate the requests in the accept queue + * to the last server socket. + * 10. call accept() for the last server socket. + * + * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp> + */ + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "test_progs.h" +#include "test_migrate_reuseport.skel.h" +#include "network_helpers.h" + +#ifndef TCP_FASTOPEN_CONNECT +#define TCP_FASTOPEN_CONNECT 30 +#endif + +#define IFINDEX_LO 1 + +#define NR_SERVERS 5 +#define NR_CLIENTS (NR_SERVERS * 5) +#define MIGRATED_TO (NR_SERVERS - 1) + +/* fastopenq->max_qlen and sk->sk_max_ack_backlog */ +#define QLEN (NR_CLIENTS * 5) + +#define MSG "Hello World\0" +#define MSGLEN 12 + +static struct migrate_reuseport_test_case { + const char *name; + __s64 servers[NR_SERVERS]; + __s64 clients[NR_CLIENTS]; + struct sockaddr_storage addr; + socklen_t addrlen; + int family; + int state; + bool drop_ack; + bool expire_synack_timer; + bool fastopen; + struct bpf_link *link; +} test_cases[] = { + { + .name = "IPv4 TCP_ESTABLISHED inet_csk_listen_stop", + .family = AF_INET, + .state = BPF_TCP_ESTABLISHED, + .drop_ack = false, + .expire_synack_timer = false, + .fastopen = false, + }, + { + .name = "IPv4 TCP_SYN_RECV inet_csk_listen_stop", + .family = AF_INET, + .state = BPF_TCP_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = false, + .fastopen = true, + }, + { + .name = "IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler", + .family = AF_INET, + .state = BPF_TCP_NEW_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = true, + .fastopen = false, + }, + { + .name = "IPv4 TCP_NEW_SYN_RECV inet_csk_complete_hashdance", + .family = AF_INET, + .state = BPF_TCP_NEW_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = false, + .fastopen = false, + }, + { + .name = "IPv6 TCP_ESTABLISHED inet_csk_listen_stop", + .family = AF_INET6, + .state = BPF_TCP_ESTABLISHED, + .drop_ack = false, + .expire_synack_timer = false, + .fastopen = false, + }, + { + .name = "IPv6 TCP_SYN_RECV inet_csk_listen_stop", + .family = AF_INET6, + .state = BPF_TCP_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = false, + .fastopen = true, + }, + { + .name = "IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler", + .family = AF_INET6, + .state = BPF_TCP_NEW_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = true, + .fastopen = false, + }, + { + .name = "IPv6 TCP_NEW_SYN_RECV inet_csk_complete_hashdance", + .family = AF_INET6, + .state = BPF_TCP_NEW_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = false, + .fastopen = false, + } +}; + +static void init_fds(__s64 fds[], int len) +{ + int i; + + for (i = 0; i < len; i++) + fds[i] = -1; +} + +static void close_fds(__s64 fds[], int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (fds[i] != -1) { + close(fds[i]); + fds[i] = -1; + } + } +} + +static int setup_fastopen(char *buf, int size, int *saved_len, bool restore) +{ + int err = 0, fd, len; + + fd = open("/proc/sys/net/ipv4/tcp_fastopen", O_RDWR); + if (!ASSERT_NEQ(fd, -1, "open")) + return -1; + + if (restore) { + len = write(fd, buf, *saved_len); + if (!ASSERT_EQ(len, *saved_len, "write - restore")) + err = -1; + } else { + *saved_len = read(fd, buf, size); + if (!ASSERT_GE(*saved_len, 1, "read")) { + err = -1; + goto close; + } + + err = lseek(fd, 0, SEEK_SET); + if (!ASSERT_OK(err, "lseek")) + goto close; + + /* (TFO_CLIENT_ENABLE | TFO_SERVER_ENABLE | + * TFO_CLIENT_NO_COOKIE | TFO_SERVER_COOKIE_NOT_REQD) + */ + len = write(fd, "519", 3); + if (!ASSERT_EQ(len, 3, "write - setup")) + err = -1; + } + +close: + close(fd); + + return err; +} + +static int drop_ack(struct migrate_reuseport_test_case *test_case, + struct test_migrate_reuseport *skel) +{ + if (test_case->family == AF_INET) + skel->bss->server_port = ((struct sockaddr_in *) + &test_case->addr)->sin_port; + else + skel->bss->server_port = ((struct sockaddr_in6 *) + &test_case->addr)->sin6_port; + + test_case->link = bpf_program__attach_xdp(skel->progs.drop_ack, + IFINDEX_LO); + if (!ASSERT_OK_PTR(test_case->link, "bpf_program__attach_xdp")) + return -1; + + return 0; +} + +static int pass_ack(struct migrate_reuseport_test_case *test_case) +{ + int err; + + err = bpf_link__detach(test_case->link); + if (!ASSERT_OK(err, "bpf_link__detach")) + return -1; + + test_case->link = NULL; + + return 0; +} + +static int start_servers(struct migrate_reuseport_test_case *test_case, + struct test_migrate_reuseport *skel) +{ + int i, err, prog_fd, reuseport = 1, qlen = QLEN; + + prog_fd = bpf_program__fd(skel->progs.migrate_reuseport); + + make_sockaddr(test_case->family, + test_case->family == AF_INET ? "127.0.0.1" : "::1", 0, + &test_case->addr, &test_case->addrlen); + + for (i = 0; i < NR_SERVERS; i++) { + test_case->servers[i] = socket(test_case->family, SOCK_STREAM, + IPPROTO_TCP); + if (!ASSERT_NEQ(test_case->servers[i], -1, "socket")) + return -1; + + err = setsockopt(test_case->servers[i], SOL_SOCKET, + SO_REUSEPORT, &reuseport, sizeof(reuseport)); + if (!ASSERT_OK(err, "setsockopt - SO_REUSEPORT")) + return -1; + + err = bind(test_case->servers[i], + (struct sockaddr *)&test_case->addr, + test_case->addrlen); + if (!ASSERT_OK(err, "bind")) + return -1; + + if (i == 0) { + err = setsockopt(test_case->servers[i], SOL_SOCKET, + SO_ATTACH_REUSEPORT_EBPF, + &prog_fd, sizeof(prog_fd)); + if (!ASSERT_OK(err, + "setsockopt - SO_ATTACH_REUSEPORT_EBPF")) + return -1; + + err = getsockname(test_case->servers[i], + (struct sockaddr *)&test_case->addr, + &test_case->addrlen); + if (!ASSERT_OK(err, "getsockname")) + return -1; + } + + if (test_case->fastopen) { + err = setsockopt(test_case->servers[i], + SOL_TCP, TCP_FASTOPEN, + &qlen, sizeof(qlen)); + if (!ASSERT_OK(err, "setsockopt - TCP_FASTOPEN")) + return -1; + } + + /* All requests will be tied to the first four listeners */ + if (i != MIGRATED_TO) { + err = listen(test_case->servers[i], qlen); + if (!ASSERT_OK(err, "listen")) + return -1; + } + } + + return 0; +} + +static int start_clients(struct migrate_reuseport_test_case *test_case) +{ + char buf[MSGLEN] = MSG; + int i, err; + + for (i = 0; i < NR_CLIENTS; i++) { + test_case->clients[i] = socket(test_case->family, SOCK_STREAM, + IPPROTO_TCP); + if (!ASSERT_NEQ(test_case->clients[i], -1, "socket")) + return -1; + + /* The attached XDP program drops only the final ACK, so + * clients will transition to TCP_ESTABLISHED immediately. + */ + err = settimeo(test_case->clients[i], 100); + if (!ASSERT_OK(err, "settimeo")) + return -1; + + if (test_case->fastopen) { + int fastopen = 1; + + err = setsockopt(test_case->clients[i], IPPROTO_TCP, + TCP_FASTOPEN_CONNECT, &fastopen, + sizeof(fastopen)); + if (!ASSERT_OK(err, + "setsockopt - TCP_FASTOPEN_CONNECT")) + return -1; + } + + err = connect(test_case->clients[i], + (struct sockaddr *)&test_case->addr, + test_case->addrlen); + if (!ASSERT_OK(err, "connect")) + return -1; + + err = write(test_case->clients[i], buf, MSGLEN); + if (!ASSERT_EQ(err, MSGLEN, "write")) + return -1; + } + + return 0; +} + +static int update_maps(struct migrate_reuseport_test_case *test_case, + struct test_migrate_reuseport *skel) +{ + int i, err, migrated_to = MIGRATED_TO; + int reuseport_map_fd, migrate_map_fd; + __u64 value; + + reuseport_map_fd = bpf_map__fd(skel->maps.reuseport_map); + migrate_map_fd = bpf_map__fd(skel->maps.migrate_map); + + for (i = 0; i < NR_SERVERS; i++) { + value = (__u64)test_case->servers[i]; + err = bpf_map_update_elem(reuseport_map_fd, &i, &value, + BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem - reuseport_map")) + return -1; + + err = bpf_map_lookup_elem(reuseport_map_fd, &i, &value); + if (!ASSERT_OK(err, "bpf_map_lookup_elem - reuseport_map")) + return -1; + + err = bpf_map_update_elem(migrate_map_fd, &value, &migrated_to, + BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem - migrate_map")) + return -1; + } + + return 0; +} + +static int migrate_dance(struct migrate_reuseport_test_case *test_case) +{ + int i, err; + + /* Migrate TCP_ESTABLISHED and TCP_SYN_RECV requests + * to the last listener based on eBPF. + */ + for (i = 0; i < MIGRATED_TO; i++) { + err = shutdown(test_case->servers[i], SHUT_RDWR); + if (!ASSERT_OK(err, "shutdown")) + return -1; + } + + /* No dance for TCP_NEW_SYN_RECV to migrate based on eBPF */ + if (test_case->state == BPF_TCP_NEW_SYN_RECV) + return 0; + + /* Note that we use the second listener instead of the + * first one here. + * + * The fist listener is bind()ed with port 0 and, + * SOCK_BINDPORT_LOCK is not set to sk_userlocks, so + * calling listen() again will bind() the first listener + * on a new ephemeral port and detach it from the existing + * reuseport group. (See: __inet_bind(), tcp_set_state()) + * + * OTOH, the second one is bind()ed with a specific port, + * and SOCK_BINDPORT_LOCK is set. Thus, re-listen() will + * resurrect the listener on the existing reuseport group. + */ + err = listen(test_case->servers[1], QLEN); + if (!ASSERT_OK(err, "listen")) + return -1; + + /* Migrate from the last listener to the second one. + * + * All listeners were detached out of the reuseport_map, + * so migration will be done by kernel random pick from here. + */ + err = shutdown(test_case->servers[MIGRATED_TO], SHUT_RDWR); + if (!ASSERT_OK(err, "shutdown")) + return -1; + + /* Back to the existing reuseport group */ + err = listen(test_case->servers[MIGRATED_TO], QLEN); + if (!ASSERT_OK(err, "listen")) + return -1; + + /* Migrate back to the last one from the second one */ + err = shutdown(test_case->servers[1], SHUT_RDWR); + if (!ASSERT_OK(err, "shutdown")) + return -1; + + return 0; +} + +static void count_requests(struct migrate_reuseport_test_case *test_case, + struct test_migrate_reuseport *skel) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int err, cnt = 0, client; + char buf[MSGLEN]; + + err = settimeo(test_case->servers[MIGRATED_TO], 4000); + if (!ASSERT_OK(err, "settimeo")) + goto out; + + for (; cnt < NR_CLIENTS; cnt++) { + client = accept(test_case->servers[MIGRATED_TO], + (struct sockaddr *)&addr, &len); + if (!ASSERT_NEQ(client, -1, "accept")) + goto out; + + memset(buf, 0, MSGLEN); + read(client, &buf, MSGLEN); + close(client); + + if (!ASSERT_STREQ(buf, MSG, "read")) + goto out; + } + +out: + ASSERT_EQ(cnt, NR_CLIENTS, "count in userspace"); + + switch (test_case->state) { + case BPF_TCP_ESTABLISHED: + cnt = skel->bss->migrated_at_close; + break; + case BPF_TCP_SYN_RECV: + cnt = skel->bss->migrated_at_close_fastopen; + break; + case BPF_TCP_NEW_SYN_RECV: + if (test_case->expire_synack_timer) + cnt = skel->bss->migrated_at_send_synack; + else + cnt = skel->bss->migrated_at_recv_ack; + break; + default: + cnt = 0; + } + + ASSERT_EQ(cnt, NR_CLIENTS, "count in BPF prog"); +} + +static void run_test(struct migrate_reuseport_test_case *test_case, + struct test_migrate_reuseport *skel) +{ + int err, saved_len; + char buf[16]; + + skel->bss->migrated_at_close = 0; + skel->bss->migrated_at_close_fastopen = 0; + skel->bss->migrated_at_send_synack = 0; + skel->bss->migrated_at_recv_ack = 0; + + init_fds(test_case->servers, NR_SERVERS); + init_fds(test_case->clients, NR_CLIENTS); + + if (test_case->fastopen) { + memset(buf, 0, sizeof(buf)); + + err = setup_fastopen(buf, sizeof(buf), &saved_len, false); + if (!ASSERT_OK(err, "setup_fastopen - setup")) + return; + } + + err = start_servers(test_case, skel); + if (!ASSERT_OK(err, "start_servers")) + goto close_servers; + + if (test_case->drop_ack) { + /* Drop the final ACK of the 3-way handshake and stick the + * in-flight requests on TCP_SYN_RECV or TCP_NEW_SYN_RECV. + */ + err = drop_ack(test_case, skel); + if (!ASSERT_OK(err, "drop_ack")) + goto close_servers; + } + + /* Tie requests to the first four listners */ + err = start_clients(test_case); + if (!ASSERT_OK(err, "start_clients")) + goto close_clients; + + err = listen(test_case->servers[MIGRATED_TO], QLEN); + if (!ASSERT_OK(err, "listen")) + goto close_clients; + + err = update_maps(test_case, skel); + if (!ASSERT_OK(err, "fill_maps")) + goto close_clients; + + /* Migrate the requests in the accept queue only. + * TCP_NEW_SYN_RECV requests are not migrated at this point. + */ + err = migrate_dance(test_case); + if (!ASSERT_OK(err, "migrate_dance")) + goto close_clients; + + if (test_case->expire_synack_timer) { + /* Wait for SYN+ACK timers to expire so that + * reqsk_timer_handler() migrates TCP_NEW_SYN_RECV requests. + */ + sleep(1); + } + + if (test_case->link) { + /* Resume 3WHS and migrate TCP_NEW_SYN_RECV requests */ + err = pass_ack(test_case); + if (!ASSERT_OK(err, "pass_ack")) + goto close_clients; + } + + count_requests(test_case, skel); + +close_clients: + close_fds(test_case->clients, NR_CLIENTS); + + if (test_case->link) { + err = pass_ack(test_case); + ASSERT_OK(err, "pass_ack - clean up"); + } + +close_servers: + close_fds(test_case->servers, NR_SERVERS); + + if (test_case->fastopen) { + err = setup_fastopen(buf, sizeof(buf), &saved_len, true); + ASSERT_OK(err, "setup_fastopen - restore"); + } +} + +void test_migrate_reuseport(void) +{ + struct test_migrate_reuseport *skel; + int i; + + skel = test_migrate_reuseport__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { + test__start_subtest(test_cases[i].name); + run_test(&test_cases[i], skel); + } + + test_migrate_reuseport__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/netcnt.c b/tools/testing/selftests/bpf/prog_tests/netcnt.c new file mode 100644 index 000000000000..6ede48bde91b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/netcnt.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/sysinfo.h> +#include <test_progs.h> +#include "network_helpers.h" +#include "netcnt_prog.skel.h" +#include "netcnt_common.h" + +#define CG_NAME "/netcnt" + +void test_netcnt(void) +{ + union percpu_net_cnt *percpu_netcnt = NULL; + struct bpf_cgroup_storage_key key; + int map_fd, percpu_map_fd; + struct netcnt_prog *skel; + unsigned long packets; + union net_cnt netcnt; + unsigned long bytes; + int cpu, nproc; + int cg_fd = -1; + char cmd[128]; + + skel = netcnt_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "netcnt_prog__open_and_load")) + return; + + nproc = get_nprocs_conf(); + percpu_netcnt = malloc(sizeof(*percpu_netcnt) * nproc); + if (!ASSERT_OK_PTR(percpu_netcnt, "malloc(percpu_netcnt)")) + goto err; + + cg_fd = test__join_cgroup(CG_NAME); + if (!ASSERT_GE(cg_fd, 0, "test__join_cgroup")) + goto err; + + skel->links.bpf_nextcnt = bpf_program__attach_cgroup(skel->progs.bpf_nextcnt, cg_fd); + if (!ASSERT_OK_PTR(skel->links.bpf_nextcnt, + "attach_cgroup(bpf_nextcnt)")) + goto err; + + snprintf(cmd, sizeof(cmd), "%s ::1 -A -c 10000 -q > /dev/null", ping_command(AF_INET6)); + ASSERT_OK(system(cmd), cmd); + + map_fd = bpf_map__fd(skel->maps.netcnt); + if (!ASSERT_OK(bpf_map_get_next_key(map_fd, NULL, &key), "bpf_map_get_next_key")) + goto err; + + if (!ASSERT_OK(bpf_map_lookup_elem(map_fd, &key, &netcnt), "bpf_map_lookup_elem(netcnt)")) + goto err; + + percpu_map_fd = bpf_map__fd(skel->maps.percpu_netcnt); + if (!ASSERT_OK(bpf_map_lookup_elem(percpu_map_fd, &key, &percpu_netcnt[0]), + "bpf_map_lookup_elem(percpu_netcnt)")) + goto err; + + /* Some packets can be still in per-cpu cache, but not more than + * MAX_PERCPU_PACKETS. + */ + packets = netcnt.packets; + bytes = netcnt.bytes; + for (cpu = 0; cpu < nproc; cpu++) { + ASSERT_LE(percpu_netcnt[cpu].packets, MAX_PERCPU_PACKETS, "MAX_PERCPU_PACKETS"); + + packets += percpu_netcnt[cpu].packets; + bytes += percpu_netcnt[cpu].bytes; + } + + /* No packets should be lost */ + ASSERT_EQ(packets, 10000, "packets"); + + /* Let's check that bytes counter matches the number of packets + * multiplied by the size of ipv6 ICMP packet. + */ + ASSERT_EQ(bytes, packets * 104, "bytes"); + +err: + if (cg_fd != -1) + close(cg_fd); + free(percpu_netcnt); + netcnt_prog__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/netns_cookie.c b/tools/testing/selftests/bpf/prog_tests/netns_cookie.c new file mode 100644 index 000000000000..71d8f3ba7d6b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/netns_cookie.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include "netns_cookie_prog.skel.h" +#include "network_helpers.h" + +#ifndef SO_NETNS_COOKIE +#define SO_NETNS_COOKIE 71 +#endif + +static int duration; + +void test_netns_cookie(void) +{ + int server_fd = -1, client_fd = -1, cgroup_fd = -1; + int err, val, ret, map, verdict; + struct netns_cookie_prog *skel; + uint64_t cookie_expected_value; + socklen_t vallen = sizeof(cookie_expected_value); + static const char send_msg[] = "message"; + + skel = netns_cookie_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + cgroup_fd = test__join_cgroup("/netns_cookie"); + if (CHECK(cgroup_fd < 0, "join_cgroup", "cgroup creation failed\n")) + goto done; + + skel->links.get_netns_cookie_sockops = bpf_program__attach_cgroup( + skel->progs.get_netns_cookie_sockops, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.get_netns_cookie_sockops, "prog_attach")) + goto done; + + verdict = bpf_program__fd(skel->progs.get_netns_cookie_sk_msg); + map = bpf_map__fd(skel->maps.sock_map); + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_OK(err, "prog_attach")) + goto done; + + server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (CHECK(server_fd < 0, "start_server", "errno %d\n", errno)) + goto done; + + client_fd = connect_to_fd(server_fd, 0); + if (CHECK(client_fd < 0, "connect_to_fd", "errno %d\n", errno)) + goto done; + + ret = send(client_fd, send_msg, sizeof(send_msg), 0); + if (CHECK(ret != sizeof(send_msg), "send(msg)", "ret:%d\n", ret)) + goto done; + + err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sockops_netns_cookies), + &client_fd, &val); + if (!ASSERT_OK(err, "map_lookup(sockops_netns_cookies)")) + goto done; + + err = getsockopt(client_fd, SOL_SOCKET, SO_NETNS_COOKIE, + &cookie_expected_value, &vallen); + if (!ASSERT_OK(err, "getsockopt")) + goto done; + + ASSERT_EQ(val, cookie_expected_value, "cookie_value"); + + err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sk_msg_netns_cookies), + &client_fd, &val); + if (!ASSERT_OK(err, "map_lookup(sk_msg_netns_cookies)")) + goto done; + + ASSERT_EQ(val, cookie_expected_value, "cookie_value"); + +done: + if (server_fd != -1) + close(server_fd); + if (client_fd != -1) + close(client_fd); + if (cgroup_fd != -1) + close(cgroup_fd); + netns_cookie_prog__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/obj_name.c b/tools/testing/selftests/bpf/prog_tests/obj_name.c index e178416bddad..6194b776a28b 100644 --- a/tools/testing/selftests/bpf/prog_tests/obj_name.c +++ b/tools/testing/selftests/bpf/prog_tests/obj_name.c @@ -38,13 +38,13 @@ void test_obj_name(void) fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); CHECK((tests[i].success && fd < 0) || - (!tests[i].success && fd != -1) || + (!tests[i].success && fd >= 0) || (!tests[i].success && errno != tests[i].expected_errno), "check-bpf-prog-name", "fd %d(%d) errno %d(%d)\n", fd, tests[i].success, errno, tests[i].expected_errno); - if (fd != -1) + if (fd >= 0) close(fd); /* test different attr.map_name during BPF_MAP_CREATE */ @@ -59,13 +59,13 @@ void test_obj_name(void) memcpy(attr.map_name, tests[i].name, ncopy); fd = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); CHECK((tests[i].success && fd < 0) || - (!tests[i].success && fd != -1) || + (!tests[i].success && fd >= 0) || (!tests[i].success && errno != tests[i].expected_errno), "check-bpf-map-name", "fd %d(%d) errno %d(%d)\n", fd, tests[i].success, errno, tests[i].expected_errno); - if (fd != -1) + if (fd >= 0) close(fd); } } diff --git a/tools/testing/selftests/bpf/prog_tests/perf_branches.c b/tools/testing/selftests/bpf/prog_tests/perf_branches.c index e35c444902a7..12c4f45cee1a 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_branches.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_branches.c @@ -74,7 +74,7 @@ static void test_perf_branches_common(int perf_fd, /* attach perf_event */ link = bpf_program__attach_perf_event(skel->progs.perf_branches, perf_fd); - if (CHECK(IS_ERR(link), "attach_perf_event", "err %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_perf_event")) goto out_destroy_skel; /* generate some branches on cpu 0 */ @@ -119,7 +119,7 @@ static void test_perf_branches_hw(void) * Some setups don't support branch records (virtual machines, !x86), * so skip test in this case. */ - if (pfd == -1) { + if (pfd < 0) { if (errno == ENOENT || errno == EOPNOTSUPP) { printf("%s:SKIP:no PERF_SAMPLE_BRANCH_STACK\n", __func__); diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c index ca9f0895ec84..6490e9673002 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c @@ -80,7 +80,7 @@ void test_perf_buffer(void) pb_opts.sample_cb = on_sample; pb_opts.ctx = &cpu_seen; pb = perf_buffer__new(bpf_map__fd(skel->maps.perf_buf_map), 1, &pb_opts); - if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb))) + if (!ASSERT_OK_PTR(pb, "perf_buf__new")) goto out_close; CHECK(perf_buffer__epoll_fd(pb) < 0, "epoll_fd", diff --git a/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c b/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c index 72c3690844fb..33144c9432ae 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c @@ -97,8 +97,7 @@ void test_perf_event_stackmap(void) skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu, pmu_fd); - if (CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event", - "err %ld\n", PTR_ERR(skel->links.oncpu))) { + if (!ASSERT_OK_PTR(skel->links.oncpu, "attach_perf_event")) { close(pmu_fd); goto cleanup; } diff --git a/tools/testing/selftests/bpf/prog_tests/perf_link.c b/tools/testing/selftests/bpf/prog_tests/perf_link.c new file mode 100644 index 000000000000..b1abd0c46607 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/perf_link.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#define _GNU_SOURCE +#include <pthread.h> +#include <sched.h> +#include <test_progs.h> +#include "test_perf_link.skel.h" + +static void burn_cpu(void) +{ + volatile int j = 0; + cpu_set_t cpu_set; + int i, err; + + /* generate some branches on cpu 0 */ + CPU_ZERO(&cpu_set); + CPU_SET(0, &cpu_set); + err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); + ASSERT_OK(err, "set_thread_affinity"); + + /* spin the loop for a while (random high number) */ + for (i = 0; i < 1000000; ++i) + ++j; +} + +void test_perf_link(void) +{ + struct test_perf_link *skel = NULL; + struct perf_event_attr attr; + int pfd = -1, link_fd = -1, err; + int run_cnt_before, run_cnt_after; + struct bpf_link_info info; + __u32 info_len = sizeof(info); + + /* create perf event */ + memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; + attr.sample_freq = 4000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (!ASSERT_GE(pfd, 0, "perf_fd")) + goto cleanup; + + skel = test_perf_link__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; + + link_fd = bpf_link_create(bpf_program__fd(skel->progs.handler), pfd, + BPF_PERF_EVENT, NULL); + if (!ASSERT_GE(link_fd, 0, "link_fd")) + goto cleanup; + + memset(&info, 0, sizeof(info)); + err = bpf_obj_get_info_by_fd(link_fd, &info, &info_len); + if (!ASSERT_OK(err, "link_get_info")) + goto cleanup; + + ASSERT_EQ(info.type, BPF_LINK_TYPE_PERF_EVENT, "link_type"); + ASSERT_GT(info.id, 0, "link_id"); + ASSERT_GT(info.prog_id, 0, "link_prog_id"); + + /* ensure we get at least one perf_event prog execution */ + burn_cpu(); + ASSERT_GT(skel->bss->run_cnt, 0, "run_cnt"); + + /* perf_event is still active, but we close link and BPF program + * shouldn't be executed anymore + */ + close(link_fd); + link_fd = -1; + + /* make sure there are no stragglers */ + kern_sync_rcu(); + + run_cnt_before = skel->bss->run_cnt; + burn_cpu(); + run_cnt_after = skel->bss->run_cnt; + + ASSERT_EQ(run_cnt_before, run_cnt_after, "run_cnt_before_after"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + if (pfd >= 0) + close(pfd); + test_perf_link__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/pinning.c b/tools/testing/selftests/bpf/prog_tests/pinning.c index fcf54b3a1dd0..d4b953ae3407 100644 --- a/tools/testing/selftests/bpf/prog_tests/pinning.c +++ b/tools/testing/selftests/bpf/prog_tests/pinning.c @@ -125,6 +125,10 @@ void test_pinning(void) if (CHECK(err, "pin maps", "err %d errno %d\n", err, errno)) goto out; + /* get pinning path */ + if (!ASSERT_STREQ(bpf_map__pin_path(map), pinpath, "get pin path")) + goto out; + /* set pinning path of other map and re-pin all */ map = bpf_object__find_map_by_name(obj, "nopinmap"); if (CHECK(!map, "find map", "NULL map")) @@ -134,6 +138,11 @@ void test_pinning(void) if (CHECK(err, "set pin path", "err %d errno %d\n", err, errno)) goto out; + /* get pinning path after set */ + if (!ASSERT_STREQ(bpf_map__pin_path(map), custpinpath, + "get pin path after set")) + goto out; + /* should only pin the one unpinned map */ err = bpf_object__pin_maps(obj, NULL); if (CHECK(err, "pin maps", "err %d errno %d\n", err, errno)) diff --git a/tools/testing/selftests/bpf/prog_tests/probe_user.c b/tools/testing/selftests/bpf/prog_tests/probe_user.c index 7aecfd9e87d1..95bd12097358 100644 --- a/tools/testing/selftests/bpf/prog_tests/probe_user.c +++ b/tools/testing/selftests/bpf/prog_tests/probe_user.c @@ -15,7 +15,7 @@ void test_probe_user(void) static const int zero = 0; obj = bpf_object__open_file(obj_file, &opts); - if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) + if (!ASSERT_OK_PTR(obj, "obj_open_file")) return; kprobe_prog = bpf_object__find_program_by_title(obj, prog_name); @@ -33,11 +33,8 @@ void test_probe_user(void) goto cleanup; kprobe_link = bpf_program__attach(kprobe_prog); - if (CHECK(IS_ERR(kprobe_link), "attach_kprobe", - "err %ld\n", PTR_ERR(kprobe_link))) { - kprobe_link = NULL; + if (!ASSERT_OK_PTR(kprobe_link, "attach_kprobe")) goto cleanup; - } memset(&curr, 0, sizeof(curr)); in->sin_family = AF_INET; diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c index 131d7f7eeb42..89fc98faf19e 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c @@ -46,7 +46,7 @@ void test_prog_run_xattr(void) tattr.prog_fd = bpf_program__fd(skel->progs.test_pkt_access); err = bpf_prog_test_run_xattr(&tattr); - CHECK_ATTR(err != -1 || errno != ENOSPC || tattr.retval, "run", + CHECK_ATTR(err >= 0 || errno != ENOSPC || tattr.retval, "run", "err %d errno %d retval %d\n", err, errno, tattr.retval); CHECK_ATTR(tattr.data_size_out != sizeof(pkt_v4), "data_size_out", @@ -78,6 +78,6 @@ void test_prog_run_xattr(void) cleanup: if (skel) test_pkt_access__destroy(skel); - if (stats_fd != -1) + if (stats_fd >= 0) close(stats_fd); } diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c index c5fb191874ac..41720a62c4fa 100644 --- a/tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c @@ -77,7 +77,7 @@ void test_raw_tp_test_run(void) /* invalid cpu ID should fail with ENXIO */ opts.cpu = 0xffffffff; err = bpf_prog_test_run_opts(prog_fd, &opts); - CHECK(err != -1 || errno != ENXIO, + CHECK(err >= 0 || errno != ENXIO, "test_run_opts_fail", "should failed with ENXIO\n"); @@ -85,7 +85,7 @@ void test_raw_tp_test_run(void) opts.cpu = 1; opts.flags = 0; err = bpf_prog_test_run_opts(prog_fd, &opts); - CHECK(err != -1 || errno != EINVAL, + CHECK(err >= 0 || errno != EINVAL, "test_run_opts_fail", "should failed with EINVAL\n"); diff --git a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c index 563e12120e77..5f9eaa3ab584 100644 --- a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c +++ b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c @@ -30,7 +30,7 @@ void test_rdonly_maps(void) struct bss bss; obj = bpf_object__open_file(file, NULL); - if (CHECK(IS_ERR(obj), "obj_open", "err %ld\n", PTR_ERR(obj))) + if (!ASSERT_OK_PTR(obj, "obj_open")) return; err = bpf_object__load(obj); @@ -58,11 +58,8 @@ void test_rdonly_maps(void) goto cleanup; link = bpf_program__attach_raw_tracepoint(prog, "sys_enter"); - if (CHECK(IS_ERR(link), "attach_prog", "prog '%s', err %ld\n", - t->prog_name, PTR_ERR(link))) { - link = NULL; + if (!ASSERT_OK_PTR(link, "attach_prog")) goto cleanup; - } /* trigger probe */ usleep(1); diff --git a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c index ac1ee10cffd8..4e91f4d6466c 100644 --- a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c +++ b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c @@ -15,7 +15,7 @@ void test_reference_tracking(void) int err = 0; obj = bpf_object__open_file(file, &open_opts); - if (CHECK_FAIL(IS_ERR(obj))) + if (!ASSERT_OK_PTR(obj, "obj_open_file")) return; if (CHECK(strcmp(bpf_object__name(obj), obj_name), "obj_name", @@ -34,8 +34,8 @@ void test_reference_tracking(void) if (!test__start_subtest(title)) continue; - /* Expect verifier failure if test name has 'fail' */ - if (strstr(title, "fail") != NULL) { + /* Expect verifier failure if test name has 'err' */ + if (strstr(title, "err_") != NULL) { libbpf_print_fn_t old_print_fn; old_print_fn = libbpf_set_print(NULL); diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c index d3c2de2c24d1..f62361306f6d 100644 --- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c +++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c @@ -76,7 +76,7 @@ __resolve_symbol(struct btf *btf, int type_id) } for (i = 0; i < ARRAY_SIZE(test_symbols); i++) { - if (test_symbols[i].id != -1) + if (test_symbols[i].id >= 0) continue; if (BTF_INFO_KIND(type->info) != test_symbols[i].type) diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index f9a8ae331963..4706cee84360 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -12,7 +12,7 @@ #include <sys/sysinfo.h> #include <linux/perf_event.h> #include <linux/ring_buffer.h> -#include "test_ringbuf.skel.h" +#include "test_ringbuf.lskel.h" #define EDONE 7777 @@ -94,15 +94,13 @@ void test_ringbuf(void) if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; - err = bpf_map__set_max_entries(skel->maps.ringbuf, page_size); - if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) - goto cleanup; + skel->maps.ringbuf.max_entries = page_size; err = test_ringbuf__load(skel); if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) goto cleanup; - rb_fd = bpf_map__fd(skel->maps.ringbuf); + rb_fd = skel->maps.ringbuf.map_fd; /* good read/write cons_pos */ mmap_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rb_fd, 0); ASSERT_OK_PTR(mmap_ptr, "rw_cons_pos"); @@ -151,7 +149,7 @@ void test_ringbuf(void) /* only trigger BPF program for current process */ skel->bss->pid = getpid(); - ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf), + ringbuf = ring_buffer__new(skel->maps.ringbuf.map_fd, process_sample, NULL, NULL); if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c index cef63e703924..167cd8a2edfd 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -63,7 +63,7 @@ void test_ringbuf_multi(void) goto cleanup; proto_fd = bpf_create_map(BPF_MAP_TYPE_RINGBUF, 0, 0, page_size, 0); - if (CHECK(proto_fd == -1, "bpf_create_map", "bpf_create_map failed\n")) + if (CHECK(proto_fd < 0, "bpf_create_map", "bpf_create_map failed\n")) goto cleanup; err = bpf_map__set_inner_map_fd(skel->maps.ringbuf_hash, proto_fd); diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 821b4146b7b6..4efd337d6a3c 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -78,7 +78,7 @@ static int create_maps(enum bpf_map_type inner_type) attr.max_entries = REUSEPORT_ARRAY_SIZE; reuseport_array = bpf_create_map_xattr(&attr); - RET_ERR(reuseport_array == -1, "creating reuseport_array", + RET_ERR(reuseport_array < 0, "creating reuseport_array", "reuseport_array:%d errno:%d\n", reuseport_array, errno); /* Creating outer_map */ @@ -89,7 +89,7 @@ static int create_maps(enum bpf_map_type inner_type) attr.max_entries = 1; attr.inner_map_fd = reuseport_array; outer_map = bpf_create_map_xattr(&attr); - RET_ERR(outer_map == -1, "creating outer_map", + RET_ERR(outer_map < 0, "creating outer_map", "outer_map:%d errno:%d\n", outer_map, errno); return 0; @@ -102,8 +102,9 @@ static int prepare_bpf_obj(void) int err; obj = bpf_object__open("test_select_reuseport_kern.o"); - RET_ERR(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o", - "obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj)); + err = libbpf_get_error(obj); + RET_ERR(err, "open test_select_reuseport_kern.o", + "obj:%p PTR_ERR(obj):%d\n", obj, err); map = bpf_object__find_map_by_name(obj, "outer_map"); RET_ERR(!map, "find outer_map", "!map\n"); @@ -116,31 +117,31 @@ static int prepare_bpf_obj(void) prog = bpf_program__next(NULL, obj); RET_ERR(!prog, "get first bpf_program", "!prog\n"); select_by_skb_data_prog = bpf_program__fd(prog); - RET_ERR(select_by_skb_data_prog == -1, "get prog fd", + RET_ERR(select_by_skb_data_prog < 0, "get prog fd", "select_by_skb_data_prog:%d\n", select_by_skb_data_prog); map = bpf_object__find_map_by_name(obj, "result_map"); RET_ERR(!map, "find result_map", "!map\n"); result_map = bpf_map__fd(map); - RET_ERR(result_map == -1, "get result_map fd", + RET_ERR(result_map < 0, "get result_map fd", "result_map:%d\n", result_map); map = bpf_object__find_map_by_name(obj, "tmp_index_ovr_map"); RET_ERR(!map, "find tmp_index_ovr_map\n", "!map"); tmp_index_ovr_map = bpf_map__fd(map); - RET_ERR(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd", + RET_ERR(tmp_index_ovr_map < 0, "get tmp_index_ovr_map fd", "tmp_index_ovr_map:%d\n", tmp_index_ovr_map); map = bpf_object__find_map_by_name(obj, "linum_map"); RET_ERR(!map, "find linum_map", "!map\n"); linum_map = bpf_map__fd(map); - RET_ERR(linum_map == -1, "get linum_map fd", + RET_ERR(linum_map < 0, "get linum_map fd", "linum_map:%d\n", linum_map); map = bpf_object__find_map_by_name(obj, "data_check_map"); RET_ERR(!map, "find data_check_map", "!map\n"); data_check_map = bpf_map__fd(map); - RET_ERR(data_check_map == -1, "get data_check_map fd", + RET_ERR(data_check_map < 0, "get data_check_map fd", "data_check_map:%d\n", data_check_map); return 0; @@ -237,7 +238,7 @@ static long get_linum(void) int err; err = bpf_map_lookup_elem(linum_map, &index_zero, &linum); - RET_ERR(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n", + RET_ERR(err < 0, "lookup_elem(linum_map)", "err:%d errno:%d\n", err, errno); return linum; @@ -254,11 +255,11 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd, addrlen = sizeof(cli_sa); err = getsockname(cli_fd, (struct sockaddr *)&cli_sa, &addrlen); - RET_IF(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n", + RET_IF(err < 0, "getsockname(cli_fd)", "err:%d errno:%d\n", err, errno); err = bpf_map_lookup_elem(data_check_map, &index_zero, &result); - RET_IF(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n", + RET_IF(err < 0, "lookup_elem(data_check_map)", "err:%d errno:%d\n", err, errno); if (type == SOCK_STREAM) { @@ -347,7 +348,7 @@ static void check_results(void) for (i = 0; i < NR_RESULTS; i++) { err = bpf_map_lookup_elem(result_map, &i, &results[i]); - RET_IF(err == -1, "lookup_elem(result_map)", + RET_IF(err < 0, "lookup_elem(result_map)", "i:%u err:%d errno:%d\n", i, err, errno); } @@ -524,12 +525,12 @@ static void test_syncookie(int type, sa_family_t family) */ err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &tmp_index, BPF_ANY); - RET_IF(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)", + RET_IF(err < 0, "update_elem(tmp_index_ovr_map, 0, 1)", "err:%d errno:%d\n", err, errno); do_test(type, family, &cmd, PASS); err = bpf_map_lookup_elem(tmp_index_ovr_map, &index_zero, &tmp_index); - RET_IF(err == -1 || tmp_index != -1, + RET_IF(err < 0 || tmp_index >= 0, "lookup_elem(tmp_index_ovr_map)", "err:%d errno:%d tmp_index:%d\n", err, errno, tmp_index); @@ -569,7 +570,7 @@ static void test_detach_bpf(int type, sa_family_t family) for (i = 0; i < NR_RESULTS; i++) { err = bpf_map_lookup_elem(result_map, &i, &tmp); - RET_IF(err == -1, "lookup_elem(result_map)", + RET_IF(err < 0, "lookup_elem(result_map)", "i:%u err:%d errno:%d\n", i, err, errno); nr_run_before += tmp; } @@ -584,7 +585,7 @@ static void test_detach_bpf(int type, sa_family_t family) for (i = 0; i < NR_RESULTS; i++) { err = bpf_map_lookup_elem(result_map, &i, &tmp); - RET_IF(err == -1, "lookup_elem(result_map)", + RET_IF(err < 0, "lookup_elem(result_map)", "i:%u err:%d errno:%d\n", i, err, errno); nr_run_after += tmp; } @@ -632,24 +633,24 @@ static void prepare_sk_fds(int type, sa_family_t family, bool inany) SO_ATTACH_REUSEPORT_EBPF, &select_by_skb_data_prog, sizeof(select_by_skb_data_prog)); - RET_IF(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)", + RET_IF(err < 0, "setsockopt(SO_ATTACH_REUEPORT_EBPF)", "err:%d errno:%d\n", err, errno); } err = bind(sk_fds[i], (struct sockaddr *)&srv_sa, addrlen); - RET_IF(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n", + RET_IF(err < 0, "bind()", "sk_fds[%d] err:%d errno:%d\n", i, err, errno); if (type == SOCK_STREAM) { err = listen(sk_fds[i], 10); - RET_IF(err == -1, "listen()", + RET_IF(err < 0, "listen()", "sk_fds[%d] err:%d errno:%d\n", i, err, errno); } err = bpf_map_update_elem(reuseport_array, &i, &sk_fds[i], BPF_NOEXIST); - RET_IF(err == -1, "update_elem(reuseport_array)", + RET_IF(err < 0, "update_elem(reuseport_array)", "sk_fds[%d] err:%d errno:%d\n", i, err, errno); if (i == first) { @@ -682,7 +683,7 @@ static void setup_per_test(int type, sa_family_t family, bool inany, prepare_sk_fds(type, family, inany); err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &ovr, BPF_ANY); - RET_IF(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)", + RET_IF(err < 0, "update_elem(tmp_index_ovr_map, 0, -1)", "err:%d errno:%d\n", err, errno); /* Install reuseport_array to outer_map? */ @@ -691,7 +692,7 @@ static void setup_per_test(int type, sa_family_t family, bool inany, err = bpf_map_update_elem(outer_map, &index_zero, &reuseport_array, BPF_ANY); - RET_IF(err == -1, "update_elem(outer_map, 0, reuseport_array)", + RET_IF(err < 0, "update_elem(outer_map, 0, reuseport_array)", "err:%d errno:%d\n", err, errno); } @@ -720,18 +721,18 @@ static void cleanup_per_test(bool no_inner_map) return; err = bpf_map_delete_elem(outer_map, &index_zero); - RET_IF(err == -1, "delete_elem(outer_map)", + RET_IF(err < 0, "delete_elem(outer_map)", "err:%d errno:%d\n", err, errno); } static void cleanup(void) { - if (outer_map != -1) { + if (outer_map >= 0) { close(outer_map); outer_map = -1; } - if (reuseport_array != -1) { + if (reuseport_array >= 0) { close(reuseport_array); reuseport_array = -1; } diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index 7043e6ded0e6..776916b61c40 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <sys/time.h> +#include <sys/resource.h> #include "test_send_signal_kern.skel.h" -static volatile int sigusr1_received = 0; +int sigusr1_received = 0; static void sigusr1_handler(int signum) { @@ -10,29 +12,25 @@ static void sigusr1_handler(int signum) } static void test_send_signal_common(struct perf_event_attr *attr, - bool signal_thread, - const char *test_name) + bool signal_thread) { struct test_send_signal_kern *skel; int pipe_c2p[2], pipe_p2c[2]; int err = -1, pmu_fd = -1; - __u32 duration = 0; char buf[256]; pid_t pid; - if (CHECK(pipe(pipe_c2p), test_name, - "pipe pipe_c2p error: %s\n", strerror(errno))) + if (!ASSERT_OK(pipe(pipe_c2p), "pipe_c2p")) return; - if (CHECK(pipe(pipe_p2c), test_name, - "pipe pipe_p2c error: %s\n", strerror(errno))) { + if (!ASSERT_OK(pipe(pipe_p2c), "pipe_p2c")) { close(pipe_c2p[0]); close(pipe_c2p[1]); return; } pid = fork(); - if (CHECK(pid < 0, test_name, "fork error: %s\n", strerror(errno))) { + if (!ASSERT_GE(pid, 0, "fork")) { close(pipe_c2p[0]); close(pipe_c2p[1]); close(pipe_p2c[0]); @@ -41,26 +39,40 @@ static void test_send_signal_common(struct perf_event_attr *attr, } if (pid == 0) { + int old_prio; + /* install signal handler and notify parent */ signal(SIGUSR1, sigusr1_handler); close(pipe_c2p[0]); /* close read */ close(pipe_p2c[1]); /* close write */ + /* boost with a high priority so we got a higher chance + * that if an interrupt happens, the underlying task + * is this process. + */ + errno = 0; + old_prio = getpriority(PRIO_PROCESS, 0); + ASSERT_OK(errno, "getpriority"); + ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority"); + /* notify parent signal handler is installed */ - CHECK(write(pipe_c2p[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno); + ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); /* make sure parent enabled bpf program to send_signal */ - CHECK(read(pipe_p2c[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno); + ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); /* wait a little for signal handler */ sleep(1); buf[0] = sigusr1_received ? '2' : '0'; - CHECK(write(pipe_c2p[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno); + ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); /* wait for parent notification and exit */ - CHECK(read(pipe_p2c[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno); + ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); + + /* restore the old priority */ + ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority"); close(pipe_c2p[1]); close(pipe_p2c[0]); @@ -71,33 +83,31 @@ static void test_send_signal_common(struct perf_event_attr *attr, close(pipe_p2c[0]); /* close read */ skel = test_send_signal_kern__open_and_load(); - if (CHECK(!skel, "skel_open_and_load", "skeleton open_and_load failed\n")) + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) goto skel_open_load_failure; if (!attr) { err = test_send_signal_kern__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed\n")) { + if (!ASSERT_OK(err, "skel_attach")) { err = -1; goto destroy_skel; } } else { pmu_fd = syscall(__NR_perf_event_open, attr, pid, -1, -1 /* group id */, 0 /* flags */); - if (CHECK(pmu_fd < 0, test_name, "perf_event_open error: %s\n", - strerror(errno))) { + if (!ASSERT_GE(pmu_fd, 0, "perf_event_open")) { err = -1; goto destroy_skel; } skel->links.send_signal_perf = bpf_program__attach_perf_event(skel->progs.send_signal_perf, pmu_fd); - if (CHECK(IS_ERR(skel->links.send_signal_perf), "attach_perf_event", - "err %ld\n", PTR_ERR(skel->links.send_signal_perf))) + if (!ASSERT_OK_PTR(skel->links.send_signal_perf, "attach_perf_event")) goto disable_pmu; } /* wait until child signal handler installed */ - CHECK(read(pipe_c2p[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno); + ASSERT_EQ(read(pipe_c2p[0], buf, 1), 1, "pipe_read"); /* trigger the bpf send_signal */ skel->bss->pid = pid; @@ -105,21 +115,21 @@ static void test_send_signal_common(struct perf_event_attr *attr, skel->bss->signal_thread = signal_thread; /* notify child that bpf program can send_signal now */ - CHECK(write(pipe_p2c[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno); + ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write"); /* wait for result */ err = read(pipe_c2p[0], buf, 1); - if (CHECK(err < 0, test_name, "reading pipe error: %s\n", strerror(errno))) + if (!ASSERT_GE(err, 0, "reading pipe")) goto disable_pmu; - if (CHECK(err == 0, test_name, "reading pipe error: size 0\n")) { + if (!ASSERT_GT(err, 0, "reading pipe error: size 0")) { err = -1; goto disable_pmu; } - CHECK(buf[0] != '2', test_name, "incorrect result\n"); + ASSERT_EQ(buf[0], '2', "incorrect result"); /* notify child safe to exit */ - CHECK(write(pipe_p2c[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno); + ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write"); disable_pmu: close(pmu_fd); @@ -133,7 +143,7 @@ skel_open_load_failure: static void test_send_signal_tracepoint(bool signal_thread) { - test_send_signal_common(NULL, signal_thread, "tracepoint"); + test_send_signal_common(NULL, signal_thread); } static void test_send_signal_perf(bool signal_thread) @@ -144,7 +154,7 @@ static void test_send_signal_perf(bool signal_thread) .config = PERF_COUNT_SW_CPU_CLOCK, }; - test_send_signal_common(&attr, signal_thread, "perf_sw_event"); + test_send_signal_common(&attr, signal_thread); } static void test_send_signal_nmi(bool signal_thread) @@ -173,7 +183,7 @@ static void test_send_signal_nmi(bool signal_thread) close(pmu_fd); } - test_send_signal_common(&attr, signal_thread, "perf_hw_event"); + test_send_signal_common(&attr, signal_thread); } void test_send_signal(void) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 45c82db3c58c..aee41547e7f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -480,7 +480,7 @@ static struct bpf_link *attach_lookup_prog(struct bpf_program *prog) } link = bpf_program__attach_netns(prog, net_fd); - if (CHECK(IS_ERR(link), "bpf_program__attach_netns", "failed\n")) { + if (!ASSERT_OK_PTR(link, "bpf_program__attach_netns")) { errno = -PTR_ERR(link); log_err("failed to attach program '%s' to netns", bpf_program__name(prog)); diff --git a/tools/testing/selftests/bpf/prog_tests/skeleton.c b/tools/testing/selftests/bpf/prog_tests/skeleton.c index fe87b77af459..f6f130c99b8c 100644 --- a/tools/testing/selftests/bpf/prog_tests/skeleton.c +++ b/tools/testing/selftests/bpf/prog_tests/skeleton.c @@ -82,10 +82,8 @@ void test_skeleton(void) CHECK(data->out2 != 2, "res2", "got %lld != exp %d\n", data->out2, 2); CHECK(bss->out3 != 3, "res3", "got %d != exp %d\n", (int)bss->out3, 3); CHECK(bss->out4 != 4, "res4", "got %lld != exp %d\n", bss->out4, 4); - CHECK(bss->handler_out5.a != 5, "res5", "got %d != exp %d\n", - bss->handler_out5.a, 5); - CHECK(bss->handler_out5.b != 6, "res6", "got %lld != exp %d\n", - bss->handler_out5.b, 6); + CHECK(bss->out5.a != 5, "res5", "got %d != exp %d\n", bss->out5.a, 5); + CHECK(bss->out5.b != 6, "res6", "got %lld != exp %d\n", bss->out5.b, 6); CHECK(bss->out6 != 14, "res7", "got %d != exp %d\n", bss->out6, 14); CHECK(bss->bpf_syscall != kcfg->CONFIG_BPF_SYSCALL, "ext1", diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf.c b/tools/testing/selftests/bpf/prog_tests/snprintf.c index dffbcaa1ec98..8fd1b4b29a0e 100644 --- a/tools/testing/selftests/bpf/prog_tests/snprintf.c +++ b/tools/testing/selftests/bpf/prog_tests/snprintf.c @@ -19,7 +19,7 @@ #define EXP_ADDR_OUT "0000000000000000 ffff00000add4e55 " #define EXP_ADDR_RET sizeof(EXP_ADDR_OUT "unknownhashedptr") -#define EXP_STR_OUT "str1 longstr" +#define EXP_STR_OUT "str1 a b c d e longstr" #define EXP_STR_RET sizeof(EXP_STR_OUT) #define EXP_OVER_OUT "%over" @@ -114,6 +114,8 @@ void test_snprintf_negative(void) ASSERT_ERR(load_single_snprintf("%"), "invalid specifier 3"); ASSERT_ERR(load_single_snprintf("%12345678"), "invalid specifier 4"); ASSERT_ERR(load_single_snprintf("%--------"), "invalid specifier 5"); + ASSERT_ERR(load_single_snprintf("%lc"), "invalid specifier 6"); + ASSERT_ERR(load_single_snprintf("%llc"), "invalid specifier 7"); ASSERT_ERR(load_single_snprintf("\x80"), "non ascii character"); ASSERT_ERR(load_single_snprintf("\x1"), "non printable character"); } diff --git a/tools/testing/selftests/bpf/prog_tests/sock_fields.c b/tools/testing/selftests/bpf/prog_tests/sock_fields.c index af87118e748e..577d619fb07e 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_fields.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_fields.c @@ -97,12 +97,12 @@ static void check_result(void) err = bpf_map_lookup_elem(linum_map_fd, &egress_linum_idx, &egress_linum); - CHECK(err == -1, "bpf_map_lookup_elem(linum_map_fd)", + CHECK(err < 0, "bpf_map_lookup_elem(linum_map_fd)", "err:%d errno:%d\n", err, errno); err = bpf_map_lookup_elem(linum_map_fd, &ingress_linum_idx, &ingress_linum); - CHECK(err == -1, "bpf_map_lookup_elem(linum_map_fd)", + CHECK(err < 0, "bpf_map_lookup_elem(linum_map_fd)", "err:%d errno:%d\n", err, errno); memcpy(&srv_sk, &skel->bss->srv_sk, sizeof(srv_sk)); @@ -355,14 +355,12 @@ void test_sock_fields(void) egress_link = bpf_program__attach_cgroup(skel->progs.egress_read_sock_fields, child_cg_fd); - if (CHECK(IS_ERR(egress_link), "attach_cgroup(egress)", "err:%ld\n", - PTR_ERR(egress_link))) + if (!ASSERT_OK_PTR(egress_link, "attach_cgroup(egress)")) goto done; ingress_link = bpf_program__attach_cgroup(skel->progs.ingress_read_sock_fields, child_cg_fd); - if (CHECK(IS_ERR(ingress_link), "attach_cgroup(ingress)", "err:%ld\n", - PTR_ERR(ingress_link))) + if (!ASSERT_OK_PTR(ingress_link, "attach_cgroup(ingress)")) goto done; linum_map_fd = bpf_map__fd(skel->maps.linum_map); @@ -375,8 +373,8 @@ done: bpf_link__destroy(egress_link); bpf_link__destroy(ingress_link); test_sock_fields__destroy(skel); - if (child_cg_fd != -1) + if (child_cg_fd >= 0) close(child_cg_fd); - if (parent_cg_fd != -1) + if (parent_cg_fd >= 0) close(parent_cg_fd); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index ab77596b64e3..1352ec104149 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -88,11 +88,11 @@ static void test_sockmap_create_update_free(enum bpf_map_type map_type) int s, map, err; s = connected_socket_v4(); - if (CHECK_FAIL(s == -1)) + if (CHECK_FAIL(s < 0)) return; map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0); - if (CHECK_FAIL(map == -1)) { + if (CHECK_FAIL(map < 0)) { perror("bpf_create_map"); goto out; } @@ -245,7 +245,7 @@ static void test_sockmap_copy(enum bpf_map_type map_type) opts.link_info = &linfo; opts.link_info_len = sizeof(linfo); link = bpf_program__attach_iter(skel->progs.copy, &opts); - if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + if (!ASSERT_OK_PTR(link, "attach_iter")) goto out; iter_fd = bpf_iter_create(bpf_link__fd(link)); @@ -304,7 +304,7 @@ static void test_sockmap_skb_verdict_attach(enum bpf_attach_type first, } err = bpf_prog_attach(verdict, map, second, 0); - assert(err == -1 && errno == EBUSY); + ASSERT_EQ(err, -EBUSY, "prog_attach_fail"); err = bpf_prog_detach2(verdict, map, first); if (CHECK_FAIL(err)) { diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c index 06b86addc181..7a0d64fdc192 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c @@ -98,7 +98,7 @@ static void run_tests(int family, enum bpf_map_type map_type) int map; map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0); - if (CHECK_FAIL(map == -1)) { + if (CHECK_FAIL(map < 0)) { perror("bpf_map_create"); return; } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 648d9ae898d2..d88bb65b74cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -139,7 +139,7 @@ #define xbpf_map_delete_elem(fd, key) \ ({ \ int __ret = bpf_map_delete_elem((fd), (key)); \ - if (__ret == -1) \ + if (__ret < 0) \ FAIL_ERRNO("map_delete"); \ __ret; \ }) @@ -147,7 +147,7 @@ #define xbpf_map_lookup_elem(fd, key, val) \ ({ \ int __ret = bpf_map_lookup_elem((fd), (key), (val)); \ - if (__ret == -1) \ + if (__ret < 0) \ FAIL_ERRNO("map_lookup"); \ __ret; \ }) @@ -155,7 +155,7 @@ #define xbpf_map_update_elem(fd, key, val, flags) \ ({ \ int __ret = bpf_map_update_elem((fd), (key), (val), (flags)); \ - if (__ret == -1) \ + if (__ret < 0) \ FAIL_ERRNO("map_update"); \ __ret; \ }) @@ -164,7 +164,7 @@ ({ \ int __ret = \ bpf_prog_attach((prog), (target), (type), (flags)); \ - if (__ret == -1) \ + if (__ret < 0) \ FAIL_ERRNO("prog_attach(" #type ")"); \ __ret; \ }) @@ -172,7 +172,7 @@ #define xbpf_prog_detach2(prog, target, type) \ ({ \ int __ret = bpf_prog_detach2((prog), (target), (type)); \ - if (__ret == -1) \ + if (__ret < 0) \ FAIL_ERRNO("prog_detach2(" #type ")"); \ __ret; \ }) @@ -351,9 +351,11 @@ static void test_insert_opened(int family, int sotype, int mapfd) errno = 0; value = s; err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); - if (!err || errno != EOPNOTSUPP) - FAIL_ERRNO("map_update: expected EOPNOTSUPP"); - + if (sotype == SOCK_STREAM) { + if (!err || errno != EOPNOTSUPP) + FAIL_ERRNO("map_update: expected EOPNOTSUPP"); + } else if (err) + FAIL_ERRNO("map_update: expected success"); xclose(s); } @@ -919,6 +921,23 @@ static const char *redir_mode_str(enum redir_mode mode) } } +static int add_to_sockmap(int sock_mapfd, int fd1, int fd2) +{ + u64 value; + u32 key; + int err; + + key = 0; + value = fd1; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + return err; + + key = 1; + value = fd2; + return xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); +} + static void redir_to_connected(int family, int sotype, int sock_mapfd, int verd_mapfd, enum redir_mode mode) { @@ -928,7 +947,6 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, unsigned int pass; socklen_t len; int err, n; - u64 value; u32 key; char b; @@ -965,15 +983,7 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, if (p1 < 0) goto close_cli1; - key = 0; - value = p0; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); - if (err) - goto close_peer1; - - key = 1; - value = p1; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + err = add_to_sockmap(sock_mapfd, p0, p1); if (err) goto close_peer1; @@ -991,12 +1001,11 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, goto close_peer1; if (pass != 1) FAIL("%s: want pass count 1, have %d", log_prefix, pass); - - n = read(c0, &b, 1); + n = recv_timeout(c0, &b, 1, 0, IO_TIMEOUT_SEC); if (n < 0) - FAIL_ERRNO("%s: read", log_prefix); + FAIL_ERRNO("%s: recv_timeout", log_prefix); if (n == 0) - FAIL("%s: incomplete read", log_prefix); + FAIL("%s: incomplete recv", log_prefix); close_peer1: xclose(p1); @@ -1061,7 +1070,6 @@ static void redir_to_listening(int family, int sotype, int sock_mapfd, int s, c, p, err, n; unsigned int drop; socklen_t len; - u64 value; u32 key; zero_verdict_count(verd_mapfd); @@ -1086,15 +1094,7 @@ static void redir_to_listening(int family, int sotype, int sock_mapfd, if (p < 0) goto close_cli; - key = 0; - value = s; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); - if (err) - goto close_peer; - - key = 1; - value = p; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + err = add_to_sockmap(sock_mapfd, s, p); if (err) goto close_peer; @@ -1346,7 +1346,6 @@ static void test_reuseport_mixed_groups(int family, int sotype, int sock_map, int s1, s2, c, err; unsigned int drop; socklen_t len; - u64 value; u32 key; zero_verdict_count(verd_map); @@ -1360,16 +1359,10 @@ static void test_reuseport_mixed_groups(int family, int sotype, int sock_map, if (s2 < 0) goto close_srv1; - key = 0; - value = s1; - err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); + err = add_to_sockmap(sock_map, s1, s2); if (err) goto close_srv2; - key = 1; - value = s2; - err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); - /* Connect to s2, reuseport BPF selects s1 via sock_map[0] */ len = sizeof(addr); err = xgetsockname(s2, sockaddr(&addr), &len); @@ -1441,6 +1434,8 @@ static const char *family_str(sa_family_t family) return "IPv4"; case AF_INET6: return "IPv6"; + case AF_UNIX: + return "Unix"; default: return "unknown"; } @@ -1563,6 +1558,94 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map, } } +static void unix_redir_to_connected(int sotype, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + int c0, c1, p0, p1; + unsigned int pass; + int err, n; + int sfd[2]; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + if (socketpair(AF_UNIX, sotype | SOCK_NONBLOCK, 0, sfd)) + return; + c0 = sfd[0], p0 = sfd[1]; + + if (socketpair(AF_UNIX, sotype | SOCK_NONBLOCK, 0, sfd)) + goto close0; + c1 = sfd[0], p1 = sfd[1]; + + err = add_to_sockmap(sock_mapfd, p0, p1); + if (err) + goto close; + + n = write(c1, "a", 1); + if (n < 0) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) + goto close; + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); + if (err) + goto close; + if (pass != 1) + FAIL("%s: want pass count 1, have %d", log_prefix, pass); + + n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC); + if (n < 0) + FAIL_ERRNO("%s: recv_timeout", log_prefix); + if (n == 0) + FAIL("%s: incomplete recv", log_prefix); + +close: + xclose(c1); + xclose(p1); +close0: + xclose(c0); + xclose(p0); +} + +static void unix_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int sotype) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0); + if (err) + return; + + skel->bss->test_ingress = false; + unix_redir_to_connected(sotype, sock_map, verdict_map, REDIR_EGRESS); + skel->bss->test_ingress = true; + unix_redir_to_connected(sotype, sock_map, verdict_map, REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); +} + +static void test_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *map, + int sotype) +{ + const char *family_name, *map_name; + char s[MAX_TEST_NAME]; + + family_name = family_str(AF_UNIX); + map_name = map_type_str(map); + snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); + if (!test__start_subtest(s)) + return; + unix_skb_redir_to_connected(skel, map, sotype); +} + static void test_reuseport(struct test_sockmap_listen *skel, struct bpf_map *map, int family, int sotype) { @@ -1603,32 +1686,27 @@ static void test_reuseport(struct test_sockmap_listen *skel, } } -static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, - int verd_mapfd, enum redir_mode mode) +static int inet_socketpair(int family, int type, int *s, int *c) { - const char *log_prefix = redir_mode_str(mode); struct sockaddr_storage addr; - int c0, c1, p0, p1; - unsigned int pass; socklen_t len; - int err, n; - u64 value; - u32 key; - char b; - - zero_verdict_count(verd_mapfd); + int p0, c0; + int err; - p0 = socket_loopback(family, sotype | SOCK_NONBLOCK); + p0 = socket_loopback(family, type | SOCK_NONBLOCK); if (p0 < 0) - return; + return p0; + len = sizeof(addr); err = xgetsockname(p0, sockaddr(&addr), &len); if (err) goto close_peer0; - c0 = xsocket(family, sotype | SOCK_NONBLOCK, 0); - if (c0 < 0) + c0 = xsocket(family, type | SOCK_NONBLOCK, 0); + if (c0 < 0) { + err = c0; goto close_peer0; + } err = xconnect(c0, sockaddr(&addr), len); if (err) goto close_cli0; @@ -1639,35 +1717,125 @@ static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, if (err) goto close_cli0; - p1 = socket_loopback(family, sotype | SOCK_NONBLOCK); - if (p1 < 0) - goto close_cli0; - err = xgetsockname(p1, sockaddr(&addr), &len); + *s = p0; + *c = c0; + return 0; + +close_cli0: + xclose(c0); +close_peer0: + xclose(p0); + return err; +} + +static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd, + enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + int c0, c1, p0, p1; + unsigned int pass; + int err, n; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + err = inet_socketpair(family, SOCK_DGRAM, &p0, &c0); + if (err) + return; + err = inet_socketpair(family, SOCK_DGRAM, &p1, &c1); if (err) goto close_cli0; - c1 = xsocket(family, sotype | SOCK_NONBLOCK, 0); - if (c1 < 0) - goto close_peer1; - err = xconnect(c1, sockaddr(&addr), len); + err = add_to_sockmap(sock_mapfd, p0, p1); if (err) goto close_cli1; - err = xgetsockname(c1, sockaddr(&addr), &len); - if (err) + + n = write(c1, "a", 1); + if (n < 0) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) goto close_cli1; - err = xconnect(p1, sockaddr(&addr), len); + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); if (err) goto close_cli1; + if (pass != 1) + FAIL("%s: want pass count 1, have %d", log_prefix, pass); - key = 0; - value = p0; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC); + if (n < 0) + FAIL_ERRNO("%s: recv_timeout", log_prefix); + if (n == 0) + FAIL("%s: incomplete recv", log_prefix); + +close_cli1: + xclose(c1); + xclose(p1); +close_cli0: + xclose(c0); + xclose(p0); +} + +static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0); if (err) - goto close_cli1; + return; - key = 1; - value = p1; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + skel->bss->test_ingress = false; + udp_redir_to_connected(family, sock_map, verdict_map, REDIR_EGRESS); + skel->bss->test_ingress = true; + udp_redir_to_connected(family, sock_map, verdict_map, REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); +} + +static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map, + int family) +{ + const char *family_name, *map_name; + char s[MAX_TEST_NAME]; + + family_name = family_str(family); + map_name = map_type_str(map); + snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); + if (!test__start_subtest(s)) + return; + udp_skb_redir_to_connected(skel, map, family); +} + +static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + int c0, c1, p0, p1; + unsigned int pass; + int err, n; + int sfd[2]; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sfd)) + return; + c0 = sfd[0], p0 = sfd[1]; + + err = inet_socketpair(family, SOCK_DGRAM, &p1, &c1); + if (err) + goto close; + + err = add_to_sockmap(sock_mapfd, p0, p1); if (err) goto close_cli1; @@ -1686,24 +1854,103 @@ static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, if (pass != 1) FAIL("%s: want pass count 1, have %d", log_prefix, pass); - n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); + n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC); if (n < 0) - FAIL_ERRNO("%s: read", log_prefix); + FAIL_ERRNO("%s: recv_timeout", log_prefix); if (n == 0) - FAIL("%s: incomplete read", log_prefix); + FAIL("%s: incomplete recv", log_prefix); close_cli1: xclose(c1); -close_peer1: + xclose(p1); +close: + xclose(c0); + xclose(p0); +} + +static void inet_unix_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0); + if (err) + return; + + skel->bss->test_ingress = false; + inet_unix_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_EGRESS); + inet_unix_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map, + REDIR_EGRESS); + skel->bss->test_ingress = true; + inet_unix_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_INGRESS); + inet_unix_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map, + REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); +} + +static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + int c0, c1, p0, p1; + unsigned int pass; + int err, n; + int sfd[2]; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + err = inet_socketpair(family, SOCK_DGRAM, &p0, &c0); + if (err) + return; + + if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sfd)) + goto close_cli0; + c1 = sfd[0], p1 = sfd[1]; + + err = add_to_sockmap(sock_mapfd, p0, p1); + if (err) + goto close; + + n = write(c1, "a", 1); + if (n < 0) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) + goto close; + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); + if (err) + goto close; + if (pass != 1) + FAIL("%s: want pass count 1, have %d", log_prefix, pass); + + n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC); + if (n < 0) + FAIL_ERRNO("%s: recv_timeout", log_prefix); + if (n == 0) + FAIL("%s: incomplete recv", log_prefix); + +close: + xclose(c1); xclose(p1); close_cli0: xclose(c0); -close_peer0: xclose(p0); + } -static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel, - struct bpf_map *inner_map, int family) +static void unix_inet_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family) { int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); int verdict_map = bpf_map__fd(skel->maps.verdict_map); @@ -1715,17 +1962,21 @@ static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel, return; skel->bss->test_ingress = false; - udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, - REDIR_EGRESS); + unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_EGRESS); + unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map, + REDIR_EGRESS); skel->bss->test_ingress = true; - udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, - REDIR_INGRESS); + unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_INGRESS); + unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map, + REDIR_INGRESS); xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); } -static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map, - int family) +static void test_udp_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *map, + int family) { const char *family_name, *map_name; char s[MAX_TEST_NAME]; @@ -1735,7 +1986,8 @@ static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); if (!test__start_subtest(s)) return; - udp_skb_redir_to_connected(skel, map, family); + inet_unix_skb_redir_to_connected(skel, map, family); + unix_inet_skb_redir_to_connected(skel, map, family); } static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, @@ -1747,6 +1999,7 @@ static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, test_reuseport(skel, map, family, SOCK_STREAM); test_reuseport(skel, map, family, SOCK_DGRAM); test_udp_redir(skel, map, family); + test_udp_unix_redir(skel, map, family); } void test_sockmap_listen(void) @@ -1762,10 +2015,14 @@ void test_sockmap_listen(void) skel->bss->test_sockmap = true; run_tests(skel, skel->maps.sock_map, AF_INET); run_tests(skel, skel->maps.sock_map, AF_INET6); + test_unix_redir(skel, skel->maps.sock_map, SOCK_DGRAM); + test_unix_redir(skel, skel->maps.sock_map, SOCK_STREAM); skel->bss->test_sockmap = false; run_tests(skel, skel->maps.sock_hash, AF_INET); run_tests(skel, skel->maps.sock_hash, AF_INET6); + test_unix_redir(skel, skel->maps.sock_hash, SOCK_DGRAM); + test_unix_redir(skel, skel->maps.sock_hash, SOCK_STREAM); test_sockmap_listen__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c index ec281b0363b8..86f97681ad89 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c @@ -195,8 +195,10 @@ static void run_test(int cgroup_fd) pthread_mutex_lock(&server_started_mtx); if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread, - (void *)&server_fd))) + (void *)&server_fd))) { + pthread_mutex_unlock(&server_started_mtx); goto close_server_fd; + } pthread_cond_wait(&server_started, &server_started_mtx); pthread_mutex_unlock(&server_started_mtx); diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c new file mode 100644 index 000000000000..6b53b3cb8dad --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <test_progs.h> +#include <netinet/tcp.h> +#include "sockopt_qos_to_cc.skel.h" + +static void run_setsockopt_test(int cg_fd, int sock_fd) +{ + socklen_t optlen; + char cc[16]; /* TCP_CA_NAME_MAX */ + int buf; + int err = -1; + + buf = 0x2D; + err = setsockopt(sock_fd, SOL_IPV6, IPV6_TCLASS, &buf, sizeof(buf)); + if (!ASSERT_OK(err, "setsockopt(sock_fd, IPV6_TCLASS)")) + return; + + /* Verify the setsockopt cc change */ + optlen = sizeof(cc); + err = getsockopt(sock_fd, SOL_TCP, TCP_CONGESTION, cc, &optlen); + if (!ASSERT_OK(err, "getsockopt(sock_fd, TCP_CONGESTION)")) + return; + + if (!ASSERT_STREQ(cc, "reno", "getsockopt(sock_fd, TCP_CONGESTION)")) + return; +} + +void test_sockopt_qos_to_cc(void) +{ + struct sockopt_qos_to_cc *skel; + char cc_cubic[16] = "cubic"; /* TCP_CA_NAME_MAX */ + int cg_fd = -1; + int sock_fd = -1; + int err; + + cg_fd = test__join_cgroup("/sockopt_qos_to_cc"); + if (!ASSERT_GE(cg_fd, 0, "cg-join(sockopt_qos_to_cc)")) + return; + + skel = sockopt_qos_to_cc__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + goto done; + + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (!ASSERT_GE(sock_fd, 0, "v6 socket open")) + goto done; + + err = setsockopt(sock_fd, SOL_TCP, TCP_CONGESTION, &cc_cubic, + sizeof(cc_cubic)); + if (!ASSERT_OK(err, "setsockopt(sock_fd, TCP_CONGESTION)")) + goto done; + + skel->links.sockopt_qos_to_cc = + bpf_program__attach_cgroup(skel->progs.sockopt_qos_to_cc, + cg_fd); + if (!ASSERT_OK_PTR(skel->links.sockopt_qos_to_cc, + "prog_attach(sockopt_qos_to_cc)")) + goto done; + + run_setsockopt_test(cg_fd, sock_fd); + +done: + if (sock_fd != -1) + close(sock_fd); + if (cg_fd != -1) + close(cg_fd); + /* destroy can take null and error pointer */ + sockopt_qos_to_cc__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 11a769e18f5d..0a91d8d9954b 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -62,8 +62,7 @@ retry: skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu, pmu_fd); - if (CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event", - "err %ld\n", PTR_ERR(skel->links.oncpu))) { + if (!ASSERT_OK_PTR(skel->links.oncpu, "attach_perf_event")) { close(pmu_fd); goto cleanup; } diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c index 37269d23df93..04b476bd62b9 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c @@ -21,7 +21,7 @@ void test_stacktrace_map(void) goto close_prog; link = bpf_program__attach_tracepoint(prog, "sched", "sched_switch"); - if (CHECK(IS_ERR(link), "attach_tp", "err %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_tp")) goto close_prog; /* find map fds */ diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c index 404a5498e1a3..4fd30bb651ad 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c @@ -21,7 +21,7 @@ void test_stacktrace_map_raw_tp(void) goto close_prog; link = bpf_program__attach_raw_tracepoint(prog, "sched_switch"); - if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_raw_tp")) goto close_prog; /* find map fds */ @@ -59,7 +59,6 @@ void test_stacktrace_map_raw_tp(void) goto close_prog; close_prog: - if (!IS_ERR_OR_NULL(link)) - bpf_link__destroy(link); + bpf_link__destroy(link); bpf_object__close(obj); } diff --git a/tools/testing/selftests/bpf/prog_tests/static_linked.c b/tools/testing/selftests/bpf/prog_tests/static_linked.c index 46556976dccc..5c4e3014e063 100644 --- a/tools/testing/selftests/bpf/prog_tests/static_linked.c +++ b/tools/testing/selftests/bpf/prog_tests/static_linked.c @@ -14,12 +14,7 @@ void test_static_linked(void) return; skel->rodata->rovar1 = 1; - skel->bss->static_var1 = 2; - skel->bss->static_var11 = 3; - skel->rodata->rovar2 = 4; - skel->bss->static_var2 = 5; - skel->bss->static_var22 = 6; err = test_static_linked__load(skel); if (!ASSERT_OK(err, "skel_load")) @@ -32,8 +27,8 @@ void test_static_linked(void) /* trigger */ usleep(1); - ASSERT_EQ(skel->bss->var1, 1 * 2 + 2 + 3, "var1"); - ASSERT_EQ(skel->bss->var2, 4 * 3 + 5 + 6, "var2"); + ASSERT_EQ(skel->data->var1, 1 * 2 + 2 + 3, "var1"); + ASSERT_EQ(skel->data->var2, 4 * 3 + 5 + 6, "var2"); cleanup: test_static_linked__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/syscall.c b/tools/testing/selftests/bpf/prog_tests/syscall.c new file mode 100644 index 000000000000..81e997a69f7a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/syscall.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <test_progs.h> +#include "syscall.skel.h" + +struct args { + __u64 log_buf; + __u32 log_size; + int max_entries; + int map_fd; + int prog_fd; + int btf_fd; +}; + +void test_syscall(void) +{ + static char verifier_log[8192]; + struct args ctx = { + .max_entries = 1024, + .log_buf = (uintptr_t) verifier_log, + .log_size = sizeof(verifier_log), + }; + struct bpf_prog_test_run_attr tattr = { + .ctx_in = &ctx, + .ctx_size_in = sizeof(ctx), + }; + struct syscall *skel = NULL; + __u64 key = 12, value = 0; + int err; + + skel = syscall__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; + + tattr.prog_fd = bpf_program__fd(skel->progs.bpf_prog); + err = bpf_prog_test_run_xattr(&tattr); + ASSERT_EQ(err, 0, "err"); + ASSERT_EQ(tattr.retval, 1, "retval"); + ASSERT_GT(ctx.map_fd, 0, "ctx.map_fd"); + ASSERT_GT(ctx.prog_fd, 0, "ctx.prog_fd"); + ASSERT_OK(memcmp(verifier_log, "processed", sizeof("processed") - 1), + "verifier_log"); + + err = bpf_map_lookup_elem(ctx.map_fd, &key, &value); + ASSERT_EQ(err, 0, "map_lookup"); + ASSERT_EQ(value, 34, "map lookup value"); +cleanup: + syscall__destroy(skel); + if (ctx.prog_fd > 0) + close(ctx.prog_fd); + if (ctx.map_fd > 0) + close(ctx.map_fd); + if (ctx.btf_fd > 0) + close(ctx.btf_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index ee27d68d2a1c..b5940e6ca67c 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -715,6 +715,8 @@ out: bpf_object__close(obj); } +#include "tailcall_bpf2bpf4.skel.h" + /* test_tailcall_bpf2bpf_4 checks that tailcall counter is correctly preserved * across tailcalls combined with bpf2bpf calls. for making sure that tailcall * counter behaves correctly, bpf program will go through following flow: @@ -727,10 +729,15 @@ out: * the loop begins. At the end of the test make sure that the global counter is * equal to 31, because tailcall counter includes the first two tailcalls * whereas global counter is incremented only on loop presented on flow above. + * + * The noise parameter is used to insert bpf_map_update calls into the logic + * to force verifier to patch instructions. This allows us to ensure jump + * logic remains correct with instruction movement. */ -static void test_tailcall_bpf2bpf_4(void) +static void test_tailcall_bpf2bpf_4(bool noise) { - int err, map_fd, prog_fd, main_fd, data_fd, i, val; + int err, map_fd, prog_fd, main_fd, data_fd, i; + struct tailcall_bpf2bpf4__bss val; struct bpf_map *prog_array, *data_map; struct bpf_program *prog; struct bpf_object *obj; @@ -774,11 +781,6 @@ static void test_tailcall_bpf2bpf_4(void) goto out; } - err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, - &duration, &retval, NULL); - CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n", - err, errno, retval); - data_map = bpf_object__find_map_by_name(obj, "tailcall.bss"); if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map))) return; @@ -788,9 +790,21 @@ static void test_tailcall_bpf2bpf_4(void) return; i = 0; + val.noise = noise; + val.count = 0; + err = bpf_map_update_elem(data_fd, &i, &val, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + &duration, &retval, NULL); + CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + i = 0; err = bpf_map_lookup_elem(data_fd, &i, &val); - CHECK(err || val != 31, "tailcall count", "err %d errno %d count %d\n", - err, errno, val); + CHECK(err || val.count != 31, "tailcall count", "err %d errno %d count %d\n", + err, errno, val.count); out: bpf_object__close(obj); @@ -815,5 +829,7 @@ void test_tailcalls(void) if (test__start_subtest("tailcall_bpf2bpf_3")) test_tailcall_bpf2bpf_3(); if (test__start_subtest("tailcall_bpf2bpf_4")) - test_tailcall_bpf2bpf_4(); + test_tailcall_bpf2bpf_4(false); + if (test__start_subtest("tailcall_bpf2bpf_5")) + test_tailcall_bpf2bpf_4(true); } diff --git a/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c b/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c new file mode 100644 index 000000000000..37c20b5ffa70 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <test_progs.h> +#include "test_task_pt_regs.skel.h" + +void test_task_pt_regs(void) +{ + struct test_task_pt_regs *skel; + struct bpf_link *uprobe_link; + size_t uprobe_offset; + ssize_t base_addr; + bool match; + + base_addr = get_base_addr(); + if (!ASSERT_GT(base_addr, 0, "get_base_addr")) + return; + uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr); + + skel = test_task_pt_regs__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + if (!ASSERT_OK_PTR(skel->bss, "check_bss")) + goto cleanup; + + uprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uprobe, + false /* retprobe */, + 0 /* self pid */, + "/proc/self/exe", + uprobe_offset); + if (!ASSERT_OK_PTR(uprobe_link, "attach_uprobe")) + goto cleanup; + skel->links.handle_uprobe = uprobe_link; + + /* trigger & validate uprobe */ + get_base_addr(); + + if (!ASSERT_EQ(skel->bss->uprobe_res, 1, "check_uprobe_res")) + goto cleanup; + + match = !memcmp(&skel->bss->current_regs, &skel->bss->ctx_regs, + sizeof(skel->bss->current_regs)); + ASSERT_TRUE(match, "check_regs_match"); + +cleanup: + test_task_pt_regs__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/tc_bpf.c b/tools/testing/selftests/bpf/prog_tests/tc_bpf.c new file mode 100644 index 000000000000..4a505a5adf4d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tc_bpf.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include <linux/pkt_cls.h> + +#include "test_tc_bpf.skel.h" + +#define LO_IFINDEX 1 + +#define TEST_DECLARE_OPTS(__fd) \ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts_h, .handle = 1); \ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts_p, .priority = 1); \ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts_f, .prog_fd = __fd); \ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts_hp, .handle = 1, .priority = 1); \ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts_hf, .handle = 1, .prog_fd = __fd); \ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts_pf, .priority = 1, .prog_fd = __fd); \ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts_hpf, .handle = 1, .priority = 1, .prog_fd = __fd); \ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts_hpi, .handle = 1, .priority = 1, .prog_id = 42); \ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts_hpr, .handle = 1, .priority = 1, \ + .flags = BPF_TC_F_REPLACE); \ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts_hpfi, .handle = 1, .priority = 1, .prog_fd = __fd, \ + .prog_id = 42); \ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts_prio_max, .handle = 1, .priority = UINT16_MAX + 1); + +static int test_tc_bpf_basic(const struct bpf_tc_hook *hook, int fd) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts, .handle = 1, .priority = 1, .prog_fd = fd); + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + int ret; + + ret = bpf_obj_get_info_by_fd(fd, &info, &info_len); + if (!ASSERT_OK(ret, "bpf_obj_get_info_by_fd")) + return ret; + + ret = bpf_tc_attach(hook, &opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + return ret; + + if (!ASSERT_EQ(opts.handle, 1, "handle set") || + !ASSERT_EQ(opts.priority, 1, "priority set") || + !ASSERT_EQ(opts.prog_id, info.id, "prog_id set")) + goto end; + + opts.prog_id = 0; + opts.flags = BPF_TC_F_REPLACE; + ret = bpf_tc_attach(hook, &opts); + if (!ASSERT_OK(ret, "bpf_tc_attach replace mode")) + goto end; + + opts.flags = opts.prog_fd = opts.prog_id = 0; + ret = bpf_tc_query(hook, &opts); + if (!ASSERT_OK(ret, "bpf_tc_query")) + goto end; + + if (!ASSERT_EQ(opts.handle, 1, "handle set") || + !ASSERT_EQ(opts.priority, 1, "priority set") || + !ASSERT_EQ(opts.prog_id, info.id, "prog_id set")) + goto end; + +end: + opts.flags = opts.prog_fd = opts.prog_id = 0; + ret = bpf_tc_detach(hook, &opts); + ASSERT_OK(ret, "bpf_tc_detach"); + return ret; +} + +static int test_tc_bpf_api(struct bpf_tc_hook *hook, int fd) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_opts, .handle = 1, .priority = 1, .prog_fd = fd); + DECLARE_LIBBPF_OPTS(bpf_tc_hook, inv_hook, .attach_point = BPF_TC_INGRESS); + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts, .handle = 1, .priority = 1); + int ret; + + ret = bpf_tc_hook_create(NULL); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_hook_create invalid hook = NULL")) + return -EINVAL; + + /* hook ifindex = 0 */ + ret = bpf_tc_hook_create(&inv_hook); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_hook_create invalid hook ifindex == 0")) + return -EINVAL; + + ret = bpf_tc_hook_destroy(&inv_hook); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_hook_destroy invalid hook ifindex == 0")) + return -EINVAL; + + ret = bpf_tc_attach(&inv_hook, &attach_opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_attach invalid hook ifindex == 0")) + return -EINVAL; + attach_opts.prog_id = 0; + + ret = bpf_tc_detach(&inv_hook, &opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid hook ifindex == 0")) + return -EINVAL; + + ret = bpf_tc_query(&inv_hook, &opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid hook ifindex == 0")) + return -EINVAL; + + /* hook ifindex < 0 */ + inv_hook.ifindex = -1; + + ret = bpf_tc_hook_create(&inv_hook); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_hook_create invalid hook ifindex < 0")) + return -EINVAL; + + ret = bpf_tc_hook_destroy(&inv_hook); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_hook_destroy invalid hook ifindex < 0")) + return -EINVAL; + + ret = bpf_tc_attach(&inv_hook, &attach_opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_attach invalid hook ifindex < 0")) + return -EINVAL; + attach_opts.prog_id = 0; + + ret = bpf_tc_detach(&inv_hook, &opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid hook ifindex < 0")) + return -EINVAL; + + ret = bpf_tc_query(&inv_hook, &opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid hook ifindex < 0")) + return -EINVAL; + + inv_hook.ifindex = LO_IFINDEX; + + /* hook.attach_point invalid */ + inv_hook.attach_point = 0xabcd; + ret = bpf_tc_hook_create(&inv_hook); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_hook_create invalid hook.attach_point")) + return -EINVAL; + + ret = bpf_tc_hook_destroy(&inv_hook); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_hook_destroy invalid hook.attach_point")) + return -EINVAL; + + ret = bpf_tc_attach(&inv_hook, &attach_opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_attach invalid hook.attach_point")) + return -EINVAL; + + ret = bpf_tc_detach(&inv_hook, &opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid hook.attach_point")) + return -EINVAL; + + ret = bpf_tc_query(&inv_hook, &opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid hook.attach_point")) + return -EINVAL; + + inv_hook.attach_point = BPF_TC_INGRESS; + + /* hook.attach_point valid, but parent invalid */ + inv_hook.parent = TC_H_MAKE(1UL << 16, 10); + ret = bpf_tc_hook_create(&inv_hook); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_hook_create invalid hook parent")) + return -EINVAL; + + ret = bpf_tc_hook_destroy(&inv_hook); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_hook_destroy invalid hook parent")) + return -EINVAL; + + ret = bpf_tc_attach(&inv_hook, &attach_opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_attach invalid hook parent")) + return -EINVAL; + + ret = bpf_tc_detach(&inv_hook, &opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid hook parent")) + return -EINVAL; + + ret = bpf_tc_query(&inv_hook, &opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid hook parent")) + return -EINVAL; + + inv_hook.attach_point = BPF_TC_CUSTOM; + inv_hook.parent = 0; + /* These return EOPNOTSUPP instead of EINVAL as parent is checked after + * attach_point of the hook. + */ + ret = bpf_tc_hook_create(&inv_hook); + if (!ASSERT_EQ(ret, -EOPNOTSUPP, "bpf_tc_hook_create invalid hook parent")) + return -EINVAL; + + ret = bpf_tc_hook_destroy(&inv_hook); + if (!ASSERT_EQ(ret, -EOPNOTSUPP, "bpf_tc_hook_destroy invalid hook parent")) + return -EINVAL; + + ret = bpf_tc_attach(&inv_hook, &attach_opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_attach invalid hook parent")) + return -EINVAL; + + ret = bpf_tc_detach(&inv_hook, &opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid hook parent")) + return -EINVAL; + + ret = bpf_tc_query(&inv_hook, &opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid hook parent")) + return -EINVAL; + + inv_hook.attach_point = BPF_TC_INGRESS; + + /* detach */ + { + TEST_DECLARE_OPTS(fd); + + ret = bpf_tc_detach(NULL, &opts_hp); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid hook = NULL")) + return -EINVAL; + + ret = bpf_tc_detach(hook, NULL); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid opts = NULL")) + return -EINVAL; + + ret = bpf_tc_detach(hook, &opts_hpr); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid flags set")) + return -EINVAL; + + ret = bpf_tc_detach(hook, &opts_hpf); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid prog_fd set")) + return -EINVAL; + + ret = bpf_tc_detach(hook, &opts_hpi); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid prog_id set")) + return -EINVAL; + + ret = bpf_tc_detach(hook, &opts_p); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid handle unset")) + return -EINVAL; + + ret = bpf_tc_detach(hook, &opts_h); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid priority unset")) + return -EINVAL; + + ret = bpf_tc_detach(hook, &opts_prio_max); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_detach invalid priority > UINT16_MAX")) + return -EINVAL; + } + + /* query */ + { + TEST_DECLARE_OPTS(fd); + + ret = bpf_tc_query(NULL, &opts); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid hook = NULL")) + return -EINVAL; + + ret = bpf_tc_query(hook, NULL); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid opts = NULL")) + return -EINVAL; + + ret = bpf_tc_query(hook, &opts_hpr); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid flags set")) + return -EINVAL; + + ret = bpf_tc_query(hook, &opts_hpf); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid prog_fd set")) + return -EINVAL; + + ret = bpf_tc_query(hook, &opts_hpi); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid prog_id set")) + return -EINVAL; + + ret = bpf_tc_query(hook, &opts_p); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid handle unset")) + return -EINVAL; + + ret = bpf_tc_query(hook, &opts_h); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid priority unset")) + return -EINVAL; + + ret = bpf_tc_query(hook, &opts_prio_max); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query invalid priority > UINT16_MAX")) + return -EINVAL; + + /* when chain is not present, kernel returns -EINVAL */ + ret = bpf_tc_query(hook, &opts_hp); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_query valid handle, priority set")) + return -EINVAL; + } + + /* attach */ + { + TEST_DECLARE_OPTS(fd); + + ret = bpf_tc_attach(NULL, &opts_hp); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_attach invalid hook = NULL")) + return -EINVAL; + + ret = bpf_tc_attach(hook, NULL); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_attach invalid opts = NULL")) + return -EINVAL; + + opts_hp.flags = 42; + ret = bpf_tc_attach(hook, &opts_hp); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_attach invalid flags")) + return -EINVAL; + + ret = bpf_tc_attach(hook, NULL); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_attach invalid prog_fd unset")) + return -EINVAL; + + ret = bpf_tc_attach(hook, &opts_hpi); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_attach invalid prog_id set")) + return -EINVAL; + + ret = bpf_tc_attach(hook, &opts_pf); + if (!ASSERT_OK(ret, "bpf_tc_attach valid handle unset")) + return -EINVAL; + opts_pf.prog_fd = opts_pf.prog_id = 0; + ASSERT_OK(bpf_tc_detach(hook, &opts_pf), "bpf_tc_detach"); + + ret = bpf_tc_attach(hook, &opts_hf); + if (!ASSERT_OK(ret, "bpf_tc_attach valid priority unset")) + return -EINVAL; + opts_hf.prog_fd = opts_hf.prog_id = 0; + ASSERT_OK(bpf_tc_detach(hook, &opts_hf), "bpf_tc_detach"); + + ret = bpf_tc_attach(hook, &opts_prio_max); + if (!ASSERT_EQ(ret, -EINVAL, "bpf_tc_attach invalid priority > UINT16_MAX")) + return -EINVAL; + + ret = bpf_tc_attach(hook, &opts_f); + if (!ASSERT_OK(ret, "bpf_tc_attach valid both handle and priority unset")) + return -EINVAL; + opts_f.prog_fd = opts_f.prog_id = 0; + ASSERT_OK(bpf_tc_detach(hook, &opts_f), "bpf_tc_detach"); + } + + return 0; +} + +void test_tc_bpf(void) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = LO_IFINDEX, + .attach_point = BPF_TC_INGRESS); + struct test_tc_bpf *skel = NULL; + bool hook_created = false; + int cls_fd, ret; + + skel = test_tc_bpf__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tc_bpf__open_and_load")) + return; + + cls_fd = bpf_program__fd(skel->progs.cls); + + ret = bpf_tc_hook_create(&hook); + if (ret == 0) + hook_created = true; + + ret = ret == -EEXIST ? 0 : ret; + if (!ASSERT_OK(ret, "bpf_tc_hook_create(BPF_TC_INGRESS)")) + goto end; + + hook.attach_point = BPF_TC_CUSTOM; + hook.parent = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS); + ret = bpf_tc_hook_create(&hook); + if (!ASSERT_EQ(ret, -EOPNOTSUPP, "bpf_tc_hook_create invalid hook.attach_point")) + goto end; + + ret = test_tc_bpf_basic(&hook, cls_fd); + if (!ASSERT_OK(ret, "test_tc_internal ingress")) + goto end; + + ret = bpf_tc_hook_destroy(&hook); + if (!ASSERT_EQ(ret, -EOPNOTSUPP, "bpf_tc_hook_destroy invalid hook.attach_point")) + goto end; + + hook.attach_point = BPF_TC_INGRESS; + hook.parent = 0; + bpf_tc_hook_destroy(&hook); + + ret = test_tc_bpf_basic(&hook, cls_fd); + if (!ASSERT_OK(ret, "test_tc_internal ingress")) + goto end; + + bpf_tc_hook_destroy(&hook); + + hook.attach_point = BPF_TC_EGRESS; + ret = test_tc_bpf_basic(&hook, cls_fd); + if (!ASSERT_OK(ret, "test_tc_internal egress")) + goto end; + + bpf_tc_hook_destroy(&hook); + + ret = test_tc_bpf_api(&hook, cls_fd); + if (!ASSERT_OK(ret, "test_tc_bpf_api")) + goto end; + + bpf_tc_hook_destroy(&hook); + +end: + if (hook_created) { + hook.attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS; + bpf_tc_hook_destroy(&hook); + } + test_tc_bpf__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 5703c918812b..e7201ba29ccd 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -13,15 +13,16 @@ #define _GNU_SOURCE #include <arpa/inet.h> +#include <linux/if.h> +#include <linux/if_tun.h> #include <linux/limits.h> #include <linux/sysctl.h> -#include <linux/if_tun.h> -#include <linux/if.h> #include <sched.h> #include <stdbool.h> #include <stdio.h> -#include <sys/stat.h> #include <sys/mount.h> +#include <sys/stat.h> +#include <unistd.h> #include "test_progs.h" #include "network_helpers.h" @@ -391,9 +392,7 @@ done: static int test_ping(int family, const char *addr) { - const char *ping = family == AF_INET6 ? "ping6" : "ping"; - - SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s > /dev/null", ping, addr); + SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s > /dev/null", ping_command(family), addr); return 0; fail: return -1; diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c index 08d19cafd5e8..1fa772079967 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c @@ -353,8 +353,7 @@ static void fastopen_estab(void) return; link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd); - if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n", - PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_cgroup(estab)")) return; if (sk_fds_connect(&sk_fds, true)) { @@ -398,8 +397,7 @@ static void syncookie_estab(void) return; link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd); - if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n", - PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_cgroup(estab)")) return; if (sk_fds_connect(&sk_fds, false)) { @@ -431,8 +429,7 @@ static void fin(void) return; link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd); - if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n", - PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_cgroup(estab)")) return; if (sk_fds_connect(&sk_fds, false)) { @@ -471,8 +468,7 @@ static void __simple_estab(bool exprm) return; link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd); - if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n", - PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_cgroup(estab)")) return; if (sk_fds_connect(&sk_fds, false)) { @@ -509,8 +505,7 @@ static void misc(void) return; link = bpf_program__attach_cgroup(misc_skel->progs.misc_estab, cg_fd); - if (CHECK(IS_ERR(link), "attach_cgroup(misc_estab)", "err: %ld\n", - PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_cgroup(misc_estab)")) return; if (sk_fds_connect(&sk_fds, false)) { diff --git a/tools/testing/selftests/bpf/prog_tests/test_overhead.c b/tools/testing/selftests/bpf/prog_tests/test_overhead.c index 9966685866fd..123c68c1917d 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_overhead.c +++ b/tools/testing/selftests/bpf/prog_tests/test_overhead.c @@ -73,7 +73,7 @@ void test_test_overhead(void) return; obj = bpf_object__open_file("./test_overhead.o", NULL); - if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) + if (!ASSERT_OK_PTR(obj, "obj_open_file")) return; kprobe_prog = bpf_object__find_program_by_title(obj, kprobe_name); @@ -108,7 +108,7 @@ void test_test_overhead(void) /* attach kprobe */ link = bpf_program__attach_kprobe(kprobe_prog, false /* retprobe */, kprobe_func); - if (CHECK(IS_ERR(link), "attach_kprobe", "err %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_kprobe")) goto cleanup; test_run("kprobe"); bpf_link__destroy(link); @@ -116,28 +116,28 @@ void test_test_overhead(void) /* attach kretprobe */ link = bpf_program__attach_kprobe(kretprobe_prog, true /* retprobe */, kprobe_func); - if (CHECK(IS_ERR(link), "attach kretprobe", "err %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_kretprobe")) goto cleanup; test_run("kretprobe"); bpf_link__destroy(link); /* attach raw_tp */ link = bpf_program__attach_raw_tracepoint(raw_tp_prog, "task_rename"); - if (CHECK(IS_ERR(link), "attach fentry", "err %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_raw_tp")) goto cleanup; test_run("raw_tp"); bpf_link__destroy(link); /* attach fentry */ link = bpf_program__attach_trace(fentry_prog); - if (CHECK(IS_ERR(link), "attach fentry", "err %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_fentry")) goto cleanup; test_run("fentry"); bpf_link__destroy(link); /* attach fexit */ link = bpf_program__attach_trace(fexit_prog); - if (CHECK(IS_ERR(link), "attach fexit", "err %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "attach_fexit")) goto cleanup; test_run("fexit"); bpf_link__destroy(link); diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c new file mode 100644 index 000000000000..25f40e1b9967 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <test_progs.h> +#include "timer.skel.h" + +static int timer(struct timer *timer_skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + + err = timer__attach(timer_skel); + if (!ASSERT_OK(err, "timer_attach")) + return err; + + ASSERT_EQ(timer_skel->data->callback_check, 52, "callback_check1"); + ASSERT_EQ(timer_skel->data->callback2_check, 52, "callback2_check1"); + + prog_fd = bpf_program__fd(timer_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(retval, 0, "test_run"); + timer__detach(timer_skel); + + usleep(50); /* 10 usecs should be enough, but give it extra */ + /* check that timer_cb1() was executed 10+10 times */ + ASSERT_EQ(timer_skel->data->callback_check, 42, "callback_check2"); + ASSERT_EQ(timer_skel->data->callback2_check, 42, "callback2_check2"); + + /* check that timer_cb2() was executed twice */ + ASSERT_EQ(timer_skel->bss->bss_data, 10, "bss_data"); + + /* check that there were no errors in timer execution */ + ASSERT_EQ(timer_skel->bss->err, 0, "err"); + + /* check that code paths completed */ + ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok"); + + return 0; +} + +void test_timer(void) +{ + struct timer *timer_skel = NULL; + int err; + + timer_skel = timer__open_and_load(); + if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) + goto cleanup; + + err = timer(timer_skel); + ASSERT_OK(err, "timer"); +cleanup: + timer__destroy(timer_skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/timer_mim.c b/tools/testing/selftests/bpf/prog_tests/timer_mim.c new file mode 100644 index 000000000000..ced8f6cf347c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer_mim.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <test_progs.h> +#include "timer_mim.skel.h" +#include "timer_mim_reject.skel.h" + +static int timer_mim(struct timer_mim *timer_skel) +{ + __u32 duration = 0, retval; + __u64 cnt1, cnt2; + int err, prog_fd, key1 = 1; + + err = timer_mim__attach(timer_skel); + if (!ASSERT_OK(err, "timer_attach")) + return err; + + prog_fd = bpf_program__fd(timer_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(retval, 0, "test_run"); + timer_mim__detach(timer_skel); + + /* check that timer_cb[12] are incrementing 'cnt' */ + cnt1 = READ_ONCE(timer_skel->bss->cnt); + for (int i = 0; i < 100; i++) { + cnt2 = READ_ONCE(timer_skel->bss->cnt); + if (cnt2 != cnt1) + break; + usleep(200); /* 100 times more than interval */ + } + ASSERT_GT(cnt2, cnt1, "cnt"); + + ASSERT_EQ(timer_skel->bss->err, 0, "err"); + /* check that code paths completed */ + ASSERT_EQ(timer_skel->bss->ok, 1 | 2, "ok"); + + close(bpf_map__fd(timer_skel->maps.inner_htab)); + err = bpf_map_delete_elem(bpf_map__fd(timer_skel->maps.outer_arr), &key1); + ASSERT_EQ(err, 0, "delete inner map"); + + /* check that timer_cb[12] are no longer running */ + cnt1 = READ_ONCE(timer_skel->bss->cnt); + for (int i = 0; i < 100; i++) { + usleep(200); /* 100 times more than interval */ + cnt2 = READ_ONCE(timer_skel->bss->cnt); + if (cnt2 == cnt1) + break; + } + ASSERT_EQ(cnt2, cnt1, "cnt"); + + return 0; +} + +void test_timer_mim(void) +{ + struct timer_mim_reject *timer_reject_skel = NULL; + libbpf_print_fn_t old_print_fn = NULL; + struct timer_mim *timer_skel = NULL; + int err; + + old_print_fn = libbpf_set_print(NULL); + timer_reject_skel = timer_mim_reject__open_and_load(); + libbpf_set_print(old_print_fn); + if (!ASSERT_ERR_PTR(timer_reject_skel, "timer_reject_skel_load")) + goto cleanup; + + timer_skel = timer_mim__open_and_load(); + if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) + goto cleanup; + + err = timer_mim(timer_skel); + ASSERT_OK(err, "timer_mim"); +cleanup: + timer_mim__destroy(timer_skel); + timer_mim_reject__destroy(timer_reject_skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/trace_printk.c b/tools/testing/selftests/bpf/prog_tests/trace_printk.c index 39b0decb1bb2..d39bc00feb45 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_printk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_printk.c @@ -3,7 +3,7 @@ #include <test_progs.h> -#include "trace_printk.skel.h" +#include "trace_printk.lskel.h" #define TRACEBUF "/sys/kernel/debug/tracing/trace_pipe" #define SEARCHMSG "testing,testing" @@ -21,6 +21,9 @@ void test_trace_printk(void) if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) return; + ASSERT_EQ(skel->rodata->fmt[0], 'T', "invalid printk fmt string"); + skel->rodata->fmt[0] = 't'; + err = trace_printk__load(skel); if (CHECK(err, "skel_load", "failed to load skeleton: %d\n", err)) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c index f3022d934e2d..d7f5a931d7f3 100644 --- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c +++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c @@ -55,7 +55,7 @@ void test_trampoline_count(void) /* attach 'allowed' trampoline programs */ for (i = 0; i < MAX_TRAMP_PROGS; i++) { obj = bpf_object__open_file(object, NULL); - if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) { + if (!ASSERT_OK_PTR(obj, "obj_open_file")) { obj = NULL; goto cleanup; } @@ -68,14 +68,14 @@ void test_trampoline_count(void) if (rand() % 2) { link = load(inst[i].obj, fentry_name); - if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) { + if (!ASSERT_OK_PTR(link, "attach_prog")) { link = NULL; goto cleanup; } inst[i].link_fentry = link; } else { link = load(inst[i].obj, fexit_name); - if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) { + if (!ASSERT_OK_PTR(link, "attach_prog")) { link = NULL; goto cleanup; } @@ -85,7 +85,7 @@ void test_trampoline_count(void) /* and try 1 extra.. */ obj = bpf_object__open_file(object, NULL); - if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) { + if (!ASSERT_OK_PTR(obj, "obj_open_file")) { obj = NULL; goto cleanup; } @@ -96,13 +96,15 @@ void test_trampoline_count(void) /* ..that needs to fail */ link = load(obj, fentry_name); - if (CHECK(!IS_ERR(link), "cannot attach over the limit", "err %ld\n", PTR_ERR(link))) { + err = libbpf_get_error(link); + if (!ASSERT_ERR_PTR(link, "cannot attach over the limit")) { bpf_link__destroy(link); goto cleanup_extra; } /* with E2BIG error */ - CHECK(PTR_ERR(link) != -E2BIG, "proper error check", "err %ld\n", PTR_ERR(link)); + ASSERT_EQ(err, -E2BIG, "proper error check"); + ASSERT_EQ(link, NULL, "ptr_is_null"); /* and finaly execute the probe */ if (CHECK_FAIL(prctl(PR_GET_NAME, comm, 0L, 0L, 0L))) diff --git a/tools/testing/selftests/bpf/prog_tests/udp_limit.c b/tools/testing/selftests/bpf/prog_tests/udp_limit.c index 2aba09d4d01b..56c9d6bd38a3 100644 --- a/tools/testing/selftests/bpf/prog_tests/udp_limit.c +++ b/tools/testing/selftests/bpf/prog_tests/udp_limit.c @@ -22,11 +22,10 @@ void test_udp_limit(void) goto close_cgroup_fd; skel->links.sock = bpf_program__attach_cgroup(skel->progs.sock, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.sock, "cg_attach_sock")) + goto close_skeleton; skel->links.sock_release = bpf_program__attach_cgroup(skel->progs.sock_release, cgroup_fd); - if (CHECK(IS_ERR(skel->links.sock) || IS_ERR(skel->links.sock_release), - "cg-attach", "sock %ld sock_release %ld", - PTR_ERR(skel->links.sock), - PTR_ERR(skel->links.sock_release))) + if (!ASSERT_OK_PTR(skel->links.sock_release, "cg_attach_sock_release")) goto close_skeleton; /* BPF program enforces a single UDP socket per cgroup, diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c new file mode 100644 index 000000000000..ad3ba81b4048 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c @@ -0,0 +1,574 @@ +// SPDX-License-Identifier: GPL-2.0 + +/** + * Test XDP bonding support + * + * Sets up two bonded veth pairs between two fresh namespaces + * and verifies that XDP_TX program loaded on a bond device + * are correctly loaded onto the slave devices and XDP_TX'd + * packets are balanced using bonding. + */ + +#define _GNU_SOURCE +#include <sched.h> +#include <net/if.h> +#include <linux/if_link.h> +#include "test_progs.h" +#include "network_helpers.h" +#include <linux/if_bonding.h> +#include <linux/limits.h> +#include <linux/udp.h> + +#include "xdp_dummy.skel.h" +#include "xdp_redirect_multi_kern.skel.h" +#include "xdp_tx.skel.h" + +#define BOND1_MAC {0x00, 0x11, 0x22, 0x33, 0x44, 0x55} +#define BOND1_MAC_STR "00:11:22:33:44:55" +#define BOND2_MAC {0x00, 0x22, 0x33, 0x44, 0x55, 0x66} +#define BOND2_MAC_STR "00:22:33:44:55:66" +#define NPACKETS 100 + +static int root_netns_fd = -1; + +static void restore_root_netns(void) +{ + ASSERT_OK(setns(root_netns_fd, CLONE_NEWNET), "restore_root_netns"); +} + +static int setns_by_name(char *name) +{ + int nsfd, err; + char nspath[PATH_MAX]; + + snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name); + nsfd = open(nspath, O_RDONLY | O_CLOEXEC); + if (nsfd < 0) + return -1; + + err = setns(nsfd, CLONE_NEWNET); + close(nsfd); + return err; +} + +static int get_rx_packets(const char *iface) +{ + FILE *f; + char line[512]; + int iface_len = strlen(iface); + + f = fopen("/proc/net/dev", "r"); + if (!f) + return -1; + + while (fgets(line, sizeof(line), f)) { + char *p = line; + + while (*p == ' ') + p++; /* skip whitespace */ + if (!strncmp(p, iface, iface_len)) { + p += iface_len; + if (*p++ != ':') + continue; + while (*p == ' ') + p++; /* skip whitespace */ + while (*p && *p != ' ') + p++; /* skip rx bytes */ + while (*p == ' ') + p++; /* skip whitespace */ + fclose(f); + return atoi(p); + } + } + fclose(f); + return -1; +} + +#define MAX_BPF_LINKS 8 + +struct skeletons { + struct xdp_dummy *xdp_dummy; + struct xdp_tx *xdp_tx; + struct xdp_redirect_multi_kern *xdp_redirect_multi_kern; + + int nlinks; + struct bpf_link *links[MAX_BPF_LINKS]; +}; + +static int xdp_attach(struct skeletons *skeletons, struct bpf_program *prog, char *iface) +{ + struct bpf_link *link; + int ifindex; + + ifindex = if_nametoindex(iface); + if (!ASSERT_GT(ifindex, 0, "get ifindex")) + return -1; + + if (!ASSERT_LE(skeletons->nlinks+1, MAX_BPF_LINKS, "too many XDP programs attached")) + return -1; + + link = bpf_program__attach_xdp(prog, ifindex); + if (!ASSERT_OK_PTR(link, "attach xdp program")) + return -1; + + skeletons->links[skeletons->nlinks++] = link; + return 0; +} + +enum { + BOND_ONE_NO_ATTACH = 0, + BOND_BOTH_AND_ATTACH, +}; + +static const char * const mode_names[] = { + [BOND_MODE_ROUNDROBIN] = "balance-rr", + [BOND_MODE_ACTIVEBACKUP] = "active-backup", + [BOND_MODE_XOR] = "balance-xor", + [BOND_MODE_BROADCAST] = "broadcast", + [BOND_MODE_8023AD] = "802.3ad", + [BOND_MODE_TLB] = "balance-tlb", + [BOND_MODE_ALB] = "balance-alb", +}; + +static const char * const xmit_policy_names[] = { + [BOND_XMIT_POLICY_LAYER2] = "layer2", + [BOND_XMIT_POLICY_LAYER34] = "layer3+4", + [BOND_XMIT_POLICY_LAYER23] = "layer2+3", + [BOND_XMIT_POLICY_ENCAP23] = "encap2+3", + [BOND_XMIT_POLICY_ENCAP34] = "encap3+4", +}; + +static int bonding_setup(struct skeletons *skeletons, int mode, int xmit_policy, + int bond_both_attach) +{ +#define SYS(fmt, ...) \ + ({ \ + char cmd[1024]; \ + snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \ + if (!ASSERT_OK(system(cmd), cmd)) \ + return -1; \ + }) + + SYS("ip netns add ns_dst"); + SYS("ip link add veth1_1 type veth peer name veth2_1 netns ns_dst"); + SYS("ip link add veth1_2 type veth peer name veth2_2 netns ns_dst"); + + SYS("ip link add bond1 type bond mode %s xmit_hash_policy %s", + mode_names[mode], xmit_policy_names[xmit_policy]); + SYS("ip link set bond1 up address " BOND1_MAC_STR " addrgenmode none"); + SYS("ip -netns ns_dst link add bond2 type bond mode %s xmit_hash_policy %s", + mode_names[mode], xmit_policy_names[xmit_policy]); + SYS("ip -netns ns_dst link set bond2 up address " BOND2_MAC_STR " addrgenmode none"); + + SYS("ip link set veth1_1 master bond1"); + if (bond_both_attach == BOND_BOTH_AND_ATTACH) { + SYS("ip link set veth1_2 master bond1"); + } else { + SYS("ip link set veth1_2 up addrgenmode none"); + + if (xdp_attach(skeletons, skeletons->xdp_dummy->progs.xdp_dummy_prog, "veth1_2")) + return -1; + } + + SYS("ip -netns ns_dst link set veth2_1 master bond2"); + + if (bond_both_attach == BOND_BOTH_AND_ATTACH) + SYS("ip -netns ns_dst link set veth2_2 master bond2"); + else + SYS("ip -netns ns_dst link set veth2_2 up addrgenmode none"); + + /* Load a dummy program on sending side as with veth peer needs to have a + * XDP program loaded as well. + */ + if (xdp_attach(skeletons, skeletons->xdp_dummy->progs.xdp_dummy_prog, "bond1")) + return -1; + + if (bond_both_attach == BOND_BOTH_AND_ATTACH) { + if (!ASSERT_OK(setns_by_name("ns_dst"), "set netns to ns_dst")) + return -1; + + if (xdp_attach(skeletons, skeletons->xdp_tx->progs.xdp_tx, "bond2")) + return -1; + + restore_root_netns(); + } + + return 0; + +#undef SYS +} + +static void bonding_cleanup(struct skeletons *skeletons) +{ + restore_root_netns(); + while (skeletons->nlinks) { + skeletons->nlinks--; + bpf_link__destroy(skeletons->links[skeletons->nlinks]); + } + ASSERT_OK(system("ip link delete bond1"), "delete bond1"); + ASSERT_OK(system("ip link delete veth1_1"), "delete veth1_1"); + ASSERT_OK(system("ip link delete veth1_2"), "delete veth1_2"); + ASSERT_OK(system("ip netns delete ns_dst"), "delete ns_dst"); +} + +static int send_udp_packets(int vary_dst_ip) +{ + struct ethhdr eh = { + .h_source = BOND1_MAC, + .h_dest = BOND2_MAC, + .h_proto = htons(ETH_P_IP), + }; + uint8_t buf[128] = {}; + struct iphdr *iph = (struct iphdr *)(buf + sizeof(eh)); + struct udphdr *uh = (struct udphdr *)(buf + sizeof(eh) + sizeof(*iph)); + int i, s = -1; + int ifindex; + + s = socket(AF_PACKET, SOCK_RAW, IPPROTO_RAW); + if (!ASSERT_GE(s, 0, "socket")) + goto err; + + ifindex = if_nametoindex("bond1"); + if (!ASSERT_GT(ifindex, 0, "get bond1 ifindex")) + goto err; + + memcpy(buf, &eh, sizeof(eh)); + iph->ihl = 5; + iph->version = 4; + iph->tos = 16; + iph->id = 1; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->saddr = 1; + iph->daddr = 2; + iph->tot_len = htons(sizeof(buf) - ETH_HLEN); + iph->check = 0; + + for (i = 1; i <= NPACKETS; i++) { + int n; + struct sockaddr_ll saddr_ll = { + .sll_ifindex = ifindex, + .sll_halen = ETH_ALEN, + .sll_addr = BOND2_MAC, + }; + + /* vary the UDP destination port for even distribution with roundrobin/xor modes */ + uh->dest++; + + if (vary_dst_ip) + iph->daddr++; + + n = sendto(s, buf, sizeof(buf), 0, (struct sockaddr *)&saddr_ll, sizeof(saddr_ll)); + if (!ASSERT_EQ(n, sizeof(buf), "sendto")) + goto err; + } + + return 0; + +err: + if (s >= 0) + close(s); + return -1; +} + +static void test_xdp_bonding_with_mode(struct skeletons *skeletons, int mode, int xmit_policy) +{ + int bond1_rx; + + if (bonding_setup(skeletons, mode, xmit_policy, BOND_BOTH_AND_ATTACH)) + goto out; + + if (send_udp_packets(xmit_policy != BOND_XMIT_POLICY_LAYER34)) + goto out; + + bond1_rx = get_rx_packets("bond1"); + ASSERT_EQ(bond1_rx, NPACKETS, "expected more received packets"); + + switch (mode) { + case BOND_MODE_ROUNDROBIN: + case BOND_MODE_XOR: { + int veth1_rx = get_rx_packets("veth1_1"); + int veth2_rx = get_rx_packets("veth1_2"); + int diff = abs(veth1_rx - veth2_rx); + + ASSERT_GE(veth1_rx + veth2_rx, NPACKETS, "expected more packets"); + + switch (xmit_policy) { + case BOND_XMIT_POLICY_LAYER2: + ASSERT_GE(diff, NPACKETS, + "expected packets on only one of the interfaces"); + break; + case BOND_XMIT_POLICY_LAYER23: + case BOND_XMIT_POLICY_LAYER34: + ASSERT_LT(diff, NPACKETS/2, + "expected even distribution of packets"); + break; + default: + PRINT_FAIL("Unimplemented xmit_policy=%d\n", xmit_policy); + break; + } + break; + } + case BOND_MODE_ACTIVEBACKUP: { + int veth1_rx = get_rx_packets("veth1_1"); + int veth2_rx = get_rx_packets("veth1_2"); + int diff = abs(veth1_rx - veth2_rx); + + ASSERT_GE(diff, NPACKETS, + "expected packets on only one of the interfaces"); + break; + } + default: + PRINT_FAIL("Unimplemented xmit_policy=%d\n", xmit_policy); + break; + } + +out: + bonding_cleanup(skeletons); +} + +/* Test the broadcast redirection using xdp_redirect_map_multi_prog and adding + * all the interfaces to it and checking that broadcasting won't send the packet + * to neither the ingress bond device (bond2) or its slave (veth2_1). + */ +static void test_xdp_bonding_redirect_multi(struct skeletons *skeletons) +{ + static const char * const ifaces[] = {"bond2", "veth2_1", "veth2_2"}; + int veth1_1_rx, veth1_2_rx; + int err; + + if (bonding_setup(skeletons, BOND_MODE_ROUNDROBIN, BOND_XMIT_POLICY_LAYER23, + BOND_ONE_NO_ATTACH)) + goto out; + + + if (!ASSERT_OK(setns_by_name("ns_dst"), "could not set netns to ns_dst")) + goto out; + + /* populate the devmap with the relevant interfaces */ + for (int i = 0; i < ARRAY_SIZE(ifaces); i++) { + int ifindex = if_nametoindex(ifaces[i]); + int map_fd = bpf_map__fd(skeletons->xdp_redirect_multi_kern->maps.map_all); + + if (!ASSERT_GT(ifindex, 0, "could not get interface index")) + goto out; + + err = bpf_map_update_elem(map_fd, &ifindex, &ifindex, 0); + if (!ASSERT_OK(err, "add interface to map_all")) + goto out; + } + + if (xdp_attach(skeletons, + skeletons->xdp_redirect_multi_kern->progs.xdp_redirect_map_multi_prog, + "bond2")) + goto out; + + restore_root_netns(); + + if (send_udp_packets(BOND_MODE_ROUNDROBIN)) + goto out; + + veth1_1_rx = get_rx_packets("veth1_1"); + veth1_2_rx = get_rx_packets("veth1_2"); + + ASSERT_EQ(veth1_1_rx, 0, "expected no packets on veth1_1"); + ASSERT_GE(veth1_2_rx, NPACKETS, "expected packets on veth1_2"); + +out: + restore_root_netns(); + bonding_cleanup(skeletons); +} + +/* Test that XDP programs cannot be attached to both the bond master and slaves simultaneously */ +static void test_xdp_bonding_attach(struct skeletons *skeletons) +{ + struct bpf_link *link = NULL; + struct bpf_link *link2 = NULL; + int veth, bond, err; + + if (!ASSERT_OK(system("ip link add veth type veth"), "add veth")) + goto out; + if (!ASSERT_OK(system("ip link add bond type bond"), "add bond")) + goto out; + + veth = if_nametoindex("veth"); + if (!ASSERT_GE(veth, 0, "if_nametoindex veth")) + goto out; + bond = if_nametoindex("bond"); + if (!ASSERT_GE(bond, 0, "if_nametoindex bond")) + goto out; + + /* enslaving with a XDP program loaded is allowed */ + link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, veth); + if (!ASSERT_OK_PTR(link, "attach program to veth")) + goto out; + + err = system("ip link set veth master bond"); + if (!ASSERT_OK(err, "set veth master")) + goto out; + + bpf_link__destroy(link); + link = NULL; + + /* attaching to slave when master has no program is allowed */ + link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, veth); + if (!ASSERT_OK_PTR(link, "attach program to slave when enslaved")) + goto out; + + /* attaching to master not allowed when slave has program loaded */ + link2 = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, bond); + if (!ASSERT_ERR_PTR(link2, "attach program to master when slave has program")) + goto out; + + bpf_link__destroy(link); + link = NULL; + + /* attaching XDP program to master allowed when slave has no program */ + link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, bond); + if (!ASSERT_OK_PTR(link, "attach program to master")) + goto out; + + /* attaching to slave not allowed when master has program loaded */ + link2 = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, veth); + if (!ASSERT_ERR_PTR(link2, "attach program to slave when master has program")) + goto out; + + bpf_link__destroy(link); + link = NULL; + + /* test program unwinding with a non-XDP slave */ + if (!ASSERT_OK(system("ip link add vxlan type vxlan id 1 remote 1.2.3.4 dstport 0 dev lo"), + "add vxlan")) + goto out; + + err = system("ip link set vxlan master bond"); + if (!ASSERT_OK(err, "set vxlan master")) + goto out; + + /* attaching not allowed when one slave does not support XDP */ + link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, bond); + if (!ASSERT_ERR_PTR(link, "attach program to master when slave does not support XDP")) + goto out; + +out: + bpf_link__destroy(link); + bpf_link__destroy(link2); + + system("ip link del veth"); + system("ip link del bond"); + system("ip link del vxlan"); +} + +/* Test with nested bonding devices to catch issue with negative jump label count */ +static void test_xdp_bonding_nested(struct skeletons *skeletons) +{ + struct bpf_link *link = NULL; + int bond, err; + + if (!ASSERT_OK(system("ip link add bond type bond"), "add bond")) + goto out; + + bond = if_nametoindex("bond"); + if (!ASSERT_GE(bond, 0, "if_nametoindex bond")) + goto out; + + if (!ASSERT_OK(system("ip link add bond_nest1 type bond"), "add bond_nest1")) + goto out; + + err = system("ip link set bond_nest1 master bond"); + if (!ASSERT_OK(err, "set bond_nest1 master")) + goto out; + + if (!ASSERT_OK(system("ip link add bond_nest2 type bond"), "add bond_nest1")) + goto out; + + err = system("ip link set bond_nest2 master bond_nest1"); + if (!ASSERT_OK(err, "set bond_nest2 master")) + goto out; + + link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, bond); + ASSERT_OK_PTR(link, "attach program to master"); + +out: + bpf_link__destroy(link); + system("ip link del bond"); + system("ip link del bond_nest1"); + system("ip link del bond_nest2"); +} + +static int libbpf_debug_print(enum libbpf_print_level level, + const char *format, va_list args) +{ + if (level != LIBBPF_WARN) + vprintf(format, args); + return 0; +} + +struct bond_test_case { + char *name; + int mode; + int xmit_policy; +}; + +static struct bond_test_case bond_test_cases[] = { + { "xdp_bonding_roundrobin", BOND_MODE_ROUNDROBIN, BOND_XMIT_POLICY_LAYER23, }, + { "xdp_bonding_activebackup", BOND_MODE_ACTIVEBACKUP, BOND_XMIT_POLICY_LAYER23 }, + + { "xdp_bonding_xor_layer2", BOND_MODE_XOR, BOND_XMIT_POLICY_LAYER2, }, + { "xdp_bonding_xor_layer23", BOND_MODE_XOR, BOND_XMIT_POLICY_LAYER23, }, + { "xdp_bonding_xor_layer34", BOND_MODE_XOR, BOND_XMIT_POLICY_LAYER34, }, +}; + +void test_xdp_bonding(void) +{ + libbpf_print_fn_t old_print_fn; + struct skeletons skeletons = {}; + int i; + + old_print_fn = libbpf_set_print(libbpf_debug_print); + + root_netns_fd = open("/proc/self/ns/net", O_RDONLY); + if (!ASSERT_GE(root_netns_fd, 0, "open /proc/self/ns/net")) + goto out; + + skeletons.xdp_dummy = xdp_dummy__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_dummy, "xdp_dummy__open_and_load")) + goto out; + + skeletons.xdp_tx = xdp_tx__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_tx, "xdp_tx__open_and_load")) + goto out; + + skeletons.xdp_redirect_multi_kern = xdp_redirect_multi_kern__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_redirect_multi_kern, + "xdp_redirect_multi_kern__open_and_load")) + goto out; + + if (test__start_subtest("xdp_bonding_attach")) + test_xdp_bonding_attach(&skeletons); + + if (test__start_subtest("xdp_bonding_nested")) + test_xdp_bonding_nested(&skeletons); + + for (i = 0; i < ARRAY_SIZE(bond_test_cases); i++) { + struct bond_test_case *test_case = &bond_test_cases[i]; + + if (test__start_subtest(test_case->name)) + test_xdp_bonding_with_mode( + &skeletons, + test_case->mode, + test_case->xmit_policy); + } + + if (test__start_subtest("xdp_bonding_redirect_multi")) + test_xdp_bonding_redirect_multi(&skeletons); + +out: + xdp_dummy__destroy(skeletons.xdp_dummy); + xdp_tx__destroy(skeletons.xdp_tx); + xdp_redirect_multi_kern__destroy(skeletons.xdp_redirect_multi_kern); + + libbpf_set_print(old_print_fn); + if (root_netns_fd >= 0) + close(root_netns_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c index 2c6c570b21f8..3bd5904b4db5 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c @@ -90,7 +90,7 @@ void test_xdp_bpf2bpf(void) pb_opts.ctx = &passed; pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map), 1, &pb_opts); - if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb))) + if (!ASSERT_OK_PTR(pb, "perf_buf__new")) goto out; /* Run test program */ diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c new file mode 100644 index 000000000000..ab4952b9fb1d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <network_helpers.h> +#include "test_xdp_context_test_run.skel.h" + +void test_xdp_context_error(int prog_fd, struct bpf_test_run_opts opts, + __u32 data_meta, __u32 data, __u32 data_end, + __u32 ingress_ifindex, __u32 rx_queue_index, + __u32 egress_ifindex) +{ + struct xdp_md ctx = { + .data = data, + .data_end = data_end, + .data_meta = data_meta, + .ingress_ifindex = ingress_ifindex, + .rx_queue_index = rx_queue_index, + .egress_ifindex = egress_ifindex, + }; + int err; + + opts.ctx_in = &ctx; + opts.ctx_size_in = sizeof(ctx); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_EQ(errno, EINVAL, "errno-EINVAL"); + ASSERT_ERR(err, "bpf_prog_test_run"); +} + +void test_xdp_context_test_run(void) +{ + struct test_xdp_context_test_run *skel = NULL; + char data[sizeof(pkt_v4) + sizeof(__u32)]; + char bad_ctx[sizeof(struct xdp_md) + 1]; + struct xdp_md ctx_in, ctx_out; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &data, + .data_size_in = sizeof(data), + .ctx_out = &ctx_out, + .ctx_size_out = sizeof(ctx_out), + .repeat = 1, + ); + int err, prog_fd; + + skel = test_xdp_context_test_run__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + prog_fd = bpf_program__fd(skel->progs.xdp_context); + + /* Data past the end of the kernel's struct xdp_md must be 0 */ + bad_ctx[sizeof(bad_ctx) - 1] = 1; + opts.ctx_in = bad_ctx; + opts.ctx_size_in = sizeof(bad_ctx); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_EQ(errno, E2BIG, "extradata-errno"); + ASSERT_ERR(err, "bpf_prog_test_run(extradata)"); + + *(__u32 *)data = XDP_PASS; + *(struct ipv4_packet *)(data + sizeof(__u32)) = pkt_v4; + opts.ctx_in = &ctx_in; + opts.ctx_size_in = sizeof(ctx_in); + memset(&ctx_in, 0, sizeof(ctx_in)); + ctx_in.data_meta = 0; + ctx_in.data = sizeof(__u32); + ctx_in.data_end = ctx_in.data + sizeof(pkt_v4); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(err, "bpf_prog_test_run(valid)"); + ASSERT_EQ(opts.retval, XDP_PASS, "valid-retval"); + ASSERT_EQ(opts.data_size_out, sizeof(pkt_v4), "valid-datasize"); + ASSERT_EQ(opts.ctx_size_out, opts.ctx_size_in, "valid-ctxsize"); + ASSERT_EQ(ctx_out.data_meta, 0, "valid-datameta"); + ASSERT_EQ(ctx_out.data, 0, "valid-data"); + ASSERT_EQ(ctx_out.data_end, sizeof(pkt_v4), "valid-dataend"); + + /* Meta data's size must be a multiple of 4 */ + test_xdp_context_error(prog_fd, opts, 0, 1, sizeof(data), 0, 0, 0); + + /* data_meta must reference the start of data */ + test_xdp_context_error(prog_fd, opts, 4, sizeof(__u32), sizeof(data), + 0, 0, 0); + + /* Meta data must be 32 bytes or smaller */ + test_xdp_context_error(prog_fd, opts, 0, 36, sizeof(data), 0, 0, 0); + + /* Total size of data must match data_end - data_meta */ + test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), + sizeof(data) - 1, 0, 0, 0); + test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), + sizeof(data) + 1, 0, 0, 0); + + /* RX queue cannot be specified without specifying an ingress */ + test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data), + 0, 1, 0); + + /* Interface 1 is always the loopback interface which always has only + * one RX queue (index 0). This makes index 1 an invalid rx queue index + * for interface 1. + */ + test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data), + 1, 1, 0); + + /* The egress cannot be specified */ + test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data), + 0, 0, 1); + + test_xdp_context_test_run__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c index 0176573fe4e7..8755effd80b0 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c @@ -7,64 +7,53 @@ #define IFINDEX_LO 1 -void test_xdp_with_cpumap_helpers(void) +void test_xdp_cpumap_attach(void) { struct test_xdp_with_cpumap_helpers *skel; struct bpf_prog_info info = {}; + __u32 len = sizeof(info); struct bpf_cpumap_val val = { .qsize = 192, }; - __u32 duration = 0, idx = 0; - __u32 len = sizeof(info); int err, prog_fd, map_fd; + __u32 idx = 0; skel = test_xdp_with_cpumap_helpers__open_and_load(); - if (CHECK_FAIL(!skel)) { - perror("test_xdp_with_cpumap_helpers__open_and_load"); + if (!ASSERT_OK_PTR(skel, "test_xdp_with_cpumap_helpers__open_and_load")) return; - } - /* can not attach program with cpumaps that allow programs - * as xdp generic - */ prog_fd = bpf_program__fd(skel->progs.xdp_redir_prog); err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE); - CHECK(err == 0, "Generic attach of program with 8-byte CPUMAP", - "should have failed\n"); + if (!ASSERT_OK(err, "Generic attach of program with 8-byte CPUMAP")) + goto out_close; + + err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE); + ASSERT_OK(err, "XDP program detach"); prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm); map_fd = bpf_map__fd(skel->maps.cpu_map); err = bpf_obj_get_info_by_fd(prog_fd, &info, &len); - if (CHECK_FAIL(err)) + if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd")) goto out_close; val.bpf_prog.fd = prog_fd; err = bpf_map_update_elem(map_fd, &idx, &val, 0); - CHECK(err, "Add program to cpumap entry", "err %d errno %d\n", - err, errno); + ASSERT_OK(err, "Add program to cpumap entry"); err = bpf_map_lookup_elem(map_fd, &idx, &val); - CHECK(err, "Read cpumap entry", "err %d errno %d\n", err, errno); - CHECK(info.id != val.bpf_prog.id, "Expected program id in cpumap entry", - "expected %u read %u\n", info.id, val.bpf_prog.id); + ASSERT_OK(err, "Read cpumap entry"); + ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to cpumap entry prog_id"); /* can not attach BPF_XDP_CPUMAP program to a device */ err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE); - CHECK(err == 0, "Attach of BPF_XDP_CPUMAP program", - "should have failed\n"); + if (!ASSERT_NEQ(err, 0, "Attach of BPF_XDP_CPUMAP program")) + bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE); val.qsize = 192; val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); err = bpf_map_update_elem(map_fd, &idx, &val, 0); - CHECK(err == 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry", - "should have failed\n"); + ASSERT_NEQ(err, 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry"); out_close: test_xdp_with_cpumap_helpers__destroy(skel); } - -void test_xdp_cpumap_attach(void) -{ - if (test__start_subtest("cpumap_with_progs")) - test_xdp_with_cpumap_helpers(); -} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c index 88ef3ec8ac4c..c72af030ff10 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -16,50 +16,45 @@ void test_xdp_with_devmap_helpers(void) .ifindex = IFINDEX_LO, }; __u32 len = sizeof(info); - __u32 duration = 0, idx = 0; int err, dm_fd, map_fd; + __u32 idx = 0; skel = test_xdp_with_devmap_helpers__open_and_load(); - if (CHECK_FAIL(!skel)) { - perror("test_xdp_with_devmap_helpers__open_and_load"); + if (!ASSERT_OK_PTR(skel, "test_xdp_with_devmap_helpers__open_and_load")) return; - } - /* can not attach program with DEVMAPs that allow programs - * as xdp generic - */ dm_fd = bpf_program__fd(skel->progs.xdp_redir_prog); err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); - CHECK(err == 0, "Generic attach of program with 8-byte devmap", - "should have failed\n"); + if (!ASSERT_OK(err, "Generic attach of program with 8-byte devmap")) + goto out_close; + + err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE); + ASSERT_OK(err, "XDP program detach"); dm_fd = bpf_program__fd(skel->progs.xdp_dummy_dm); map_fd = bpf_map__fd(skel->maps.dm_ports); err = bpf_obj_get_info_by_fd(dm_fd, &info, &len); - if (CHECK_FAIL(err)) + if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd")) goto out_close; val.bpf_prog.fd = dm_fd; err = bpf_map_update_elem(map_fd, &idx, &val, 0); - CHECK(err, "Add program to devmap entry", - "err %d errno %d\n", err, errno); + ASSERT_OK(err, "Add program to devmap entry"); err = bpf_map_lookup_elem(map_fd, &idx, &val); - CHECK(err, "Read devmap entry", "err %d errno %d\n", err, errno); - CHECK(info.id != val.bpf_prog.id, "Expected program id in devmap entry", - "expected %u read %u\n", info.id, val.bpf_prog.id); + ASSERT_OK(err, "Read devmap entry"); + ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to devmap entry prog_id"); /* can not attach BPF_XDP_DEVMAP program to a device */ err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); - CHECK(err == 0, "Attach of BPF_XDP_DEVMAP program", - "should have failed\n"); + if (!ASSERT_NEQ(err, 0, "Attach of BPF_XDP_DEVMAP program")) + bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE); val.ifindex = 1; val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); err = bpf_map_update_elem(map_fd, &idx, &val, 0); - CHECK(err == 0, "Add non-BPF_XDP_DEVMAP program to devmap entry", - "should have failed\n"); + ASSERT_NEQ(err, 0, "Add non-BPF_XDP_DEVMAP program to devmap entry"); out_close: test_xdp_with_devmap_helpers__destroy(skel); @@ -68,12 +63,10 @@ out_close: void test_neg_xdp_devmap_helpers(void) { struct test_xdp_devmap_helpers *skel; - __u32 duration = 0; skel = test_xdp_devmap_helpers__open_and_load(); - if (CHECK(skel, - "Load of XDP program accessing egress ifindex without attach type", - "should have failed\n")) { + if (!ASSERT_EQ(skel, NULL, + "Load of XDP program accessing egress ifindex without attach type")) { test_xdp_devmap_helpers__destroy(skel); } } diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_link.c b/tools/testing/selftests/bpf/prog_tests/xdp_link.c index 6f814999b395..46eed0a33c23 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_link.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_link.c @@ -51,7 +51,7 @@ void test_xdp_link(void) /* BPF link is not allowed to replace prog attachment */ link = bpf_program__attach_xdp(skel1->progs.xdp_handler, IFINDEX_LO); - if (CHECK(!IS_ERR(link), "link_attach_fail", "unexpected success\n")) { + if (!ASSERT_ERR_PTR(link, "link_attach_should_fail")) { bpf_link__destroy(link); /* best-effort detach prog */ opts.old_fd = prog_fd1; @@ -67,7 +67,7 @@ void test_xdp_link(void) /* now BPF link should attach successfully */ link = bpf_program__attach_xdp(skel1->progs.xdp_handler, IFINDEX_LO); - if (CHECK(IS_ERR(link), "link_attach", "failed: %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "link_attach")) goto cleanup; skel1->links.xdp_handler = link; @@ -95,7 +95,7 @@ void test_xdp_link(void) /* BPF link is not allowed to replace another BPF link */ link = bpf_program__attach_xdp(skel2->progs.xdp_handler, IFINDEX_LO); - if (CHECK(!IS_ERR(link), "link_attach_fail", "unexpected success\n")) { + if (!ASSERT_ERR_PTR(link, "link_attach_should_fail")) { bpf_link__destroy(link); goto cleanup; } @@ -105,7 +105,7 @@ void test_xdp_link(void) /* new link attach should succeed */ link = bpf_program__attach_xdp(skel2->progs.xdp_handler, IFINDEX_LO); - if (CHECK(IS_ERR(link), "link_attach", "failed: %ld\n", PTR_ERR(link))) + if (!ASSERT_OK_PTR(link, "link_attach")) goto cleanup; skel2->links.xdp_handler = link; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index fd42247da8b4..9573be6122be 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -17,6 +17,11 @@ char _license[] SEC("license") = "GPL"; +volatile const char fallback[TCP_CA_NAME_MAX]; +const char bpf_dctcp[] = "bpf_dctcp"; +const char tcp_cdg[] = "cdg"; +char cc_res[TCP_CA_NAME_MAX]; +int tcp_cdg_res = 0; int stg_result = 0; struct { @@ -57,6 +62,26 @@ void BPF_PROG(dctcp_init, struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); int *stg; + if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { + /* Switch to fallback */ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)fallback, sizeof(fallback)); + /* Switch back to myself which the bpf trampoline + * stopped calling dctcp_init recursively. + */ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)bpf_dctcp, sizeof(bpf_dctcp)); + /* Switch back to fallback */ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)fallback, sizeof(fallback)); + /* Expecting -ENOTSUPP for tcp_cdg_res */ + tcp_cdg_res = bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)tcp_cdg, sizeof(tcp_cdg)); + bpf_getsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)cc_res, sizeof(cc_res)); + return; + } + ca->prior_rcv_nxt = tp->rcv_nxt; ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->loss_cwnd = 0; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c new file mode 100644 index 000000000000..d836f7c372f0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <stddef.h> +#include <linux/bpf.h> +#include <linux/types.h> +#include <linux/stddef.h> +#include <linux/tcp.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; +const char cubic[] = "cubic"; + +void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk) +{ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)cubic, sizeof(cubic)); +} + +SEC(".struct_ops") +struct tcp_congestion_ops dctcp_rel = { + .release = (void *)dctcp_nouse_release, + .name = "bpf_dctcp_rel", +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h index 3d83b185c4bc..8cfaeba1ddbf 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter.h +++ b/tools/testing/selftests/bpf/progs/bpf_iter.h @@ -12,6 +12,7 @@ #define tcp6_sock tcp6_sock___not_used #define bpf_iter__udp bpf_iter__udp___not_used #define udp6_sock udp6_sock___not_used +#define bpf_iter__unix bpf_iter__unix___not_used #define bpf_iter__bpf_map_elem bpf_iter__bpf_map_elem___not_used #define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used #define bpf_iter__sockmap bpf_iter__sockmap___not_used @@ -32,6 +33,7 @@ #undef tcp6_sock #undef bpf_iter__udp #undef udp6_sock +#undef bpf_iter__unix #undef bpf_iter__bpf_map_elem #undef bpf_iter__bpf_sk_storage_map #undef bpf_iter__sockmap @@ -103,6 +105,12 @@ struct udp6_sock { struct ipv6_pinfo inet6; } __attribute__((preserve_access_index)); +struct bpf_iter__unix { + struct bpf_iter_meta *meta; + struct unix_sock *unix_sk; + uid_t uid; +} __attribute__((preserve_access_index)); + struct bpf_iter__bpf_map_elem { struct bpf_iter_meta *meta; struct bpf_map *map; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c index 6dfce3fd68bc..0aa3cd34cbe3 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c @@ -2,7 +2,6 @@ /* Copyright (c) 2020 Facebook */ #include "bpf_iter.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c index b83b5d2e17dc..6c39e86b666f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c @@ -2,7 +2,6 @@ /* Copyright (c) 2020 Facebook */ #include "bpf_iter.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c index d58d9f1642b5..784a610ce039 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c @@ -3,7 +3,6 @@ #include "bpf_iter.h" #include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c index 95989f4c99b5..a28e51e2dcee 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c @@ -3,7 +3,6 @@ #include "bpf_iter.h" #include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c new file mode 100644 index 000000000000..b77adfd55d73 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "bpf_iter.h" +#include "bpf_tracing_net.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define bpf_tcp_sk(skc) ({ \ + struct sock_common *_skc = skc; \ + sk = NULL; \ + tp = NULL; \ + if (_skc) { \ + tp = bpf_skc_to_tcp_sock(_skc); \ + sk = (struct sock *)tp; \ + } \ + tp; \ +}) + +unsigned short reuse_listen_hport = 0; +unsigned short listen_hport = 0; +char cubic_cc[TCP_CA_NAME_MAX] = "bpf_cubic"; +char dctcp_cc[TCP_CA_NAME_MAX] = "bpf_dctcp"; +bool random_retry = false; + +static bool tcp_cc_eq(const char *a, const char *b) +{ + int i; + + for (i = 0; i < TCP_CA_NAME_MAX; i++) { + if (a[i] != b[i]) + return false; + if (!a[i]) + break; + } + + return true; +} + +SEC("iter/tcp") +int change_tcp_cc(struct bpf_iter__tcp *ctx) +{ + char cur_cc[TCP_CA_NAME_MAX]; + struct tcp_sock *tp; + struct sock *sk; + int ret; + + if (!bpf_tcp_sk(ctx->sk_common)) + return 0; + + if (sk->sk_family != AF_INET6 || + (sk->sk_state != TCP_LISTEN && + sk->sk_state != TCP_ESTABLISHED) || + (sk->sk_num != reuse_listen_hport && + sk->sk_num != listen_hport && + bpf_ntohs(sk->sk_dport) != listen_hport)) + return 0; + + if (bpf_getsockopt(tp, SOL_TCP, TCP_CONGESTION, + cur_cc, sizeof(cur_cc))) + return 0; + + if (!tcp_cc_eq(cur_cc, cubic_cc)) + return 0; + + if (random_retry && bpf_get_prandom_u32() % 4 == 1) + return 1; + + bpf_setsockopt(tp, SOL_TCP, TCP_CONGESTION, dctcp_cc, sizeof(dctcp_cc)); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c index b7f32c160f4e..c86b93f33b32 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c @@ -2,7 +2,6 @@ /* Copyright (c) 2020 Facebook */ #include "bpf_iter.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c index a1ddc36f13ec..bca8b889cb10 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c @@ -2,7 +2,6 @@ /* Copyright (c) 2020, Oracle and/or its affiliates. */ #include "bpf_iter.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> #include <bpf/bpf_core_read.h> #include <errno.h> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c index b2f7c7c5f952..6e7b400888fe 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c @@ -2,7 +2,6 @@ /* Copyright (c) 2020 Facebook */ #include "bpf_iter.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c index 43c36f5f7649..f2b8167b72a8 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c @@ -2,7 +2,6 @@ /* Copyright (c) 2020 Facebook */ #include "bpf_iter.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c index 11d1aa37cf11..4ea6a37d1345 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c @@ -2,7 +2,6 @@ /* Copyright (c) 2020 Facebook */ #include "bpf_iter.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c index 54380c5e1069..92267abb462f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c @@ -3,7 +3,6 @@ #include "bpf_iter.h" #include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> #include <bpf/bpf_endian.h> char _license[] SEC("license") = "GPL"; @@ -122,7 +121,7 @@ static int dump_tcp_sock(struct seq_file *seq, struct tcp_sock *tp, } BPF_SEQ_PRINTF(seq, "%4d: %08X:%04X %08X:%04X ", - seq_num, src, srcp, destp, destp); + seq_num, src, srcp, dest, destp); BPF_SEQ_PRINTF(seq, "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d ", state, tp->write_seq - tp->snd_una, rx_queue, diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c index b4fbddfa4e10..943f7bba180e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c @@ -3,7 +3,6 @@ #include "bpf_iter.h" #include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> #include <bpf/bpf_endian.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c index ee49493dc125..400fdf8d6233 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c @@ -9,8 +9,8 @@ __u32 map1_id = 0, map2_id = 0; __u32 map1_accessed = 0, map2_accessed = 0; __u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0; -static volatile const __u32 print_len; -static volatile const __u32 ret1; +volatile const __u32 print_len; +volatile const __u32 ret1; SEC("iter/bpf_map") int dump_bpf_map(struct bpf_iter__bpf_map *ctx) diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c index f258583afbbd..cf0c485b1ed7 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c @@ -3,7 +3,6 @@ #include "bpf_iter.h" #include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> #include <bpf/bpf_endian.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c index 65f93bb03f0f..5031e21c433f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c @@ -3,7 +3,6 @@ #include "bpf_iter.h" #include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> #include <bpf/bpf_endian.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_unix.c b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c new file mode 100644 index 000000000000..94423902685d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright Amazon.com Inc. or its affiliates. */ +#include "bpf_iter.h" +#include "bpf_tracing_net.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +char _license[] SEC("license") = "GPL"; + +static long sock_i_ino(const struct sock *sk) +{ + const struct socket *sk_socket = sk->sk_socket; + const struct inode *inode; + unsigned long ino; + + if (!sk_socket) + return 0; + + inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode; + bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino); + return ino; +} + +SEC("iter/unix") +int dump_unix(struct bpf_iter__unix *ctx) +{ + struct unix_sock *unix_sk = ctx->unix_sk; + struct sock *sk = (struct sock *)unix_sk; + struct seq_file *seq; + __u32 seq_num; + + if (!unix_sk) + return 0; + + seq = ctx->meta->seq; + seq_num = ctx->meta->seq_num; + if (seq_num == 0) + BPF_SEQ_PRINTF(seq, "Num RefCount Protocol Flags Type St Inode Path\n"); + + BPF_SEQ_PRINTF(seq, "%pK: %08X %08X %08X %04X %02X %8lu", + unix_sk, + sk->sk_refcnt.refs.counter, + 0, + sk->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, + sk->sk_type, + sk->sk_socket ? + (sk->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : + (sk->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), + sock_i_ino(sk)); + + if (unix_sk->addr) { + if (!UNIX_ABSTRACT(unix_sk)) { + BPF_SEQ_PRINTF(seq, " %s", unix_sk->addr->name->sun_path); + } else { + /* The name of the abstract UNIX domain socket starts + * with '\0' and can contain '\0'. The null bytes + * should be escaped as done in unix_seq_show(). + */ + __u64 i, len; + + len = unix_sk->addr->len - sizeof(short); + + BPF_SEQ_PRINTF(seq, " @"); + + for (i = 1; i < len; i++) { + /* unix_mkname() tests this upper bound. */ + if (i >= sizeof(struct sockaddr_un)) + break; + + BPF_SEQ_PRINTF(seq, "%c", + unix_sk->addr->name->sun_path[i] ?: + '@'); + } + } + } + + BPF_SEQ_PRINTF(seq, "\n"); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 01378911252b..eef5646ddb19 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -5,6 +5,14 @@ #define AF_INET 2 #define AF_INET6 10 +#define __SO_ACCEPTCON (1 << 16) +#define UNIX_HASH_SIZE 256 +#define UNIX_ABSTRACT(unix_sk) (unix_sk->addr->hash < UNIX_HASH_SIZE) + +#define SOL_TCP 6 +#define TCP_CONGESTION 13 +#define TCP_CA_NAME_MAX 16 + #define ICSK_TIME_RETRANS 1 #define ICSK_TIME_PROBE0 3 #define ICSK_TIME_LOSS_PROBE 5 @@ -32,6 +40,8 @@ #define ir_v6_rmt_addr req.__req_common.skc_v6_daddr #define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr +#define sk_num __sk_common.skc_num +#define sk_dport __sk_common.skc_dport #define sk_family __sk_common.skc_family #define sk_rmem_alloc sk_backlog.rmem_alloc #define sk_refcnt __sk_common.skc_refcnt diff --git a/tools/testing/selftests/bpf/progs/connect4_dropper.c b/tools/testing/selftests/bpf/progs/connect4_dropper.c new file mode 100644 index 000000000000..b565d997810a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect4_dropper.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <string.h> + +#include <linux/stddef.h> +#include <linux/bpf.h> + +#include <sys/socket.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define VERDICT_REJECT 0 +#define VERDICT_PROCEED 1 + +SEC("cgroup/connect4") +int connect_v4_dropper(struct bpf_sock_addr *ctx) +{ + if (ctx->type != SOCK_STREAM) + return VERDICT_PROCEED; + if (ctx->user_port == bpf_htons(60123)) + return VERDICT_REJECT; + return VERDICT_PROCEED; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c new file mode 100644 index 000000000000..a587aeca5ae0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +extern const void bpf_fentry_test1 __ksym; +extern const void bpf_fentry_test2 __ksym; +extern const void bpf_fentry_test3 __ksym; +extern const void bpf_fentry_test4 __ksym; +extern const void bpf_modify_return_test __ksym; +extern const void bpf_fentry_test6 __ksym; +extern const void bpf_fentry_test7 __ksym; + +__u64 test1_result = 0; +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test1, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test1_result = (const void *) addr == &bpf_fentry_test1; + return 0; +} + +__u64 test2_result = 0; +SEC("fexit/bpf_fentry_test2") +int BPF_PROG(test2, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test2_result = (const void *) addr == &bpf_fentry_test2; + return 0; +} + +__u64 test3_result = 0; +SEC("kprobe/bpf_fentry_test3") +int test3(struct pt_regs *ctx) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test3_result = (const void *) addr == &bpf_fentry_test3; + return 0; +} + +__u64 test4_result = 0; +SEC("kretprobe/bpf_fentry_test4") +int BPF_KRETPROBE(test4) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test4_result = (const void *) addr == &bpf_fentry_test4; + return 0; +} + +__u64 test5_result = 0; +SEC("fmod_ret/bpf_modify_return_test") +int BPF_PROG(test5, int a, int *b, int ret) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test5_result = (const void *) addr == &bpf_modify_return_test; + return ret; +} + +__u64 test6_result = 0; +SEC("kprobe/bpf_fentry_test6+0x5") +int test6(struct pt_regs *ctx) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test6_result = (const void *) addr == &bpf_fentry_test6 + 5; + return 0; +} + +__u64 test7_result = 0; +SEC("kprobe/bpf_fentry_test7+5") +int test7(struct pt_regs *ctx) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test7_result = (const void *) addr == &bpf_fentry_test7 + 5; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/kfree_skb.c b/tools/testing/selftests/bpf/progs/kfree_skb.c index a46a264ce24e..55e283050cab 100644 --- a/tools/testing/selftests/bpf/progs/kfree_skb.c +++ b/tools/testing/selftests/bpf/progs/kfree_skb.c @@ -109,10 +109,10 @@ int BPF_PROG(trace_kfree_skb, struct sk_buff *skb, void *location) return 0; } -static volatile struct { +struct { bool fentry_test_ok; bool fexit_test_ok; -} result; +} result = {}; SEC("fentry/eth_type_trans") int BPF_PROG(fentry_eth_type_trans, struct sk_buff *skb, struct net_device *dev, diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c index b2dcb7d9cb03..5fbd9e232d44 100644 --- a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c @@ -9,7 +9,7 @@ extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, __u32 c, __u64 d) __ksym; extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym; int active_res = -1; -int sk_state = -1; +int sk_state_res = -1; int __noinline f1(struct __sk_buff *skb) { @@ -28,7 +28,7 @@ int __noinline f1(struct __sk_buff *skb) if (active) active_res = *active; - sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state; + sk_state_res = bpf_kfunc_call_test3((struct sock *)sk)->sk_state; return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4); } diff --git a/tools/testing/selftests/bpf/progs/linked_maps1.c b/tools/testing/selftests/bpf/progs/linked_maps1.c index 52291515cc72..00bf1ca95986 100644 --- a/tools/testing/selftests/bpf/progs/linked_maps1.c +++ b/tools/testing/selftests/bpf/progs/linked_maps1.c @@ -75,7 +75,7 @@ int BPF_PROG(handler_exit1) val = bpf_map_lookup_elem(&map_weak, &key); if (val) output_weak1 = *val; - + return 0; } diff --git a/tools/testing/selftests/bpf/progs/netcnt_prog.c b/tools/testing/selftests/bpf/progs/netcnt_prog.c index d071adf178bd..43649bce4c54 100644 --- a/tools/testing/selftests/bpf/progs/netcnt_prog.c +++ b/tools/testing/selftests/bpf/progs/netcnt_prog.c @@ -13,21 +13,21 @@ struct { __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); __type(key, struct bpf_cgroup_storage_key); - __type(value, struct percpu_net_cnt); + __type(value, union percpu_net_cnt); } percpu_netcnt SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE); __type(key, struct bpf_cgroup_storage_key); - __type(value, struct net_cnt); + __type(value, union net_cnt); } netcnt SEC(".maps"); SEC("cgroup/skb") int bpf_nextcnt(struct __sk_buff *skb) { - struct percpu_net_cnt *percpu_cnt; + union percpu_net_cnt *percpu_cnt; char fmt[] = "%d %llu %llu\n"; - struct net_cnt *cnt; + union net_cnt *cnt; __u64 ts, dt; int ret; diff --git a/tools/testing/selftests/bpf/progs/netns_cookie_prog.c b/tools/testing/selftests/bpf/progs/netns_cookie_prog.c new file mode 100644 index 000000000000..aeff3a4f9287 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/netns_cookie_prog.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include <bpf/bpf_helpers.h> + +#define AF_INET6 10 + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} sockops_netns_cookies SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} sk_msg_netns_cookies SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_map SEC(".maps"); + +SEC("sockops") +int get_netns_cookie_sockops(struct bpf_sock_ops *ctx) +{ + struct bpf_sock *sk = ctx->sk; + int *cookie; + __u32 key = 0; + + if (ctx->family != AF_INET6) + return 1; + + if (!sk) + return 1; + + switch (ctx->op) { + case BPF_SOCK_OPS_TCP_CONNECT_CB: + cookie = bpf_sk_storage_get(&sockops_netns_cookies, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!cookie) + return 1; + + *cookie = bpf_get_netns_cookie(ctx); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + bpf_sock_map_update(ctx, &sock_map, &key, BPF_NOEXIST); + break; + default: + break; + } + + return 1; +} + +SEC("sk_msg") +int get_netns_cookie_sk_msg(struct sk_msg_md *msg) +{ + struct bpf_sock *sk = msg->sk; + int *cookie; + + if (msg->family != AF_INET6) + return 1; + + if (!sk) + return 1; + + cookie = bpf_sk_storage_get(&sk_msg_netns_cookies, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!cookie) + return 1; + + *cookie = bpf_get_netns_cookie(msg); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c new file mode 100644 index 000000000000..1bce83b6e3a7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <string.h> +#include <linux/tcp.h> +#include <netinet/in.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; + +SEC("cgroup/setsockopt") +int sockopt_qos_to_cc(struct bpf_sockopt *ctx) +{ + void *optval_end = ctx->optval_end; + int *optval = ctx->optval; + char buf[TCP_CA_NAME_MAX]; + char cc_reno[TCP_CA_NAME_MAX] = "reno"; + char cc_cubic[TCP_CA_NAME_MAX] = "cubic"; + + if (ctx->level != SOL_IPV6 || ctx->optname != IPV6_TCLASS) + return 1; + + if (optval + 1 > optval_end) + return 0; /* EPERM, bounds check */ + + if (bpf_getsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) + return 0; + + if (!tcp_cc_eq(buf, cc_cubic)) + return 0; + + if (*optval == 0x2d) { + if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &cc_reno, + sizeof(cc_reno))) + return 0; + } + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index 8acdb99b5959..79c8139b63b8 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -33,6 +33,14 @@ int _getsockopt(struct bpf_sockopt *ctx) __u8 *optval = ctx->optval; struct sockopt_sk *storage; + /* Make sure bpf_get_netns_cookie is callable. + */ + if (bpf_get_netns_cookie(NULL) == 0) + return 0; + + if (bpf_get_netns_cookie(ctx) == 0) + return 0; + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { /* Not interested in SOL_IP:IP_TOS; * let next BPF program in the cgroup chain or kernel @@ -123,6 +131,14 @@ int _setsockopt(struct bpf_sockopt *ctx) __u8 *optval = ctx->optval; struct sockopt_sk *storage; + /* Make sure bpf_get_netns_cookie is callable. + */ + if (bpf_get_netns_cookie(NULL) == 0) + return 0; + + if (bpf_get_netns_cookie(ctx) == 0) + return 0; + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { /* Not interested in SOL_IP:IP_TOS; * let next BPF program in the cgroup chain or kernel diff --git a/tools/testing/selftests/bpf/progs/syscall.c b/tools/testing/selftests/bpf/progs/syscall.c new file mode 100644 index 000000000000..e550f728962d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/syscall.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <linux/stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <../../../tools/include/linux/filter.h> +#include <linux/btf.h> + +char _license[] SEC("license") = "GPL"; + +struct args { + __u64 log_buf; + __u32 log_size; + int max_entries; + int map_fd; + int prog_fd; + int btf_fd; +}; + +#define BTF_INFO_ENC(kind, kind_flag, vlen) \ + ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) +#define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type) +#define BTF_INT_ENC(encoding, bits_offset, nr_bits) \ + ((encoding) << 24 | (bits_offset) << 16 | (nr_bits)) +#define BTF_TYPE_INT_ENC(name, encoding, bits_offset, bits, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz), \ + BTF_INT_ENC(encoding, bits_offset, bits) + +static int btf_load(void) +{ + struct btf_blob { + struct btf_header btf_hdr; + __u32 types[8]; + __u32 str; + } raw_btf = { + .btf_hdr = { + .magic = BTF_MAGIC, + .version = BTF_VERSION, + .hdr_len = sizeof(struct btf_header), + .type_len = sizeof(__u32) * 8, + .str_off = sizeof(__u32) * 8, + .str_len = sizeof(__u32), + }, + .types = { + /* long */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 64, 8), /* [1] */ + /* unsigned long */ + BTF_TYPE_INT_ENC(0, 0, 0, 64, 8), /* [2] */ + }, + }; + static union bpf_attr btf_load_attr = { + .btf_size = sizeof(raw_btf), + }; + + btf_load_attr.btf = (long)&raw_btf; + return bpf_sys_bpf(BPF_BTF_LOAD, &btf_load_attr, sizeof(btf_load_attr)); +} + +SEC("syscall") +int bpf_prog(struct args *ctx) +{ + static char license[] = "GPL"; + static struct bpf_insn insns[] = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + static union bpf_attr map_create_attr = { + .map_type = BPF_MAP_TYPE_HASH, + .key_size = 8, + .value_size = 8, + .btf_key_type_id = 1, + .btf_value_type_id = 2, + }; + static union bpf_attr map_update_attr = { .map_fd = 1, }; + static __u64 key = 12; + static __u64 value = 34; + static union bpf_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + .insn_cnt = sizeof(insns) / sizeof(insns[0]), + }; + int ret; + + ret = btf_load(); + if (ret <= 0) + return ret; + + ctx->btf_fd = ret; + map_create_attr.max_entries = ctx->max_entries; + map_create_attr.btf_fd = ret; + + prog_load_attr.license = (long) license; + prog_load_attr.insns = (long) insns; + prog_load_attr.log_buf = ctx->log_buf; + prog_load_attr.log_size = ctx->log_size; + prog_load_attr.log_level = 1; + + ret = bpf_sys_bpf(BPF_MAP_CREATE, &map_create_attr, sizeof(map_create_attr)); + if (ret <= 0) + return ret; + ctx->map_fd = ret; + insns[3].imm = ret; + + map_update_attr.map_fd = ret; + map_update_attr.key = (long) &key; + map_update_attr.value = (long) &value; + ret = bpf_sys_bpf(BPF_MAP_UPDATE_ELEM, &map_update_attr, sizeof(map_update_attr)); + if (ret < 0) + return ret; + + ret = bpf_sys_bpf(BPF_PROG_LOAD, &prog_load_attr, sizeof(prog_load_attr)); + if (ret <= 0) + return ret; + ctx->prog_fd = ret; + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/tailcall3.c b/tools/testing/selftests/bpf/progs/tailcall3.c index 739dc2a51e74..910858fe078a 100644 --- a/tools/testing/selftests/bpf/progs/tailcall3.c +++ b/tools/testing/selftests/bpf/progs/tailcall3.c @@ -10,7 +10,7 @@ struct { __uint(value_size, sizeof(__u32)); } jmp_table SEC(".maps"); -static volatile int count; +int count = 0; SEC("classifier/0") int bpf_func_0(struct __sk_buff *skb) diff --git a/tools/testing/selftests/bpf/progs/tailcall4.c b/tools/testing/selftests/bpf/progs/tailcall4.c index f82075b47d7d..bd4be135c39d 100644 --- a/tools/testing/selftests/bpf/progs/tailcall4.c +++ b/tools/testing/selftests/bpf/progs/tailcall4.c @@ -10,7 +10,7 @@ struct { __uint(value_size, sizeof(__u32)); } jmp_table SEC(".maps"); -static volatile int selector; +int selector = 0; #define TAIL_FUNC(x) \ SEC("classifier/" #x) \ diff --git a/tools/testing/selftests/bpf/progs/tailcall5.c b/tools/testing/selftests/bpf/progs/tailcall5.c index ce5450744fd4..adf30a33064e 100644 --- a/tools/testing/selftests/bpf/progs/tailcall5.c +++ b/tools/testing/selftests/bpf/progs/tailcall5.c @@ -10,7 +10,7 @@ struct { __uint(value_size, sizeof(__u32)); } jmp_table SEC(".maps"); -static volatile int selector; +int selector = 0; #define TAIL_FUNC(x) \ SEC("classifier/" #x) \ diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c index 7b1c04183824..3cc4c12817b5 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c @@ -20,7 +20,7 @@ int subprog_tail(struct __sk_buff *skb) return 1; } -static volatile int count; +int count = 0; SEC("classifier/0") int bpf_func_0(struct __sk_buff *skb) diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c index 9a1b166b7fbe..e89368a50b97 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c @@ -3,17 +3,35 @@ #include <bpf/bpf_helpers.h> struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} nop_table SEC(".maps"); + +struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); __uint(max_entries, 3); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u32)); } jmp_table SEC(".maps"); -static volatile int count; +int count = 0; +int noise = 0; + +__always_inline int subprog_noise(void) +{ + __u32 key = 0; + + bpf_map_lookup_elem(&nop_table, &key); + return 0; +} __noinline int subprog_tail_2(struct __sk_buff *skb) { + if (noise) + subprog_noise(); bpf_tail_call_static(skb, &jmp_table, 2); return skb->len * 3; } diff --git a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c new file mode 100644 index 000000000000..2d3a7710e2ce --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +int my_tid; + +int kprobe_res; +int kprobe_multi_res; +int kretprobe_res; +int uprobe_res; +int uretprobe_res; +int tp_res; +int pe_res; + +static void update(void *ctx, int *res) +{ + if (my_tid != (u32)bpf_get_current_pid_tgid()) + return; + + *res |= bpf_get_attach_cookie(ctx); +} + +SEC("kprobe/sys_nanosleep") +int handle_kprobe(struct pt_regs *ctx) +{ + update(ctx, &kprobe_res); + return 0; +} + +SEC("kretprobe/sys_nanosleep") +int handle_kretprobe(struct pt_regs *ctx) +{ + update(ctx, &kretprobe_res); + return 0; +} + +SEC("uprobe/trigger_func") +int handle_uprobe(struct pt_regs *ctx) +{ + update(ctx, &uprobe_res); + return 0; +} + +SEC("uretprobe/trigger_func") +int handle_uretprobe(struct pt_regs *ctx) +{ + update(ctx, &uretprobe_res); + return 0; +} + +/* bpf_prog_array, used by kernel internally to keep track of attached BPF + * programs to a given BPF hook (e.g., for tracepoints) doesn't allow the same + * BPF program to be attached multiple times. So have three identical copies + * ready to attach to the same tracepoint. + */ +SEC("tp/syscalls/sys_enter_nanosleep") +int handle_tp1(struct pt_regs *ctx) +{ + update(ctx, &tp_res); + return 0; +} +SEC("tp/syscalls/sys_enter_nanosleep") +int handle_tp2(struct pt_regs *ctx) +{ + update(ctx, &tp_res); + return 0; +} +SEC("tp/syscalls/sys_enter_nanosleep") +int handle_tp3(void *ctx) +{ + update(ctx, &tp_res); + return 1; +} + +SEC("perf_event") +int handle_pe(struct pt_regs *ctx) +{ + update(ctx, &pe_res); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_check_mtu.c b/tools/testing/selftests/bpf/progs/test_check_mtu.c index c4a9bae96e75..71184af57749 100644 --- a/tools/testing/selftests/bpf/progs/test_check_mtu.c +++ b/tools/testing/selftests/bpf/progs/test_check_mtu.c @@ -11,8 +11,8 @@ char _license[] SEC("license") = "GPL"; /* Userspace will update with MTU it can see on device */ -static volatile const int GLOBAL_USER_MTU; -static volatile const __u32 GLOBAL_USER_IFINDEX; +volatile const int GLOBAL_USER_MTU; +volatile const __u32 GLOBAL_USER_IFINDEX; /* BPF-prog will update these with MTU values it can see */ __u32 global_bpf_mtu_xdp = 0; diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index 3c1e042962e6..e2a5acc4785c 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -39,8 +39,8 @@ char _license[] SEC("license") = "Dual BSD/GPL"; /** * Destination port and IP used for UDP encapsulation. */ -static volatile const __be16 ENCAPSULATION_PORT; -static volatile const __be32 ENCAPSULATION_IP; +volatile const __be16 ENCAPSULATION_PORT; +volatile const __be32 ENCAPSULATION_IP; typedef struct { uint64_t processed_packets_total; diff --git a/tools/testing/selftests/bpf/progs/test_core_autosize.c b/tools/testing/selftests/bpf/progs/test_core_autosize.c index 44f5aa2e8956..9a7829c5e4a7 100644 --- a/tools/testing/selftests/bpf/progs/test_core_autosize.c +++ b/tools/testing/selftests/bpf/progs/test_core_autosize.c @@ -125,6 +125,16 @@ int handle_downsize(void *ctx) return 0; } +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define bpf_core_read_int bpf_core_read +#else +#define bpf_core_read_int(dst, sz, src) ({ \ + /* Prevent "subtraction from stack pointer prohibited" */ \ + volatile long __off = sizeof(*dst) - (sz); \ + bpf_core_read((char *)(dst) + __off, sz, src); \ +}) +#endif + SEC("raw_tp/sys_enter") int handle_probed(void *ctx) { @@ -132,23 +142,23 @@ int handle_probed(void *ctx) __u64 tmp; tmp = 0; - bpf_core_read(&tmp, bpf_core_field_size(in->ptr), &in->ptr); + bpf_core_read_int(&tmp, bpf_core_field_size(in->ptr), &in->ptr); ptr_probed = tmp; tmp = 0; - bpf_core_read(&tmp, bpf_core_field_size(in->val1), &in->val1); + bpf_core_read_int(&tmp, bpf_core_field_size(in->val1), &in->val1); val1_probed = tmp; tmp = 0; - bpf_core_read(&tmp, bpf_core_field_size(in->val2), &in->val2); + bpf_core_read_int(&tmp, bpf_core_field_size(in->val2), &in->val2); val2_probed = tmp; tmp = 0; - bpf_core_read(&tmp, bpf_core_field_size(in->val3), &in->val3); + bpf_core_read_int(&tmp, bpf_core_field_size(in->val3), &in->val3); val3_probed = tmp; tmp = 0; - bpf_core_read(&tmp, bpf_core_field_size(in->val4), &in->val4); + bpf_core_read_int(&tmp, bpf_core_field_size(in->val4), &in->val4); val4_probed = tmp; return 0; diff --git a/tools/testing/selftests/bpf/progs/test_global_func_args.c b/tools/testing/selftests/bpf/progs/test_global_func_args.c index cae309538a9e..e712bf77daae 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func_args.c +++ b/tools/testing/selftests/bpf/progs/test_global_func_args.c @@ -8,7 +8,7 @@ struct S { int v; }; -static volatile struct S global_variable; +struct S global_variable = {}; struct { __uint(type, BPF_MAP_TYPE_ARRAY); diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_weak.c b/tools/testing/selftests/bpf/progs/test_ksyms_weak.c new file mode 100644 index 000000000000..5f8379aadb29 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ksyms_weak.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test weak ksyms. + * + * Copyright (c) 2021 Google + */ + +#include "vmlinux.h" + +#include <bpf/bpf_helpers.h> + +int out__existing_typed = -1; +__u64 out__existing_typeless = -1; + +__u64 out__non_existent_typeless = -1; +__u64 out__non_existent_typed = -1; + +/* existing weak symbols */ + +/* test existing weak symbols can be resolved. */ +extern const struct rq runqueues __ksym __weak; /* typed */ +extern const void bpf_prog_active __ksym __weak; /* typeless */ + + +/* non-existent weak symbols. */ + +/* typeless symbols, default to zero. */ +extern const void bpf_link_fops1 __ksym __weak; + +/* typed symbols, default to zero. */ +extern const int bpf_link_fops2 __ksym __weak; + +SEC("raw_tp/sys_enter") +int pass_handler(const void *ctx) +{ + struct rq *rq; + + /* tests existing symbols. */ + rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, 0); + if (rq) + out__existing_typed = rq->cpu; + out__existing_typeless = (__u64)&bpf_prog_active; + + /* tests non-existent symbols. */ + out__non_existent_typeless = (__u64)&bpf_link_fops1; + + /* tests non-existent symbols. */ + out__non_existent_typed = (__u64)&bpf_link_fops2; + + if (&bpf_link_fops2) /* can't happen */ + out__non_existent_typed = (__u64)bpf_per_cpu_ptr(&bpf_link_fops2, 0); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c b/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c new file mode 100644 index 000000000000..3a193f42c7e7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +__u32 set_pid = 0; +__u64 set_key = 0; +__u64 set_value = 0; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 2); + __type(key, __u64); + __type(value, __u64); +} hash_map SEC(".maps"); + +SEC("tp/syscalls/sys_enter_getpgid") +int bpf_lookup_and_delete_test(const void *ctx) +{ + if (set_pid == bpf_get_current_pid_tgid() >> 32) + bpf_map_update_elem(&hash_map, &set_key, &set_value, BPF_NOEXIST); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_map_in_map_invalid.c b/tools/testing/selftests/bpf/progs/test_map_in_map_invalid.c new file mode 100644 index 000000000000..703c08e06442 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_map_in_map_invalid.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Isovalent, Inc. */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct inner { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, int); + __uint(max_entries, 4); +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 0); /* This will make map creation to fail */ + __uint(key_size, sizeof(__u32)); + __array(values, struct inner); +} mim SEC(".maps"); + +SEC("xdp") +int xdp_noop0(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c b/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c new file mode 100644 index 000000000000..27df571abf5b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check if we can migrate child sockets. + * + * 1. If reuse_md->migrating_sk is NULL (SYN packet), + * return SK_PASS without selecting a listener. + * 2. If reuse_md->migrating_sk is not NULL (socket migration), + * select a listener (reuseport_map[migrate_map[cookie]]) + * + * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp> + */ + +#include <stddef.h> +#include <string.h> +#include <linux/bpf.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <linux/in.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY); + __uint(max_entries, 256); + __type(key, int); + __type(value, __u64); +} reuseport_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 256); + __type(key, __u64); + __type(value, int); +} migrate_map SEC(".maps"); + +int migrated_at_close = 0; +int migrated_at_close_fastopen = 0; +int migrated_at_send_synack = 0; +int migrated_at_recv_ack = 0; +__be16 server_port; + +SEC("xdp") +int drop_ack(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + struct ethhdr *eth = data; + struct tcphdr *tcp = NULL; + + if (eth + 1 > data_end) + goto pass; + + switch (bpf_ntohs(eth->h_proto)) { + case ETH_P_IP: { + struct iphdr *ip = (struct iphdr *)(eth + 1); + + if (ip + 1 > data_end) + goto pass; + + if (ip->protocol != IPPROTO_TCP) + goto pass; + + tcp = (struct tcphdr *)((void *)ip + ip->ihl * 4); + break; + } + case ETH_P_IPV6: { + struct ipv6hdr *ipv6 = (struct ipv6hdr *)(eth + 1); + + if (ipv6 + 1 > data_end) + goto pass; + + if (ipv6->nexthdr != IPPROTO_TCP) + goto pass; + + tcp = (struct tcphdr *)(ipv6 + 1); + break; + } + default: + goto pass; + } + + if (tcp + 1 > data_end) + goto pass; + + if (tcp->dest != server_port) + goto pass; + + if (!tcp->syn && tcp->ack) + return XDP_DROP; + +pass: + return XDP_PASS; +} + +SEC("sk_reuseport/migrate") +int migrate_reuseport(struct sk_reuseport_md *reuse_md) +{ + int *key, flags = 0, state, err; + __u64 cookie; + + if (!reuse_md->migrating_sk) + return SK_PASS; + + state = reuse_md->migrating_sk->state; + cookie = bpf_get_socket_cookie(reuse_md->sk); + + key = bpf_map_lookup_elem(&migrate_map, &cookie); + if (!key) + return SK_DROP; + + err = bpf_sk_select_reuseport(reuse_md, &reuseport_map, key, flags); + if (err) + return SK_PASS; + + switch (state) { + case BPF_TCP_ESTABLISHED: + __sync_fetch_and_add(&migrated_at_close, 1); + break; + case BPF_TCP_SYN_RECV: + __sync_fetch_and_add(&migrated_at_close_fastopen, 1); + break; + case BPF_TCP_NEW_SYN_RECV: + if (!reuse_md->len) + __sync_fetch_and_add(&migrated_at_send_synack, 1); + else + __sync_fetch_and_add(&migrated_at_recv_ack, 1); + break; + } + + return SK_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_perf_link.c b/tools/testing/selftests/bpf/progs/test_perf_link.c new file mode 100644 index 000000000000..c1db9fd98d0c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_perf_link.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +int run_cnt = 0; + +SEC("perf_event") +int handler(struct pt_regs *ctx) +{ + __sync_fetch_and_add(&run_cnt, 1); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_rdonly_maps.c b/tools/testing/selftests/bpf/progs/test_rdonly_maps.c index ecbeea2df259..fc8e8a34a3db 100644 --- a/tools/testing/selftests/bpf/progs/test_rdonly_maps.c +++ b/tools/testing/selftests/bpf/progs/test_rdonly_maps.c @@ -5,7 +5,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -static volatile const struct { +const struct { unsigned a[4]; /* * if the struct's size is multiple of 16, compiler will put it into @@ -15,11 +15,11 @@ static volatile const struct { char _y; } rdonly_values = { .a = {2, 3, 4, 5} }; -static volatile struct { +struct { unsigned did_run; unsigned iters; unsigned sum; -} res; +} res = {}; SEC("raw_tracepoint/sys_enter:skip_loop") int skip_loop(struct pt_regs *ctx) diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf.c b/tools/testing/selftests/bpf/progs/test_ringbuf.c index 6b3f288b7c63..eaa7d9dba0be 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf.c @@ -35,7 +35,7 @@ long prod_pos = 0; /* inner state */ long seq = 0; -SEC("tp/syscalls/sys_enter_getpgid") +SEC("fentry/__x64_sys_getpgid") int test_ringbuf(void *ctx) { int cur_pid = bpf_get_current_pid_tgid() >> 32; @@ -48,7 +48,7 @@ int test_ringbuf(void *ctx) sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); if (!sample) { __sync_fetch_and_add(&dropped, 1); - return 1; + return 0; } sample->pid = pid; diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c index e83d0b48d80c..8249075f088f 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c @@ -91,7 +91,7 @@ int bpf_sk_lookup_test1(struct __sk_buff *skb) return 0; } -SEC("classifier/fail_use_after_free") +SEC("classifier/err_use_after_free") int bpf_sk_lookup_uaf(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -106,7 +106,7 @@ int bpf_sk_lookup_uaf(struct __sk_buff *skb) return family; } -SEC("classifier/fail_modify_sk_pointer") +SEC("classifier/err_modify_sk_pointer") int bpf_sk_lookup_modptr(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -121,7 +121,7 @@ int bpf_sk_lookup_modptr(struct __sk_buff *skb) return 0; } -SEC("classifier/fail_modify_sk_or_null_pointer") +SEC("classifier/err_modify_sk_or_null_pointer") int bpf_sk_lookup_modptr_or_null(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -135,7 +135,7 @@ int bpf_sk_lookup_modptr_or_null(struct __sk_buff *skb) return 0; } -SEC("classifier/fail_no_release") +SEC("classifier/err_no_release") int bpf_sk_lookup_test2(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -144,7 +144,7 @@ int bpf_sk_lookup_test2(struct __sk_buff *skb) return 0; } -SEC("classifier/fail_release_twice") +SEC("classifier/err_release_twice") int bpf_sk_lookup_test3(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -156,7 +156,7 @@ int bpf_sk_lookup_test3(struct __sk_buff *skb) return 0; } -SEC("classifier/fail_release_unchecked") +SEC("classifier/err_release_unchecked") int bpf_sk_lookup_test4(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -173,7 +173,7 @@ void lookup_no_release(struct __sk_buff *skb) bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0); } -SEC("classifier/fail_no_release_subcall") +SEC("classifier/err_no_release_subcall") int bpf_sk_lookup_test5(struct __sk_buff *skb) { lookup_no_release(skb); diff --git a/tools/testing/selftests/bpf/progs/test_skeleton.c b/tools/testing/selftests/bpf/progs/test_skeleton.c index 374ccef704e1..441fa1c552c8 100644 --- a/tools/testing/selftests/bpf/progs/test_skeleton.c +++ b/tools/testing/selftests/bpf/progs/test_skeleton.c @@ -38,11 +38,11 @@ extern int LINUX_KERNEL_VERSION __kconfig; bool bpf_syscall = 0; int kern_ver = 0; +struct s out5 = {}; + SEC("raw_tp/sys_enter") int handler(const void *ctx) { - static volatile struct s out5; - out1 = in1; out2 = in2; out3 = in3; diff --git a/tools/testing/selftests/bpf/progs/test_snprintf.c b/tools/testing/selftests/bpf/progs/test_snprintf.c index e35129bea0a0..8fda07544023 100644 --- a/tools/testing/selftests/bpf/progs/test_snprintf.c +++ b/tools/testing/selftests/bpf/progs/test_snprintf.c @@ -3,7 +3,6 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> __u32 pid = 0; @@ -60,9 +59,9 @@ int handler(const void *ctx) /* Kernel pointers */ addr_ret = BPF_SNPRINTF(addr_out, sizeof(addr_out), "%pK %px %p", 0, 0xFFFF00000ADD4E55, 0xFFFF00000ADD4E55); - /* Strings embedding */ - str_ret = BPF_SNPRINTF(str_out, sizeof(str_out), "%s %+05s", - str1, longstr); + /* Strings and single-byte character embedding */ + str_ret = BPF_SNPRINTF(str_out, sizeof(str_out), "%s % 9c %+2c %-3c %04c %0c %+05s", + str1, 'a', 'b', 'c', 'd', 'e', longstr); /* Overflow */ over_ret = BPF_SNPRINTF(over_out, sizeof(over_out), "%%overflow"); /* Padding of fixed width numbers */ diff --git a/tools/testing/selftests/bpf/progs/test_snprintf_single.c b/tools/testing/selftests/bpf/progs/test_snprintf_single.c index 402adaf344f9..3095837334d3 100644 --- a/tools/testing/selftests/bpf/progs/test_snprintf_single.c +++ b/tools/testing/selftests/bpf/progs/test_snprintf_single.c @@ -5,7 +5,7 @@ #include <bpf/bpf_helpers.h> /* The format string is filled from the userspace such that loading fails */ -static const char fmt[10]; +const char fmt[10]; SEC("raw_tp/sys_enter") int handler(const void *ctx) diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c index a39eba9f5201..a1cc58b10c7c 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -28,8 +28,8 @@ struct { __type(value, unsigned int); } verdict_map SEC(".maps"); -static volatile bool test_sockmap; /* toggled by user-space */ -static volatile bool test_ingress; /* toggled by user-space */ +bool test_sockmap = false; /* toggled by user-space */ +bool test_ingress = false; /* toggled by user-space */ SEC("sk_skb/stream_parser") int prog_stream_parser(struct __sk_buff *skb) diff --git a/tools/testing/selftests/bpf/progs/test_static_linked1.c b/tools/testing/selftests/bpf/progs/test_static_linked1.c index ea1a6c4c7172..4f0b612e1661 100644 --- a/tools/testing/selftests/bpf/progs/test_static_linked1.c +++ b/tools/testing/selftests/bpf/progs/test_static_linked1.c @@ -4,10 +4,10 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -/* 8-byte aligned .bss */ -static volatile long static_var1; -static volatile int static_var11; -int var1 = 0; +/* 8-byte aligned .data */ +static volatile long static_var1 = 2; +static volatile int static_var2 = 3; +int var1 = -1; /* 4-byte aligned .rodata */ const volatile int rovar1; @@ -21,7 +21,7 @@ static __noinline int subprog(int x) SEC("raw_tp/sys_enter") int handler1(const void *ctx) { - var1 = subprog(rovar1) + static_var1 + static_var11; + var1 = subprog(rovar1) + static_var1 + static_var2; return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_static_linked2.c b/tools/testing/selftests/bpf/progs/test_static_linked2.c index 54d8d1ab577c..766ebd502a60 100644 --- a/tools/testing/selftests/bpf/progs/test_static_linked2.c +++ b/tools/testing/selftests/bpf/progs/test_static_linked2.c @@ -4,10 +4,10 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -/* 4-byte aligned .bss */ -static volatile int static_var2; -static volatile int static_var22; -int var2 = 0; +/* 4-byte aligned .data */ +static volatile int static_var1 = 5; +static volatile int static_var2 = 6; +int var2 = -1; /* 8-byte aligned .rodata */ const volatile long rovar2; @@ -21,7 +21,7 @@ static __noinline int subprog(int x) SEC("raw_tp/sys_enter") int handler2(const void *ctx) { - var2 = subprog(rovar2) + static_var2 + static_var22; + var2 = subprog(rovar2) + static_var1 + static_var2; return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_subprogs.c b/tools/testing/selftests/bpf/progs/test_subprogs.c index d3c5673c0218..b7c37ca09544 100644 --- a/tools/testing/selftests/bpf/progs/test_subprogs.c +++ b/tools/testing/selftests/bpf/progs/test_subprogs.c @@ -4,8 +4,18 @@ const char LICENSE[] SEC("license") = "GPL"; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} array SEC(".maps"); + __noinline int sub1(int x) { + int key = 0; + + bpf_map_lookup_elem(&array, &key); return x + 1; } @@ -23,6 +33,9 @@ static __noinline int sub3(int z) static __noinline int sub4(int w) { + int key = 0; + + bpf_map_lookup_elem(&array, &key); return w + sub3(5) + sub1(6); } diff --git a/tools/testing/selftests/bpf/progs/test_task_pt_regs.c b/tools/testing/selftests/bpf/progs/test_task_pt_regs.c new file mode 100644 index 000000000000..e6cb09259408 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_task_pt_regs.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define PT_REGS_SIZE sizeof(struct pt_regs) + +/* + * The kernel struct pt_regs isn't exported in its entirety to userspace. + * Pass it as an array to task_pt_regs.c + */ +char current_regs[PT_REGS_SIZE] = {}; +char ctx_regs[PT_REGS_SIZE] = {}; +int uprobe_res = 0; + +SEC("uprobe/trigger_func") +int handle_uprobe(struct pt_regs *ctx) +{ + struct task_struct *current; + struct pt_regs *regs; + + current = bpf_get_current_task_btf(); + regs = (struct pt_regs *) bpf_task_pt_regs(current); + if (bpf_probe_read_kernel(current_regs, PT_REGS_SIZE, regs)) + return 0; + if (bpf_probe_read_kernel(ctx_regs, PT_REGS_SIZE, ctx)) + return 0; + + /* Prove that uprobe was run */ + uprobe_res = 1; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tc_bpf.c b/tools/testing/selftests/bpf/progs/test_tc_bpf.c new file mode 100644 index 000000000000..18a3a7ed924a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tc_bpf.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* Dummy prog to test TC-BPF API */ + +SEC("classifier") +int cls(struct __sk_buff *skb) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 84cd63259554..a0e7762b1e5a 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -528,7 +528,6 @@ int __encap_ip6vxlan_eth(struct __sk_buff *skb) static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) { - char buf[sizeof(struct v6hdr)]; struct gre_hdr greh; struct udphdr udph; int olen = len; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_context_test_run.c b/tools/testing/selftests/bpf/progs/test_xdp_context_test_run.c new file mode 100644 index 000000000000..d7b88cd05afd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_context_test_run.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("xdp") +int xdp_context(struct xdp_md *xdp) +{ + void *data = (void *)(long)xdp->data; + __u32 *metadata = (void *)(long)xdp->data_meta; + __u32 ret; + + if (metadata + 1 > data) + return XDP_ABORTED; + ret = *metadata; + if (bpf_xdp_adjust_meta(xdp, 4)) + return XDP_ABORTED; + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c new file mode 100644 index 000000000000..5f5309791649 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <linux/bpf.h> +#include <time.h> +#include <errno.h> +#include <bpf/bpf_helpers.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; +struct hmap_elem { + int counter; + struct bpf_timer timer; + struct bpf_spin_lock lock; /* unused */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap_malloc SEC(".maps"); + +struct elem { + struct bpf_timer t; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +__u64 bss_data; +__u64 err; +__u64 ok; +__u64 callback_check = 52; +__u64 callback2_check = 52; + +#define ARRAY 1 +#define HTAB 2 +#define HTAB_MALLOC 3 +#define LRU 4 + +/* callback for array and lru timers */ +static int timer_cb1(void *map, int *key, struct bpf_timer *timer) +{ + /* increment bss variable twice. + * Once via array timer callback and once via lru timer callback + */ + bss_data += 5; + + /* *key == 0 - the callback was called for array timer. + * *key == 4 - the callback was called from lru timer. + */ + if (*key == ARRAY) { + struct bpf_timer *lru_timer; + int lru_key = LRU; + + /* rearm array timer to be called again in ~35 seconds */ + if (bpf_timer_start(timer, 1ull << 35, 0) != 0) + err |= 1; + + lru_timer = bpf_map_lookup_elem(&lru, &lru_key); + if (!lru_timer) + return 0; + bpf_timer_set_callback(lru_timer, timer_cb1); + if (bpf_timer_start(lru_timer, 0, 0) != 0) + err |= 2; + } else if (*key == LRU) { + int lru_key, i; + + for (i = LRU + 1; + i <= 100 /* for current LRU eviction algorithm this number + * should be larger than ~ lru->max_entries * 2 + */; + i++) { + struct elem init = {}; + + /* lru_key cannot be used as loop induction variable + * otherwise the loop will be unbounded. + */ + lru_key = i; + + /* add more elements into lru map to push out current + * element and force deletion of this timer + */ + bpf_map_update_elem(map, &lru_key, &init, 0); + /* look it up to bump it into active list */ + bpf_map_lookup_elem(map, &lru_key); + + /* keep adding until *key changes underneath, + * which means that key/timer memory was reused + */ + if (*key != LRU) + break; + } + + /* check that the timer was removed */ + if (bpf_timer_cancel(timer) != -EINVAL) + err |= 4; + ok |= 1; + } + return 0; +} + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test1, int a) +{ + struct bpf_timer *arr_timer, *lru_timer; + struct elem init = {}; + int lru_key = LRU; + int array_key = ARRAY; + + arr_timer = bpf_map_lookup_elem(&array, &array_key); + if (!arr_timer) + return 0; + bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC); + + bpf_map_update_elem(&lru, &lru_key, &init, 0); + lru_timer = bpf_map_lookup_elem(&lru, &lru_key); + if (!lru_timer) + return 0; + bpf_timer_init(lru_timer, &lru, CLOCK_MONOTONIC); + + bpf_timer_set_callback(arr_timer, timer_cb1); + bpf_timer_start(arr_timer, 0 /* call timer_cb1 asap */, 0); + + /* init more timers to check that array destruction + * doesn't leak timer memory. + */ + array_key = 0; + arr_timer = bpf_map_lookup_elem(&array, &array_key); + if (!arr_timer) + return 0; + bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC); + return 0; +} + +/* callback for prealloc and non-prealloca hashtab timers */ +static int timer_cb2(void *map, int *key, struct hmap_elem *val) +{ + if (*key == HTAB) + callback_check--; + else + callback2_check--; + if (val->counter > 0 && --val->counter) { + /* re-arm the timer again to execute after 1 usec */ + bpf_timer_start(&val->timer, 1000, 0); + } else if (*key == HTAB) { + struct bpf_timer *arr_timer; + int array_key = ARRAY; + + /* cancel arr_timer otherwise bpf_fentry_test1 prog + * will stay alive forever. + */ + arr_timer = bpf_map_lookup_elem(&array, &array_key); + if (!arr_timer) + return 0; + if (bpf_timer_cancel(arr_timer) != 1) + /* bpf_timer_cancel should return 1 to indicate + * that arr_timer was active at this time + */ + err |= 8; + + /* try to cancel ourself. It shouldn't deadlock. */ + if (bpf_timer_cancel(&val->timer) != -EDEADLK) + err |= 16; + + /* delete this key and this timer anyway. + * It shouldn't deadlock either. + */ + bpf_map_delete_elem(map, key); + + /* in preallocated hashmap both 'key' and 'val' could have been + * reused to store another map element (like in LRU above), + * but in controlled test environment the below test works. + * It's not a use-after-free. The memory is owned by the map. + */ + if (bpf_timer_start(&val->timer, 1000, 0) != -EINVAL) + err |= 32; + ok |= 2; + } else { + if (*key != HTAB_MALLOC) + err |= 64; + + /* try to cancel ourself. It shouldn't deadlock. */ + if (bpf_timer_cancel(&val->timer) != -EDEADLK) + err |= 128; + + /* delete this key and this timer anyway. + * It shouldn't deadlock either. + */ + bpf_map_delete_elem(map, key); + + /* in non-preallocated hashmap both 'key' and 'val' are RCU + * protected and still valid though this element was deleted + * from the map. Arm this timer for ~35 seconds. When callback + * finishes the call_rcu will invoke: + * htab_elem_free_rcu + * check_and_free_timer + * bpf_timer_cancel_and_free + * to cancel this 35 second sleep and delete the timer for real. + */ + if (bpf_timer_start(&val->timer, 1ull << 35, 0) != 0) + err |= 256; + ok |= 4; + } + return 0; +} + +int bpf_timer_test(void) +{ + struct hmap_elem *val; + int key = HTAB, key_malloc = HTAB_MALLOC; + + val = bpf_map_lookup_elem(&hmap, &key); + if (val) { + if (bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME) != 0) + err |= 512; + bpf_timer_set_callback(&val->timer, timer_cb2); + bpf_timer_start(&val->timer, 1000, 0); + } + val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc); + if (val) { + if (bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME) != 0) + err |= 1024; + bpf_timer_set_callback(&val->timer, timer_cb2); + bpf_timer_start(&val->timer, 1000, 0); + } + return 0; +} + +SEC("fentry/bpf_fentry_test2") +int BPF_PROG(test2, int a, int b) +{ + struct hmap_elem init = {}, *val; + int key = HTAB, key_malloc = HTAB_MALLOC; + + init.counter = 10; /* number of times to trigger timer_cb2 */ + bpf_map_update_elem(&hmap, &key, &init, 0); + val = bpf_map_lookup_elem(&hmap, &key); + if (val) + bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME); + /* update the same key to free the timer */ + bpf_map_update_elem(&hmap, &key, &init, 0); + + bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0); + val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc); + if (val) + bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME); + /* update the same key to free the timer */ + bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0); + + /* init more timers to check that htab operations + * don't leak timer memory. + */ + key = 0; + bpf_map_update_elem(&hmap, &key, &init, 0); + val = bpf_map_lookup_elem(&hmap, &key); + if (val) + bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME); + bpf_map_delete_elem(&hmap, &key); + bpf_map_update_elem(&hmap, &key, &init, 0); + val = bpf_map_lookup_elem(&hmap, &key); + if (val) + bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME); + + /* and with non-prealloc htab */ + key_malloc = 0; + bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0); + val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc); + if (val) + bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME); + bpf_map_delete_elem(&hmap_malloc, &key_malloc); + bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0); + val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc); + if (val) + bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME); + + return bpf_timer_test(); +} diff --git a/tools/testing/selftests/bpf/progs/timer_mim.c b/tools/testing/selftests/bpf/progs/timer_mim.c new file mode 100644 index 000000000000..2fee7ab105ef --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_mim.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <linux/bpf.h> +#include <time.h> +#include <errno.h> +#include <bpf/bpf_helpers.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; +struct hmap_elem { + int pad; /* unused */ + struct bpf_timer timer; +}; + +struct inner_map { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1024); + __type(key, int); + __type(value, struct hmap_elem); +} inner_htab SEC(".maps"); + +#define ARRAY_KEY 1 +#define HASH_KEY 1234 + +struct outer_arr { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 2); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __array(values, struct inner_map); +} outer_arr SEC(".maps") = { + .values = { [ARRAY_KEY] = &inner_htab }, +}; + +__u64 err; +__u64 ok; +__u64 cnt; + +static int timer_cb1(void *map, int *key, struct hmap_elem *val); + +static int timer_cb2(void *map, int *key, struct hmap_elem *val) +{ + cnt++; + bpf_timer_set_callback(&val->timer, timer_cb1); + if (bpf_timer_start(&val->timer, 1000, 0)) + err |= 1; + ok |= 1; + return 0; +} + +/* callback for inner hash map */ +static int timer_cb1(void *map, int *key, struct hmap_elem *val) +{ + cnt++; + bpf_timer_set_callback(&val->timer, timer_cb2); + if (bpf_timer_start(&val->timer, 1000, 0)) + err |= 2; + /* Do a lookup to make sure 'map' and 'key' pointers are correct */ + bpf_map_lookup_elem(map, key); + ok |= 2; + return 0; +} + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test1, int a) +{ + struct hmap_elem init = {}; + struct bpf_map *inner_map; + struct hmap_elem *val; + int array_key = ARRAY_KEY; + int hash_key = HASH_KEY; + + inner_map = bpf_map_lookup_elem(&outer_arr, &array_key); + if (!inner_map) + return 0; + + bpf_map_update_elem(inner_map, &hash_key, &init, 0); + val = bpf_map_lookup_elem(inner_map, &hash_key); + if (!val) + return 0; + + bpf_timer_init(&val->timer, inner_map, CLOCK_MONOTONIC); + if (bpf_timer_set_callback(&val->timer, timer_cb1)) + err |= 4; + if (bpf_timer_start(&val->timer, 0, 0)) + err |= 8; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/timer_mim_reject.c b/tools/testing/selftests/bpf/progs/timer_mim_reject.c new file mode 100644 index 000000000000..5d648e3d8a41 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_mim_reject.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <linux/bpf.h> +#include <time.h> +#include <errno.h> +#include <bpf/bpf_helpers.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; +struct hmap_elem { + int pad; /* unused */ + struct bpf_timer timer; +}; + +struct inner_map { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1024); + __type(key, int); + __type(value, struct hmap_elem); +} inner_htab SEC(".maps"); + +#define ARRAY_KEY 1 +#define ARRAY_KEY2 2 +#define HASH_KEY 1234 + +struct outer_arr { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 2); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __array(values, struct inner_map); +} outer_arr SEC(".maps") = { + .values = { [ARRAY_KEY] = &inner_htab }, +}; + +__u64 err; +__u64 ok; +__u64 cnt; + +/* callback for inner hash map */ +static int timer_cb(void *map, int *key, struct hmap_elem *val) +{ + return 0; +} + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test1, int a) +{ + struct hmap_elem init = {}; + struct bpf_map *inner_map, *inner_map2; + struct hmap_elem *val; + int array_key = ARRAY_KEY; + int array_key2 = ARRAY_KEY2; + int hash_key = HASH_KEY; + + inner_map = bpf_map_lookup_elem(&outer_arr, &array_key); + if (!inner_map) + return 0; + + inner_map2 = bpf_map_lookup_elem(&outer_arr, &array_key2); + if (!inner_map2) + return 0; + bpf_map_update_elem(inner_map, &hash_key, &init, 0); + val = bpf_map_lookup_elem(inner_map, &hash_key); + if (!val) + return 0; + + bpf_timer_init(&val->timer, inner_map2, CLOCK_MONOTONIC); + if (bpf_timer_set_callback(&val->timer, timer_cb)) + err |= 4; + if (bpf_timer_start(&val->timer, 0, 0)) + err |= 8; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/trace_printk.c b/tools/testing/selftests/bpf/progs/trace_printk.c index 8ca7f399b670..119582aa105a 100644 --- a/tools/testing/selftests/bpf/progs/trace_printk.c +++ b/tools/testing/selftests/bpf/progs/trace_printk.c @@ -10,11 +10,11 @@ char _license[] SEC("license") = "GPL"; int trace_printk_ret = 0; int trace_printk_ran = 0; -SEC("tp/raw_syscalls/sys_enter") +const char fmt[] = "Testing,testing %d\n"; + +SEC("fentry/__x64_sys_nanosleep") int sys_enter(void *ctx) { - static const char fmt[] = "testing,testing %d\n"; - trace_printk_ret = bpf_trace_printk(fmt, sizeof(fmt), ++trace_printk_ran); return 0; diff --git a/tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c b/tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c new file mode 100644 index 000000000000..880debcbcd65 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +#define KBUILD_MODNAME "foo" +#include <string.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/ip.h> +#include <linux/ipv6.h> + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +/* One map use devmap, another one use devmap_hash for testing */ +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 1024); +} map_all SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 128); +} map_egress SEC(".maps"); + +/* map to store egress interfaces mac addresses */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u32); + __type(value, __be64); + __uint(max_entries, 128); +} mac_map SEC(".maps"); + +SEC("xdp_redirect_map_multi") +int xdp_redirect_map_multi_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + int if_index = ctx->ingress_ifindex; + struct ethhdr *eth = data; + __u16 h_proto; + __u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + h_proto = eth->h_proto; + + /* Using IPv4 for (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS) testing */ + if (h_proto == bpf_htons(ETH_P_IP)) + return bpf_redirect_map(&map_all, 0, + BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS); + /* Using IPv6 for none flag testing */ + else if (h_proto == bpf_htons(ETH_P_IPV6)) + return bpf_redirect_map(&map_all, if_index, 0); + /* All others for BPF_F_BROADCAST testing */ + else + return bpf_redirect_map(&map_all, 0, BPF_F_BROADCAST); +} + +/* The following 2 progs are for 2nd devmap prog testing */ +SEC("xdp_redirect_map_ingress") +int xdp_redirect_map_all_prog(struct xdp_md *ctx) +{ + return bpf_redirect_map(&map_egress, 0, + BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS); +} + +SEC("xdp_devmap/map_prog") +int xdp_devmap_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + __u32 key = ctx->egress_ifindex; + struct ethhdr *eth = data; + __u64 nh_off; + __be64 *mac; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + mac = bpf_map_lookup_elem(&mac_map, &key); + if (mac) + __builtin_memcpy(eth->h_source, mac, ETH_ALEN); + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/xdp_tx.c b/tools/testing/selftests/bpf/progs/xdp_tx.c index 94e6c2b281cb..5f725c720e00 100644 --- a/tools/testing/selftests/bpf/progs/xdp_tx.c +++ b/tools/testing/selftests/bpf/progs/xdp_tx.c @@ -3,7 +3,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -SEC("tx") +SEC("xdp") int xdp_tx(struct xdp_md *xdp) { return XDP_TX; diff --git a/tools/testing/selftests/bpf/test_bpftool.sh b/tools/testing/selftests/bpf/test_bpftool.sh index 66690778e36d..718f59692ccb 100755 --- a/tools/testing/selftests/bpf/test_bpftool.sh +++ b/tools/testing/selftests/bpf/test_bpftool.sh @@ -2,4 +2,10 @@ # SPDX-License-Identifier: GPL-2.0 # Copyright (c) 2020 SUSE LLC. +# 'make -C tools/testing/selftests/bpf install' will install to SCRIPT_DIR +SCRIPT_DIR=$(dirname $(realpath $0)) + +# 'make -C tools/testing/selftests/bpf' will install to BPFTOOL_INSTALL_PATH +BPFTOOL_INSTALL_PATH="$SCRIPT_DIR"/tools/sbin +export PATH=$SCRIPT_DIR:$BPFTOOL_INSTALL_PATH:$PATH python3 -m unittest -v test_bpftool.TestBpftool diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index ac349a5cea7e..b03a87571592 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -22,7 +22,7 @@ KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../) cd $KDIR_ROOT_DIR if [ ! -e tools/bpf/bpftool/Makefile ]; then echo -e "skip: bpftool files not found!\n" - exit 0 + exit 4 # KSFT_SKIP=4 fi ERROR=0 diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py new file mode 100755 index 000000000000..be54b7335a76 --- /dev/null +++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py @@ -0,0 +1,586 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# +# Copyright (C) 2021 Isovalent, Inc. + +import argparse +import re +import os, sys + +LINUX_ROOT = os.path.abspath(os.path.join(__file__, + os.pardir, os.pardir, os.pardir, os.pardir, os.pardir)) +BPFTOOL_DIR = os.path.join(LINUX_ROOT, 'tools/bpf/bpftool') +retval = 0 + +class BlockParser(object): + """ + A parser for extracting set of values from blocks such as enums. + @reader: a pointer to the open file to parse + """ + def __init__(self, reader): + self.reader = reader + + def search_block(self, start_marker): + """ + Search for a given structure in a file. + @start_marker: regex marking the beginning of a structure to parse + """ + offset = self.reader.tell() + array_start = re.search(start_marker, self.reader.read()) + if array_start is None: + raise Exception('Failed to find start of block') + self.reader.seek(offset + array_start.start()) + + def parse(self, pattern, end_marker): + """ + Parse a block and return a set of values. Values to extract must be + on separate lines in the file. + @pattern: pattern used to identify the values to extract + @end_marker: regex marking the end of the block to parse + """ + entries = set() + while True: + line = self.reader.readline() + if not line or re.match(end_marker, line): + break + capture = pattern.search(line) + if capture and pattern.groups >= 1: + entries.add(capture.group(1)) + return entries + +class ArrayParser(BlockParser): + """ + A parser for extracting dicionaries of values from some BPF-related arrays. + @reader: a pointer to the open file to parse + @array_name: name of the array to parse + """ + end_marker = re.compile('^};') + + def __init__(self, reader, array_name): + self.array_name = array_name + self.start_marker = re.compile(f'(static )?const char \* const {self.array_name}\[.*\] = {{\n') + super().__init__(reader) + + def search_block(self): + """ + Search for the given array in a file. + """ + super().search_block(self.start_marker); + + def parse(self): + """ + Parse a block and return data as a dictionary. Items to extract must be + on separate lines in the file. + """ + pattern = re.compile('\[(BPF_\w*)\]\s*= "(.*)",?$') + entries = {} + while True: + line = self.reader.readline() + if line == '' or re.match(self.end_marker, line): + break + capture = pattern.search(line) + if capture: + entries[capture.group(1)] = capture.group(2) + return entries + +class InlineListParser(BlockParser): + """ + A parser for extracting set of values from inline lists. + """ + def parse(self, pattern, end_marker): + """ + Parse a block and return a set of values. Multiple values to extract + can be on a same line in the file. + @pattern: pattern used to identify the values to extract + @end_marker: regex marking the end of the block to parse + """ + entries = set() + while True: + line = self.reader.readline() + if not line: + break + entries.update(pattern.findall(line)) + if re.search(end_marker, line): + break + return entries + +class FileExtractor(object): + """ + A generic reader for extracting data from a given file. This class contains + several helper methods that wrap arround parser objects to extract values + from different structures. + This class does not offer a way to set a filename, which is expected to be + defined in children classes. + """ + def __init__(self): + self.reader = open(self.filename, 'r') + + def close(self): + """ + Close the file used by the parser. + """ + self.reader.close() + + def reset_read(self): + """ + Reset the file position indicator for this parser. This is useful when + parsing several structures in the file without respecting the order in + which those structures appear in the file. + """ + self.reader.seek(0) + + def get_types_from_array(self, array_name): + """ + Search for and parse an array associating names to BPF_* enum members, + for example: + + const char * const prog_type_name[] = { + [BPF_PROG_TYPE_UNSPEC] = "unspec", + [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", + [BPF_PROG_TYPE_KPROBE] = "kprobe", + }; + + Return a dictionary with the enum member names as keys and the + associated names as values, for example: + + {'BPF_PROG_TYPE_UNSPEC': 'unspec', + 'BPF_PROG_TYPE_SOCKET_FILTER': 'socket_filter', + 'BPF_PROG_TYPE_KPROBE': 'kprobe'} + + @array_name: name of the array to parse + """ + array_parser = ArrayParser(self.reader, array_name) + array_parser.search_block() + return array_parser.parse() + + def get_enum(self, enum_name): + """ + Search for and parse an enum containing BPF_* members, for example: + + enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, + BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, + }; + + Return a set containing all member names, for example: + + {'BPF_PROG_TYPE_UNSPEC', + 'BPF_PROG_TYPE_SOCKET_FILTER', + 'BPF_PROG_TYPE_KPROBE'} + + @enum_name: name of the enum to parse + """ + start_marker = re.compile(f'enum {enum_name} {{\n') + pattern = re.compile('^\s*(BPF_\w+),?$') + end_marker = re.compile('^};') + parser = BlockParser(self.reader) + parser.search_block(start_marker) + return parser.parse(pattern, end_marker) + + def __get_description_list(self, start_marker, pattern, end_marker): + parser = InlineListParser(self.reader) + parser.search_block(start_marker) + return parser.parse(pattern, end_marker) + + def get_rst_list(self, block_name): + """ + Search for and parse a list of type names from RST documentation, for + example: + + | *TYPE* := { + | **socket** | **kprobe** | + | **kretprobe** + | } + + Return a set containing all type names, for example: + + {'socket', 'kprobe', 'kretprobe'} + + @block_name: name of the blog to parse, 'TYPE' in the example + """ + start_marker = re.compile(f'\*{block_name}\* := {{') + pattern = re.compile('\*\*([\w/-]+)\*\*') + end_marker = re.compile('}\n') + return self.__get_description_list(start_marker, pattern, end_marker) + + def get_help_list(self, block_name): + """ + Search for and parse a list of type names from a help message in + bpftool, for example: + + " TYPE := { socket | kprobe |\\n" + " kretprobe }\\n" + + Return a set containing all type names, for example: + + {'socket', 'kprobe', 'kretprobe'} + + @block_name: name of the blog to parse, 'TYPE' in the example + """ + start_marker = re.compile(f'"\s*{block_name} := {{') + pattern = re.compile('([\w/]+) [|}]') + end_marker = re.compile('}') + return self.__get_description_list(start_marker, pattern, end_marker) + + def get_help_list_macro(self, macro): + """ + Search for and parse a list of values from a help message starting with + a macro in bpftool, for example: + + " " HELP_SPEC_OPTIONS " |\\n" + " {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} }\\n" + + Return a set containing all item names, for example: + + {'-f', '--bpffs', '-m', '--mapcompat', '-n', '--nomount'} + + @macro: macro starting the block, 'HELP_SPEC_OPTIONS' in the example + """ + start_marker = re.compile(f'"\s*{macro}\s*" [|}}]') + pattern = re.compile('([\w-]+) ?(?:\||}[ }\]])') + end_marker = re.compile('}\\\\n') + return self.__get_description_list(start_marker, pattern, end_marker) + + def default_options(self): + """ + Return the default options contained in HELP_SPEC_OPTIONS + """ + return { '-j', '--json', '-p', '--pretty', '-d', '--debug' } + + def get_bashcomp_list(self, block_name): + """ + Search for and parse a list of type names from a variable in bash + completion file, for example: + + local BPFTOOL_PROG_LOAD_TYPES='socket kprobe \\ + kretprobe' + + Return a set containing all type names, for example: + + {'socket', 'kprobe', 'kretprobe'} + + @block_name: name of the blog to parse, 'TYPE' in the example + """ + start_marker = re.compile(f'local {block_name}=\'') + pattern = re.compile('(?:.*=\')?([\w/]+)') + end_marker = re.compile('\'$') + return self.__get_description_list(start_marker, pattern, end_marker) + +class SourceFileExtractor(FileExtractor): + """ + An abstract extractor for a source file with usage message. + This class does not offer a way to set a filename, which is expected to be + defined in children classes. + """ + def get_options(self): + return self.default_options().union(self.get_help_list_macro('HELP_SPEC_OPTIONS')) + +class ProgFileExtractor(SourceFileExtractor): + """ + An extractor for bpftool's prog.c. + """ + filename = os.path.join(BPFTOOL_DIR, 'prog.c') + + def get_prog_types(self): + return self.get_types_from_array('prog_type_name') + + def get_attach_types(self): + return self.get_types_from_array('attach_type_strings') + + def get_prog_attach_help(self): + return self.get_help_list('ATTACH_TYPE') + +class MapFileExtractor(SourceFileExtractor): + """ + An extractor for bpftool's map.c. + """ + filename = os.path.join(BPFTOOL_DIR, 'map.c') + + def get_map_types(self): + return self.get_types_from_array('map_type_name') + + def get_map_help(self): + return self.get_help_list('TYPE') + +class CgroupFileExtractor(SourceFileExtractor): + """ + An extractor for bpftool's cgroup.c. + """ + filename = os.path.join(BPFTOOL_DIR, 'cgroup.c') + + def get_prog_attach_help(self): + return self.get_help_list('ATTACH_TYPE') + +class CommonFileExtractor(SourceFileExtractor): + """ + An extractor for bpftool's common.c. + """ + filename = os.path.join(BPFTOOL_DIR, 'common.c') + + def __init__(self): + super().__init__() + self.attach_types = {} + + def get_attach_types(self): + if not self.attach_types: + self.attach_types = self.get_types_from_array('attach_type_name') + return self.attach_types + + def get_cgroup_attach_types(self): + if not self.attach_types: + self.get_attach_types() + cgroup_types = {} + for (key, value) in self.attach_types.items(): + if key.find('BPF_CGROUP') != -1: + cgroup_types[key] = value + return cgroup_types + +class GenericSourceExtractor(SourceFileExtractor): + """ + An extractor for generic source code files. + """ + filename = "" + + def __init__(self, filename): + self.filename = os.path.join(BPFTOOL_DIR, filename) + super().__init__() + +class BpfHeaderExtractor(FileExtractor): + """ + An extractor for the UAPI BPF header. + """ + filename = os.path.join(LINUX_ROOT, 'tools/include/uapi/linux/bpf.h') + + def get_prog_types(self): + return self.get_enum('bpf_prog_type') + + def get_map_types(self): + return self.get_enum('bpf_map_type') + + def get_attach_types(self): + return self.get_enum('bpf_attach_type') + +class ManPageExtractor(FileExtractor): + """ + An abstract extractor for an RST documentation page. + This class does not offer a way to set a filename, which is expected to be + defined in children classes. + """ + def get_options(self): + return self.get_rst_list('OPTIONS') + +class ManProgExtractor(ManPageExtractor): + """ + An extractor for bpftool-prog.rst. + """ + filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-prog.rst') + + def get_attach_types(self): + return self.get_rst_list('ATTACH_TYPE') + +class ManMapExtractor(ManPageExtractor): + """ + An extractor for bpftool-map.rst. + """ + filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-map.rst') + + def get_map_types(self): + return self.get_rst_list('TYPE') + +class ManCgroupExtractor(ManPageExtractor): + """ + An extractor for bpftool-cgroup.rst. + """ + filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-cgroup.rst') + + def get_attach_types(self): + return self.get_rst_list('ATTACH_TYPE') + +class ManGenericExtractor(ManPageExtractor): + """ + An extractor for generic RST documentation pages. + """ + filename = "" + + def __init__(self, filename): + self.filename = os.path.join(BPFTOOL_DIR, filename) + super().__init__() + +class BashcompExtractor(FileExtractor): + """ + An extractor for bpftool's bash completion file. + """ + filename = os.path.join(BPFTOOL_DIR, 'bash-completion/bpftool') + + def get_prog_attach_types(self): + return self.get_bashcomp_list('BPFTOOL_PROG_ATTACH_TYPES') + + def get_map_types(self): + return self.get_bashcomp_list('BPFTOOL_MAP_CREATE_TYPES') + + def get_cgroup_attach_types(self): + return self.get_bashcomp_list('BPFTOOL_CGROUP_ATTACH_TYPES') + +def verify(first_set, second_set, message): + """ + Print all values that differ between two sets. + @first_set: one set to compare + @second_set: another set to compare + @message: message to print for values belonging to only one of the sets + """ + global retval + diff = first_set.symmetric_difference(second_set) + if diff: + print(message, diff) + retval = 1 + +def main(): + # No arguments supported at this time, but print usage for -h|--help + argParser = argparse.ArgumentParser(description=""" + Verify that bpftool's code, help messages, documentation and bash + completion are all in sync on program types, map types, attach types, and + options. Also check that bpftool is in sync with the UAPI BPF header. + """) + args = argParser.parse_args() + + # Map types (enum) + + bpf_info = BpfHeaderExtractor() + ref = bpf_info.get_map_types() + + map_info = MapFileExtractor() + source_map_items = map_info.get_map_types() + map_types_enum = set(source_map_items.keys()) + + verify(ref, map_types_enum, + f'Comparing BPF header (enum bpf_map_type) and {MapFileExtractor.filename} (map_type_name):') + + # Map types (names) + + source_map_types = set(source_map_items.values()) + source_map_types.discard('unspec') + + help_map_types = map_info.get_map_help() + help_map_options = map_info.get_options() + map_info.close() + + man_map_info = ManMapExtractor() + man_map_options = man_map_info.get_options() + man_map_types = man_map_info.get_map_types() + man_map_info.close() + + bashcomp_info = BashcompExtractor() + bashcomp_map_types = bashcomp_info.get_map_types() + + verify(source_map_types, help_map_types, + f'Comparing {MapFileExtractor.filename} (map_type_name) and {MapFileExtractor.filename} (do_help() TYPE):') + verify(source_map_types, man_map_types, + f'Comparing {MapFileExtractor.filename} (map_type_name) and {ManMapExtractor.filename} (TYPE):') + verify(help_map_options, man_map_options, + f'Comparing {MapFileExtractor.filename} (do_help() OPTIONS) and {ManMapExtractor.filename} (OPTIONS):') + verify(source_map_types, bashcomp_map_types, + f'Comparing {MapFileExtractor.filename} (map_type_name) and {BashcompExtractor.filename} (BPFTOOL_MAP_CREATE_TYPES):') + + # Program types (enum) + + ref = bpf_info.get_prog_types() + + prog_info = ProgFileExtractor() + prog_types = set(prog_info.get_prog_types().keys()) + + verify(ref, prog_types, + f'Comparing BPF header (enum bpf_prog_type) and {ProgFileExtractor.filename} (prog_type_name):') + + # Attach types (enum) + + ref = bpf_info.get_attach_types() + bpf_info.close() + + common_info = CommonFileExtractor() + attach_types = common_info.get_attach_types() + + verify(ref, attach_types, + f'Comparing BPF header (enum bpf_attach_type) and {CommonFileExtractor.filename} (attach_type_name):') + + # Attach types (names) + + source_prog_attach_types = set(prog_info.get_attach_types().values()) + + help_prog_attach_types = prog_info.get_prog_attach_help() + help_prog_options = prog_info.get_options() + prog_info.close() + + man_prog_info = ManProgExtractor() + man_prog_options = man_prog_info.get_options() + man_prog_attach_types = man_prog_info.get_attach_types() + man_prog_info.close() + + bashcomp_info.reset_read() # We stopped at map types, rewind + bashcomp_prog_attach_types = bashcomp_info.get_prog_attach_types() + + verify(source_prog_attach_types, help_prog_attach_types, + f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {ProgFileExtractor.filename} (do_help() ATTACH_TYPE):') + verify(source_prog_attach_types, man_prog_attach_types, + f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {ManProgExtractor.filename} (ATTACH_TYPE):') + verify(help_prog_options, man_prog_options, + f'Comparing {ProgFileExtractor.filename} (do_help() OPTIONS) and {ManProgExtractor.filename} (OPTIONS):') + verify(source_prog_attach_types, bashcomp_prog_attach_types, + f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {BashcompExtractor.filename} (BPFTOOL_PROG_ATTACH_TYPES):') + + # Cgroup attach types + + source_cgroup_attach_types = set(common_info.get_cgroup_attach_types().values()) + common_info.close() + + cgroup_info = CgroupFileExtractor() + help_cgroup_attach_types = cgroup_info.get_prog_attach_help() + help_cgroup_options = cgroup_info.get_options() + cgroup_info.close() + + man_cgroup_info = ManCgroupExtractor() + man_cgroup_options = man_cgroup_info.get_options() + man_cgroup_attach_types = man_cgroup_info.get_attach_types() + man_cgroup_info.close() + + bashcomp_cgroup_attach_types = bashcomp_info.get_cgroup_attach_types() + bashcomp_info.close() + + verify(source_cgroup_attach_types, help_cgroup_attach_types, + f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {CgroupFileExtractor.filename} (do_help() ATTACH_TYPE):') + verify(source_cgroup_attach_types, man_cgroup_attach_types, + f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {ManCgroupExtractor.filename} (ATTACH_TYPE):') + verify(help_cgroup_options, man_cgroup_options, + f'Comparing {CgroupFileExtractor.filename} (do_help() OPTIONS) and {ManCgroupExtractor.filename} (OPTIONS):') + verify(source_cgroup_attach_types, bashcomp_cgroup_attach_types, + f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {BashcompExtractor.filename} (BPFTOOL_CGROUP_ATTACH_TYPES):') + + # Options for remaining commands + + for cmd in [ 'btf', 'feature', 'gen', 'iter', 'link', 'net', 'perf', 'struct_ops', ]: + source_info = GenericSourceExtractor(cmd + '.c') + help_cmd_options = source_info.get_options() + source_info.close() + + man_cmd_info = ManGenericExtractor(os.path.join('Documentation', 'bpftool-' + cmd + '.rst')) + man_cmd_options = man_cmd_info.get_options() + man_cmd_info.close() + + verify(help_cmd_options, man_cmd_options, + f'Comparing {source_info.filename} (do_help() OPTIONS) and {man_cmd_info.filename} (OPTIONS):') + + source_main_info = GenericSourceExtractor('main.c') + help_main_options = source_main_info.get_options() + source_main_info.close() + + man_main_info = ManGenericExtractor(os.path.join('Documentation', 'bpftool.rst')) + man_main_options = man_main_info.get_options() + man_main_info.close() + + verify(help_main_options, man_main_options, + f'Comparing {source_main_info.filename} (do_help() OPTIONS) and {man_main_info.filename} (OPTIONS):') + + sys.exit(retval) + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/bpf/test_doc_build.sh b/tools/testing/selftests/bpf/test_doc_build.sh index 7eb940a7b2eb..679cf968c7d1 100755 --- a/tools/testing/selftests/bpf/test_doc_build.sh +++ b/tools/testing/selftests/bpf/test_doc_build.sh @@ -1,13 +1,20 @@ #!/bin/bash # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +set -e # Assume script is located under tools/testing/selftests/bpf/. We want to start # build attempts from the top of kernel repository. -SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0) +SCRIPT_REL_PATH=$(realpath $0) SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH) -KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../) +KDIR_ROOT_DIR=$(realpath $SCRIPT_REL_DIR/../../../../) +SCRIPT_REL_DIR=$(dirname $(realpath --relative-to=$KDIR_ROOT_DIR $SCRIPT_REL_PATH)) cd $KDIR_ROOT_DIR +if [ ! -e $PWD/$SCRIPT_REL_DIR/Makefile ]; then + echo -e "skip: bpftool files not found!\n" + exit 4 # KSFT_SKIP=4 +fi + for tgt in docs docs-clean; do make -s -C $PWD/$SCRIPT_REL_DIR $tgt; done diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c index 6a5349f9eb14..7e9049fa3edf 100644 --- a/tools/testing/selftests/bpf/test_lru_map.c +++ b/tools/testing/selftests/bpf/test_lru_map.c @@ -231,6 +231,14 @@ static void test_lru_sanity0(int map_type, int map_flags) assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 && errno == ENOENT); + /* lookup elem key=1 and delete it, then check it doesn't exist */ + key = 1; + assert(!bpf_map_lookup_and_delete_elem(lru_map_fd, &key, &value)); + assert(value[0] == 1234); + + /* remove the same element from the expected map */ + assert(!bpf_map_delete_elem(expected_map_fd, &key)); + assert(map_equal(lru_map_fd, expected_map_fd)); close(expected_map_fd); diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh index 59ea56945e6c..b497bb85b667 100755 --- a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh +++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh @@ -112,6 +112,14 @@ setup() ip netns add "${NS2}" ip netns add "${NS3}" + # rp_filter gets confused by what these tests are doing, so disable it + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip link add veth1 type veth peer name veth2 ip link add veth3 type veth peer name veth4 ip link add veth5 type veth peer name veth6 @@ -236,11 +244,6 @@ setup() ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6} ${VRF} ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8} ${VRF} - # rp_filter gets confused by what these tests are doing, so disable it - ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 - TMPFILE=$(mktemp /tmp/test_lwt_ip_encap.XXXXXX) sleep 1 # reduce flakiness diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 51adc42b2b40..c7a36a9378f8 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -53,23 +53,30 @@ static void test_hashmap(unsigned int task, void *data) value = 0; /* BPF_NOEXIST means add new element if it doesn't exist. */ - assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 && /* key=1 already exists. */ errno == EEXIST); /* -1 is an invalid flag. */ - assert(bpf_map_update_elem(fd, &key, &value, -1) == -1 && + assert(bpf_map_update_elem(fd, &key, &value, -1) < 0 && errno == EINVAL); /* Check that key=1 can be found. */ assert(bpf_map_lookup_elem(fd, &key, &value) == 0 && value == 1234); key = 2; + value = 1234; + /* Insert key=2 element. */ + assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); + + /* Check that key=2 matches the value and delete it */ + assert(bpf_map_lookup_and_delete_elem(fd, &key, &value) == 0 && value == 1234); + /* Check that key=2 is not found. */ - assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT); + assert(bpf_map_lookup_elem(fd, &key, &value) < 0 && errno == ENOENT); /* BPF_EXIST means update existing element. */ - assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) < 0 && /* key=2 is not there. */ errno == ENOENT); @@ -80,7 +87,7 @@ static void test_hashmap(unsigned int task, void *data) * inserted due to max_entries limit. */ key = 0; - assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 && errno == E2BIG); /* Update existing element, though the map is full. */ @@ -89,12 +96,12 @@ static void test_hashmap(unsigned int task, void *data) key = 2; assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); key = 3; - assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 && errno == E2BIG); /* Check that key = 0 doesn't exist. */ key = 0; - assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT); + assert(bpf_map_delete_elem(fd, &key) < 0 && errno == ENOENT); /* Iterate over two elements. */ assert(bpf_map_get_next_key(fd, NULL, &first_key) == 0 && @@ -104,7 +111,7 @@ static void test_hashmap(unsigned int task, void *data) assert(bpf_map_get_next_key(fd, &next_key, &next_key) == 0 && (next_key == 1 || next_key == 2) && (next_key != first_key)); - assert(bpf_map_get_next_key(fd, &next_key, &next_key) == -1 && + assert(bpf_map_get_next_key(fd, &next_key, &next_key) < 0 && errno == ENOENT); /* Delete both elements. */ @@ -112,13 +119,13 @@ static void test_hashmap(unsigned int task, void *data) assert(bpf_map_delete_elem(fd, &key) == 0); key = 2; assert(bpf_map_delete_elem(fd, &key) == 0); - assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT); + assert(bpf_map_delete_elem(fd, &key) < 0 && errno == ENOENT); key = 0; /* Check that map is empty. */ - assert(bpf_map_get_next_key(fd, NULL, &next_key) == -1 && + assert(bpf_map_get_next_key(fd, NULL, &next_key) < 0 && errno == ENOENT); - assert(bpf_map_get_next_key(fd, &key, &next_key) == -1 && + assert(bpf_map_get_next_key(fd, &key, &next_key) < 0 && errno == ENOENT); close(fd); @@ -166,15 +173,25 @@ static void test_hashmap_percpu(unsigned int task, void *data) /* Insert key=1 element. */ assert(!(expected_key_mask & key)); assert(bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0); + + /* Lookup and delete elem key=1 and check value. */ + assert(bpf_map_lookup_and_delete_elem(fd, &key, value) == 0 && + bpf_percpu(value,0) == 100); + + for (i = 0; i < nr_cpus; i++) + bpf_percpu(value,i) = i + 100; + + /* Insert key=1 element which should not exist. */ + assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) == 0); expected_key_mask |= key; /* BPF_NOEXIST means add new element if it doesn't exist. */ - assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) < 0 && /* key=1 already exists. */ errno == EEXIST); /* -1 is an invalid flag. */ - assert(bpf_map_update_elem(fd, &key, value, -1) == -1 && + assert(bpf_map_update_elem(fd, &key, value, -1) < 0 && errno == EINVAL); /* Check that key=1 can be found. Value could be 0 if the lookup @@ -186,10 +203,10 @@ static void test_hashmap_percpu(unsigned int task, void *data) key = 2; /* Check that key=2 is not found. */ - assert(bpf_map_lookup_elem(fd, &key, value) == -1 && errno == ENOENT); + assert(bpf_map_lookup_elem(fd, &key, value) < 0 && errno == ENOENT); /* BPF_EXIST means update existing element. */ - assert(bpf_map_update_elem(fd, &key, value, BPF_EXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, value, BPF_EXIST) < 0 && /* key=2 is not there. */ errno == ENOENT); @@ -202,11 +219,11 @@ static void test_hashmap_percpu(unsigned int task, void *data) * inserted due to max_entries limit. */ key = 0; - assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) < 0 && errno == E2BIG); /* Check that key = 0 doesn't exist. */ - assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT); + assert(bpf_map_delete_elem(fd, &key) < 0 && errno == ENOENT); /* Iterate over two elements. */ assert(bpf_map_get_next_key(fd, NULL, &first_key) == 0 && @@ -237,13 +254,13 @@ static void test_hashmap_percpu(unsigned int task, void *data) assert(bpf_map_delete_elem(fd, &key) == 0); key = 2; assert(bpf_map_delete_elem(fd, &key) == 0); - assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT); + assert(bpf_map_delete_elem(fd, &key) < 0 && errno == ENOENT); key = 0; /* Check that map is empty. */ - assert(bpf_map_get_next_key(fd, NULL, &next_key) == -1 && + assert(bpf_map_get_next_key(fd, NULL, &next_key) < 0 && errno == ENOENT); - assert(bpf_map_get_next_key(fd, &key, &next_key) == -1 && + assert(bpf_map_get_next_key(fd, &key, &next_key) < 0 && errno == ENOENT); close(fd); @@ -360,7 +377,7 @@ static void test_arraymap(unsigned int task, void *data) assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); value = 0; - assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 && errno == EEXIST); /* Check that key=1 can be found. */ @@ -374,11 +391,11 @@ static void test_arraymap(unsigned int task, void *data) * due to max_entries limit. */ key = 2; - assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) < 0 && errno == E2BIG); /* Check that key = 2 doesn't exist. */ - assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT); + assert(bpf_map_lookup_elem(fd, &key, &value) < 0 && errno == ENOENT); /* Iterate over two elements. */ assert(bpf_map_get_next_key(fd, NULL, &next_key) == 0 && @@ -387,12 +404,12 @@ static void test_arraymap(unsigned int task, void *data) next_key == 0); assert(bpf_map_get_next_key(fd, &next_key, &next_key) == 0 && next_key == 1); - assert(bpf_map_get_next_key(fd, &next_key, &next_key) == -1 && + assert(bpf_map_get_next_key(fd, &next_key, &next_key) < 0 && errno == ENOENT); /* Delete shouldn't succeed. */ key = 1; - assert(bpf_map_delete_elem(fd, &key) == -1 && errno == EINVAL); + assert(bpf_map_delete_elem(fd, &key) < 0 && errno == EINVAL); close(fd); } @@ -418,7 +435,7 @@ static void test_arraymap_percpu(unsigned int task, void *data) assert(bpf_map_update_elem(fd, &key, values, BPF_ANY) == 0); bpf_percpu(values, 0) = 0; - assert(bpf_map_update_elem(fd, &key, values, BPF_NOEXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, values, BPF_NOEXIST) < 0 && errno == EEXIST); /* Check that key=1 can be found. */ @@ -433,11 +450,11 @@ static void test_arraymap_percpu(unsigned int task, void *data) /* Check that key=2 cannot be inserted due to max_entries limit. */ key = 2; - assert(bpf_map_update_elem(fd, &key, values, BPF_EXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, values, BPF_EXIST) < 0 && errno == E2BIG); /* Check that key = 2 doesn't exist. */ - assert(bpf_map_lookup_elem(fd, &key, values) == -1 && errno == ENOENT); + assert(bpf_map_lookup_elem(fd, &key, values) < 0 && errno == ENOENT); /* Iterate over two elements. */ assert(bpf_map_get_next_key(fd, NULL, &next_key) == 0 && @@ -446,12 +463,12 @@ static void test_arraymap_percpu(unsigned int task, void *data) next_key == 0); assert(bpf_map_get_next_key(fd, &next_key, &next_key) == 0 && next_key == 1); - assert(bpf_map_get_next_key(fd, &next_key, &next_key) == -1 && + assert(bpf_map_get_next_key(fd, &next_key, &next_key) < 0 && errno == ENOENT); /* Delete shouldn't succeed. */ key = 1; - assert(bpf_map_delete_elem(fd, &key) == -1 && errno == EINVAL); + assert(bpf_map_delete_elem(fd, &key) < 0 && errno == EINVAL); close(fd); } @@ -555,7 +572,7 @@ static void test_queuemap(unsigned int task, void *data) assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0); /* Check that element cannot be pushed due to max_entries limit */ - assert(bpf_map_update_elem(fd, NULL, &val, 0) == -1 && + assert(bpf_map_update_elem(fd, NULL, &val, 0) < 0 && errno == E2BIG); /* Peek element */ @@ -571,12 +588,12 @@ static void test_queuemap(unsigned int task, void *data) val == vals[i]); /* Check that there are not elements left */ - assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == -1 && + assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) < 0 && errno == ENOENT); /* Check that non supported functions set errno to EINVAL */ - assert(bpf_map_delete_elem(fd, NULL) == -1 && errno == EINVAL); - assert(bpf_map_get_next_key(fd, NULL, NULL) == -1 && errno == EINVAL); + assert(bpf_map_delete_elem(fd, NULL) < 0 && errno == EINVAL); + assert(bpf_map_get_next_key(fd, NULL, NULL) < 0 && errno == EINVAL); close(fd); } @@ -613,7 +630,7 @@ static void test_stackmap(unsigned int task, void *data) assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0); /* Check that element cannot be pushed due to max_entries limit */ - assert(bpf_map_update_elem(fd, NULL, &val, 0) == -1 && + assert(bpf_map_update_elem(fd, NULL, &val, 0) < 0 && errno == E2BIG); /* Peek element */ @@ -629,12 +646,12 @@ static void test_stackmap(unsigned int task, void *data) val == vals[i]); /* Check that there are not elements left */ - assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == -1 && + assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) < 0 && errno == ENOENT); /* Check that non supported functions set errno to EINVAL */ - assert(bpf_map_delete_elem(fd, NULL) == -1 && errno == EINVAL); - assert(bpf_map_get_next_key(fd, NULL, NULL) == -1 && errno == EINVAL); + assert(bpf_map_delete_elem(fd, NULL) < 0 && errno == EINVAL); + assert(bpf_map_get_next_key(fd, NULL, NULL) < 0 && errno == EINVAL); close(fd); } @@ -747,8 +764,8 @@ static void test_sockmap(unsigned int tasks, void *data) udp = socket(AF_INET, SOCK_DGRAM, 0); i = 0; err = bpf_map_update_elem(fd, &i, &udp, BPF_ANY); - if (!err) { - printf("Failed socket SOCK_DGRAM allowed '%i:%i'\n", + if (err) { + printf("Failed socket update SOCK_DGRAM '%i:%i'\n", i, udp); goto out_sockmap; } @@ -835,7 +852,7 @@ static void test_sockmap(unsigned int tasks, void *data) } bpf_map_rx = bpf_object__find_map_by_name(obj, "sock_map_rx"); - if (IS_ERR(bpf_map_rx)) { + if (!bpf_map_rx) { printf("Failed to load map rx from verdict prog\n"); goto out_sockmap; } @@ -847,7 +864,7 @@ static void test_sockmap(unsigned int tasks, void *data) } bpf_map_tx = bpf_object__find_map_by_name(obj, "sock_map_tx"); - if (IS_ERR(bpf_map_tx)) { + if (!bpf_map_tx) { printf("Failed to load map tx from verdict prog\n"); goto out_sockmap; } @@ -859,7 +876,7 @@ static void test_sockmap(unsigned int tasks, void *data) } bpf_map_msg = bpf_object__find_map_by_name(obj, "sock_map_msg"); - if (IS_ERR(bpf_map_msg)) { + if (!bpf_map_msg) { printf("Failed to load map msg from msg_verdict prog\n"); goto out_sockmap; } @@ -871,7 +888,7 @@ static void test_sockmap(unsigned int tasks, void *data) } bpf_map_break = bpf_object__find_map_by_name(obj, "sock_map_break"); - if (IS_ERR(bpf_map_break)) { + if (!bpf_map_break) { printf("Failed to load map tx from verdict prog\n"); goto out_sockmap; } @@ -968,7 +985,7 @@ static void test_sockmap(unsigned int tasks, void *data) FD_ZERO(&w); FD_SET(sfd[3], &w); - to.tv_sec = 1; + to.tv_sec = 30; to.tv_usec = 0; s = select(sfd[3] + 1, &w, NULL, NULL, &to); if (s == -1) { @@ -1136,12 +1153,17 @@ out_sockmap: } #define MAPINMAP_PROG "./test_map_in_map.o" +#define MAPINMAP_INVALID_PROG "./test_map_in_map_invalid.o" static void test_map_in_map(void) { struct bpf_object *obj; struct bpf_map *map; int mim_fd, fd, err; int pos = 0; + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + __u32 id = 0; + libbpf_print_fn_t old_print_fn; obj = bpf_object__open(MAPINMAP_PROG); @@ -1153,7 +1175,7 @@ static void test_map_in_map(void) } map = bpf_object__find_map_by_name(obj, "mim_array"); - if (IS_ERR(map)) { + if (!map) { printf("Failed to load array of maps from test prog\n"); goto out_map_in_map; } @@ -1164,7 +1186,7 @@ static void test_map_in_map(void) } map = bpf_object__find_map_by_name(obj, "mim_hash"); - if (IS_ERR(map)) { + if (!map) { printf("Failed to load hash of maps from test prog\n"); goto out_map_in_map; } @@ -1177,7 +1199,7 @@ static void test_map_in_map(void) bpf_object__load(obj); map = bpf_object__find_map_by_name(obj, "mim_array"); - if (IS_ERR(map)) { + if (!map) { printf("Failed to load array of maps from test prog\n"); goto out_map_in_map; } @@ -1194,7 +1216,7 @@ static void test_map_in_map(void) } map = bpf_object__find_map_by_name(obj, "mim_hash"); - if (IS_ERR(map)) { + if (!map) { printf("Failed to load hash of maps from test prog\n"); goto out_map_in_map; } @@ -1211,11 +1233,72 @@ static void test_map_in_map(void) } close(fd); + fd = -1; bpf_object__close(obj); + + /* Test that failing bpf_object__create_map() destroys the inner map */ + obj = bpf_object__open(MAPINMAP_INVALID_PROG); + err = libbpf_get_error(obj); + if (err) { + printf("Failed to load %s program: %d %d", + MAPINMAP_INVALID_PROG, err, errno); + goto out_map_in_map; + } + + map = bpf_object__find_map_by_name(obj, "mim"); + if (!map) { + printf("Failed to load array of maps from test prog\n"); + goto out_map_in_map; + } + + old_print_fn = libbpf_set_print(NULL); + + err = bpf_object__load(obj); + if (!err) { + printf("Loading obj supposed to fail\n"); + goto out_map_in_map; + } + + libbpf_set_print(old_print_fn); + + /* Iterate over all maps to check whether the internal map + * ("mim.internal") has been destroyed. + */ + while (true) { + err = bpf_map_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) + break; + printf("Failed to get next map: %d", errno); + goto out_map_in_map; + } + + fd = bpf_map_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + printf("Failed to get map by id %u: %d", id, errno); + goto out_map_in_map; + } + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + printf("Failed to get map info by fd %d: %d", fd, + errno); + goto out_map_in_map; + } + + if (!strcmp(info.name, "mim.inner")) { + printf("Inner map mim.inner was not destroyed\n"); + goto out_map_in_map; + } + } + return; out_map_in_map: - close(fd); + if (fd >= 0) + close(fd); exit(1); } @@ -1246,7 +1329,7 @@ static void test_map_large(void) } key.c = -1; - assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 && errno == E2BIG); /* Iterate through all elements. */ @@ -1254,12 +1337,12 @@ static void test_map_large(void) key.c = -1; for (i = 0; i < MAP_SIZE; i++) assert(bpf_map_get_next_key(fd, &key, &key) == 0); - assert(bpf_map_get_next_key(fd, &key, &key) == -1 && errno == ENOENT); + assert(bpf_map_get_next_key(fd, &key, &key) < 0 && errno == ENOENT); key.c = 0; assert(bpf_map_lookup_elem(fd, &key, &value) == 0 && value == 0); key.a = 1; - assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT); + assert(bpf_map_lookup_elem(fd, &key, &value) < 0 && errno == ENOENT); close(fd); } @@ -1313,15 +1396,22 @@ static void test_map_stress(void) #define DO_DELETE 0 #define MAP_RETRIES 20 +#define MAX_DELAY_US 50000 +#define MIN_DELAY_RANGE_US 5000 static int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts) { + int delay = rand() % MIN_DELAY_RANGE_US; + while (bpf_map_update_elem(map_fd, key, value, flags)) { if (!attempts || (errno != EAGAIN && errno != EBUSY)) return -errno; - usleep(1); + if (delay <= MAX_DELAY_US / 2) + delay *= 2; + + usleep(delay); attempts--; } @@ -1330,11 +1420,16 @@ static int map_update_retriable(int map_fd, const void *key, const void *value, static int map_delete_retriable(int map_fd, const void *key, int attempts) { + int delay = rand() % MIN_DELAY_RANGE_US; + while (bpf_map_delete_elem(map_fd, key)) { if (!attempts || (errno != EAGAIN && errno != EBUSY)) return -errno; - usleep(1); + if (delay <= MAX_DELAY_US / 2) + delay *= 2; + + usleep(delay); attempts--; } @@ -1391,7 +1486,7 @@ static void test_map_parallel(void) run_parallel(TASKS, test_update_delete, data); /* Check that key=0 is already there. */ - assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 && + assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) < 0 && errno == EEXIST); /* Check that all elements were inserted. */ @@ -1399,7 +1494,7 @@ static void test_map_parallel(void) key = -1; for (i = 0; i < MAP_SIZE; i++) assert(bpf_map_get_next_key(fd, &key, &key) == 0); - assert(bpf_map_get_next_key(fd, &key, &key) == -1 && errno == ENOENT); + assert(bpf_map_get_next_key(fd, &key, &key) < 0 && errno == ENOENT); /* Another check for all elements */ for (i = 0; i < MAP_SIZE; i++) { @@ -1415,8 +1510,8 @@ static void test_map_parallel(void) /* Nothing should be left. */ key = -1; - assert(bpf_map_get_next_key(fd, NULL, &key) == -1 && errno == ENOENT); - assert(bpf_map_get_next_key(fd, &key, &key) == -1 && errno == ENOENT); + assert(bpf_map_get_next_key(fd, NULL, &key) < 0 && errno == ENOENT); + assert(bpf_map_get_next_key(fd, &key, &key) < 0 && errno == ENOENT); } static void test_map_rdonly(void) @@ -1434,12 +1529,12 @@ static void test_map_rdonly(void) key = 1; value = 1234; /* Try to insert key=1 element. */ - assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == -1 && + assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) < 0 && errno == EPERM); /* Check that key=1 is not found. */ - assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT); - assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == ENOENT); + assert(bpf_map_lookup_elem(fd, &key, &value) < 0 && errno == ENOENT); + assert(bpf_map_get_next_key(fd, &key, &value) < 0 && errno == ENOENT); close(fd); } @@ -1462,8 +1557,8 @@ static void test_map_wronly_hash(void) assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); /* Check that reading elements and keys from the map is not allowed. */ - assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == EPERM); - assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM); + assert(bpf_map_lookup_elem(fd, &key, &value) < 0 && errno == EPERM); + assert(bpf_map_get_next_key(fd, &key, &value) < 0 && errno == EPERM); close(fd); } @@ -1490,10 +1585,10 @@ static void test_map_wronly_stack_or_queue(enum bpf_map_type map_type) assert(bpf_map_update_elem(fd, NULL, &value, BPF_ANY) == 0); /* Peek element should fail */ - assert(bpf_map_lookup_elem(fd, NULL, &value) == -1 && errno == EPERM); + assert(bpf_map_lookup_elem(fd, NULL, &value) < 0 && errno == EPERM); /* Pop element should fail */ - assert(bpf_map_lookup_and_delete_elem(fd, NULL, &value) == -1 && + assert(bpf_map_lookup_and_delete_elem(fd, NULL, &value) < 0 && errno == EPERM); close(fd); @@ -1547,7 +1642,7 @@ static void prepare_reuseport_grp(int type, int map_fd, size_t map_elem_size, value = &fd32; } err = bpf_map_update_elem(map_fd, &index0, value, BPF_ANY); - CHECK(err != -1 || errno != EINVAL, + CHECK(err >= 0 || errno != EINVAL, "reuseport array update unbound sk", "sock_type:%d err:%d errno:%d\n", type, err, errno); @@ -1576,7 +1671,7 @@ static void prepare_reuseport_grp(int type, int map_fd, size_t map_elem_size, */ err = bpf_map_update_elem(map_fd, &index0, value, BPF_ANY); - CHECK(err != -1 || errno != EINVAL, + CHECK(err >= 0 || errno != EINVAL, "reuseport array update non-listening sk", "sock_type:%d err:%d errno:%d\n", type, err, errno); @@ -1606,31 +1701,31 @@ static void test_reuseport_array(void) map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, sizeof(__u32), sizeof(__u64), array_size, 0); - CHECK(map_fd == -1, "reuseport array create", + CHECK(map_fd < 0, "reuseport array create", "map_fd:%d, errno:%d\n", map_fd, errno); /* Test lookup/update/delete with invalid index */ err = bpf_map_delete_elem(map_fd, &bad_index); - CHECK(err != -1 || errno != E2BIG, "reuseport array del >=max_entries", + CHECK(err >= 0 || errno != E2BIG, "reuseport array del >=max_entries", "err:%d errno:%d\n", err, errno); err = bpf_map_update_elem(map_fd, &bad_index, &fd64, BPF_ANY); - CHECK(err != -1 || errno != E2BIG, + CHECK(err >= 0 || errno != E2BIG, "reuseport array update >=max_entries", "err:%d errno:%d\n", err, errno); err = bpf_map_lookup_elem(map_fd, &bad_index, &map_cookie); - CHECK(err != -1 || errno != ENOENT, + CHECK(err >= 0 || errno != ENOENT, "reuseport array update >=max_entries", "err:%d errno:%d\n", err, errno); /* Test lookup/delete non existence elem */ err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); - CHECK(err != -1 || errno != ENOENT, + CHECK(err >= 0 || errno != ENOENT, "reuseport array lookup not-exist elem", "err:%d errno:%d\n", err, errno); err = bpf_map_delete_elem(map_fd, &index3); - CHECK(err != -1 || errno != ENOENT, + CHECK(err >= 0 || errno != ENOENT, "reuseport array del not-exist elem", "err:%d errno:%d\n", err, errno); @@ -1644,7 +1739,7 @@ static void test_reuseport_array(void) /* BPF_EXIST failure case */ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], BPF_EXIST); - CHECK(err != -1 || errno != ENOENT, + CHECK(err >= 0 || errno != ENOENT, "reuseport array update empty elem BPF_EXIST", "sock_type:%d err:%d errno:%d\n", type, err, errno); @@ -1653,7 +1748,7 @@ static void test_reuseport_array(void) /* BPF_NOEXIST success case */ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], BPF_NOEXIST); - CHECK(err == -1, + CHECK(err < 0, "reuseport array update empty elem BPF_NOEXIST", "sock_type:%d err:%d errno:%d\n", type, err, errno); @@ -1662,7 +1757,7 @@ static void test_reuseport_array(void) /* BPF_EXIST success case. */ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], BPF_EXIST); - CHECK(err == -1, + CHECK(err < 0, "reuseport array update same elem BPF_EXIST", "sock_type:%d err:%d errno:%d\n", type, err, errno); fds_idx = REUSEPORT_FD_IDX(err, fds_idx); @@ -1670,7 +1765,7 @@ static void test_reuseport_array(void) /* BPF_NOEXIST failure case */ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], BPF_NOEXIST); - CHECK(err != -1 || errno != EEXIST, + CHECK(err >= 0 || errno != EEXIST, "reuseport array update non-empty elem BPF_NOEXIST", "sock_type:%d err:%d errno:%d\n", type, err, errno); @@ -1679,7 +1774,7 @@ static void test_reuseport_array(void) /* BPF_ANY case (always succeed) */ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], BPF_ANY); - CHECK(err == -1, + CHECK(err < 0, "reuseport array update same sk with BPF_ANY", "sock_type:%d err:%d errno:%d\n", type, err, errno); @@ -1688,32 +1783,32 @@ static void test_reuseport_array(void) /* The same sk cannot be added to reuseport_array twice */ err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_ANY); - CHECK(err != -1 || errno != EBUSY, + CHECK(err >= 0 || errno != EBUSY, "reuseport array update same sk with same index", "sock_type:%d err:%d errno:%d\n", type, err, errno); err = bpf_map_update_elem(map_fd, &index0, &fd64, BPF_ANY); - CHECK(err != -1 || errno != EBUSY, + CHECK(err >= 0 || errno != EBUSY, "reuseport array update same sk with different index", "sock_type:%d err:%d errno:%d\n", type, err, errno); /* Test delete elem */ err = bpf_map_delete_elem(map_fd, &index3); - CHECK(err == -1, "reuseport array delete sk", + CHECK(err < 0, "reuseport array delete sk", "sock_type:%d err:%d errno:%d\n", type, err, errno); /* Add it back with BPF_NOEXIST */ err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST); - CHECK(err == -1, + CHECK(err < 0, "reuseport array re-add with BPF_NOEXIST after del", "sock_type:%d err:%d errno:%d\n", type, err, errno); /* Test cookie */ err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); - CHECK(err == -1 || sk_cookie != map_cookie, + CHECK(err < 0 || sk_cookie != map_cookie, "reuseport array lookup re-added sk", "sock_type:%d err:%d errno:%d sk_cookie:0x%llx map_cookie:0x%llxn", type, err, errno, sk_cookie, map_cookie); @@ -1722,7 +1817,7 @@ static void test_reuseport_array(void) for (f = 0; f < ARRAY_SIZE(grpa_fds64); f++) close(grpa_fds64[f]); err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); - CHECK(err != -1 || errno != ENOENT, + CHECK(err >= 0 || errno != ENOENT, "reuseport array lookup after close()", "sock_type:%d err:%d errno:%d\n", type, err, errno); @@ -1733,7 +1828,7 @@ static void test_reuseport_array(void) CHECK(fd64 == -1, "socket(SOCK_RAW)", "err:%d errno:%d\n", err, errno); err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST); - CHECK(err != -1 || errno != ENOTSUPP, "reuseport array update SOCK_RAW", + CHECK(err >= 0 || errno != ENOTSUPP, "reuseport array update SOCK_RAW", "err:%d errno:%d\n", err, errno); close(fd64); @@ -1743,16 +1838,16 @@ static void test_reuseport_array(void) /* Test 32 bit fd */ map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, sizeof(__u32), sizeof(__u32), array_size, 0); - CHECK(map_fd == -1, "reuseport array create", + CHECK(map_fd < 0, "reuseport array create", "map_fd:%d, errno:%d\n", map_fd, errno); prepare_reuseport_grp(SOCK_STREAM, map_fd, sizeof(__u32), &fd64, &sk_cookie, 1); fd = fd64; err = bpf_map_update_elem(map_fd, &index3, &fd, BPF_NOEXIST); - CHECK(err == -1, "reuseport array update 32 bit fd", + CHECK(err < 0, "reuseport array update 32 bit fd", "err:%d errno:%d\n", err, errno); err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); - CHECK(err != -1 || errno != ENOSPC, + CHECK(err >= 0 || errno != ENOSPC, "reuseport array lookup 32 bit fd", "err:%d errno:%d\n", err, errno); close(fd); @@ -1798,6 +1893,8 @@ int main(void) { srand(time(NULL)); + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + map_flags = 0; run_all_tests(); diff --git a/tools/testing/selftests/bpf/test_netcnt.c b/tools/testing/selftests/bpf/test_netcnt.c deleted file mode 100644 index a7b9a69f4fd5..000000000000 --- a/tools/testing/selftests/bpf/test_netcnt.c +++ /dev/null @@ -1,148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <errno.h> -#include <assert.h> -#include <sys/sysinfo.h> -#include <sys/time.h> - -#include <linux/bpf.h> -#include <bpf/bpf.h> -#include <bpf/libbpf.h> - -#include "cgroup_helpers.h" -#include "bpf_rlimit.h" -#include "netcnt_common.h" - -#define BPF_PROG "./netcnt_prog.o" -#define TEST_CGROUP "/test-network-counters/" - -static int bpf_find_map(const char *test, struct bpf_object *obj, - const char *name) -{ - struct bpf_map *map; - - map = bpf_object__find_map_by_name(obj, name); - if (!map) { - printf("%s:FAIL:map '%s' not found\n", test, name); - return -1; - } - return bpf_map__fd(map); -} - -int main(int argc, char **argv) -{ - struct percpu_net_cnt *percpu_netcnt; - struct bpf_cgroup_storage_key key; - int map_fd, percpu_map_fd; - int error = EXIT_FAILURE; - struct net_cnt netcnt; - struct bpf_object *obj; - int prog_fd, cgroup_fd; - unsigned long packets; - unsigned long bytes; - int cpu, nproc; - __u32 prog_cnt; - - nproc = get_nprocs_conf(); - percpu_netcnt = malloc(sizeof(*percpu_netcnt) * nproc); - if (!percpu_netcnt) { - printf("Not enough memory for per-cpu area (%d cpus)\n", nproc); - goto err; - } - - if (bpf_prog_load(BPF_PROG, BPF_PROG_TYPE_CGROUP_SKB, - &obj, &prog_fd)) { - printf("Failed to load bpf program\n"); - goto out; - } - - cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); - if (cgroup_fd < 0) - goto err; - - /* Attach bpf program */ - if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0)) { - printf("Failed to attach bpf program"); - goto err; - } - - if (system("which ping6 &>/dev/null") == 0) - assert(!system("ping6 ::1 -c 10000 -f -q > /dev/null")); - else - assert(!system("ping -6 ::1 -c 10000 -f -q > /dev/null")); - - if (bpf_prog_query(cgroup_fd, BPF_CGROUP_INET_EGRESS, 0, NULL, NULL, - &prog_cnt)) { - printf("Failed to query attached programs"); - goto err; - } - - map_fd = bpf_find_map(__func__, obj, "netcnt"); - if (map_fd < 0) { - printf("Failed to find bpf map with net counters"); - goto err; - } - - percpu_map_fd = bpf_find_map(__func__, obj, "percpu_netcnt"); - if (percpu_map_fd < 0) { - printf("Failed to find bpf map with percpu net counters"); - goto err; - } - - if (bpf_map_get_next_key(map_fd, NULL, &key)) { - printf("Failed to get key in cgroup storage\n"); - goto err; - } - - if (bpf_map_lookup_elem(map_fd, &key, &netcnt)) { - printf("Failed to lookup cgroup storage\n"); - goto err; - } - - if (bpf_map_lookup_elem(percpu_map_fd, &key, &percpu_netcnt[0])) { - printf("Failed to lookup percpu cgroup storage\n"); - goto err; - } - - /* Some packets can be still in per-cpu cache, but not more than - * MAX_PERCPU_PACKETS. - */ - packets = netcnt.packets; - bytes = netcnt.bytes; - for (cpu = 0; cpu < nproc; cpu++) { - if (percpu_netcnt[cpu].packets > MAX_PERCPU_PACKETS) { - printf("Unexpected percpu value: %llu\n", - percpu_netcnt[cpu].packets); - goto err; - } - - packets += percpu_netcnt[cpu].packets; - bytes += percpu_netcnt[cpu].bytes; - } - - /* No packets should be lost */ - if (packets != 10000) { - printf("Unexpected packet count: %lu\n", packets); - goto err; - } - - /* Let's check that bytes counter matches the number of packets - * multiplied by the size of ipv6 ICMP packet. - */ - if (bytes != packets * 104) { - printf("Unexpected bytes count: %lu\n", bytes); - goto err; - } - - error = 0; - printf("test_netcnt:PASS\n"); - -err: - cleanup_cgroup_environment(); - free(percpu_netcnt); - -out: - return error; -} diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 6396932b97e2..cc1cd240445d 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -13,6 +13,28 @@ #include <execinfo.h> /* backtrace */ #include <linux/membarrier.h> +/* Adapted from perf/util/string.c */ +static bool glob_match(const char *str, const char *pat) +{ + while (*str && *pat && *pat != '*') { + if (*str != *pat) + return false; + str++; + pat++; + } + /* Check wild card */ + if (*pat == '*') { + while (*pat == '*') + pat++; + if (!*pat) /* Tail wild card matches all */ + return true; + while (*str) + if (glob_match(str++, pat)) + return true; + } + return !*str && !*pat; +} + #define EXIT_NO_TEST 2 #define EXIT_ERR_SETUP_INFRA 3 @@ -55,12 +77,12 @@ static bool should_run(struct test_selector *sel, int num, const char *name) int i; for (i = 0; i < sel->blacklist.cnt; i++) { - if (strstr(name, sel->blacklist.strs[i])) + if (glob_match(name, sel->blacklist.strs[i])) return false; } for (i = 0; i < sel->whitelist.cnt; i++) { - if (strstr(name, sel->whitelist.strs[i])) + if (glob_match(name, sel->whitelist.strs[i])) return true; } @@ -148,18 +170,18 @@ void test__end_subtest() struct prog_test_def *test = env.test; int sub_error_cnt = test->error_cnt - test->old_error_cnt; + dump_test_log(test, sub_error_cnt); + + fprintf(env.stdout, "#%d/%d %s/%s:%s\n", + test->test_num, test->subtest_num, test->test_name, test->subtest_name, + sub_error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK")); + if (sub_error_cnt) env.fail_cnt++; else if (test->skip_cnt == 0) env.sub_succ_cnt++; skip_account(); - dump_test_log(test, sub_error_cnt); - - fprintf(env.stdout, "#%d/%d %s:%s\n", - test->test_num, test->subtest_num, test->subtest_name, - sub_error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK")); - free(test->subtest_name); test->subtest_name = NULL; } @@ -450,6 +472,8 @@ enum ARG_KEYS { ARG_VERBOSE = 'v', ARG_GET_TEST_CNT = 'c', ARG_LIST_TEST_NAMES = 'l', + ARG_TEST_NAME_GLOB_ALLOWLIST = 'a', + ARG_TEST_NAME_GLOB_DENYLIST = 'd', }; static const struct argp_option opts[] = { @@ -467,6 +491,10 @@ static const struct argp_option opts[] = { "Get number of selected top-level tests " }, { "list", ARG_LIST_TEST_NAMES, NULL, 0, "List test names that would run (without running them) " }, + { "allow", ARG_TEST_NAME_GLOB_ALLOWLIST, "NAMES", 0, + "Run tests with name matching the pattern (supports '*' wildcard)." }, + { "deny", ARG_TEST_NAME_GLOB_DENYLIST, "NAMES", 0, + "Don't run tests with name matching the pattern (supports '*' wildcard)." }, {}, }; @@ -491,36 +519,48 @@ static void free_str_set(const struct str_set *set) free(set->strs); } -static int parse_str_list(const char *s, struct str_set *set) +static int parse_str_list(const char *s, struct str_set *set, bool is_glob_pattern) { char *input, *state = NULL, *next, **tmp, **strs = NULL; - int cnt = 0; + int i, cnt = 0; input = strdup(s); if (!input) return -ENOMEM; - set->cnt = 0; - set->strs = NULL; - while ((next = strtok_r(state ? NULL : input, ",", &state))) { tmp = realloc(strs, sizeof(*strs) * (cnt + 1)); if (!tmp) goto err; strs = tmp; - strs[cnt] = strdup(next); - if (!strs[cnt]) - goto err; + if (is_glob_pattern) { + strs[cnt] = strdup(next); + if (!strs[cnt]) + goto err; + } else { + strs[cnt] = malloc(strlen(next) + 2 + 1); + if (!strs[cnt]) + goto err; + sprintf(strs[cnt], "*%s*", next); + } cnt++; } - set->cnt = cnt; - set->strs = (const char **)strs; + tmp = realloc(set->strs, sizeof(*strs) * (cnt + set->cnt)); + if (!tmp) + goto err; + memcpy(tmp + set->cnt, strs, sizeof(*strs) * cnt); + set->strs = (const char **)tmp; + set->cnt += cnt; + free(input); + free(strs); return 0; err: + for (i = 0; i < cnt; i++) + free(strs[i]); free(strs); free(input); return -ENOMEM; @@ -553,29 +593,35 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) } break; } + case ARG_TEST_NAME_GLOB_ALLOWLIST: case ARG_TEST_NAME: { char *subtest_str = strchr(arg, '/'); if (subtest_str) { *subtest_str = '\0'; if (parse_str_list(subtest_str + 1, - &env->subtest_selector.whitelist)) + &env->subtest_selector.whitelist, + key == ARG_TEST_NAME_GLOB_ALLOWLIST)) return -ENOMEM; } - if (parse_str_list(arg, &env->test_selector.whitelist)) + if (parse_str_list(arg, &env->test_selector.whitelist, + key == ARG_TEST_NAME_GLOB_ALLOWLIST)) return -ENOMEM; break; } + case ARG_TEST_NAME_GLOB_DENYLIST: case ARG_TEST_NAME_BLACKLIST: { char *subtest_str = strchr(arg, '/'); if (subtest_str) { *subtest_str = '\0'; if (parse_str_list(subtest_str + 1, - &env->subtest_selector.blacklist)) + &env->subtest_selector.blacklist, + key == ARG_TEST_NAME_GLOB_DENYLIST)) return -ENOMEM; } - if (parse_str_list(arg, &env->test_selector.blacklist)) + if (parse_str_list(arg, &env->test_selector.blacklist, + key == ARG_TEST_NAME_GLOB_DENYLIST)) return -ENOMEM; break; } @@ -737,6 +783,9 @@ int main(int argc, char **argv) if (err) return err; + /* Use libbpf 1.0 API mode */ + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + libbpf_set_print(libbpf_print_fn); srand(time(NULL)); @@ -752,7 +801,7 @@ int main(int argc, char **argv) save_netns(); stdio_hijack(); env.has_testmod = true; - if (load_bpf_testmod()) { + if (!env.list_test_names && load_bpf_testmod()) { fprintf(env.stderr, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n"); env.has_testmod = false; } @@ -783,24 +832,25 @@ int main(int argc, char **argv) test__end_subtest(); test->tested = true; - if (test->error_cnt) - env.fail_cnt++; - else - env.succ_cnt++; - skip_account(); dump_test_log(test, test->error_cnt); fprintf(env.stdout, "#%d %s:%s\n", test->test_num, test->test_name, - test->error_cnt ? "FAIL" : "OK"); + test->error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK")); + + if (test->error_cnt) + env.fail_cnt++; + else + env.succ_cnt++; + skip_account(); reset_affinity(); restore_netns(); if (test->need_cgroup_cleanup) cleanup_cgroup_environment(); } - if (env.has_testmod) + if (!env.list_test_names && env.has_testmod) unload_bpf_testmod(); stdio_restore(); diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index dda52cb649dc..c8c2bf878f67 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -221,6 +221,18 @@ extern int test__join_cgroup(const char *path); ___ok; \ }) +#define ASSERT_STRNEQ(actual, expected, len, name) ({ \ + static int duration = 0; \ + const char *___act = actual; \ + const char *___exp = expected; \ + int ___len = len; \ + bool ___ok = strncmp(___act, ___exp, ___len) == 0; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual '%.*s' != expected '%.*s'\n", \ + (name), ___len, ___act, ___len, ___exp); \ + ___ok; \ +}) + #define ASSERT_OK(res, name) ({ \ static int duration = 0; \ long long ___res = (res); \ @@ -249,16 +261,17 @@ extern int test__join_cgroup(const char *path); #define ASSERT_OK_PTR(ptr, name) ({ \ static int duration = 0; \ const void *___res = (ptr); \ - bool ___ok = !IS_ERR_OR_NULL(___res); \ - CHECK(!___ok, (name), \ - "unexpected error: %ld\n", PTR_ERR(___res)); \ + int ___err = libbpf_get_error(___res); \ + bool ___ok = ___err == 0; \ + CHECK(!___ok, (name), "unexpected error: %d\n", ___err); \ ___ok; \ }) #define ASSERT_ERR_PTR(ptr, name) ({ \ static int duration = 0; \ const void *___res = (ptr); \ - bool ___ok = IS_ERR(___res); \ + int ___err = libbpf_get_error(___res); \ + bool ___ok = ___err != 0; \ CHECK(!___ok, (name), "unexpected pointer: %p\n", ___res); \ ___ok; \ }) diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index c9dde9b9d987..088fcad138c9 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -69,7 +69,7 @@ cleanup() { } server_listen() { - ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" > "${outfile}" & + ip netns exec "${ns2}" nc "${netcat_opt}" -l "${port}" > "${outfile}" & server_pid=$! sleep 0.2 } diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c index 73da7fe8c152..4a39304cc5a6 100644 --- a/tools/testing/selftests/bpf/test_tcpnotify_user.c +++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c @@ -82,6 +82,8 @@ int main(int argc, char **argv) cpu_set_t cpuset; __u32 key = 0; + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + CPU_ZERO(&cpuset); CPU_SET(0, &cpuset); pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); @@ -116,7 +118,7 @@ int main(int argc, char **argv) pb_opts.sample_cb = dummyfn; pb = perf_buffer__new(bpf_map__fd(perf_map), 8, &pb_opts); - if (IS_ERR(pb)) + if (!pb) goto err; pthread_create(&tid, NULL, poller_thread, pb); @@ -163,7 +165,6 @@ err: bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS); close(cg_fd); cleanup_cgroup_environment(); - if (!IS_ERR_OR_NULL(pb)) - perf_buffer__free(pb); + perf_buffer__free(pb); return error; } diff --git a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh new file mode 100755 index 000000000000..1538373157e3 --- /dev/null +++ b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh @@ -0,0 +1,204 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test topology: +# - - - - - - - - - - - - - - - - - - - - - - - - - +# | veth1 veth2 veth3 | ... init net +# - -| - - - - - - | - - - - - - | - - +# --------- --------- --------- +# | veth0 | | veth0 | | veth0 | ... +# --------- --------- --------- +# ns1 ns2 ns3 +# +# Test modules: +# XDP modes: generic, native, native + egress_prog +# +# Test cases: +# ARP: Testing BPF_F_BROADCAST, the ingress interface also should receive +# the redirects. +# ns1 -> gw: ns1, ns2, ns3, should receive the arp request +# IPv4: Testing BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, the ingress +# interface should not receive the redirects. +# ns1 -> gw: ns1 should not receive, ns2, ns3 should receive redirects. +# IPv6: Testing none flag, all the pkts should be redirected back +# ping test: ns1 -> ns2 (block), echo requests will be redirect back +# egress_prog: +# all src mac should be egress interface's mac + +# netns numbers +NUM=3 +IFACES="" +DRV_MODE="xdpgeneric xdpdrv xdpegress" +PASS=0 +FAIL=0 + +test_pass() +{ + echo "Pass: $@" + PASS=$((PASS + 1)) +} + +test_fail() +{ + echo "fail: $@" + FAIL=$((FAIL + 1)) +} + +clean_up() +{ + for i in $(seq $NUM); do + ip link del veth$i 2> /dev/null + ip netns del ns$i 2> /dev/null + done +} + +# Kselftest framework requirement - SKIP code is 4. +check_env() +{ + ip link set dev lo xdpgeneric off &>/dev/null + if [ $? -ne 0 ];then + echo "selftests: [SKIP] Could not run test without the ip xdpgeneric support" + exit 4 + fi + + which tcpdump &>/dev/null + if [ $? -ne 0 ];then + echo "selftests: [SKIP] Could not run test without tcpdump" + exit 4 + fi +} + +setup_ns() +{ + local mode=$1 + IFACES="" + + if [ "$mode" = "xdpegress" ]; then + mode="xdpdrv" + fi + + for i in $(seq $NUM); do + ip netns add ns$i + ip link add veth$i type veth peer name veth0 netns ns$i + ip link set veth$i up + ip -n ns$i link set veth0 up + + ip -n ns$i addr add 192.0.2.$i/24 dev veth0 + ip -n ns$i addr add 2001:db8::$i/64 dev veth0 + # Add a neigh entry for IPv4 ping test + ip -n ns$i neigh add 192.0.2.253 lladdr 00:00:00:00:00:01 dev veth0 + ip -n ns$i link set veth0 $mode obj \ + xdp_dummy.o sec xdp_dummy &> /dev/null || \ + { test_fail "Unable to load dummy xdp" && exit 1; } + IFACES="$IFACES veth$i" + veth_mac[$i]=$(ip link show veth$i | awk '/link\/ether/ {print $2}') + done +} + +do_egress_tests() +{ + local mode=$1 + + # mac test + ip netns exec ns2 tcpdump -e -i veth0 -nn -l -e &> mac_ns1-2_${mode}.log & + ip netns exec ns3 tcpdump -e -i veth0 -nn -l -e &> mac_ns1-3_${mode}.log & + sleep 0.5 + ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null + sleep 0.5 + pkill -9 tcpdump + + # mac check + grep -q "${veth_mac[2]} > ff:ff:ff:ff:ff:ff" mac_ns1-2_${mode}.log && \ + test_pass "$mode mac ns1-2" || test_fail "$mode mac ns1-2" + grep -q "${veth_mac[3]} > ff:ff:ff:ff:ff:ff" mac_ns1-3_${mode}.log && \ + test_pass "$mode mac ns1-3" || test_fail "$mode mac ns1-3" +} + +do_ping_tests() +{ + local mode=$1 + + # ping6 test: echo request should be redirect back to itself, not others + ip netns exec ns1 ip neigh add 2001:db8::2 dev veth0 lladdr 00:00:00:00:00:02 + + ip netns exec ns1 tcpdump -i veth0 -nn -l -e &> ns1-1_${mode}.log & + ip netns exec ns2 tcpdump -i veth0 -nn -l -e &> ns1-2_${mode}.log & + ip netns exec ns3 tcpdump -i veth0 -nn -l -e &> ns1-3_${mode}.log & + sleep 0.5 + # ARP test + ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null + # IPv4 test + ip netns exec ns1 ping 192.0.2.253 -i 0.1 -c 4 &> /dev/null + # IPv6 test + ip netns exec ns1 ping6 2001:db8::2 -i 0.1 -c 2 &> /dev/null + sleep 0.5 + pkill -9 tcpdump + + # All netns should receive the redirect arp requests + [ $(grep -c "who-has 192.0.2.254" ns1-1_${mode}.log) -gt 4 ] && \ + test_pass "$mode arp(F_BROADCAST) ns1-1" || \ + test_fail "$mode arp(F_BROADCAST) ns1-1" + [ $(grep -c "who-has 192.0.2.254" ns1-2_${mode}.log) -le 4 ] && \ + test_pass "$mode arp(F_BROADCAST) ns1-2" || \ + test_fail "$mode arp(F_BROADCAST) ns1-2" + [ $(grep -c "who-has 192.0.2.254" ns1-3_${mode}.log) -le 4 ] && \ + test_pass "$mode arp(F_BROADCAST) ns1-3" || \ + test_fail "$mode arp(F_BROADCAST) ns1-3" + + # ns1 should not receive the redirect echo request, others should + [ $(grep -c "ICMP echo request" ns1-1_${mode}.log) -eq 4 ] && \ + test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1" || \ + test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1" + [ $(grep -c "ICMP echo request" ns1-2_${mode}.log) -eq 4 ] && \ + test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2" || \ + test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2" + [ $(grep -c "ICMP echo request" ns1-3_${mode}.log) -eq 4 ] && \ + test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3" || \ + test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3" + + # ns1 should receive the echo request, ns2 should not + [ $(grep -c "ICMP6, echo request" ns1-1_${mode}.log) -eq 4 ] && \ + test_pass "$mode IPv6 (no flags) ns1-1" || \ + test_fail "$mode IPv6 (no flags) ns1-1" + [ $(grep -c "ICMP6, echo request" ns1-2_${mode}.log) -eq 0 ] && \ + test_pass "$mode IPv6 (no flags) ns1-2" || \ + test_fail "$mode IPv6 (no flags) ns1-2" +} + +do_tests() +{ + local mode=$1 + local drv_p + + case ${mode} in + xdpdrv) drv_p="-N";; + xdpegress) drv_p="-X";; + xdpgeneric) drv_p="-S";; + esac + + ./xdp_redirect_multi $drv_p $IFACES &> xdp_redirect_${mode}.log & + xdp_pid=$! + sleep 1 + + if [ "$mode" = "xdpegress" ]; then + do_egress_tests $mode + else + do_ping_tests $mode + fi + + kill $xdp_pid +} + +trap clean_up 0 2 3 6 9 + +check_env +rm -f xdp_redirect_*.log ns*.log mac_ns*.log + +for mode in ${DRV_MODE}; do + setup_ns $mode + do_tests $mode + clean_up +done + +echo "Summary: PASS $PASS, FAIL $FAIL" +[ $FAIL -eq 0 ] && exit 0 || exit 1 diff --git a/tools/testing/selftests/bpf/test_xdp_veth.sh b/tools/testing/selftests/bpf/test_xdp_veth.sh index ba8ffcdaac30..995278e684b6 100755 --- a/tools/testing/selftests/bpf/test_xdp_veth.sh +++ b/tools/testing/selftests/bpf/test_xdp_veth.sh @@ -108,7 +108,7 @@ ip link set dev veth2 xdp pinned $BPF_DIR/progs/redirect_map_1 ip link set dev veth3 xdp pinned $BPF_DIR/progs/redirect_map_2 ip -n ns1 link set dev veth11 xdp obj xdp_dummy.o sec xdp_dummy -ip -n ns2 link set dev veth22 xdp obj xdp_tx.o sec tx +ip -n ns2 link set dev veth22 xdp obj xdp_tx.o sec xdp ip -n ns3 link set dev veth33 xdp obj xdp_dummy.o sec xdp_dummy trap cleanup EXIT diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 46633a3bfb0b..cd7bf32e6a17 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -63,14 +63,11 @@ # ---------------- # Must run with CAP_NET_ADMIN capability. # -# Run (full color-coded output): -# sudo ./test_xsk.sh -c +# Run: +# sudo ./test_xsk.sh # # If running from kselftests: -# sudo make colorconsole=1 run_tests -# -# Run (full output without color-coding): -# sudo ./test_xsk.sh +# sudo make run_tests # # Run with verbose output: # sudo ./test_xsk.sh -v @@ -83,7 +80,6 @@ while getopts "cvD" flag do case "${flag}" in - c) colorconsole=1;; v) verbose=1;; D) dump_pkts=1;; esac diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 1bbd1d9830c8..e7a19b04d4ea 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -136,3 +136,90 @@ void read_trace_pipe(void) } } } + +#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 + +#define OP_RT_RA_MASK 0xffff0000UL +#define LIS_R2 0x3c400000UL +#define ADDIS_R2_R12 0x3c4c0000UL +#define ADDI_R2_R2 0x38420000UL + +ssize_t get_uprobe_offset(const void *addr, ssize_t base) +{ + u32 *insn = (u32 *)(uintptr_t)addr; + + /* + * A PPC64 ABIv2 function may have a local and a global entry + * point. We need to use the local entry point when patching + * functions, so identify and step over the global entry point + * sequence. + * + * The global entry point sequence is always of the form: + * + * addis r2,r12,XXXX + * addi r2,r2,XXXX + * + * A linker optimisation may convert the addis to lis: + * + * lis r2,XXXX + * addi r2,r2,XXXX + */ + if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) || + ((*insn & OP_RT_RA_MASK) == LIS_R2)) && + ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2)) + return (ssize_t)(insn + 2) - base; + else + return (uintptr_t)addr - base; +} + +#else + +ssize_t get_uprobe_offset(const void *addr, ssize_t base) +{ + return (uintptr_t)addr - base; +} + +#endif + +ssize_t get_base_addr(void) +{ + size_t start, offset; + char buf[256]; + FILE *f; + + f = fopen("/proc/self/maps", "r"); + if (!f) + return -errno; + + while (fscanf(f, "%zx-%*x %s %zx %*[^\n]\n", + &start, buf, &offset) == 3) { + if (strcmp(buf, "r-xp") == 0) { + fclose(f); + return start - offset; + } + } + + fclose(f); + return -EINVAL; +} + +ssize_t get_rel_offset(uintptr_t addr) +{ + size_t start, end, offset; + char buf[256]; + FILE *f; + + f = fopen("/proc/self/maps", "r"); + if (!f) + return -errno; + + while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) { + if (addr >= start && addr < end) { + fclose(f); + return (size_t)addr - start + offset; + } + } + + fclose(f); + return -EINVAL; +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index f62fdef9e589..d907b445524d 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -18,4 +18,8 @@ int kallsyms_find(const char *sym, unsigned long long *addr); void read_trace_pipe(void); +ssize_t get_uprobe_offset(const void *addr, ssize_t base); +ssize_t get_base_addr(void); +ssize_t get_rel_offset(uintptr_t addr); + #endif diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 2c8935b3e65d..ee454327e5c6 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -159,3 +159,15 @@ .result = ACCEPT, .retval = 2, }, +{ + "dead code: zero extension", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -4), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, +}, diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index a3e593ddfafc..2debba4e8a3a 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -1,4 +1,233 @@ { + "map access: known scalar += value_ptr unknown vs const", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 1, 4), + BPF_MOV64_IMM(BPF_REG_1, 6), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x7), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV64_IMM(BPF_REG_1, 3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 5 }, + .fixup_map_array_48b = { 8 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths or scalars", + .result = ACCEPT, + .retval = 1, +}, +{ + "map access: known scalar += value_ptr const vs unknown", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 1, 2), + BPF_MOV64_IMM(BPF_REG_1, 3), + BPF_JMP_IMM(BPF_JA, 0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, 6), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x7), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 5 }, + .fixup_map_array_48b = { 8 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths or scalars", + .result = ACCEPT, + .retval = 1, +}, +{ + "map access: known scalar += value_ptr const vs const (ne)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 1, 2), + BPF_MOV64_IMM(BPF_REG_1, 3), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV64_IMM(BPF_REG_1, 5), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 5 }, + .fixup_map_array_48b = { 8 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths or scalars", + .result = ACCEPT, + .retval = 1, +}, +{ + "map access: known scalar += value_ptr const vs const (eq)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 1, 2), + BPF_MOV64_IMM(BPF_REG_1, 5), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV64_IMM(BPF_REG_1, 5), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 5 }, + .fixup_map_array_48b = { 8 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "map access: known scalar += value_ptr unknown vs unknown (eq)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 1, 4), + BPF_MOV64_IMM(BPF_REG_1, 6), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x7), + BPF_JMP_IMM(BPF_JA, 0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, 6), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x7), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 5 }, + .fixup_map_array_48b = { 8 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "map access: known scalar += value_ptr unknown vs unknown (lt)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 1, 4), + BPF_MOV64_IMM(BPF_REG_1, 6), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x3), + BPF_JMP_IMM(BPF_JA, 0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, 6), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x7), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 5 }, + .fixup_map_array_48b = { 8 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths or scalars", + .result = ACCEPT, + .retval = 1, +}, +{ + "map access: known scalar += value_ptr unknown vs unknown (gt)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 1, 4), + BPF_MOV64_IMM(BPF_REG_1, 6), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x7), + BPF_JMP_IMM(BPF_JA, 0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, 6), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 5 }, + .fixup_map_array_48b = { 8 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths or scalars", + .result = ACCEPT, + .retval = 1, +}, +{ "map access: known scalar += value_ptr from different maps", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, diff --git a/tools/testing/selftests/bpf/xdp_redirect_multi.c b/tools/testing/selftests/bpf/xdp_redirect_multi.c new file mode 100644 index 000000000000..3696a8f32c23 --- /dev/null +++ b/tools/testing/selftests/bpf/xdp_redirect_multi.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <net/if.h> +#include <unistd.h> +#include <libgen.h> +#include <sys/resource.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> + +#include "bpf_util.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#define MAX_IFACE_NUM 32 +#define MAX_INDEX_NUM 1024 + +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static int ifaces[MAX_IFACE_NUM] = {}; + +static void int_exit(int sig) +{ + __u32 prog_id = 0; + int i; + + for (i = 0; ifaces[i] > 0; i++) { + if (bpf_get_link_xdp_id(ifaces[i], &prog_id, xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(1); + } + if (prog_id) + bpf_set_link_xdp_fd(ifaces[i], -1, xdp_flags); + } + + exit(0); +} + +static int get_mac_addr(unsigned int ifindex, void *mac_addr) +{ + char ifname[IF_NAMESIZE]; + struct ifreq ifr; + int fd, ret = -1; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return ret; + + if (!if_indextoname(ifindex, ifname)) + goto err_out; + + strcpy(ifr.ifr_name, ifname); + + if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) + goto err_out; + + memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char)); + ret = 0; + +err_out: + close(fd); + return ret; +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "usage: %s [OPTS] <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n" + "OPTS:\n" + " -S use skb-mode\n" + " -N enforce native mode\n" + " -F force loading prog\n" + " -X load xdp program on egress\n", + prog); +} + +int main(int argc, char **argv) +{ + int prog_fd, group_all, mac_map; + struct bpf_program *ingress_prog, *egress_prog; + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_UNSPEC, + }; + int i, ret, opt, egress_prog_fd = 0; + struct bpf_devmap_val devmap_val; + bool attach_egress_prog = false; + unsigned char mac_addr[6]; + char ifname[IF_NAMESIZE]; + struct bpf_object *obj; + unsigned int ifindex; + char filename[256]; + + while ((opt = getopt(argc, argv, "SNFX")) != -1) { + switch (opt) { + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'N': + /* default, set below */ + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + case 'X': + attach_egress_prog = true; + break; + default: + usage(basename(argv[0])); + return 1; + } + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) { + xdp_flags |= XDP_FLAGS_DRV_MODE; + } else if (attach_egress_prog) { + printf("Load xdp program on egress with SKB mode not supported yet\n"); + goto err_out; + } + + if (optind == argc) { + printf("usage: %s <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n", argv[0]); + goto err_out; + } + + printf("Get interfaces"); + for (i = 0; i < MAX_IFACE_NUM && argv[optind + i]; i++) { + ifaces[i] = if_nametoindex(argv[optind + i]); + if (!ifaces[i]) + ifaces[i] = strtoul(argv[optind + i], NULL, 0); + if (!if_indextoname(ifaces[i], ifname)) { + perror("Invalid interface name or i"); + goto err_out; + } + if (ifaces[i] > MAX_INDEX_NUM) { + printf("Interface index to large\n"); + goto err_out; + } + printf(" %d", ifaces[i]); + } + printf("\n"); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; + + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + goto err_out; + + if (attach_egress_prog) + group_all = bpf_object__find_map_fd_by_name(obj, "map_egress"); + else + group_all = bpf_object__find_map_fd_by_name(obj, "map_all"); + mac_map = bpf_object__find_map_fd_by_name(obj, "mac_map"); + + if (group_all < 0 || mac_map < 0) { + printf("bpf_object__find_map_fd_by_name failed\n"); + goto err_out; + } + + if (attach_egress_prog) { + /* Find ingress/egress prog for 2nd xdp prog */ + ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_all_prog"); + egress_prog = bpf_object__find_program_by_name(obj, "xdp_devmap_prog"); + if (!ingress_prog || !egress_prog) { + printf("finding ingress/egress_prog in obj file failed\n"); + goto err_out; + } + prog_fd = bpf_program__fd(ingress_prog); + egress_prog_fd = bpf_program__fd(egress_prog); + if (prog_fd < 0 || egress_prog_fd < 0) { + printf("find egress_prog fd failed\n"); + goto err_out; + } + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + /* Init forward multicast groups and exclude group */ + for (i = 0; ifaces[i] > 0; i++) { + ifindex = ifaces[i]; + + if (attach_egress_prog) { + ret = get_mac_addr(ifindex, mac_addr); + if (ret < 0) { + printf("get interface %d mac failed\n", ifindex); + goto err_out; + } + ret = bpf_map_update_elem(mac_map, &ifindex, mac_addr, 0); + if (ret) { + perror("bpf_update_elem mac_map failed\n"); + goto err_out; + } + } + + /* Add all the interfaces to group all */ + devmap_val.ifindex = ifindex; + devmap_val.bpf_prog.fd = egress_prog_fd; + ret = bpf_map_update_elem(group_all, &ifindex, &devmap_val, 0); + if (ret) { + perror("bpf_map_update_elem"); + goto err_out; + } + + /* bind prog_fd to each interface */ + ret = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags); + if (ret) { + printf("Set xdp fd failed on %d\n", ifindex); + goto err_out; + } + } + + /* sleep some time for testing */ + sleep(999); + + return 0; + +err_out: + return 1; +} diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 1135fb980814..f53ce2683f8d 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -70,7 +70,6 @@ #include <errno.h> #include <getopt.h> #include <asm/barrier.h> -typedef __u16 __sum16; #include <linux/if_link.h> #include <linux/if_ether.h> #include <linux/ip.h> @@ -106,14 +105,9 @@ static const u16 UDP_PORT2 = 2121; static void __exit_with_error(int error, const char *file, const char *func, int line) { - if (configured_mode == TEST_MODE_UNCONFIGURED) { - ksft_exit_fail_msg - ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); - } else { - ksft_test_result_fail - ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); - ksft_exit_xfail(); - } + ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, + strerror(error)); + ksft_exit_xfail(); } #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) @@ -126,7 +120,7 @@ static void __exit_with_error(int error, const char *file, const char *func, int test_type == TEST_TYPE_STATS ? "Stats" : "",\ test_type == TEST_TYPE_BPF_RES ? "BPF RES" : "")) -static void *memset32_htonl(void *dest, u32 val, u32 size) +static void memset32_htonl(void *dest, u32 val, u32 size) { u32 *ptr = (u32 *)dest; int i; @@ -135,11 +129,6 @@ static void *memset32_htonl(void *dest, u32 val, u32 size) for (i = 0; i < (size & (~0x3)); i += 4) ptr[i >> 2] = val; - - for (; i < size; i++) - ((char *)dest)[i] = ((char *)&val)[i & 3]; - - return dest; } /* @@ -230,13 +219,13 @@ static void gen_ip_hdr(struct ifobject *ifobject, struct iphdr *ip_hdr) ip_hdr->check = 0; } -static void gen_udp_hdr(struct generic_data *data, struct ifobject *ifobject, +static void gen_udp_hdr(u32 payload, void *pkt, struct ifobject *ifobject, struct udphdr *udp_hdr) { udp_hdr->source = htons(ifobject->src_port); udp_hdr->dest = htons(ifobject->dst_port); udp_hdr->len = htons(UDP_PKT_SIZE); - memset32_htonl(pkt_data + PKT_HDR_SIZE, htonl(data->seqnum), UDP_PKT_DATA_SIZE); + memset32_htonl(pkt + PKT_HDR_SIZE, payload, UDP_PKT_DATA_SIZE); } static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr) @@ -246,12 +235,7 @@ static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr) udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, IPPROTO_UDP, (u16 *)udp_hdr); } -static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) -{ - memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE); -} - -static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx) +static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size, int idx) { struct xsk_umem_config cfg = { .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, @@ -260,7 +244,6 @@ static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx) .frame_headroom = frame_headroom, .flags = XSK_UMEM__DEFAULT_FLAGS }; - int size = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; struct xsk_umem_info *umem; int ret; @@ -271,7 +254,7 @@ static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx) ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, &cfg); if (ret) - exit_with_error(ret); + exit_with_error(-ret); umem->buffer = buffer; @@ -285,7 +268,7 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem) ret = xsk_ring_prod__reserve(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx); if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) - exit_with_error(ret); + exit_with_error(-ret); for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++) *xsk_ring_prod__fill_addr(&umem->fq, idx++) = i * XSK_UMEM__DEFAULT_FRAME_SIZE; xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); @@ -333,20 +316,19 @@ static struct option long_options[] = { {"queue", optional_argument, 0, 'q'}, {"dump-pkts", optional_argument, 0, 'D'}, {"verbose", no_argument, 0, 'v'}, - {"tx-pkt-count", optional_argument, 0, 'C'}, {0, 0, 0, 0} }; static void usage(const char *prog) { const char *str = - " Usage: %s [OPTIONS]\n" - " Options:\n" - " -i, --interface Use interface\n" - " -q, --queue=n Use queue n (default 0)\n" - " -D, --dump-pkts Dump packets L2 - L5\n" - " -v, --verbose Verbose output\n" - " -C, --tx-pkt-count=n Number of packets to send\n"; + " Usage: %s [OPTIONS]\n" + " Options:\n" + " -i, --interface Use interface\n" + " -q, --queue=n Use queue n (default 0)\n" + " -D, --dump-pkts Dump packets L2 - L5\n" + " -v, --verbose Verbose output\n"; + ksft_print_msg(str, prog); } @@ -392,7 +374,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "i:DC:v", long_options, &option_index); + c = getopt_long(argc, argv, "i:Dv", long_options, &option_index); if (c == -1) break; @@ -413,13 +395,10 @@ static void parse_command_line(int argc, char **argv) interface_index++; break; case 'D': - debug_pkt_dump = 1; - break; - case 'C': - opt_pkt_count = atoi(optarg); + opt_pkt_dump = true; break; case 'v': - opt_verbose = 1; + opt_verbose = true; break; default: usage(basename(argv[0])); @@ -427,17 +406,143 @@ static void parse_command_line(int argc, char **argv) } } - if (!opt_pkt_count) { - print_verbose("No tx-pkt-count specified, using default %u\n", DEFAULT_PKT_CNT); - opt_pkt_count = DEFAULT_PKT_CNT; - } - if (!validate_interfaces()) { usage(basename(argv[0])); ksft_exit_xfail(); } } +static struct pkt *pkt_stream_get_pkt(struct pkt_stream *pkt_stream, u32 pkt_nb) +{ + if (pkt_nb >= pkt_stream->nb_pkts) + return NULL; + + return &pkt_stream->pkts[pkt_nb]; +} + +static struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len) +{ + struct pkt_stream *pkt_stream; + u32 i; + + pkt_stream = malloc(sizeof(*pkt_stream)); + if (!pkt_stream) + exit_with_error(ENOMEM); + + pkt_stream->pkts = calloc(nb_pkts, sizeof(*pkt_stream->pkts)); + if (!pkt_stream->pkts) + exit_with_error(ENOMEM); + + pkt_stream->nb_pkts = nb_pkts; + for (i = 0; i < nb_pkts; i++) { + pkt_stream->pkts[i].addr = (i % num_frames) * XSK_UMEM__DEFAULT_FRAME_SIZE; + pkt_stream->pkts[i].len = pkt_len; + pkt_stream->pkts[i].payload = i; + } + + return pkt_stream; +} + +static struct pkt *pkt_generate(struct ifobject *ifobject, u32 pkt_nb) +{ + struct pkt *pkt = pkt_stream_get_pkt(ifobject->pkt_stream, pkt_nb); + struct udphdr *udp_hdr; + struct ethhdr *eth_hdr; + struct iphdr *ip_hdr; + void *data; + + if (!pkt) + return NULL; + + data = xsk_umem__get_data(ifobject->umem->buffer, pkt->addr); + udp_hdr = (struct udphdr *)(data + sizeof(struct ethhdr) + sizeof(struct iphdr)); + ip_hdr = (struct iphdr *)(data + sizeof(struct ethhdr)); + eth_hdr = (struct ethhdr *)data; + + gen_udp_hdr(pkt_nb, data, ifobject, udp_hdr); + gen_ip_hdr(ifobject, ip_hdr); + gen_udp_csum(udp_hdr, ip_hdr); + gen_eth_hdr(ifobject, eth_hdr); + + return pkt; +} + +static void pkt_dump(void *pkt, u32 len) +{ + char s[INET_ADDRSTRLEN]; + struct ethhdr *ethhdr; + struct udphdr *udphdr; + struct iphdr *iphdr; + int payload, i; + + ethhdr = pkt; + iphdr = pkt + sizeof(*ethhdr); + udphdr = pkt + sizeof(*ethhdr) + sizeof(*iphdr); + + /*extract L2 frame */ + fprintf(stdout, "DEBUG>> L2: dst mac: "); + for (i = 0; i < ETH_ALEN; i++) + fprintf(stdout, "%02X", ethhdr->h_dest[i]); + + fprintf(stdout, "\nDEBUG>> L2: src mac: "); + for (i = 0; i < ETH_ALEN; i++) + fprintf(stdout, "%02X", ethhdr->h_source[i]); + + /*extract L3 frame */ + fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl); + fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", + inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s))); + fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", + inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s))); + /*extract L4 frame */ + fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source)); + fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest)); + /*extract L5 frame */ + payload = *((uint32_t *)(pkt + PKT_HDR_SIZE)); + + fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload); + fprintf(stdout, "---------------------------------------\n"); +} + +static bool is_pkt_valid(struct pkt *pkt, void *buffer, const struct xdp_desc *desc) +{ + void *data = xsk_umem__get_data(buffer, desc->addr); + struct iphdr *iphdr = (struct iphdr *)(data + sizeof(struct ethhdr)); + + if (!pkt) { + ksft_test_result_fail("ERROR: [%s] too many packets received\n", __func__); + return false; + } + + if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) { + u32 seqnum = ntohl(*((u32 *)(data + PKT_HDR_SIZE))); + + if (opt_pkt_dump && test_type != TEST_TYPE_STATS) + pkt_dump(data, PKT_SIZE); + + if (pkt->len != desc->len) { + ksft_test_result_fail + ("ERROR: [%s] expected length [%d], got length [%d]\n", + __func__, pkt->len, desc->len); + return false; + } + + if (pkt->payload != seqnum) { + ksft_test_result_fail + ("ERROR: [%s] expected seqnum [%d], got seqnum [%d]\n", + __func__, pkt->payload, seqnum); + return false; + } + } else { + ksft_print_msg("Invalid frame received: "); + ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version, + iphdr->tos); + return false; + } + + return true; +} + static void kick_tx(struct xsk_socket_info *xsk) { int ret; @@ -448,7 +553,7 @@ static void kick_tx(struct xsk_socket_info *xsk) exit_with_error(errno); } -static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) +static void complete_pkts(struct xsk_socket_info *xsk, int batch_size) { unsigned int rcvd; u32 idx; @@ -463,133 +568,108 @@ static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) if (rcvd) { xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; - xsk->tx_npkts += rcvd; } } -static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) +static void receive_pkts(struct pkt_stream *pkt_stream, struct xsk_socket_info *xsk, + struct pollfd *fds) { - unsigned int rcvd, i; - u32 idx_rx = 0, idx_fq = 0; + u32 idx_rx = 0, idx_fq = 0, rcvd, i, pkt_count = 0; + struct pkt *pkt; int ret; - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); - if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { - ret = poll(fds, 1, POLL_TMOUT); - if (ret < 0) - exit_with_error(ret); + pkt = pkt_stream_get_pkt(pkt_stream, pkt_count++); + while (pkt) { + rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + if (!rcvd) { + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret < 0) + exit_with_error(-ret); + } + continue; } - return; - } - ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); - while (ret != rcvd) { - if (ret < 0) - exit_with_error(ret); - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { - ret = poll(fds, 1, POLL_TMOUT); + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + while (ret != rcvd) { if (ret < 0) - exit_with_error(ret); + exit_with_error(-ret); + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret < 0) + exit_with_error(-ret); + } + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); } - ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); - } - - for (i = 0; i < rcvd; i++) { - u64 addr, orig; - - addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; - xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); - orig = xsk_umem__extract_addr(addr); - addr = xsk_umem__add_offset_to_addr(addr); - pkt_node_rx = malloc(sizeof(struct pkt) + PKT_SIZE); - if (!pkt_node_rx) - exit_with_error(errno); + for (i = 0; i < rcvd; i++) { + const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); + u64 addr = desc->addr, orig; - pkt_node_rx->pkt_frame = malloc(PKT_SIZE); - if (!pkt_node_rx->pkt_frame) - exit_with_error(errno); + orig = xsk_umem__extract_addr(addr); + addr = xsk_umem__add_offset_to_addr(addr); + if (!is_pkt_valid(pkt, xsk->umem->buffer, desc)) + return; - memcpy(pkt_node_rx->pkt_frame, xsk_umem__get_data(xsk->umem->buffer, addr), - PKT_SIZE); - - TAILQ_INSERT_HEAD(&head, pkt_node_rx, pkt_nodes); + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; + pkt = pkt_stream_get_pkt(pkt_stream, pkt_count++); + } - *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; + xsk_ring_prod__submit(&xsk->umem->fq, rcvd); + xsk_ring_cons__release(&xsk->rx, rcvd); } - - xsk_ring_prod__submit(&xsk->umem->fq, rcvd); - xsk_ring_cons__release(&xsk->rx, rcvd); - xsk->rx_npkts += rcvd; } -static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) +static u32 __send_pkts(struct ifobject *ifobject, u32 pkt_nb) { - u32 idx = 0; - unsigned int i; - bool tx_invalid_test = stat_test_type == STAT_TEST_TX_INVALID; - u32 len = tx_invalid_test ? XSK_UMEM__DEFAULT_FRAME_SIZE + 1 : PKT_SIZE; + struct xsk_socket_info *xsk = ifobject->xsk; + u32 i, idx; - while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < batch_size) - complete_tx_only(xsk, batch_size); + while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) + complete_pkts(xsk, BATCH_SIZE); - for (i = 0; i < batch_size; i++) { + for (i = 0; i < BATCH_SIZE; i++) { struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); + struct pkt *pkt = pkt_generate(ifobject, pkt_nb); - tx_desc->addr = (*frameptr + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; - tx_desc->len = len; - } + if (!pkt) + break; - xsk_ring_prod__submit(&xsk->tx, batch_size); - if (!tx_invalid_test) { - xsk->outstanding_tx += batch_size; - } else if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { - kick_tx(xsk); + tx_desc->addr = pkt->addr; + tx_desc->len = pkt->len; + pkt_nb++; } - *frameptr += batch_size; - *frameptr %= num_frames; - complete_tx_only(xsk, batch_size); -} - -static int get_batch_size(int pkt_cnt) -{ - if (!opt_pkt_count) - return BATCH_SIZE; - if (pkt_cnt + BATCH_SIZE <= opt_pkt_count) - return BATCH_SIZE; + xsk_ring_prod__submit(&xsk->tx, i); + if (stat_test_type != STAT_TEST_TX_INVALID) + xsk->outstanding_tx += i; + else if (xsk_ring_prod__needs_wakeup(&xsk->tx)) + kick_tx(xsk); + complete_pkts(xsk, i); - return opt_pkt_count - pkt_cnt; + return i; } -static void complete_tx_only_all(struct ifobject *ifobject) +static void wait_for_tx_completion(struct xsk_socket_info *xsk) { - bool pending; - - do { - pending = false; - if (ifobject->xsk->outstanding_tx) { - complete_tx_only(ifobject->xsk, BATCH_SIZE); - pending = !!ifobject->xsk->outstanding_tx; - } - } while (pending); + while (xsk->outstanding_tx) + complete_pkts(xsk, BATCH_SIZE); } -static void tx_only_all(struct ifobject *ifobject) +static void send_pkts(struct ifobject *ifobject) { struct pollfd fds[MAX_SOCKS] = { }; - u32 frame_nb = 0; - int pkt_cnt = 0; - int ret; + u32 pkt_cnt = 0; fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); fds[0].events = POLLOUT; - while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { - int batch_size = get_batch_size(pkt_cnt); + while (pkt_cnt < ifobject->pkt_stream->nb_pkts) { + u32 sent; if (test_type == TEST_TYPE_POLL) { + int ret; + ret = poll(fds, 1, POLL_TMOUT); if (ret <= 0) continue; @@ -598,78 +678,30 @@ static void tx_only_all(struct ifobject *ifobject) continue; } - tx_only(ifobject->xsk, &frame_nb, batch_size); - pkt_cnt += batch_size; + sent = __send_pkts(ifobject, pkt_cnt); + pkt_cnt += sent; + usleep(10); } - if (opt_pkt_count) - complete_tx_only_all(ifobject); + wait_for_tx_completion(ifobject->xsk); } -static void worker_pkt_dump(void) -{ - struct ethhdr *ethhdr; - struct iphdr *iphdr; - struct udphdr *udphdr; - char s[128]; - int payload; - void *ptr; - - fprintf(stdout, "---------------------------------------\n"); - for (int iter = 0; iter < num_frames - 1; iter++) { - ptr = pkt_buf[iter]->payload; - ethhdr = ptr; - iphdr = ptr + sizeof(*ethhdr); - udphdr = ptr + sizeof(*ethhdr) + sizeof(*iphdr); - - /*extract L2 frame */ - fprintf(stdout, "DEBUG>> L2: dst mac: "); - for (int i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ethhdr->h_dest[i]); - - fprintf(stdout, "\nDEBUG>> L2: src mac: "); - for (int i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ethhdr->h_source[i]); - - /*extract L3 frame */ - fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl); - fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", - inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s))); - fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", - inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s))); - /*extract L4 frame */ - fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source)); - fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest)); - /*extract L5 frame */ - payload = *((uint32_t *)(ptr + PKT_HDR_SIZE)); - - if (payload == EOT) { - print_verbose("End-of-transmission frame received\n"); - fprintf(stdout, "---------------------------------------\n"); - break; - } - fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload); - fprintf(stdout, "---------------------------------------\n"); - } -} - -static void worker_stats_validate(struct ifobject *ifobject) +static bool rx_stats_are_valid(struct ifobject *ifobject) { + u32 xsk_stat = 0, expected_stat = ifobject->pkt_stream->nb_pkts; + struct xsk_socket *xsk = ifobject->xsk->xsk; + int fd = xsk_socket__fd(xsk); struct xdp_statistics stats; socklen_t optlen; int err; - struct xsk_socket *xsk = stat_test_type == STAT_TEST_TX_INVALID ? - ifdict[!ifobject->ifdict_index]->xsk->xsk : - ifobject->xsk->xsk; - int fd = xsk_socket__fd(xsk); - unsigned long xsk_stat = 0, expected_stat = opt_pkt_count; - - sigvar = 0; optlen = sizeof(stats); err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); - if (err) - return; + if (err) { + ksft_test_result_fail("ERROR: [%s] getsockopt(XDP_STATISTICS) error %u %s\n", + __func__, -err, strerror(-err)); + return true; + } if (optlen == sizeof(struct xdp_statistics)) { switch (stat_test_type) { @@ -677,8 +709,7 @@ static void worker_stats_validate(struct ifobject *ifobject) xsk_stat = stats.rx_dropped; break; case STAT_TEST_TX_INVALID: - xsk_stat = stats.tx_invalid_descs; - break; + return true; case STAT_TEST_RX_FULL: xsk_stat = stats.rx_ring_full; expected_stat -= RX_FULL_RXQSIZE; @@ -691,99 +722,70 @@ static void worker_stats_validate(struct ifobject *ifobject) } if (xsk_stat == expected_stat) - sigvar = 1; + return true; } + + return false; } -static void worker_pkt_validate(void) +static void tx_stats_validate(struct ifobject *ifobject) { - u32 payloadseqnum = -2; - struct iphdr *iphdr; - - while (1) { - pkt_node_rx_q = TAILQ_LAST(&head, head_s); - if (!pkt_node_rx_q) - break; - - iphdr = (struct iphdr *)(pkt_node_rx_q->pkt_frame + sizeof(struct ethhdr)); - - /*do not increment pktcounter if !(tos=0x9 and ipv4) */ - if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) { - payloadseqnum = *((uint32_t *)(pkt_node_rx_q->pkt_frame + PKT_HDR_SIZE)); - if (debug_pkt_dump && payloadseqnum != EOT) { - pkt_obj = malloc(sizeof(*pkt_obj)); - pkt_obj->payload = malloc(PKT_SIZE); - memcpy(pkt_obj->payload, pkt_node_rx_q->pkt_frame, PKT_SIZE); - pkt_buf[payloadseqnum] = pkt_obj; - } - - if (payloadseqnum == EOT) { - print_verbose("End-of-transmission frame received: PASS\n"); - sigvar = 1; - break; - } + struct xsk_socket *xsk = ifobject->xsk->xsk; + int fd = xsk_socket__fd(xsk); + struct xdp_statistics stats; + socklen_t optlen; + int err; - if (prev_pkt + 1 != payloadseqnum) { - ksft_test_result_fail - ("ERROR: [%s] prev_pkt [%d], payloadseqnum [%d]\n", - __func__, prev_pkt, payloadseqnum); - ksft_exit_xfail(); - } + optlen = sizeof(stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); + if (err) { + ksft_test_result_fail("ERROR: [%s] getsockopt(XDP_STATISTICS) error %u %s\n", + __func__, -err, strerror(-err)); + return; + } - prev_pkt = payloadseqnum; - pkt_counter++; - } else { - ksft_print_msg("Invalid frame received: "); - ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version, - iphdr->tos); - } + if (stats.tx_invalid_descs == ifobject->pkt_stream->nb_pkts) + return; - TAILQ_REMOVE(&head, pkt_node_rx_q, pkt_nodes); - free(pkt_node_rx_q->pkt_frame); - free(pkt_node_rx_q); - pkt_node_rx_q = NULL; - } + ksft_test_result_fail("ERROR: [%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", + __func__, stats.tx_invalid_descs, ifobject->pkt_stream->nb_pkts); } static void thread_common_ops(struct ifobject *ifobject, void *bufs) { - int umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; + u64 umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; + int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; + size_t mmap_sz = umem_sz; int ctr = 0; int ret; ifobject->ns_fd = switch_namespace(ifobject->nsname); if (test_type == TEST_TYPE_BPF_RES) - umem_sz *= 2; + mmap_sz *= 2; - bufs = mmap(NULL, umem_sz, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); if (bufs == MAP_FAILED) exit_with_error(errno); - xsk_configure_umem(ifobject, bufs, 0); - ifobject->umem = ifobject->umem_arr[0]; - ret = xsk_configure_socket(ifobject, 0); - - /* Retry Create Socket if it fails as xsk_socket__create() - * is asynchronous - */ - while (ret && ctr < SOCK_RECONF_CTR) { - xsk_configure_umem(ifobject, bufs, 0); + while (ctr++ < SOCK_RECONF_CTR) { + xsk_configure_umem(ifobject, bufs, umem_sz, 0); ifobject->umem = ifobject->umem_arr[0]; ret = xsk_configure_socket(ifobject, 0); + if (!ret) + break; + + /* Retry Create Socket if it fails as xsk_socket__create() is asynchronous */ usleep(USLEEP_MAX); - ctr++; + if (ctr >= SOCK_RECONF_CTR) + exit_with_error(-ret); } - if (ctr >= SOCK_RECONF_CTR) - exit_with_error(ret); - ifobject->umem = ifobject->umem_arr[0]; ifobject->xsk = ifobject->xsk_arr[0]; if (test_type == TEST_TYPE_BPF_RES) { - xsk_configure_umem(ifobject, (u8 *)bufs + (umem_sz / 2), 1); + xsk_configure_umem(ifobject, (u8 *)bufs + umem_sz, umem_sz, 1); ifobject->umem = ifobject->umem_arr[1]; ret = xsk_configure_socket(ifobject, 1); } @@ -809,33 +811,18 @@ static void testapp_cleanup_xsk_res(struct ifobject *ifobj) static void *worker_testapp_validate_tx(void *arg) { - struct udphdr *udp_hdr = - (struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr)); - struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + sizeof(struct ethhdr)); - struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; struct ifobject *ifobject = (struct ifobject *)arg; - struct generic_data data; void *bufs = NULL; if (!second_step) thread_common_ops(ifobject, bufs); - for (int i = 0; i < num_frames; i++) { - /*send EOT frame */ - if (i == (num_frames - 1)) - data.seqnum = -1; - else - data.seqnum = i; - gen_udp_hdr(&data, ifobject, udp_hdr); - gen_ip_hdr(ifobject, ip_hdr); - gen_udp_csum(udp_hdr, ip_hdr); - gen_eth_hdr(ifobject, eth_hdr); - gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); - } + print_verbose("Sending %d packets on interface %s\n", ifobject->pkt_stream->nb_pkts, + ifobject->ifname); + send_pkts(ifobject); - print_verbose("Sending %d packets on interface %s\n", - (opt_pkt_count - 1), ifobject->ifname); - tx_only_all(ifobject); + if (stat_test_type == STAT_TEST_TX_INVALID) + tx_stats_validate(ifobject); testapp_cleanup_xsk_res(ifobject); pthread_exit(NULL); @@ -853,31 +840,16 @@ static void *worker_testapp_validate_rx(void *arg) if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) xsk_populate_fill_ring(ifobject->umem); - TAILQ_INIT(&head); - if (debug_pkt_dump) { - pkt_buf = calloc(num_frames, sizeof(*pkt_buf)); - if (!pkt_buf) - exit_with_error(errno); - } - fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); fds[0].events = POLLIN; pthread_barrier_wait(&barr); - while (1) { - if (test_type != TEST_TYPE_STATS) { - rx_pkt(ifobject->xsk, fds); - worker_pkt_validate(); - } else { - worker_stats_validate(ifobject); - } - if (sigvar) - break; - } - - print_verbose("Received %d packets on interface %s\n", - pkt_counter, ifobject->ifname); + if (test_type == TEST_TYPE_STATS) + while (!rx_stats_are_valid(ifobject)) + continue; + else + receive_pkts(ifobject->pkt_stream, ifobject->xsk, fds); if (test_type == TEST_TYPE_TEARDOWN) print_verbose("Destroying socket\n"); @@ -890,10 +862,18 @@ static void testapp_validate(void) { bool bidi = test_type == TEST_TYPE_BIDI; bool bpf = test_type == TEST_TYPE_BPF_RES; + struct pkt_stream *pkt_stream; if (pthread_barrier_init(&barr, NULL, 2)) exit_with_error(errno); + if (stat_test_type == STAT_TEST_TX_INVALID) + pkt_stream = pkt_stream_generate(DEFAULT_PKT_CNT, XSK_UMEM__INVALID_FRAME_SIZE); + else + pkt_stream = pkt_stream_generate(DEFAULT_PKT_CNT, PKT_SIZE); + ifdict_tx->pkt_stream = pkt_stream; + ifdict_rx->pkt_stream = pkt_stream; + /*Spawn RX thread */ pthread_create(&t0, NULL, ifdict_rx->func_ptr, ifdict_rx); @@ -907,15 +887,6 @@ static void testapp_validate(void) pthread_join(t1, NULL); pthread_join(t0, NULL); - if (debug_pkt_dump && test_type != TEST_TYPE_STATS) { - worker_pkt_dump(); - for (int iter = 0; iter < num_frames - 1; iter++) { - free(pkt_buf[iter]->payload); - free(pkt_buf[iter]); - } - free(pkt_buf); - } - if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !bpf && !(test_type == TEST_TYPE_STATS)) print_ksft_result(); } @@ -925,9 +896,6 @@ static void testapp_teardown(void) int i; for (i = 0; i < MAX_TEARDOWN_ITER; i++) { - pkt_counter = 0; - prev_pkt = -1; - sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); } @@ -953,9 +921,6 @@ static void swap_vectors(struct ifobject *ifobj1, struct ifobject *ifobj2) static void testapp_bidi(void) { for (int i = 0; i < MAX_BIDI_ITER; i++) { - pkt_counter = 0; - prev_pkt = -1; - sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); if (!second_step) { @@ -987,9 +952,6 @@ static void testapp_bpf_res(void) int i; for (i = 0; i < MAX_BPF_ITER; i++) { - pkt_counter = 0; - prev_pkt = -1; - sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); if (!second_step) @@ -1017,6 +979,8 @@ static void testapp_stats(void) case STAT_TEST_RX_FULL: rxqsize = RX_FULL_RXQSIZE; break; + case STAT_TEST_TX_INVALID: + continue; default: break; } @@ -1062,10 +1026,7 @@ static void run_pkt_test(int mode, int type) /* reset defaults after potential previous test */ xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; - pkt_counter = 0; second_step = 0; - prev_pkt = -1; - sigvar = 0; stat_test_type = -1; rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; @@ -1102,62 +1063,70 @@ static void run_pkt_test(int mode, int type) } } +static struct ifobject *ifobject_create(void) +{ + struct ifobject *ifobj; + + ifobj = calloc(1, sizeof(struct ifobject)); + if (!ifobj) + return NULL; + + ifobj->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *)); + if (!ifobj->xsk_arr) + goto out_xsk_arr; + + ifobj->umem_arr = calloc(2, sizeof(struct xsk_umem_info *)); + if (!ifobj->umem_arr) + goto out_umem_arr; + + return ifobj; + +out_umem_arr: + free(ifobj->xsk_arr); +out_xsk_arr: + free(ifobj); + return NULL; +} + +static void ifobject_delete(struct ifobject *ifobj) +{ + free(ifobj->umem_arr); + free(ifobj->xsk_arr); + free(ifobj); +} + int main(int argc, char **argv) { struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; - bool failure = false; int i, j; if (setrlimit(RLIMIT_MEMLOCK, &_rlim)) exit_with_error(errno); - for (int i = 0; i < MAX_INTERFACES; i++) { - ifdict[i] = malloc(sizeof(struct ifobject)); + for (i = 0; i < MAX_INTERFACES; i++) { + ifdict[i] = ifobject_create(); if (!ifdict[i]) - exit_with_error(errno); - - ifdict[i]->ifdict_index = i; - ifdict[i]->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *)); - if (!ifdict[i]->xsk_arr) { - failure = true; - goto cleanup; - } - ifdict[i]->umem_arr = calloc(2, sizeof(struct xsk_umem_info *)); - if (!ifdict[i]->umem_arr) { - failure = true; - goto cleanup; - } + exit_with_error(ENOMEM); } setlocale(LC_ALL, ""); parse_command_line(argc, argv); - num_frames = ++opt_pkt_count; - - init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx); - init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx); + init_iface(ifdict[tx], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx); + init_iface(ifdict[rx], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx); ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); - for (i = 0; i < TEST_MODE_MAX; i++) { - for (j = 0; j < TEST_TYPE_MAX; j++) + for (i = 0; i < TEST_MODE_MAX; i++) + for (j = 0; j < TEST_TYPE_MAX; j++) { run_pkt_test(i, j); - } - -cleanup: - for (int i = 0; i < MAX_INTERFACES; i++) { - if (ifdict[i]->ns_fd != -1) - close(ifdict[i]->ns_fd); - free(ifdict[i]->xsk_arr); - free(ifdict[i]->umem_arr); - free(ifdict[i]); - } + usleep(USLEEP_MAX); + } - if (failure) - exit_with_error(errno); + for (i = 0; i < MAX_INTERFACES; i++) + ifobject_delete(ifdict[i]); ksft_exit_pass(); - return 0; } diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 6c428b276ab6..7e49b9fbe25e 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -34,28 +34,23 @@ #define IP_PKT_TOS 0x9 #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) -#define EOT (-1) -#define USLEEP_MAX 200000 +#define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 -#define BATCH_SIZE 64 +#define BATCH_SIZE 8 #define POLL_TMOUT 1000 -#define DEFAULT_PKT_CNT 10000 +#define DEFAULT_PKT_CNT (4 * 1024) #define RX_FULL_RXQSIZE 32 +#define XSK_UMEM__INVALID_FRAME_SIZE (XSK_UMEM__DEFAULT_FRAME_SIZE + 1) #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) -typedef __u32 u32; -typedef __u16 u16; -typedef __u8 u8; - -enum TEST_MODES { - TEST_MODE_UNCONFIGURED = -1, +enum test_mode { TEST_MODE_SKB, TEST_MODE_DRV, TEST_MODE_MAX }; -enum TEST_TYPES { +enum test_type { TEST_TYPE_NOPOLL, TEST_TYPE_POLL, TEST_TYPE_TEARDOWN, @@ -65,7 +60,7 @@ enum TEST_TYPES { TEST_TYPE_MAX }; -enum STAT_TEST_TYPES { +enum stat_test_type { STAT_TEST_RX_DROPPED, STAT_TEST_TX_INVALID, STAT_TEST_RX_FULL, @@ -73,21 +68,16 @@ enum STAT_TEST_TYPES { STAT_TEST_TYPE_MAX }; -static int configured_mode = TEST_MODE_UNCONFIGURED; -static u8 debug_pkt_dump; -static u32 num_frames; +static int configured_mode; +static bool opt_pkt_dump; +static u32 num_frames = DEFAULT_PKT_CNT / 4; static bool second_step; static int test_type; -static int opt_pkt_count; -static u8 opt_verbose; +static bool opt_verbose; static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static u32 xdp_bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; -static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; -static u32 pkt_counter; -static long prev_pkt = -1; -static int sigvar; static int stat_test_type; static u32 rxqsize; static u32 frame_headroom; @@ -104,10 +94,6 @@ struct xsk_socket_info { struct xsk_ring_prod tx; struct xsk_umem_info *umem; struct xsk_socket *xsk; - unsigned long rx_npkts; - unsigned long tx_npkts; - unsigned long prev_rx_npkts; - unsigned long prev_tx_npkts; u32 outstanding_tx; }; @@ -118,8 +104,15 @@ struct flow_vector { } vector; }; -struct generic_data { - u32 seqnum; +struct pkt { + u64 addr; + u32 len; + u32 payload; +}; + +struct pkt_stream { + u32 nb_pkts; + struct pkt *pkts; }; struct ifobject { @@ -131,8 +124,8 @@ struct ifobject { struct xsk_umem_info *umem; void *(*func_ptr)(void *arg); struct flow_vector fv; + struct pkt_stream *pkt_stream; int ns_fd; - int ifdict_index; u32 dst_ip; u32 src_ip; u16 src_port; @@ -149,18 +142,4 @@ static struct ifobject *ifdict_tx; pthread_barrier_t barr; pthread_t t0, t1; -TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); -struct head_s *head_p; -struct pkt { - char *pkt_frame; - - TAILQ_ENTRY(pkt) pkt_nodes; -} *pkt_node_rx, *pkt_node_rx_q; - -struct pkt_frame { - char *payload; -} *pkt_obj; - -struct pkt_frame **pkt_buf; - #endif /* XDPXCEIVER_H */ diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index dac1c5f78752..bf29d2549bee 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -8,14 +8,8 @@ ksft_xfail=2 ksft_xpass=3 ksft_skip=4 -GREEN='\033[0;92m' -YELLOW='\033[0;93m' -RED='\033[0;31m' -NC='\033[0m' -STACK_LIM=131072 SPECFILE=veth.spec XSKOBJ=xdpxceiver -NUMPKTS=10000 validate_root_exec() { @@ -50,22 +44,12 @@ validate_veth_spec_file() test_status() { statusval=$1 - if [ -n "${colorconsole+set}" ]; then - if [ $statusval -eq 2 ]; then - echo -e "${YELLOW}$2${NC}: [ ${RED}FAIL${NC} ]" - elif [ $statusval -eq 1 ]; then - echo -e "${YELLOW}$2${NC}: [ ${RED}SKIPPED${NC} ]" - elif [ $statusval -eq 0 ]; then - echo -e "${YELLOW}$2${NC}: [ ${GREEN}PASS${NC} ]" - fi - else - if [ $statusval -eq 2 ]; then - echo -e "$2: [ FAIL ]" - elif [ $statusval -eq 1 ]; then - echo -e "$2: [ SKIPPED ]" - elif [ $statusval -eq 0 ]; then - echo -e "$2: [ PASS ]" - fi + if [ $statusval -eq 2 ]; then + echo -e "$2: [ FAIL ]" + elif [ $statusval -eq 1 ]; then + echo -e "$2: [ SKIPPED ]" + elif [ $statusval -eq 0 ]; then + echo -e "$2: [ PASS ]" fi } @@ -107,5 +91,5 @@ validate_ip_utility() execxdpxceiver() { - ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} -C ${NUMPKTS} ${VERBOSE_ARG} ${DUMP_PKTS_ARG} + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${VERBOSE_ARG} ${DUMP_PKTS_ARG} } diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index 84cfcabea838..be9643ef6285 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -2,4 +2,5 @@ test_memcontrol test_core test_freezer -test_kmem
\ No newline at end of file +test_kmem +test_kill diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index f027d933595b..59e222460581 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -9,6 +9,7 @@ TEST_GEN_PROGS = test_memcontrol TEST_GEN_PROGS += test_kmem TEST_GEN_PROGS += test_core TEST_GEN_PROGS += test_freezer +TEST_GEN_PROGS += test_kill include ../lib.mk @@ -16,3 +17,4 @@ $(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h $(OUTPUT)/test_kmem: cgroup_util.c ../clone3/clone3_selftests.h $(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h $(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h +$(OUTPUT)/test_kill: cgroup_util.c ../clone3/clone3_selftests.h ../pidfd/pidfd.h diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 027014662fb2..623cec04ad42 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -5,10 +5,12 @@ #include <errno.h> #include <fcntl.h> #include <linux/limits.h> +#include <poll.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sys/inotify.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/wait.h> @@ -252,6 +254,10 @@ int cg_killall(const char *cgroup) char buf[PAGE_SIZE]; char *ptr = buf; + /* If cgroup.kill exists use it. */ + if (!cg_write(cgroup, "cgroup.kill", "1")) + return 0; + if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf))) return -1; @@ -576,3 +582,48 @@ int clone_into_cgroup_run_wait(const char *cgroup) (void)clone_reap(pid, WEXITED); return 0; } + +int cg_prepare_for_wait(const char *cgroup) +{ + int fd, ret = -1; + + fd = inotify_init1(0); + if (fd == -1) + return fd; + + ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"), + IN_MODIFY); + if (ret == -1) { + close(fd); + fd = -1; + } + + return fd; +} + +int cg_wait_for(int fd) +{ + int ret = -1; + struct pollfd fds = { + .fd = fd, + .events = POLLIN, + }; + + while (true) { + ret = poll(&fds, 1, 10000); + + if (ret == -1) { + if (errno == EINTR) + continue; + + break; + } + + if (ret > 0 && fds.revents & POLLIN) { + ret = 0; + break; + } + } + + return ret; +} diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index 5a1305dd1f0b..82e59cdf16e7 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -54,3 +54,5 @@ extern pid_t clone_into_cgroup(int cgroup_fd); extern int clone_reap(pid_t pid, int options); extern int clone_into_cgroup_run_wait(const char *cgroup); extern int dirfd_open_opath(const char *dir); +extern int cg_prepare_for_wait(const char *cgroup); +extern int cg_wait_for(int fd); diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 23d8fa4a3e4e..ff519029f6f4 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -7,9 +7,7 @@ #include <unistd.h> #include <stdio.h> #include <errno.h> -#include <poll.h> #include <stdlib.h> -#include <sys/inotify.h> #include <string.h> #include <sys/wait.h> @@ -55,61 +53,6 @@ static int cg_freeze_nowait(const char *cgroup, bool freeze) } /* - * Prepare for waiting on cgroup.events file. - */ -static int cg_prepare_for_wait(const char *cgroup) -{ - int fd, ret = -1; - - fd = inotify_init1(0); - if (fd == -1) { - debug("Error: inotify_init1() failed\n"); - return fd; - } - - ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"), - IN_MODIFY); - if (ret == -1) { - debug("Error: inotify_add_watch() failed\n"); - close(fd); - fd = -1; - } - - return fd; -} - -/* - * Wait for an event. If there are no events for 10 seconds, - * treat this an error. - */ -static int cg_wait_for(int fd) -{ - int ret = -1; - struct pollfd fds = { - .fd = fd, - .events = POLLIN, - }; - - while (true) { - ret = poll(&fds, 1, 10000); - - if (ret == -1) { - if (errno == EINTR) - continue; - debug("Error: poll() failed\n"); - break; - } - - if (ret > 0 && fds.revents & POLLIN) { - ret = 0; - break; - } - } - - return ret; -} - -/* * Attach a task to the given cgroup and wait for a cgroup frozen event. * All transient events (e.g. populated) are ignored. */ diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c new file mode 100644 index 000000000000..6153690319c9 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_kill.c @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <errno.h> +#include <linux/limits.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <unistd.h> + +#include "../kselftest.h" +#include "../pidfd/pidfd.h" +#include "cgroup_util.h" + +/* + * Kill the given cgroup and wait for the inotify signal. + * If there are no events in 10 seconds, treat this as an error. + * Then check that the cgroup is in the desired state. + */ +static int cg_kill_wait(const char *cgroup) +{ + int fd, ret = -1; + + fd = cg_prepare_for_wait(cgroup); + if (fd < 0) + return fd; + + ret = cg_write(cgroup, "cgroup.kill", "1"); + if (ret) + goto out; + + ret = cg_wait_for(fd); + if (ret) + goto out; + +out: + close(fd); + return ret; +} + +/* + * A simple process running in a sleep loop until being + * re-parented. + */ +static int child_fn(const char *cgroup, void *arg) +{ + int ppid = getppid(); + + while (getppid() == ppid) + usleep(1000); + + return getppid() == ppid; +} + +static int test_cgkill_simple(const char *root) +{ + pid_t pids[100]; + int ret = KSFT_FAIL; + char *cgroup = NULL; + int i; + + cgroup = cg_name(root, "cg_test_simple"); + if (!cgroup) + goto cleanup; + + if (cg_create(cgroup)) + goto cleanup; + + for (i = 0; i < 100; i++) + pids[i] = cg_run_nowait(cgroup, child_fn, NULL); + + if (cg_wait_for_proc_count(cgroup, 100)) + goto cleanup; + + if (cg_read_strcmp(cgroup, "cgroup.events", "populated 1\n")) + goto cleanup; + + if (cg_kill_wait(cgroup)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + for (i = 0; i < 100; i++) + wait_for_pid(pids[i]); + + if (ret == KSFT_PASS && + cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n")) + ret = KSFT_FAIL; + + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * The test creates the following hierarchy: + * A + * / / \ \ + * B E I K + * /\ | + * C D F + * | + * G + * | + * H + * + * with a process in C, H and 3 processes in K. + * Then it tries to kill the whole tree. + */ +static int test_cgkill_tree(const char *root) +{ + pid_t pids[5]; + char *cgroup[10] = {0}; + int ret = KSFT_FAIL; + int i; + + cgroup[0] = cg_name(root, "cg_test_tree_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(cgroup[0], "B"); + if (!cgroup[1]) + goto cleanup; + + cgroup[2] = cg_name(cgroup[1], "C"); + if (!cgroup[2]) + goto cleanup; + + cgroup[3] = cg_name(cgroup[1], "D"); + if (!cgroup[3]) + goto cleanup; + + cgroup[4] = cg_name(cgroup[0], "E"); + if (!cgroup[4]) + goto cleanup; + + cgroup[5] = cg_name(cgroup[4], "F"); + if (!cgroup[5]) + goto cleanup; + + cgroup[6] = cg_name(cgroup[5], "G"); + if (!cgroup[6]) + goto cleanup; + + cgroup[7] = cg_name(cgroup[6], "H"); + if (!cgroup[7]) + goto cleanup; + + cgroup[8] = cg_name(cgroup[0], "I"); + if (!cgroup[8]) + goto cleanup; + + cgroup[9] = cg_name(cgroup[0], "K"); + if (!cgroup[9]) + goto cleanup; + + for (i = 0; i < 10; i++) + if (cg_create(cgroup[i])) + goto cleanup; + + pids[0] = cg_run_nowait(cgroup[2], child_fn, NULL); + pids[1] = cg_run_nowait(cgroup[7], child_fn, NULL); + pids[2] = cg_run_nowait(cgroup[9], child_fn, NULL); + pids[3] = cg_run_nowait(cgroup[9], child_fn, NULL); + pids[4] = cg_run_nowait(cgroup[9], child_fn, NULL); + + /* + * Wait until all child processes will enter + * corresponding cgroups. + */ + + if (cg_wait_for_proc_count(cgroup[2], 1) || + cg_wait_for_proc_count(cgroup[7], 1) || + cg_wait_for_proc_count(cgroup[9], 3)) + goto cleanup; + + /* + * Kill A and check that we get an empty notification. + */ + if (cg_kill_wait(cgroup[0])) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + for (i = 0; i < 5; i++) + wait_for_pid(pids[i]); + + if (ret == KSFT_PASS && + cg_read_strcmp(cgroup[0], "cgroup.events", "populated 0\n")) + ret = KSFT_FAIL; + + for (i = 9; i >= 0 && cgroup[i]; i--) { + cg_destroy(cgroup[i]); + free(cgroup[i]); + } + + return ret; +} + +static int forkbomb_fn(const char *cgroup, void *arg) +{ + int ppid; + + fork(); + fork(); + + ppid = getppid(); + + while (getppid() == ppid) + usleep(1000); + + return getppid() == ppid; +} + +/* + * The test runs a fork bomb in a cgroup and tries to kill it. + */ +static int test_cgkill_forkbomb(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + pid_t pid = -ESRCH; + + cgroup = cg_name(root, "cg_forkbomb_test"); + if (!cgroup) + goto cleanup; + + if (cg_create(cgroup)) + goto cleanup; + + pid = cg_run_nowait(cgroup, forkbomb_fn, NULL); + if (pid < 0) + goto cleanup; + + usleep(100000); + + if (cg_kill_wait(cgroup)) + goto cleanup; + + if (cg_wait_for_proc_count(cgroup, 0)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + if (pid > 0) + wait_for_pid(pid); + + if (ret == KSFT_PASS && + cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n")) + ret = KSFT_FAIL; + + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +#define T(x) { x, #x } +struct cgkill_test { + int (*fn)(const char *root); + const char *name; +} tests[] = { + T(test_cgkill_simple), + T(test_cgkill_tree), + T(test_cgkill_forkbomb), +}; +#undef T + +int main(int argc, char *argv[]) +{ + char root[PATH_MAX]; + int i, ret = EXIT_SUCCESS; + + if (cg_find_unified_root(root, sizeof(root))) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + for (i = 0; i < ARRAY_SIZE(tests); i++) { + switch (tests[i].fn(root)) { + case KSFT_PASS: + ksft_test_result_pass("%s\n", tests[i].name); + break; + case KSFT_SKIP: + ksft_test_result_skip("%s\n", tests[i].name); + break; + default: + ret = EXIT_FAILURE; + ksft_test_result_fail("%s\n", tests[i].name); + break; + } + } + + return ret; +} diff --git a/tools/testing/selftests/cpufreq/config b/tools/testing/selftests/cpufreq/config index 27ff72ebd0f5..75e900793e8a 100644 --- a/tools/testing/selftests/cpufreq/config +++ b/tools/testing/selftests/cpufreq/config @@ -6,7 +6,7 @@ CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y CONFIG_DEBUG_RT_MUTEXES=y -CONFIG_DEBUG_PI_LIST=y +CONFIG_DEBUG_PLIST=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y CONFIG_DEBUG_LOCK_ALLOC=y diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile new file mode 100644 index 000000000000..8a3f2cd9fec0 --- /dev/null +++ b/tools/testing/selftests/damon/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for damon selftests + +TEST_FILES = _chk_dependency.sh +TEST_PROGS = debugfs_attrs.sh + +include ../lib.mk diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh new file mode 100644 index 000000000000..0189db81550b --- /dev/null +++ b/tools/testing/selftests/damon/_chk_dependency.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +DBGFS=/sys/kernel/debug/damon + +if [ $EUID -ne 0 ]; +then + echo "Run as root" + exit $ksft_skip +fi + +if [ ! -d "$DBGFS" ] +then + echo "$DBGFS not found" + exit $ksft_skip +fi + +for f in attrs target_ids monitor_on +do + if [ ! -f "$DBGFS/$f" ] + then + echo "$f not found" + exit 1 + fi +done diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh new file mode 100644 index 000000000000..bfabb19dc0d3 --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_attrs.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +test_write_result() { + file=$1 + content=$2 + orig_content=$3 + expect_reason=$4 + expected=$5 + + echo "$content" > "$file" + if [ $? -ne "$expected" ] + then + echo "writing $content to $file doesn't return $expected" + echo "expected because: $expect_reason" + echo "$orig_content" > "$file" + exit 1 + fi +} + +test_write_succ() { + test_write_result "$1" "$2" "$3" "$4" 0 +} + +test_write_fail() { + test_write_result "$1" "$2" "$3" "$4" 1 +} + +test_content() { + file=$1 + orig_content=$2 + expected=$3 + expect_reason=$4 + + content=$(cat "$file") + if [ "$content" != "$expected" ] + then + echo "reading $file expected $expected but $content" + echo "expected because: $expect_reason" + echo "$orig_content" > "$file" + exit 1 + fi +} + +source ./_chk_dependency.sh + +# Test attrs file +# =============== + +file="$DBGFS/attrs" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4 5" "$orig_content" "valid input" +test_write_fail "$file" "1 2 3 4" "$orig_content" "no enough fields" +test_write_fail "$file" "1 2 3 5 4" "$orig_content" \ + "min_nr_regions > max_nr_regions" +test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written" +echo "$orig_content" > "$file" + +# Test target_ids file +# ==================== + +file="$DBGFS/target_ids" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input" +test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input" +test_content "$file" "$orig_content" "1 2" "non-integer was there" +test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input" +test_content "$file" "$orig_content" "" "wrong input written" +test_write_succ "$file" "" "$orig_content" "empty input" +test_content "$file" "$orig_content" "" "empty input written" +echo "$orig_content" > "$file" + +echo "PASS" diff --git a/tools/testing/selftests/drivers/dma-buf/udmabuf.c b/tools/testing/selftests/drivers/dma-buf/udmabuf.c index 4de902ea14d8..de1c4e6de0b2 100644 --- a/tools/testing/selftests/drivers/dma-buf/udmabuf.c +++ b/tools/testing/selftests/drivers/dma-buf/udmabuf.c @@ -1,10 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <string.h> #include <errno.h> -#include <linux/fcntl.h> +#include <fcntl.h> #include <malloc.h> #include <sys/ioctl.h> diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh index 4029833f7e27..160891dcb4bc 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh @@ -109,6 +109,9 @@ router_destroy() __addr_add_del $rp1 del 192.0.2.2/24 2001:db8:1::2/64 tc qdisc del dev $rp2 clsact + + ip link set dev $rp2 down + ip link set dev $rp1 down } setup_prepare() diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh index 42d44e27802c..190c1b6b5365 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh @@ -111,6 +111,9 @@ router_destroy() __addr_add_del $rp1 del 192.0.2.2/24 2001:db8:1::2/64 tc qdisc del dev $rp2 clsact + + ip link set dev $rp2 down + ip link set dev $rp1 down } setup_prepare() diff --git a/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh index 65f43a7ce9c9..1e9a4aff76a2 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh @@ -7,6 +7,8 @@ PORT_NUM_NETIFS=0 +declare -a unsplit + port_setup_prepare() { : @@ -20,12 +22,12 @@ port_cleanup() devlink port unsplit $port check_err $? "Did not unsplit $netdev" done + unsplit=() } split_all_ports() { local should_fail=$1; shift - local -a unsplit # Loop over the splittable netdevs and create tuples of netdev along # with its width. For example: diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh index 5cbff8038f84..28a570006d4d 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh @@ -93,7 +93,9 @@ switch_destroy() lldptool -T -i $swp1 -V APP -d $(dscp_map 10) >/dev/null lldpad_app_wait_del + ip link set dev $swp2 down ip link set dev $swp2 nomaster + ip link set dev $swp1 down ip link set dev $swp1 nomaster ip link del dev br1 } diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_headroom.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_headroom.sh index 27de3d9ed08e..f4493ef9cca1 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_headroom.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_headroom.sh @@ -29,37 +29,38 @@ cleanup() get_prio_pg() { - __mlnx_qos -i $swp | sed -n '/^PFC/,/^[^[:space:]]/p' | - grep buffer | sed 's/ \+/ /g' | cut -d' ' -f 2- + # Produces a string of numbers "<B0> <B1> ... <B7> ", where BX is number + # of buffer that priority X is mapped to. + dcb -j buffer show dev $swp | + jq -r '[.prio_buffer | .[] | tostring + " "] | add' } get_prio_pfc() { - __mlnx_qos -i $swp | sed -n '/^PFC/,/^[^[:space:]]/p' | - grep enabled | sed 's/ \+/ /g' | cut -d' ' -f 2- + # Produces a string of numbers "<P0> <P1> ... <P7> ", where PX denotes + # whether priority X has PFC enabled (the value is 1) or disabled (0). + dcb -j pfc show dev $swp | + jq -r '[.prio_pfc | .[] | if . then "1 " else "0 " end] | add' } get_prio_tc() { - __mlnx_qos -i $swp | sed -n '/^tc/,$p' | - awk '/^tc/ { TC = $2 } - /priority:/ { PRIO[$2]=TC } - END { - for (i in PRIO) - printf("%d ", PRIO[i]) - }' + # Produces a string of numbers "<T0> <T1> ... <T7> ", where TC is number + # of TC that priority X is mapped to. + dcb -j ets show dev $swp | + jq -r '[.prio_tc | .[] | tostring + " "] | add' } get_buf_size() { local idx=$1; shift - __mlnx_qos -i $swp | grep Receive | sed 's/.*: //' | cut -d, -f $((idx + 1)) + dcb -j buffer show dev $swp | jq ".buffer_size[$idx]" } get_tot_size() { - __mlnx_qos -i $swp | grep Receive | sed 's/.*total_size=//' + dcb -j buffer show dev $swp | jq '.total_size' } check_prio_pg() @@ -121,18 +122,18 @@ test_dcb_ets() { RET=0 - __mlnx_qos -i $swp --prio_tc=0,2,4,6,1,3,5,7 > /dev/null + dcb ets set dev $swp prio-tc 0:0 1:2 2:4 3:6 4:1 5:3 6:5 7:7 check_prio_pg "0 2 4 6 1 3 5 7 " check_prio_tc "0 2 4 6 1 3 5 7 " check_prio_pfc "0 0 0 0 0 0 0 0 " - __mlnx_qos -i $swp --prio_tc=0,0,0,0,0,0,0,0 > /dev/null + dcb ets set dev $swp prio-tc all:0 check_prio_pg "0 0 0 0 0 0 0 0 " check_prio_tc "0 0 0 0 0 0 0 0 " - __mlnx_qos -i $swp --prio2buffer=1,3,5,7,0,2,4,6 &> /dev/null + dcb buffer set dev $swp prio-buffer 0:1 1:3 2:5 3:7 4:0 5:2 6:4 7:6 2>/dev/null check_fail $? "prio2buffer accepted in DCB mode" log_test "Configuring headroom through ETS" @@ -174,7 +175,7 @@ test_pfc() { RET=0 - __mlnx_qos -i $swp --prio_tc=0,0,0,0,0,1,2,3 > /dev/null + dcb ets set dev $swp prio-tc all:0 5:1 6:2 7:3 local buf0size=$(get_buf_size 0) local buf1size=$(get_buf_size 1) @@ -193,7 +194,7 @@ test_pfc() RET=0 - __mlnx_qos -i $swp --pfc=0,0,0,0,0,1,1,1 --cable_len=0 > /dev/null + dcb pfc set dev $swp prio-pfc all:off 5:on 6:on 7:on delay 0 check_prio_pg "0 0 0 0 0 1 2 3 " check_prio_pfc "0 0 0 0 0 1 1 1 " @@ -210,7 +211,7 @@ test_pfc() RET=0 - __mlnx_qos -i $swp --pfc=0,0,0,0,0,1,1,1 --cable_len=1000 > /dev/null + dcb pfc set dev $swp delay 1000 check_buf_size 0 "== $buf0size" check_buf_size 1 "> $buf1size" @@ -221,8 +222,8 @@ test_pfc() RET=0 - __mlnx_qos -i $swp --pfc=0,0,0,0,0,0,0,0 --cable_len=0 > /dev/null - __mlnx_qos -i $swp --prio_tc=0,0,0,0,0,0,0,0 > /dev/null + dcb pfc set dev $swp prio-pfc all:off delay 0 + dcb ets set dev $swp prio-tc all:0 check_prio_pg "0 0 0 0 0 0 0 0 " check_prio_tc "0 0 0 0 0 0 0 0 " @@ -242,13 +243,13 @@ test_tc_priomap() { RET=0 - __mlnx_qos -i $swp --prio_tc=0,1,2,3,4,5,6,7 > /dev/null + dcb ets set dev $swp prio-tc 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7 check_prio_pg "0 1 2 3 4 5 6 7 " tc qdisc replace dev $swp root handle 1: bfifo limit 1.5M check_prio_pg "0 0 0 0 0 0 0 0 " - __mlnx_qos -i $swp --prio2buffer=1,3,5,7,0,2,4,6 > /dev/null + dcb buffer set dev $swp prio-buffer 0:1 1:3 2:5 3:7 4:0 5:2 6:4 7:6 check_prio_pg "1 3 5 7 0 2 4 6 " tc qdisc delete dev $swp root @@ -256,9 +257,9 @@ test_tc_priomap() # Clean up. tc qdisc replace dev $swp root handle 1: bfifo limit 1.5M - __mlnx_qos -i $swp --prio2buffer=0,0,0,0,0,0,0,0 > /dev/null + dcb buffer set dev $swp prio-buffer all:0 tc qdisc delete dev $swp root - __mlnx_qos -i $swp --prio_tc=0,0,0,0,0,0,0,0 > /dev/null + dcb ets set dev $swp prio-tc all:0 log_test "TC: priomap" } @@ -270,12 +271,12 @@ test_tc_sizes() RET=0 - __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 &> /dev/null + dcb buffer set dev $swp buffer-size all:0 0:$size 2>/dev/null check_fail $? "buffer_size should fail before qdisc is added" tc qdisc replace dev $swp root handle 1: bfifo limit 1.5M - __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 > /dev/null + dcb buffer set dev $swp buffer-size all:0 0:$size check_err $? "buffer_size should pass after qdisc is added" check_buf_size 0 "== $size" "set size: " @@ -283,26 +284,26 @@ test_tc_sizes() check_buf_size 0 "== $size" "set MTU: " mtu_restore $swp - __mlnx_qos -i $swp --buffer_size=0,0,0,0,0,0,0,0 > /dev/null + dcb buffer set dev $swp buffer-size all:0 # After replacing the qdisc for the same kind, buffer_size still has to # work. tc qdisc replace dev $swp root handle 1: bfifo limit 1M - __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 > /dev/null + dcb buffer set dev $swp buffer-size all:0 0:$size check_buf_size 0 "== $size" "post replace, set size: " - __mlnx_qos -i $swp --buffer_size=0,0,0,0,0,0,0,0 > /dev/null + dcb buffer set dev $swp buffer-size all:0 # Likewise after replacing for a different kind. tc qdisc replace dev $swp root handle 2: prio bands 8 - __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 > /dev/null + dcb buffer set dev $swp buffer-size all:0 0:$size check_buf_size 0 "== $size" "post replace different kind, set size: " tc qdisc delete dev $swp root - __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 &> /dev/null + dcb buffer set dev $swp buffer-size all:0 0:$size 2>/dev/null check_fail $? "buffer_size should fail after qdisc is deleted" log_test "TC: buffer size" @@ -363,10 +364,10 @@ test_tc_int_buf() tc qdisc replace dev $swp root handle 1: bfifo limit 1.5M test_int_buf "TC: " - __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 > /dev/null + dcb buffer set dev $swp buffer-size all:0 0:$size test_int_buf "TC+buffsize: " - __mlnx_qos -i $swp --buffer_size=0,0,0,0,0,0,0,0 > /dev/null + dcb buffer set dev $swp buffer-size all:0 tc qdisc delete dev $swp root } diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh index 0bf76f13c030..faa51012cdac 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh @@ -82,17 +82,3 @@ bail_on_lldpad() fi fi } - -__mlnx_qos() -{ - local err - - mlnx_qos "$@" 2>/dev/null - err=$? - - if ((err)); then - echo "Error ($err) in mlnx_qos $@" >/dev/stderr - fi - - return $err -} diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh index 5c7700212f75..5d5622fc2758 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh @@ -171,7 +171,7 @@ switch_create() # assignment. tc qdisc replace dev $swp1 root handle 1: \ ets bands 8 strict 8 priomap 7 6 - __mlnx_qos -i $swp1 --prio2buffer=0,1,0,0,0,0,0,0 >/dev/null + dcb buffer set dev $swp1 prio-buffer all:0 1:1 # $swp2 # ----- @@ -209,8 +209,8 @@ switch_create() # the lossless prio into a buffer of its own. Don't bother with buffer # sizes though, there is not going to be any pressure in the "backward" # direction. - __mlnx_qos -i $swp3 --prio2buffer=0,1,0,0,0,0,0,0 >/dev/null - __mlnx_qos -i $swp3 --pfc=0,1,0,0,0,0,0,0 >/dev/null + dcb buffer set dev $swp3 prio-buffer all:0 1:1 + dcb pfc set dev $swp3 prio-pfc all:off 1:on # $swp4 # ----- @@ -226,11 +226,11 @@ switch_create() # Configure qdisc so that we can hand-tune headroom. tc qdisc replace dev $swp4 root handle 1: \ ets bands 8 strict 8 priomap 7 6 - __mlnx_qos -i $swp4 --prio2buffer=0,1,0,0,0,0,0,0 >/dev/null - __mlnx_qos -i $swp4 --pfc=0,1,0,0,0,0,0,0 >/dev/null + dcb buffer set dev $swp4 prio-buffer all:0 1:1 + dcb pfc set dev $swp4 prio-pfc all:off 1:on # PG0 will get autoconfigured to Xoff, give PG1 arbitrarily 100K, which # is (-2*MTU) about 80K of delay provision. - __mlnx_qos -i $swp4 --buffer_size=0,$_100KB,0,0,0,0,0,0 >/dev/null + dcb buffer set dev $swp4 buffer-size all:0 1:$_100KB # bridges # ------- @@ -273,9 +273,9 @@ switch_destroy() # $swp4 # ----- - __mlnx_qos -i $swp4 --buffer_size=0,0,0,0,0,0,0,0 >/dev/null - __mlnx_qos -i $swp4 --pfc=0,0,0,0,0,0,0,0 >/dev/null - __mlnx_qos -i $swp4 --prio2buffer=0,0,0,0,0,0,0,0 >/dev/null + dcb buffer set dev $swp4 buffer-size all:0 + dcb pfc set dev $swp4 prio-pfc all:off + dcb buffer set dev $swp4 prio-buffer all:0 tc qdisc del dev $swp4 root devlink_tc_bind_pool_th_restore $swp4 1 ingress @@ -288,8 +288,8 @@ switch_destroy() # $swp3 # ----- - __mlnx_qos -i $swp3 --pfc=0,0,0,0,0,0,0,0 >/dev/null - __mlnx_qos -i $swp3 --prio2buffer=0,0,0,0,0,0,0,0 >/dev/null + dcb pfc set dev $swp3 prio-pfc all:off + dcb buffer set dev $swp3 prio-buffer all:0 tc qdisc del dev $swp3 root devlink_tc_bind_pool_th_restore $swp3 1 egress @@ -315,7 +315,7 @@ switch_destroy() # $swp1 # ----- - __mlnx_qos -i $swp1 --prio2buffer=0,0,0,0,0,0,0,0 >/dev/null + dcb buffer set dev $swp1 prio-buffer all:0 tc qdisc del dev $swp1 root devlink_tc_bind_pool_th_restore $swp1 1 ingress diff --git a/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh index e93878d42596..683759d29199 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh @@ -68,7 +68,7 @@ wait_for_routes() local t0=$1; shift local route_count=$1; shift - local t1=$(ip route | grep -o 'offload' | wc -l) + local t1=$(ip route | grep 'offload' | grep -v 'offload_failed' | wc -l) local delta=$((t1 - t0)) echo $delta [[ $delta -ge $route_count ]] diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh index 093bed088ad0..373d5f2a846e 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh @@ -234,15 +234,15 @@ __tc_sample_rate_test() psample_capture_start - ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + ip vrf exec v$h1 $MZ $h1 -c 320000 -d 100usec -p 64 -A 192.0.2.1 \ -B $dip -t udp dp=52768,sp=42768 -q psample_capture_stop pkts=$(grep -e "group 1 " $CAPTURE_FILE | wc -l) - pct=$((100 * (pkts - 100) / 100)) + pct=$((100 * (pkts - 10000) / 10000)) (( -25 <= pct && pct <= 25)) - check_err $? "Expected 100 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%" + check_err $? "Expected 10000 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%" log_test "tc sample rate ($desc)" @@ -587,15 +587,15 @@ __tc_sample_acl_rate_test() psample_capture_start - ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + ip vrf exec v$h1 $MZ $h1 -c 320000 -d 100usec -p 64 -A 192.0.2.1 \ -B 198.51.100.1 -t udp dp=52768,sp=42768 -q psample_capture_stop pkts=$(grep -e "group 1 " $CAPTURE_FILE | wc -l) - pct=$((100 * (pkts - 100) / 100)) + pct=$((100 * (pkts - 10000) / 10000)) (( -25 <= pct && pct <= 25)) - check_err $? "Expected 100 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%" + check_err $? "Expected 10000 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%" # Setup a filter that should not match any packet and make sure packets # are not sampled. diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index 40909c254365..9de1d123f4f5 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -5,12 +5,13 @@ lib_dir=$(dirname $0)/../../../net/forwarding ALL_TESTS="fw_flash_test params_test regions_test reload_test \ netns_reload_test resource_test dev_info_test \ - empty_reporter_test dummy_reporter_test" + empty_reporter_test dummy_reporter_test rate_test" NUM_NETIFS=0 source $lib_dir/lib.sh BUS_ADDR=10 PORT_COUNT=4 +VF_COUNT=4 DEV_NAME=netdevsim$BUS_ADDR SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV_NAME/net/ DEBUGFS_DIR=/sys/kernel/debug/netdevsim/$DEV_NAME/ @@ -507,6 +508,170 @@ dummy_reporter_test() log_test "dummy reporter test" } +rate_leafs_get() +{ + local handle=$1 + + cmd_jq "devlink port function rate show -j" \ + '.[] | to_entries | .[] | select(.value.type == "leaf") | .key | select(contains("'$handle'"))' +} + +rate_nodes_get() +{ + local handle=$1 + + cmd_jq "devlink port function rate show -j" \ + '.[] | to_entries | .[] | select(.value.type == "node") | .key | select(contains("'$handle'"))' +} + +rate_attr_set() +{ + local handle=$1 + local name=$2 + local value=$3 + local units=$4 + + devlink port function rate set $handle $name $value$units +} + +rate_attr_get() +{ + local handle=$1 + local name=$2 + + cmd_jq "devlink port function rate show $handle -j" '.[][].'$name +} + +rate_attr_tx_rate_check() +{ + local handle=$1 + local name=$2 + local rate=$3 + local debug_file=$4 + + rate_attr_set $handle $name $rate mbit + check_err $? "Failed to set $name value" + + local debug_value=$(cat $debug_file) + check_err $? "Failed to read $name value from debugfs" + [ "$debug_value" == "$rate" ] + check_err $? "Unexpected $name debug value $debug_value != $rate" + + local api_value=$(( $(rate_attr_get $handle $name) * 8 / 1000000 )) + check_err $? "Failed to get $name attr value" + [ "$api_value" == "$rate" ] + check_err $? "Unexpected $name attr value $api_value != $rate" +} + +rate_attr_parent_check() +{ + local handle=$1 + local parent=$2 + local debug_file=$3 + + rate_attr_set $handle parent $parent + check_err $? "Failed to set parent" + + debug_value=$(cat $debug_file) + check_err $? "Failed to get parent debugfs value" + [ "$debug_value" == "$parent" ] + check_err $? "Unexpected parent debug value $debug_value != $parent" + + api_value=$(rate_attr_get $r_obj parent) + check_err $? "Failed to get parent attr value" + [ "$api_value" == "$parent" ] + check_err $? "Unexpected parent attr value $api_value != $parent" +} + +rate_node_add() +{ + local handle=$1 + + devlink port function rate add $handle +} + +rate_node_del() +{ + local handle=$1 + + devlink port function rate del $handle +} + +rate_test() +{ + RET=0 + + echo $VF_COUNT > /sys/bus/netdevsim/devices/$DEV_NAME/sriov_numvfs + devlink dev eswitch set $DL_HANDLE mode switchdev + local leafs=`rate_leafs_get $DL_HANDLE` + local num_leafs=`echo $leafs | wc -w` + [ "$num_leafs" == "$VF_COUNT" ] + check_err $? "Expected $VF_COUNT rate leafs but got $num_leafs" + + rate=10 + for r_obj in $leafs + do + rate_attr_tx_rate_check $r_obj tx_share $rate \ + $DEBUGFS_DIR/ports/${r_obj##*/}/tx_share + rate=$(($rate+10)) + done + + rate=100 + for r_obj in $leafs + do + rate_attr_tx_rate_check $r_obj tx_max $rate \ + $DEBUGFS_DIR/ports/${r_obj##*/}/tx_max + rate=$(($rate+100)) + done + + local node1_name='group1' + local node1="$DL_HANDLE/$node1_name" + rate_node_add "$node1" + check_err $? "Failed to add node $node1" + + local num_nodes=`rate_nodes_get $DL_HANDLE | wc -w` + [ $num_nodes == 1 ] + check_err $? "Expected 1 rate node in output but got $num_nodes" + + local node_tx_share=10 + rate_attr_tx_rate_check $node1 tx_share $node_tx_share \ + $DEBUGFS_DIR/rate_nodes/${node1##*/}/tx_share + + local node_tx_max=100 + rate_attr_tx_rate_check $node1 tx_max $node_tx_max \ + $DEBUGFS_DIR/rate_nodes/${node1##*/}/tx_max + + rate_node_del "$node1" + check_err $? "Failed to delete node $node1" + local num_nodes=`rate_nodes_get $DL_HANDLE | wc -w` + [ $num_nodes == 0 ] + check_err $? "Expected 0 rate node but got $num_nodes" + + local node1_name='group1' + local node1="$DL_HANDLE/$node1_name" + rate_node_add "$node1" + check_err $? "Failed to add node $node1" + + rate_attr_parent_check $r_obj $node1_name \ + $DEBUGFS_DIR/ports/${r_obj##*/}/rate_parent + + local node2_name='group2' + local node2="$DL_HANDLE/$node2_name" + rate_node_add "$node2" + check_err $? "Failed to add node $node2" + + rate_attr_parent_check $node2 $node1_name \ + $DEBUGFS_DIR/rate_nodes/$node2_name/rate_parent + rate_node_del "$node2" + check_err $? "Failed to delete node $node2" + rate_attr_set "$r_obj" noparent + check_err $? "Failed to unset $r_obj parent node" + rate_node_del "$node1" + check_err $? "Failed to delete node $node1" + + log_test "rate test" +} + setup_prepare() { modprobe netdevsim diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh index da49ad2761b5..109900c817be 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh @@ -24,13 +24,15 @@ ALL_TESTS=" NETDEVSIM_PATH=/sys/bus/netdevsim/ DEV_ADDR=1337 DEV=netdevsim${DEV_ADDR} -DEVLINK_DEV=netdevsim/${DEV} DEBUGFS_DIR=/sys/kernel/debug/netdevsim/$DEV/ SLEEP_TIME=1 NETDEV="" NUM_NETIFS=0 source $lib_dir/lib.sh + +DEVLINK_DEV= source $lib_dir/devlink_lib.sh +DEVLINK_DEV=netdevsim/${DEV} require_command udevadm @@ -163,6 +165,16 @@ trap_stats_test() devlink_trap_action_set $trap_name "drop" devlink_trap_stats_idle_test $trap_name check_err $? "Stats of trap $trap_name not idle when action is drop" + + echo "y"> $DEBUGFS_DIR/fail_trap_drop_counter_get + devlink -s trap show $DEVLINK_DEV trap $trap_name &> /dev/null + check_fail $? "Managed to read trap (hard dropped) statistics when should not" + echo "n"> $DEBUGFS_DIR/fail_trap_drop_counter_get + devlink -s trap show $DEVLINK_DEV trap $trap_name &> /dev/null + check_err $? "Did not manage to read trap (hard dropped) statistics when should" + + devlink_trap_drop_stats_idle_test $trap_name + check_fail $? "Drop stats of trap $trap_name idle when should not" else devlink_trap_stats_idle_test $trap_name check_fail $? "Stats of non-drop trap $trap_name idle when should not" diff --git a/tools/testing/selftests/drivers/net/netdevsim/fib.sh b/tools/testing/selftests/drivers/net/netdevsim/fib.sh index 251f228ce63e..fc794cd30389 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/fib.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/fib.sh @@ -33,13 +33,15 @@ ALL_TESTS=" NETDEVSIM_PATH=/sys/bus/netdevsim/ DEV_ADDR=1337 DEV=netdevsim${DEV_ADDR} -DEVLINK_DEV=netdevsim/${DEV} SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/ NUM_NETIFS=0 source $lib_dir/lib.sh -source $lib_dir/devlink_lib.sh source $lib_dir/fib_offload_lib.sh +DEVLINK_DEV= +source $lib_dir/devlink_lib.sh +DEVLINK_DEV=netdevsim/${DEV} + ipv4_identical_routes() { fib_ipv4_identical_routes_test "testns1" diff --git a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh index ba75c81cda91..e8e0dc088d6a 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh @@ -44,12 +44,14 @@ ALL_TESTS=" NETDEVSIM_PATH=/sys/bus/netdevsim/ DEV_ADDR=1337 DEV=netdevsim${DEV_ADDR} -DEVLINK_DEV=netdevsim/${DEV} SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/ DEBUGFS_NET_DIR=/sys/kernel/debug/netdevsim/$DEV/ NUM_NETIFS=0 source $lib_dir/lib.sh + +DEVLINK_DEV= source $lib_dir/devlink_lib.sh +DEVLINK_DEV=netdevsim/${DEV} nexthop_check() { diff --git a/tools/testing/selftests/drivers/net/netdevsim/psample.sh b/tools/testing/selftests/drivers/net/netdevsim/psample.sh index ee10b1a8933c..e689ff7a0b12 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/psample.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/psample.sh @@ -14,13 +14,15 @@ ALL_TESTS=" NETDEVSIM_PATH=/sys/bus/netdevsim/ DEV_ADDR=1337 DEV=netdevsim${DEV_ADDR} -DEVLINK_DEV=netdevsim/${DEV} SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/ PSAMPLE_DIR=/sys/kernel/debug/netdevsim/$DEV/psample/ CAPTURE_FILE=$(mktemp) NUM_NETIFS=0 source $lib_dir/lib.sh + +DEVLINK_DEV= source $lib_dir/devlink_lib.sh +DEVLINK_DEV=netdevsim/${DEV} # Available at https://github.com/Mellanox/libpsample require_command psample diff --git a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh index beee0d5646a6..f7d84549cc3e 100755 --- a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh +++ b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh @@ -1,6 +1,6 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Copyright 2020 NXP Semiconductors +# Copyright 2020 NXP WAIT_TIME=1 NUM_NETIFS=4 diff --git a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c index 477cbb042f5b..0315955ff0f4 100644 --- a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c +++ b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c @@ -62,6 +62,9 @@ static int __do_binderfs_test(struct __test_metadata *_metadata) struct binder_version version = { 0 }; char binderfs_mntpt[] = P_tmpdir "/binderfs_XXXXXX", device_path[sizeof(P_tmpdir "/binderfs_XXXXXX/") + BINDERFS_MAX_NAME]; + static const char * const binder_features[] = { + "oneway_spam_detection", + }; change_mountns(_metadata); @@ -150,6 +153,20 @@ static int __do_binderfs_test(struct __test_metadata *_metadata) } /* success: binder-control device removal failed as expected */ + + for (int i = 0; i < ARRAY_SIZE(binder_features); i++) { + snprintf(device_path, sizeof(device_path), "%s/features/%s", + binderfs_mntpt, binder_features[i]); + fd = open(device_path, O_CLOEXEC | O_RDONLY); + EXPECT_GE(fd, 0) { + TH_LOG("%s - Failed to open binder feature: %s", + strerror(errno), binder_features[i]); + goto umount; + } + close(fd); + } + + /* success: binder feature files found */ result = 0; umount: diff --git a/tools/testing/selftests/firmware/fw_namespace.c b/tools/testing/selftests/firmware/fw_namespace.c index 0e393cb5f42d..4c6f0cd83c5b 100644 --- a/tools/testing/selftests/firmware/fw_namespace.c +++ b/tools/testing/selftests/firmware/fw_namespace.c @@ -129,7 +129,8 @@ int main(int argc, char **argv) die("mounting tmpfs to /lib/firmware failed\n"); sys_path = argv[1]; - asprintf(&fw_path, "/lib/firmware/%s", fw_name); + if (asprintf(&fw_path, "/lib/firmware/%s", fw_name) < 0) + die("error: failed to build full fw_path\n"); setup_fw(fw_path); diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc new file mode 100644 index 000000000000..60c02b482be8 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc @@ -0,0 +1,90 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Generic dynamic event - add/remove eprobe events +# requires: dynamic_events events/syscalls/sys_enter_openat "e[:[<group>/]<event>] <attached-group>.<attached-event> [<args>]":README + +echo 0 > events/enable + +clear_dynamic_events + +SYSTEM="syscalls" +EVENT="sys_enter_openat" +FIELD="filename" +EPROBE="eprobe_open" +OPTIONS="file=+0(\$filename):ustring" +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events + +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE + +echo 1 > events/eprobes/$EPROBE/enable +ls +echo 0 > events/eprobes/$EPROBE/enable + +content=`grep '^ *ls-' trace | grep 'file='` +nocontent=`grep '^ *ls-' trace | grep 'file=' | grep -v -e '"/' -e '"."' -e '(fault)' ` || true + +if [ -z "$content" ]; then + exit_fail +fi + +if [ ! -z "$nocontent" ]; then + exit_fail +fi + +echo "-:$EPROBE" >> dynamic_events + +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# test various ways to remove the probe (already tested with just event name) + +# With group name +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With group name and system/event +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE $SYSTEM/$EVENT" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With just event name and system/event +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:$EPROBE $SYSTEM/$EVENT" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With just event name and system/event and options +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With group name and system/event and options +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# Finally make sure what is in the dynamic_events file clears it too +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +LINE=`sed -e '/$EPROBE/s/^e/-/' < dynamic_events` +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +clear_trace diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc b/tools/testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc new file mode 100644 index 000000000000..db522577ff78 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc @@ -0,0 +1,38 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Generic dynamic event - check if duplicate events are caught +# requires: dynamic_events "e[:[<group>/]<event>] <attached-group>.<attached-event> [<args>]":README + +echo 0 > events/enable + +HAVE_KPROBES=0 + +if [ -f kprobe_events ]; then + HAVE_KPROBES=1 +fi + +clear_dynamic_events + +# first create dynamic events for eprobes and kprobes. + +echo 'e:egroup/eevent syscalls/sys_enter_openat file=+0($filename):ustring' >> dynamic_events + +# Test eprobe for same eprobe, existing kprobe and existing event +! echo 'e:egroup/eevent syscalls/sys_enter_openat file=+0($filename):ustring' >> dynamic_events +! echo 'e:syscalls/sys_enter_open syscalls/sys_enter_openat file=+0($filename):ustring' >> dynamic_events + +if [ $HAVE_KPROBES -eq 1 ]; then + echo 'p:kgroup/kevent vfs_open file=+0($arg2)' >> dynamic_events + ! echo 'e:kgroup/kevent syscalls/sys_enter_openat file=+0($filename):ustring' >> dynamic_events + +# Test kprobe for same kprobe, existing eprobe and existing event + ! echo 'p:kgroup/kevent vfs_open file=+0($arg2)' >> dynamic_events + ! echo 'p:egroup/eevent vfs_open file=+0($arg2)' >> dynamic_events + ! echo 'p:syscalls/sys_enter_open vfs_open file=+0($arg2)' >> dynamic_events + + echo '-:kgroup/kevent' >> dynamic_events +fi + +echo '-:egroup/eevent' >> dynamic_events + +clear_trace diff --git a/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc b/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc index e6eb78f0b954..9933ed24f901 100644 --- a/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc +++ b/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc @@ -57,6 +57,10 @@ enable_events() { echo 1 > tracing_on } +other_task() { + sleep .001 || usleep 1 || sleep 1 +} + echo 0 > options/event-fork do_reset @@ -94,6 +98,9 @@ child=$! echo "child = $child" wait $child +# Be sure some other events will happen for small systems (e.g. 1 core) +other_task + echo 0 > tracing_on cnt=`count_pid $mypid` diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index a6fac927ee82..000fd05e84b1 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -83,6 +83,27 @@ clear_synthetic_events() { # reset all current synthetic events done } +clear_dynamic_events() { # reset all current dynamic events + again=1 + stop=1 + # loop mulitple times as some events require other to be removed first + while [ $again -eq 1 ]; do + stop=$((stop+1)) + # Prevent infinite loops + if [ $stop -gt 10 ]; then + break; + fi + again=2 + grep -v '^#' dynamic_events| + while read line; do + del=`echo $line | sed -e 's/^.\([^ ]*\).*/-\1/'` + if ! echo "$del" >> dynamic_events; then + again=1 + fi + done + done +} + initialize_ftrace() { # Reset ftrace to initial-state # As the initial state, ftrace will be set to nop tracer, # no events, no triggers, no filters, no function filters, @@ -93,6 +114,7 @@ initialize_ftrace() { # Reset ftrace to initial-state reset_events_filter reset_ftrace_filter disable_events + clear_dynamic_events [ -f set_event_pid ] && echo > set_event_pid [ -f set_ftrace_pid ] && echo > set_ftrace_pid [ -f set_ftrace_notrace ] && echo > set_ftrace_notrace @@ -115,7 +137,7 @@ check_requires() { # Check required files and tracers echo "Required tracer $t is not configured." exit_unsupported fi - elif [ $r != $i ]; then + elif [ "$r" != "$i" ]; then if ! grep -Fq "$r" README ; then echo "Required feature pattern \"$r\" is not in README." exit_unsupported diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-eprobe.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-eprobe.tc new file mode 100644 index 000000000000..914fe2e5d030 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-eprobe.tc @@ -0,0 +1,53 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event trigger - test inter-event histogram trigger eprobe on synthetic event +# requires: dynamic_events synthetic_events events/syscalls/sys_enter_openat/hist "e[:[<group>/]<event>] <attached-group>.<attached-event> [<args>]":README + +echo 0 > events/enable + +clear_dynamic_events + +SYSTEM="syscalls" +START="sys_enter_openat" +END="sys_exit_openat" +FIELD="filename" +SYNTH="synth_open" +EPROBE="eprobe_open" + +echo "$SYNTH u64 filename; s64 ret;" > synthetic_events +echo "hist:keys=common_pid:__arg__1=$FIELD" > events/$SYSTEM/$START/trigger +echo "hist:keys=common_pid:filename=\$__arg__1,ret=ret:onmatch($SYSTEM.$START).trace($SYNTH,\$filename,\$ret)" > events/$SYSTEM/$END/trigger + +echo "e:$EPROBE synthetic/$SYNTH file=+0(\$filename):ustring ret=\$ret:s64" >> dynamic_events + +grep -q "$SYNTH" dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/synthetic/$SYNTH +test -d events/eprobes/$EPROBE + +echo 1 > events/eprobes/$EPROBE/enable +ls +echo 0 > events/eprobes/$EPROBE/enable + +content=`grep '^ *ls-' trace | grep 'file='` +nocontent=`grep '^ *ls-' trace | grep 'file=' | grep -v -e '"/' -e '"."'` || true + +if [ -z "$content" ]; then + exit_fail +fi + +if [ ! -z "$nocontent" ]; then + exit_fail +fi + +echo "-:$EPROBE" >> dynamic_events +echo '!'"hist:keys=common_pid:filename=\$__arg__1,ret=ret:onmatch($SYSTEM.$START).trace($SYNTH,\$filename,\$ret)" > events/$SYSTEM/$END/trigger +echo '!'"hist:keys=common_pid:__arg__1=$FIELD" > events/$SYSTEM/$START/trigger +echo '!'"$SYNTH u64 filename; s64 ret;" >> synthetic_events + +! grep -q "$SYNTH" dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/synthetic/$SYNTH +! test -d events/eprobes/$EPROBE + +clear_trace diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc index 2950bfbc6fce..adae72665500 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc @@ -39,6 +39,24 @@ grep "parent_comm: $COMM" events/sched/sched_process_fork/hist > /dev/null || \ reset_trigger +echo "Test histogram with sym modifier" + +echo 'hist:keys=call_site.sym' > events/kmem/kmalloc/trigger +for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done +grep '{ call_site: \[[0-9a-f][0-9a-f]*\] [_a-zA-Z][_a-zA-Z]* *}' events/kmem/kmalloc/hist > /dev/null || \ + fail "sym modifier on kmalloc call_site did not work" + +reset_trigger + +echo "Test histogram with sym-offset modifier" + +echo 'hist:keys=call_site.sym-offset' > events/kmem/kmalloc/trigger +for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done +grep '{ call_site: \[[0-9a-f][0-9a-f]*\] [_a-zA-Z][_a-zA-Z]*+0x[0-9a-f][0-9a-f]*' events/kmem/kmalloc/hist > /dev/null || \ + fail "sym-offset modifier on kmalloc call_site did not work" + +reset_trigger + echo "Test histogram with sort key" echo 'hist:keys=parent_pid,child_pid:sort=child_pid.ascending' > events/sched/sched_process_fork/trigger diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore index 0efcd494daab..0e78b49d0f2f 100644 --- a/tools/testing/selftests/futex/functional/.gitignore +++ b/tools/testing/selftests/futex/functional/.gitignore @@ -6,3 +6,5 @@ futex_wait_private_mapped_file futex_wait_timeout futex_wait_uninitialized_heap futex_wait_wouldblock +futex_wait +futex_requeue diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 23207829ec75..bd1fec59e010 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -INCLUDES := -I../include -I../../ +INCLUDES := -I../include -I../../ -I../../../../../usr/include/ \ + -I$(KBUILD_OUTPUT)/kselftest/usr/include CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) LDLIBS := -lpthread -lrt @@ -14,7 +15,9 @@ TEST_GEN_FILES := \ futex_requeue_pi_signal_restart \ futex_requeue_pi_mismatched_ops \ futex_wait_uninitialized_heap \ - futex_wait_private_mapped_file + futex_wait_private_mapped_file \ + futex_wait \ + futex_requeue TEST_PROGS := run.sh diff --git a/tools/testing/selftests/futex/functional/futex_requeue.c b/tools/testing/selftests/futex/functional/futex_requeue.c new file mode 100644 index 000000000000..51485be6eb2f --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_requeue.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright Collabora Ltd., 2021 + * + * futex cmp requeue test by André Almeida <andrealmeid@collabora.com> + */ + +#include <pthread.h> +#include <limits.h> +#include "logging.h" +#include "futextest.h" + +#define TEST_NAME "futex-requeue" +#define timeout_ns 30000000 +#define WAKE_WAIT_US 10000 + +volatile futex_t *f1; + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +void *waiterfn(void *arg) +{ + struct timespec to; + + to.tv_sec = 0; + to.tv_nsec = timeout_ns; + + if (futex_wait(f1, *f1, &to, 0)) + printf("waiter failed errno %d\n", errno); + + return NULL; +} + +int main(int argc, char *argv[]) +{ + pthread_t waiter[10]; + int res, ret = RET_PASS; + int c, i; + volatile futex_t _f1 = 0; + volatile futex_t f2 = 0; + + f1 = &_f1; + + while ((c = getopt(argc, argv, "cht:v:")) != -1) { + switch (c) { + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + ksft_print_header(); + ksft_set_plan(2); + ksft_print_msg("%s: Test futex_requeue\n", + basename(argv[0])); + + /* + * Requeue a waiter from f1 to f2, and wake f2. + */ + if (pthread_create(&waiter[0], NULL, waiterfn, NULL)) + error("pthread_create failed\n", errno); + + usleep(WAKE_WAIT_US); + + info("Requeuing 1 futex from f1 to f2\n"); + res = futex_cmp_requeue(f1, 0, &f2, 0, 1, 0); + if (res != 1) { + ksft_test_result_fail("futex_requeue simple returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } + + + info("Waking 1 futex at f2\n"); + res = futex_wake(&f2, 1, 0); + if (res != 1) { + ksft_test_result_fail("futex_requeue simple returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_requeue simple succeeds\n"); + } + + + /* + * Create 10 waiters at f1. At futex_requeue, wake 3 and requeue 7. + * At futex_wake, wake INT_MAX (should be exactly 7). + */ + for (i = 0; i < 10; i++) { + if (pthread_create(&waiter[i], NULL, waiterfn, NULL)) + error("pthread_create failed\n", errno); + } + + usleep(WAKE_WAIT_US); + + info("Waking 3 futexes at f1 and requeuing 7 futexes from f1 to f2\n"); + res = futex_cmp_requeue(f1, 0, &f2, 3, 7, 0); + if (res != 10) { + ksft_test_result_fail("futex_requeue many returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } + + info("Waking INT_MAX futexes at f2\n"); + res = futex_wake(&f2, INT_MAX, 0); + if (res != 7) { + ksft_test_result_fail("futex_requeue many returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_requeue many succeeds\n"); + } + + ksft_print_cnts(); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/futex_wait.c b/tools/testing/selftests/futex/functional/futex_wait.c new file mode 100644 index 000000000000..685140d9b93d --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_wait.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright Collabora Ltd., 2021 + * + * futex cmp requeue test by André Almeida <andrealmeid@collabora.com> + */ + +#include <pthread.h> +#include <sys/shm.h> +#include <sys/mman.h> +#include <fcntl.h> +#include "logging.h" +#include "futextest.h" + +#define TEST_NAME "futex-wait" +#define timeout_ns 30000000 +#define WAKE_WAIT_US 10000 +#define SHM_PATH "futex_shm_file" + +void *futex; + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +static void *waiterfn(void *arg) +{ + struct timespec to; + unsigned int flags = 0; + + if (arg) + flags = *((unsigned int *) arg); + + to.tv_sec = 0; + to.tv_nsec = timeout_ns; + + if (futex_wait(futex, 0, &to, flags)) + printf("waiter failed errno %d\n", errno); + + return NULL; +} + +int main(int argc, char *argv[]) +{ + int res, ret = RET_PASS, fd, c, shm_id; + u_int32_t f_private = 0, *shared_data; + unsigned int flags = FUTEX_PRIVATE_FLAG; + pthread_t waiter; + void *shm; + + futex = &f_private; + + while ((c = getopt(argc, argv, "cht:v:")) != -1) { + switch (c) { + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + ksft_print_header(); + ksft_set_plan(3); + ksft_print_msg("%s: Test futex_wait\n", basename(argv[0])); + + /* Testing a private futex */ + info("Calling private futex_wait on futex: %p\n", futex); + if (pthread_create(&waiter, NULL, waiterfn, (void *) &flags)) + error("pthread_create failed\n", errno); + + usleep(WAKE_WAIT_US); + + info("Calling private futex_wake on futex: %p\n", futex); + res = futex_wake(futex, 1, FUTEX_PRIVATE_FLAG); + if (res != 1) { + ksft_test_result_fail("futex_wake private returned: %d %s\n", + errno, strerror(errno)); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_wake private succeeds\n"); + } + + /* Testing an anon page shared memory */ + shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); + if (shm_id < 0) { + perror("shmget"); + exit(1); + } + + shared_data = shmat(shm_id, NULL, 0); + + *shared_data = 0; + futex = shared_data; + + info("Calling shared (page anon) futex_wait on futex: %p\n", futex); + if (pthread_create(&waiter, NULL, waiterfn, NULL)) + error("pthread_create failed\n", errno); + + usleep(WAKE_WAIT_US); + + info("Calling shared (page anon) futex_wake on futex: %p\n", futex); + res = futex_wake(futex, 1, 0); + if (res != 1) { + ksft_test_result_fail("futex_wake shared (page anon) returned: %d %s\n", + errno, strerror(errno)); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_wake shared (page anon) succeeds\n"); + } + + + /* Testing a file backed shared memory */ + fd = open(SHM_PATH, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (fd < 0) { + perror("open"); + exit(1); + } + + if (ftruncate(fd, sizeof(f_private))) { + perror("ftruncate"); + exit(1); + } + + shm = mmap(NULL, sizeof(f_private), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (shm == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + memcpy(shm, &f_private, sizeof(f_private)); + + futex = shm; + + info("Calling shared (file backed) futex_wait on futex: %p\n", futex); + if (pthread_create(&waiter, NULL, waiterfn, NULL)) + error("pthread_create failed\n", errno); + + usleep(WAKE_WAIT_US); + + info("Calling shared (file backed) futex_wake on futex: %p\n", futex); + res = futex_wake(shm, 1, 0); + if (res != 1) { + ksft_test_result_fail("futex_wake shared (file backed) returned: %d %s\n", + errno, strerror(errno)); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_wake shared (file backed) succeeds\n"); + } + + /* Freeing resources */ + shmdt(shared_data); + munmap(shm, sizeof(f_private)); + remove(SHM_PATH); + close(fd); + + ksft_print_cnts(); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c index ee55e6d389a3..1f8f6daaf1e7 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c +++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c @@ -11,21 +11,18 @@ * * HISTORY * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com> + * 2021-Apr-26: More test cases by André Almeida <andrealmeid@collabora.com> * *****************************************************************************/ -#include <errno.h> -#include <getopt.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <time.h> +#include <pthread.h> #include "futextest.h" #include "logging.h" #define TEST_NAME "futex-wait-timeout" static long timeout_ns = 100000; /* 100us default timeout */ +static futex_t futex_pi; void usage(char *prog) { @@ -37,11 +34,67 @@ void usage(char *prog) VQUIET, VCRITICAL, VINFO); } +/* + * Get a PI lock and hold it forever, so the main thread lock_pi will block + * and we can test the timeout + */ +void *get_pi_lock(void *arg) +{ + int ret; + volatile futex_t lock = 0; + + ret = futex_lock_pi(&futex_pi, NULL, 0, 0); + if (ret != 0) + error("futex_lock_pi failed\n", ret); + + /* Blocks forever */ + ret = futex_wait(&lock, 0, NULL, 0); + error("futex_wait failed\n", ret); + + return NULL; +} + +/* + * Check if the function returned the expected error + */ +static void test_timeout(int res, int *ret, char *test_name, int err) +{ + if (!res || errno != err) { + ksft_test_result_fail("%s returned %d\n", test_name, + res < 0 ? errno : res); + *ret = RET_FAIL; + } else { + ksft_test_result_pass("%s succeeds\n", test_name); + } +} + +/* + * Calculate absolute timeout and correct overflow + */ +static int futex_get_abs_timeout(clockid_t clockid, struct timespec *to, + long timeout_ns) +{ + if (clock_gettime(clockid, to)) { + error("clock_gettime failed\n", errno); + return errno; + } + + to->tv_nsec += timeout_ns; + + if (to->tv_nsec >= 1000000000) { + to->tv_sec++; + to->tv_nsec -= 1000000000; + } + + return 0; +} + int main(int argc, char *argv[]) { futex_t f1 = FUTEX_INITIALIZER; - struct timespec to; int res, ret = RET_PASS; + struct timespec to; + pthread_t thread; int c; while ((c = getopt(argc, argv, "cht:v:")) != -1) { @@ -65,22 +118,63 @@ int main(int argc, char *argv[]) } ksft_print_header(); - ksft_set_plan(1); + ksft_set_plan(7); ksft_print_msg("%s: Block on a futex and wait for timeout\n", basename(argv[0])); ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); - /* initialize timeout */ + pthread_create(&thread, NULL, get_pi_lock, NULL); + + /* initialize relative timeout */ to.tv_sec = 0; to.tv_nsec = timeout_ns; - info("Calling futex_wait on f1: %u @ %p\n", f1, &f1); - res = futex_wait(&f1, f1, &to, FUTEX_PRIVATE_FLAG); - if (!res || errno != ETIMEDOUT) { - fail("futex_wait returned %d\n", ret < 0 ? errno : ret); - ret = RET_FAIL; - } + res = futex_wait(&f1, f1, &to, 0); + test_timeout(res, &ret, "futex_wait relative", ETIMEDOUT); + + /* FUTEX_WAIT_BITSET with CLOCK_REALTIME */ + if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) + return RET_FAIL; + res = futex_wait_bitset(&f1, f1, &to, 1, FUTEX_CLOCK_REALTIME); + test_timeout(res, &ret, "futex_wait_bitset realtime", ETIMEDOUT); + + /* FUTEX_WAIT_BITSET with CLOCK_MONOTONIC */ + if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) + return RET_FAIL; + res = futex_wait_bitset(&f1, f1, &to, 1, 0); + test_timeout(res, &ret, "futex_wait_bitset monotonic", ETIMEDOUT); + + /* FUTEX_WAIT_REQUEUE_PI with CLOCK_REALTIME */ + if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) + return RET_FAIL; + res = futex_wait_requeue_pi(&f1, f1, &futex_pi, &to, FUTEX_CLOCK_REALTIME); + test_timeout(res, &ret, "futex_wait_requeue_pi realtime", ETIMEDOUT); + + /* FUTEX_WAIT_REQUEUE_PI with CLOCK_MONOTONIC */ + if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) + return RET_FAIL; + res = futex_wait_requeue_pi(&f1, f1, &futex_pi, &to, 0); + test_timeout(res, &ret, "futex_wait_requeue_pi monotonic", ETIMEDOUT); + + /* + * FUTEX_LOCK_PI with CLOCK_REALTIME + * Due to historical reasons, FUTEX_LOCK_PI supports only realtime + * clock, but requires the caller to not set CLOCK_REALTIME flag. + * + * If you call FUTEX_LOCK_PI with a monotonic clock, it'll be + * interpreted as a realtime clock, and (unless you mess your machine's + * time or your time machine) the monotonic clock value is always + * smaller than realtime and the syscall will timeout immediately. + */ + if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) + return RET_FAIL; + res = futex_lock_pi(&futex_pi, &to, 0, 0); + test_timeout(res, &ret, "futex_lock_pi realtime", ETIMEDOUT); + + /* Test operations that don't support FUTEX_CLOCK_REALTIME */ + res = futex_lock_pi(&futex_pi, NULL, 0, FUTEX_CLOCK_REALTIME); + test_timeout(res, &ret, "futex_lock_pi invalid timeout flag", ENOSYS); - print_result(TEST_NAME, ret); + ksft_print_cnts(); return ret; } diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 1acb6ace1680..11a9d62290f5 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -73,3 +73,9 @@ echo echo ./futex_wait_uninitialized_heap $COLOR ./futex_wait_private_mapped_file $COLOR + +echo +./futex_wait $COLOR + +echo +./futex_requeue $COLOR diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 524c857a049c..b8dbabe24ac2 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only +/aarch64/debug-exceptions /aarch64/get-reg-list -/aarch64/get-reg-list-sve +/aarch64/psci_cpu_on_test /aarch64/vgic_init /s390x/memop /s390x/resets @@ -8,18 +9,22 @@ /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs /x86_64/evmcs_test +/x86_64/emulator_error_test /x86_64/get_cpuid_test /x86_64/get_msr_index_features /x86_64/kvm_pv_test /x86_64/hyperv_clock /x86_64/hyperv_cpuid +/x86_64/hyperv_features /x86_64/mmio_warning_test +/x86_64/mmu_role_test /x86_64/platform_info_test /x86_64/set_boot_cpu_id /x86_64/set_sregs_test /x86_64/smm_test /x86_64/state_test /x86_64/svm_vmcall_test +/x86_64/svm_int_ctl_test /x86_64/sync_regs_test /x86_64/tsc_msrs_test /x86_64/userspace_msr_exit_test @@ -29,11 +34,13 @@ /x86_64/vmx_preemption_timer_test /x86_64/vmx_set_nested_state_test /x86_64/vmx_tsc_adjust_test +/x86_64/vmx_nested_tsc_scaling_test /x86_64/xapic_ipi_test /x86_64/xen_shinfo_test /x86_64/xen_vmcall_test /x86_64/xss_msr_test /x86_64/vmx_pmu_msrs_test +/access_tracking_perf_test /demand_paging_test /dirty_log_test /dirty_log_perf_test @@ -42,5 +49,7 @@ /kvm_page_table_test /memslot_modification_stress_test /memslot_perf_test +/rseq_test /set_memory_region_test /steal_time +/kvm_binary_stats_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index daaee1888b12..d1774f461393 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -34,18 +34,21 @@ ifeq ($(ARCH),s390) endif LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c -LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S -LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c +LIBKVM_x86_64 = lib/x86_64/apic.c lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S +LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handlers.S LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test +TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test +TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test @@ -53,6 +56,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test +TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test @@ -60,6 +64,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test TEST_GEN_PROGS_x86_64 += x86_64/debug_regs @@ -67,6 +72,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test +TEST_GEN_PROGS_x86_64 += access_tracking_perf_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test @@ -75,19 +81,24 @@ TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += kvm_page_table_test TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test TEST_GEN_PROGS_x86_64 += memslot_perf_test +TEST_GEN_PROGS_x86_64 += rseq_test TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time +TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test +TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list -TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve +TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += dirty_log_perf_test TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus TEST_GEN_PROGS_aarch64 += kvm_page_table_test +TEST_GEN_PROGS_aarch64 += rseq_test TEST_GEN_PROGS_aarch64 += set_memory_region_test TEST_GEN_PROGS_aarch64 += steal_time +TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test TEST_GEN_PROGS_s390x = s390x/memop TEST_GEN_PROGS_s390x += s390x/resets @@ -96,7 +107,9 @@ TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus TEST_GEN_PROGS_s390x += kvm_page_table_test +TEST_GEN_PROGS_s390x += rseq_test TEST_GEN_PROGS_s390x += set_memory_region_test +TEST_GEN_PROGS_s390x += kvm_binary_stats_test TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c new file mode 100644 index 000000000000..e5e6c92b60da --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> + +#define VCPU_ID 0 + +#define MDSCR_KDE (1 << 13) +#define MDSCR_MDE (1 << 15) +#define MDSCR_SS (1 << 0) + +#define DBGBCR_LEN8 (0xff << 5) +#define DBGBCR_EXEC (0x0 << 3) +#define DBGBCR_EL1 (0x1 << 1) +#define DBGBCR_E (0x1 << 0) + +#define DBGWCR_LEN8 (0xff << 5) +#define DBGWCR_RD (0x1 << 3) +#define DBGWCR_WR (0x2 << 3) +#define DBGWCR_EL1 (0x1 << 1) +#define DBGWCR_E (0x1 << 0) + +#define SPSR_D (1 << 9) +#define SPSR_SS (1 << 21) + +extern unsigned char sw_bp, hw_bp, bp_svc, bp_brk, hw_wp, ss_start; +static volatile uint64_t sw_bp_addr, hw_bp_addr; +static volatile uint64_t wp_addr, wp_data_addr; +static volatile uint64_t svc_addr; +static volatile uint64_t ss_addr[4], ss_idx; +#define PC(v) ((uint64_t)&(v)) + +static void reset_debug_state(void) +{ + asm volatile("msr daifset, #8"); + + write_sysreg(osdlr_el1, 0); + write_sysreg(oslar_el1, 0); + isb(); + + write_sysreg(mdscr_el1, 0); + /* This test only uses the first bp and wp slot. */ + write_sysreg(dbgbvr0_el1, 0); + write_sysreg(dbgbcr0_el1, 0); + write_sysreg(dbgwcr0_el1, 0); + write_sysreg(dbgwvr0_el1, 0); + isb(); +} + +static void install_wp(uint64_t addr) +{ + uint32_t wcr; + uint32_t mdscr; + + wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E; + write_sysreg(dbgwcr0_el1, wcr); + write_sysreg(dbgwvr0_el1, addr); + isb(); + + asm volatile("msr daifclr, #8"); + + mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE; + write_sysreg(mdscr_el1, mdscr); + isb(); +} + +static void install_hw_bp(uint64_t addr) +{ + uint32_t bcr; + uint32_t mdscr; + + bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E; + write_sysreg(dbgbcr0_el1, bcr); + write_sysreg(dbgbvr0_el1, addr); + isb(); + + asm volatile("msr daifclr, #8"); + + mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE; + write_sysreg(mdscr_el1, mdscr); + isb(); +} + +static void install_ss(void) +{ + uint32_t mdscr; + + asm volatile("msr daifclr, #8"); + + mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_SS; + write_sysreg(mdscr_el1, mdscr); + isb(); +} + +static volatile char write_data; + +static void guest_code(void) +{ + GUEST_SYNC(0); + + /* Software-breakpoint */ + asm volatile("sw_bp: brk #0"); + GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp)); + + GUEST_SYNC(1); + + /* Hardware-breakpoint */ + reset_debug_state(); + install_hw_bp(PC(hw_bp)); + asm volatile("hw_bp: nop"); + GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp)); + + GUEST_SYNC(2); + + /* Hardware-breakpoint + svc */ + reset_debug_state(); + install_hw_bp(PC(bp_svc)); + asm volatile("bp_svc: svc #0"); + GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_svc)); + GUEST_ASSERT_EQ(svc_addr, PC(bp_svc) + 4); + + GUEST_SYNC(3); + + /* Hardware-breakpoint + software-breakpoint */ + reset_debug_state(); + install_hw_bp(PC(bp_brk)); + asm volatile("bp_brk: brk #0"); + GUEST_ASSERT_EQ(sw_bp_addr, PC(bp_brk)); + GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_brk)); + + GUEST_SYNC(4); + + /* Watchpoint */ + reset_debug_state(); + install_wp(PC(write_data)); + write_data = 'x'; + GUEST_ASSERT_EQ(write_data, 'x'); + GUEST_ASSERT_EQ(wp_data_addr, PC(write_data)); + + GUEST_SYNC(5); + + /* Single-step */ + reset_debug_state(); + install_ss(); + ss_idx = 0; + asm volatile("ss_start:\n" + "mrs x0, esr_el1\n" + "add x0, x0, #1\n" + "msr daifset, #8\n" + : : : "x0"); + GUEST_ASSERT_EQ(ss_addr[0], PC(ss_start)); + GUEST_ASSERT_EQ(ss_addr[1], PC(ss_start) + 4); + GUEST_ASSERT_EQ(ss_addr[2], PC(ss_start) + 8); + + GUEST_DONE(); +} + +static void guest_sw_bp_handler(struct ex_regs *regs) +{ + sw_bp_addr = regs->pc; + regs->pc += 4; +} + +static void guest_hw_bp_handler(struct ex_regs *regs) +{ + hw_bp_addr = regs->pc; + regs->pstate |= SPSR_D; +} + +static void guest_wp_handler(struct ex_regs *regs) +{ + wp_data_addr = read_sysreg(far_el1); + wp_addr = regs->pc; + regs->pstate |= SPSR_D; +} + +static void guest_ss_handler(struct ex_regs *regs) +{ + GUEST_ASSERT_1(ss_idx < 4, ss_idx); + ss_addr[ss_idx++] = regs->pc; + regs->pstate |= SPSR_SS; +} + +static void guest_svc_handler(struct ex_regs *regs) +{ + svc_addr = regs->pc; +} + +static int debug_version(struct kvm_vm *vm) +{ + uint64_t id_aa64dfr0; + + get_reg(vm, VCPU_ID, ARM64_SYS_REG(ID_AA64DFR0_EL1), &id_aa64dfr0); + return id_aa64dfr0 & 0xf; +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct ucall uc; + int stage; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + ucall_init(vm, NULL); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + + if (debug_version(vm) < 6) { + print_skip("Armv8 debug architecture not supported."); + kvm_vm_free(vm); + exit(KSFT_SKIP); + } + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_BRK_INS, guest_sw_bp_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_HW_BP_CURRENT, guest_hw_bp_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_WP_CURRENT, guest_wp_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_SSTEP_CURRENT, guest_ss_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_SVC64, guest_svc_handler); + + for (stage = 0; stage < 7; stage++) { + vcpu_run(vm, VCPU_ID); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(uc.args[1] == stage, + "Stage %d: Unexpected sync ucall, got %lx", + stage, (ulong)uc.args[1]); + break; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld\n\tvalues: %#lx, %#lx", + (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2], uc.args[3]); + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c deleted file mode 100644 index efba76682b4b..000000000000 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c +++ /dev/null @@ -1,3 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define REG_LIST_SVE -#include "get-reg-list.c" diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 486932164cf2..cc898181faab 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -27,17 +27,37 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/wait.h> #include "kvm_util.h" #include "test_util.h" #include "processor.h" -#ifdef REG_LIST_SVE -#define reg_list_sve() (true) -#else -#define reg_list_sve() (false) -#endif +static struct kvm_reg_list *reg_list; +static __u64 *blessed_reg, blessed_n; -#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK) +struct reg_sublist { + const char *name; + long capability; + int feature; + bool finalize; + __u64 *regs; + __u64 regs_n; + __u64 *rejects_set; + __u64 rejects_set_n; +}; + +struct vcpu_config { + char *name; + struct reg_sublist sublists[]; +}; + +static struct vcpu_config *vcpu_configs[]; +static int vcpu_configs_n; + +#define for_each_sublist(c, s) \ + for ((s) = &(c)->sublists[0]; (s)->regs; ++(s)) #define for_each_reg(i) \ for ((i) = 0; (i) < reg_list->n; ++(i)) @@ -54,12 +74,41 @@ for_each_reg_filtered(i) \ if (!find_reg(blessed_reg, blessed_n, reg_list->reg[i])) +static const char *config_name(struct vcpu_config *c) +{ + struct reg_sublist *s; + int len = 0; -static struct kvm_reg_list *reg_list; + if (c->name) + return c->name; -static __u64 base_regs[], vregs[], sve_regs[], rejects_set[]; -static __u64 base_regs_n, vregs_n, sve_regs_n, rejects_set_n; -static __u64 *blessed_reg, blessed_n; + for_each_sublist(c, s) + len += strlen(s->name) + 1; + + c->name = malloc(len); + + len = 0; + for_each_sublist(c, s) { + if (!strcmp(s->name, "base")) + continue; + strcat(c->name + len, s->name); + len += strlen(s->name) + 1; + c->name[len - 1] = '+'; + } + c->name[len - 1] = '\0'; + + return c->name; +} + +static bool has_cap(struct vcpu_config *c, long capability) +{ + struct reg_sublist *s; + + for_each_sublist(c, s) + if (s->capability == capability) + return true; + return false; +} static bool filter_reg(__u64 reg) { @@ -96,11 +145,13 @@ static const char *str_with_index(const char *template, __u64 index) return (const char *)str; } +#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK) + #define CORE_REGS_XX_NR_WORDS 2 #define CORE_SPSR_XX_NR_WORDS 2 #define CORE_FPREGS_XX_NR_WORDS 4 -static const char *core_id_to_str(__u64 id) +static const char *core_id_to_str(struct vcpu_config *c, __u64 id) { __u64 core_off = id & ~REG_MASK, idx; @@ -111,7 +162,7 @@ static const char *core_id_to_str(__u64 id) case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... KVM_REG_ARM_CORE_REG(regs.regs[30]): idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / CORE_REGS_XX_NR_WORDS; - TEST_ASSERT(idx < 31, "Unexpected regs.regs index: %lld", idx); + TEST_ASSERT(idx < 31, "%s: Unexpected regs.regs index: %lld", config_name(c), idx); return str_with_index("KVM_REG_ARM_CORE_REG(regs.regs[##])", idx); case KVM_REG_ARM_CORE_REG(regs.sp): return "KVM_REG_ARM_CORE_REG(regs.sp)"; @@ -126,12 +177,12 @@ static const char *core_id_to_str(__u64 id) case KVM_REG_ARM_CORE_REG(spsr[0]) ... KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]): idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / CORE_SPSR_XX_NR_WORDS; - TEST_ASSERT(idx < KVM_NR_SPSR, "Unexpected spsr index: %lld", idx); + TEST_ASSERT(idx < KVM_NR_SPSR, "%s: Unexpected spsr index: %lld", config_name(c), idx); return str_with_index("KVM_REG_ARM_CORE_REG(spsr[##])", idx); case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): idx = (core_off - KVM_REG_ARM_CORE_REG(fp_regs.vregs[0])) / CORE_FPREGS_XX_NR_WORDS; - TEST_ASSERT(idx < 32, "Unexpected fp_regs.vregs index: %lld", idx); + TEST_ASSERT(idx < 32, "%s: Unexpected fp_regs.vregs index: %lld", config_name(c), idx); return str_with_index("KVM_REG_ARM_CORE_REG(fp_regs.vregs[##])", idx); case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): return "KVM_REG_ARM_CORE_REG(fp_regs.fpsr)"; @@ -139,11 +190,11 @@ static const char *core_id_to_str(__u64 id) return "KVM_REG_ARM_CORE_REG(fp_regs.fpcr)"; } - TEST_FAIL("Unknown core reg id: 0x%llx", id); + TEST_FAIL("%s: Unknown core reg id: 0x%llx", config_name(c), id); return NULL; } -static const char *sve_id_to_str(__u64 id) +static const char *sve_id_to_str(struct vcpu_config *c, __u64 id) { __u64 sve_off, n, i; @@ -153,37 +204,37 @@ static const char *sve_id_to_str(__u64 id) sve_off = id & ~(REG_MASK | ((1ULL << 5) - 1)); i = id & (KVM_ARM64_SVE_MAX_SLICES - 1); - TEST_ASSERT(i == 0, "Currently we don't expect slice > 0, reg id 0x%llx", id); + TEST_ASSERT(i == 0, "%s: Currently we don't expect slice > 0, reg id 0x%llx", config_name(c), id); switch (sve_off) { case KVM_REG_ARM64_SVE_ZREG_BASE ... KVM_REG_ARM64_SVE_ZREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_ZREGS - 1: n = (id >> 5) & (KVM_ARM64_SVE_NUM_ZREGS - 1); TEST_ASSERT(id == KVM_REG_ARM64_SVE_ZREG(n, 0), - "Unexpected bits set in SVE ZREG id: 0x%llx", id); + "%s: Unexpected bits set in SVE ZREG id: 0x%llx", config_name(c), id); return str_with_index("KVM_REG_ARM64_SVE_ZREG(##, 0)", n); case KVM_REG_ARM64_SVE_PREG_BASE ... KVM_REG_ARM64_SVE_PREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_PREGS - 1: n = (id >> 5) & (KVM_ARM64_SVE_NUM_PREGS - 1); TEST_ASSERT(id == KVM_REG_ARM64_SVE_PREG(n, 0), - "Unexpected bits set in SVE PREG id: 0x%llx", id); + "%s: Unexpected bits set in SVE PREG id: 0x%llx", config_name(c), id); return str_with_index("KVM_REG_ARM64_SVE_PREG(##, 0)", n); case KVM_REG_ARM64_SVE_FFR_BASE: TEST_ASSERT(id == KVM_REG_ARM64_SVE_FFR(0), - "Unexpected bits set in SVE FFR id: 0x%llx", id); + "%s: Unexpected bits set in SVE FFR id: 0x%llx", config_name(c), id); return "KVM_REG_ARM64_SVE_FFR(0)"; } return NULL; } -static void print_reg(__u64 id) +static void print_reg(struct vcpu_config *c, __u64 id) { unsigned op0, op1, crn, crm, op2; const char *reg_size = NULL; TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64, - "KVM_REG_ARM64 missing in reg id: 0x%llx", id); + "%s: KVM_REG_ARM64 missing in reg id: 0x%llx", config_name(c), id); switch (id & KVM_REG_SIZE_MASK) { case KVM_REG_SIZE_U8: @@ -214,17 +265,17 @@ static void print_reg(__u64 id) reg_size = "KVM_REG_SIZE_U2048"; break; default: - TEST_FAIL("Unexpected reg size: 0x%llx in reg id: 0x%llx", - (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id); + TEST_FAIL("%s: Unexpected reg size: 0x%llx in reg id: 0x%llx", + config_name(c), (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id); } switch (id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: - printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(id)); + printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(c, id)); break; case KVM_REG_ARM_DEMUX: TEST_ASSERT(!(id & ~(REG_MASK | KVM_REG_ARM_DEMUX_ID_MASK | KVM_REG_ARM_DEMUX_VAL_MASK)), - "Unexpected bits set in DEMUX reg id: 0x%llx", id); + "%s: Unexpected bits set in DEMUX reg id: 0x%llx", config_name(c), id); printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | %lld,\n", reg_size, id & KVM_REG_ARM_DEMUX_VAL_MASK); break; @@ -235,23 +286,23 @@ static void print_reg(__u64 id) crm = (id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT; op2 = (id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT; TEST_ASSERT(id == ARM64_SYS_REG(op0, op1, crn, crm, op2), - "Unexpected bits set in SYSREG reg id: 0x%llx", id); + "%s: Unexpected bits set in SYSREG reg id: 0x%llx", config_name(c), id); printf("\tARM64_SYS_REG(%d, %d, %d, %d, %d),\n", op0, op1, crn, crm, op2); break; case KVM_REG_ARM_FW: TEST_ASSERT(id == KVM_REG_ARM_FW_REG(id & 0xffff), - "Unexpected bits set in FW reg id: 0x%llx", id); + "%s: Unexpected bits set in FW reg id: 0x%llx", config_name(c), id); printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff); break; case KVM_REG_ARM64_SVE: - if (reg_list_sve()) - printf("\t%s,\n", sve_id_to_str(id)); + if (has_cap(c, KVM_CAP_ARM_SVE)) + printf("\t%s,\n", sve_id_to_str(c, id)); else - TEST_FAIL("KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", id); + TEST_FAIL("%s: KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", config_name(c), id); break; default: - TEST_FAIL("Unexpected coproc type: 0x%llx in reg id: 0x%llx", - (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id); + TEST_FAIL("%s: Unexpected coproc type: 0x%llx in reg id: 0x%llx", + config_name(c), (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id); } } @@ -312,56 +363,58 @@ static void core_reg_fixup(void) reg_list = tmp; } -static void prepare_vcpu_init(struct kvm_vcpu_init *init) +static void prepare_vcpu_init(struct vcpu_config *c, struct kvm_vcpu_init *init) { - if (reg_list_sve()) - init->features[0] |= 1 << KVM_ARM_VCPU_SVE; + struct reg_sublist *s; + + for_each_sublist(c, s) + if (s->capability) + init->features[s->feature / 32] |= 1 << (s->feature % 32); } -static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid, struct vcpu_config *c) { + struct reg_sublist *s; int feature; - if (reg_list_sve()) { - feature = KVM_ARM_VCPU_SVE; - vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_FINALIZE, &feature); + for_each_sublist(c, s) { + if (s->finalize) { + feature = s->feature; + vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_FINALIZE, &feature); + } } } -static void check_supported(void) +static void check_supported(struct vcpu_config *c) { - if (reg_list_sve() && !kvm_check_cap(KVM_CAP_ARM_SVE)) { - fprintf(stderr, "SVE not available, skipping tests\n"); - exit(KSFT_SKIP); + struct reg_sublist *s; + + for_each_sublist(c, s) { + if (s->capability && !kvm_check_cap(s->capability)) { + fprintf(stderr, "%s: %s not available, skipping tests\n", config_name(c), s->name); + exit(KSFT_SKIP); + } } } -int main(int ac, char **av) +static bool print_list; +static bool print_filtered; +static bool fixup_core_regs; + +static void run_test(struct vcpu_config *c) { struct kvm_vcpu_init init = { .target = -1, }; - int new_regs = 0, missing_regs = 0, i; + int new_regs = 0, missing_regs = 0, i, n; int failed_get = 0, failed_set = 0, failed_reject = 0; - bool print_list = false, print_filtered = false, fixup_core_regs = false; struct kvm_vm *vm; - __u64 *vec_regs; + struct reg_sublist *s; - check_supported(); - - for (i = 1; i < ac; ++i) { - if (strcmp(av[i], "--core-reg-fixup") == 0) - fixup_core_regs = true; - else if (strcmp(av[i], "--list") == 0) - print_list = true; - else if (strcmp(av[i], "--list-filtered") == 0) - print_filtered = true; - else - TEST_FAIL("Unknown option: %s\n", av[i]); - } + check_supported(c); vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); - prepare_vcpu_init(&init); + prepare_vcpu_init(c, &init); aarch64_vcpu_add_default(vm, 0, &init, NULL); - finalize_vcpu(vm, 0); + finalize_vcpu(vm, 0, c); reg_list = vcpu_get_reg_list(vm, 0); @@ -374,10 +427,10 @@ int main(int ac, char **av) __u64 id = reg_list->reg[i]; if ((print_list && !filter_reg(id)) || (print_filtered && filter_reg(id))) - print_reg(id); + print_reg(c, id); } putchar('\n'); - return 0; + return; } /* @@ -396,50 +449,52 @@ int main(int ac, char **av) .id = reg_list->reg[i], .addr = (__u64)&addr, }; + bool reject_reg = false; int ret; ret = _vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, ®); if (ret) { - puts("Failed to get "); - print_reg(reg.id); + printf("%s: Failed to get ", config_name(c)); + print_reg(c, reg.id); putchar('\n'); ++failed_get; } /* rejects_set registers are rejected after KVM_ARM_VCPU_FINALIZE */ - if (find_reg(rejects_set, rejects_set_n, reg.id)) { - ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); - if (ret != -1 || errno != EPERM) { - printf("Failed to reject (ret=%d, errno=%d) ", ret, errno); - print_reg(reg.id); - putchar('\n'); - ++failed_reject; + for_each_sublist(c, s) { + if (s->rejects_set && find_reg(s->rejects_set, s->rejects_set_n, reg.id)) { + reject_reg = true; + ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + if (ret != -1 || errno != EPERM) { + printf("%s: Failed to reject (ret=%d, errno=%d) ", config_name(c), ret, errno); + print_reg(c, reg.id); + putchar('\n'); + ++failed_reject; + } + break; } - continue; } - ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); - if (ret) { - puts("Failed to set "); - print_reg(reg.id); - putchar('\n'); - ++failed_set; + if (!reject_reg) { + ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + if (ret) { + printf("%s: Failed to set ", config_name(c)); + print_reg(c, reg.id); + putchar('\n'); + ++failed_set; + } } } - if (reg_list_sve()) { - blessed_n = base_regs_n + sve_regs_n; - vec_regs = sve_regs; - } else { - blessed_n = base_regs_n + vregs_n; - vec_regs = vregs; - } - + for_each_sublist(c, s) + blessed_n += s->regs_n; blessed_reg = calloc(blessed_n, sizeof(__u64)); - for (i = 0; i < base_regs_n; ++i) - blessed_reg[i] = base_regs[i]; - for (i = 0; i < blessed_n - base_regs_n; ++i) - blessed_reg[base_regs_n + i] = vec_regs[i]; + + n = 0; + for_each_sublist(c, s) { + for (i = 0; i < s->regs_n; ++i) + blessed_reg[n++] = s->regs[i]; + } for_each_new_reg(i) ++new_regs; @@ -448,40 +503,141 @@ int main(int ac, char **av) ++missing_regs; if (new_regs || missing_regs) { - printf("Number blessed registers: %5lld\n", blessed_n); - printf("Number registers: %5lld\n", reg_list->n); + printf("%s: Number blessed registers: %5lld\n", config_name(c), blessed_n); + printf("%s: Number registers: %5lld\n", config_name(c), reg_list->n); } if (new_regs) { - printf("\nThere are %d new registers.\n" + printf("\n%s: There are %d new registers.\n" "Consider adding them to the blessed reg " - "list with the following lines:\n\n", new_regs); + "list with the following lines:\n\n", config_name(c), new_regs); for_each_new_reg(i) - print_reg(reg_list->reg[i]); + print_reg(c, reg_list->reg[i]); putchar('\n'); } if (missing_regs) { - printf("\nThere are %d missing registers.\n" - "The following lines are missing registers:\n\n", missing_regs); + printf("\n%s: There are %d missing registers.\n" + "The following lines are missing registers:\n\n", config_name(c), missing_regs); for_each_missing_reg(i) - print_reg(blessed_reg[i]); + print_reg(c, blessed_reg[i]); putchar('\n'); } TEST_ASSERT(!missing_regs && !failed_get && !failed_set && !failed_reject, - "There are %d missing registers; " + "%s: There are %d missing registers; " "%d registers failed get; %d registers failed set; %d registers failed reject", - missing_regs, failed_get, failed_set, failed_reject); + config_name(c), missing_regs, failed_get, failed_set, failed_reject); - return 0; + pr_info("%s: PASS\n", config_name(c)); + blessed_n = 0; + free(blessed_reg); + free(reg_list); + kvm_vm_free(vm); +} + +static void help(void) +{ + struct vcpu_config *c; + int i; + + printf( + "\n" + "usage: get-reg-list [--config=<selection>] [--list] [--list-filtered] [--core-reg-fixup]\n\n" + " --config=<selection> Used to select a specific vcpu configuration for the test/listing\n" + " '<selection>' may be\n"); + + for (i = 0; i < vcpu_configs_n; ++i) { + c = vcpu_configs[i]; + printf( + " '%s'\n", config_name(c)); + } + + printf( + "\n" + " --list Print the register list rather than test it (requires --config)\n" + " --list-filtered Print registers that would normally be filtered out (requires --config)\n" + " --core-reg-fixup Needed when running on old kernels with broken core reg listings\n" + "\n" + ); +} + +static struct vcpu_config *parse_config(const char *config) +{ + struct vcpu_config *c; + int i; + + if (config[8] != '=') + help(), exit(1); + + for (i = 0; i < vcpu_configs_n; ++i) { + c = vcpu_configs[i]; + if (strcmp(config_name(c), &config[9]) == 0) + break; + } + + if (i == vcpu_configs_n) + help(), exit(1); + + return c; +} + +int main(int ac, char **av) +{ + struct vcpu_config *c, *sel = NULL; + int i, ret = 0; + pid_t pid; + + for (i = 1; i < ac; ++i) { + if (strcmp(av[i], "--core-reg-fixup") == 0) + fixup_core_regs = true; + else if (strncmp(av[i], "--config", 8) == 0) + sel = parse_config(av[i]); + else if (strcmp(av[i], "--list") == 0) + print_list = true; + else if (strcmp(av[i], "--list-filtered") == 0) + print_filtered = true; + else if (strcmp(av[i], "--help") == 0 || strcmp(av[1], "-h") == 0) + help(), exit(0); + else + help(), exit(1); + } + + if (print_list || print_filtered) { + /* + * We only want to print the register list of a single config. + */ + if (!sel) + help(), exit(1); + } + + for (i = 0; i < vcpu_configs_n; ++i) { + c = vcpu_configs[i]; + if (sel && c != sel) + continue; + + pid = fork(); + + if (!pid) { + run_test(c); + exit(0); + } else { + int wstatus; + pid_t wpid = wait(&wstatus); + TEST_ASSERT(wpid == pid && WIFEXITED(wstatus), "wait: Unexpected return"); + if (WEXITSTATUS(wstatus) && WEXITSTATUS(wstatus) != KSFT_SKIP) + ret = KSFT_FAIL; + } + } + + return ret; } /* * The current blessed list was primed with the output of kernel version * v4.15 with --core-reg-fixup and then later updated with new registers. * - * The blessed list is up to date with kernel version v5.10-rc5 + * The blessed list is up to date with kernel version v5.13-rc3 */ static __u64 base_regs[] = { KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]), @@ -673,8 +829,6 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 5, 2, 0), /* ESR_EL1 */ ARM64_SYS_REG(3, 0, 6, 0, 0), /* FAR_EL1 */ ARM64_SYS_REG(3, 0, 7, 4, 0), /* PAR_EL1 */ - ARM64_SYS_REG(3, 0, 9, 14, 1), /* PMINTENSET_EL1 */ - ARM64_SYS_REG(3, 0, 9, 14, 2), /* PMINTENCLR_EL1 */ ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */ ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */ ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */ @@ -683,6 +837,16 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */ ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */ ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */ + ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ + ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ + ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */ +}; + +static __u64 pmu_regs[] = { + ARM64_SYS_REG(3, 0, 9, 14, 1), /* PMINTENSET_EL1 */ + ARM64_SYS_REG(3, 0, 9, 14, 2), /* PMINTENCLR_EL1 */ ARM64_SYS_REG(3, 3, 9, 12, 0), /* PMCR_EL0 */ ARM64_SYS_REG(3, 3, 9, 12, 1), /* PMCNTENSET_EL0 */ ARM64_SYS_REG(3, 3, 9, 12, 2), /* PMCNTENCLR_EL0 */ @@ -692,8 +856,6 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 3, 9, 13, 0), /* PMCCNTR_EL0 */ ARM64_SYS_REG(3, 3, 9, 14, 0), /* PMUSERENR_EL0 */ ARM64_SYS_REG(3, 3, 9, 14, 3), /* PMOVSSET_EL0 */ - ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ - ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ ARM64_SYS_REG(3, 3, 14, 8, 0), ARM64_SYS_REG(3, 3, 14, 8, 1), ARM64_SYS_REG(3, 3, 14, 8, 2), @@ -757,11 +919,7 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 3, 14, 15, 5), ARM64_SYS_REG(3, 3, 14, 15, 6), ARM64_SYS_REG(3, 3, 14, 15, 7), /* PMCCFILTR_EL0 */ - ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */ - ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */ - ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */ }; -static __u64 base_regs_n = ARRAY_SIZE(base_regs); static __u64 vregs[] = { KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]), @@ -797,7 +955,6 @@ static __u64 vregs[] = { KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]), KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]), }; -static __u64 vregs_n = ARRAY_SIZE(vregs); static __u64 sve_regs[] = { KVM_REG_ARM64_SVE_VLS, @@ -852,11 +1009,58 @@ static __u64 sve_regs[] = { KVM_REG_ARM64_SVE_FFR(0), ARM64_SYS_REG(3, 0, 1, 2, 0), /* ZCR_EL1 */ }; -static __u64 sve_regs_n = ARRAY_SIZE(sve_regs); -static __u64 rejects_set[] = { -#ifdef REG_LIST_SVE +static __u64 sve_rejects_set[] = { KVM_REG_ARM64_SVE_VLS, -#endif }; -static __u64 rejects_set_n = ARRAY_SIZE(rejects_set); + +#define BASE_SUBLIST \ + { "base", .regs = base_regs, .regs_n = ARRAY_SIZE(base_regs), } +#define VREGS_SUBLIST \ + { "vregs", .regs = vregs, .regs_n = ARRAY_SIZE(vregs), } +#define PMU_SUBLIST \ + { "pmu", .capability = KVM_CAP_ARM_PMU_V3, .feature = KVM_ARM_VCPU_PMU_V3, \ + .regs = pmu_regs, .regs_n = ARRAY_SIZE(pmu_regs), } +#define SVE_SUBLIST \ + { "sve", .capability = KVM_CAP_ARM_SVE, .feature = KVM_ARM_VCPU_SVE, .finalize = true, \ + .regs = sve_regs, .regs_n = ARRAY_SIZE(sve_regs), \ + .rejects_set = sve_rejects_set, .rejects_set_n = ARRAY_SIZE(sve_rejects_set), } + +static struct vcpu_config vregs_config = { + .sublists = { + BASE_SUBLIST, + VREGS_SUBLIST, + {0}, + }, +}; +static struct vcpu_config vregs_pmu_config = { + .sublists = { + BASE_SUBLIST, + VREGS_SUBLIST, + PMU_SUBLIST, + {0}, + }, +}; +static struct vcpu_config sve_config = { + .sublists = { + BASE_SUBLIST, + SVE_SUBLIST, + {0}, + }, +}; +static struct vcpu_config sve_pmu_config = { + .sublists = { + BASE_SUBLIST, + SVE_SUBLIST, + PMU_SUBLIST, + {0}, + }, +}; + +static struct vcpu_config *vcpu_configs[] = { + &vregs_config, + &vregs_pmu_config, + &sve_config, + &sve_pmu_config, +}; +static int vcpu_configs_n = ARRAY_SIZE(vcpu_configs); diff --git a/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c b/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c new file mode 100644 index 000000000000..018c269990e1 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * psci_cpu_on_test - Test that the observable state of a vCPU targeted by the + * CPU_ON PSCI call matches what the caller requested. + * + * Copyright (c) 2021 Google LLC. + * + * This is a regression test for a race between KVM servicing the PSCI call and + * userspace reading the vCPUs registers. + */ + +#define _GNU_SOURCE + +#include <linux/psci.h> + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +#define VCPU_ID_SOURCE 0 +#define VCPU_ID_TARGET 1 + +#define CPU_ON_ENTRY_ADDR 0xfeedf00dul +#define CPU_ON_CONTEXT_ID 0xdeadc0deul + +static uint64_t psci_cpu_on(uint64_t target_cpu, uint64_t entry_addr, + uint64_t context_id) +{ + register uint64_t x0 asm("x0") = PSCI_0_2_FN64_CPU_ON; + register uint64_t x1 asm("x1") = target_cpu; + register uint64_t x2 asm("x2") = entry_addr; + register uint64_t x3 asm("x3") = context_id; + + asm("hvc #0" + : "=r"(x0) + : "r"(x0), "r"(x1), "r"(x2), "r"(x3) + : "memory"); + + return x0; +} + +static uint64_t psci_affinity_info(uint64_t target_affinity, + uint64_t lowest_affinity_level) +{ + register uint64_t x0 asm("x0") = PSCI_0_2_FN64_AFFINITY_INFO; + register uint64_t x1 asm("x1") = target_affinity; + register uint64_t x2 asm("x2") = lowest_affinity_level; + + asm("hvc #0" + : "=r"(x0) + : "r"(x0), "r"(x1), "r"(x2) + : "memory"); + + return x0; +} + +static void guest_main(uint64_t target_cpu) +{ + GUEST_ASSERT(!psci_cpu_on(target_cpu, CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID)); + uint64_t target_state; + + do { + target_state = psci_affinity_info(target_cpu, 0); + + GUEST_ASSERT((target_state == PSCI_0_2_AFFINITY_LEVEL_ON) || + (target_state == PSCI_0_2_AFFINITY_LEVEL_OFF)); + } while (target_state != PSCI_0_2_AFFINITY_LEVEL_ON); + + GUEST_DONE(); +} + +int main(void) +{ + uint64_t target_mpidr, obs_pc, obs_x0; + struct kvm_vcpu_init init; + struct kvm_vm *vm; + struct ucall uc; + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + kvm_vm_elf_load(vm, program_invocation_name); + ucall_init(vm, NULL); + + vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); + init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2); + + aarch64_vcpu_add_default(vm, VCPU_ID_SOURCE, &init, guest_main); + + /* + * make sure the target is already off when executing the test. + */ + init.features[0] |= (1 << KVM_ARM_VCPU_POWER_OFF); + aarch64_vcpu_add_default(vm, VCPU_ID_TARGET, &init, guest_main); + + get_reg(vm, VCPU_ID_TARGET, ARM64_SYS_REG(MPIDR_EL1), &target_mpidr); + vcpu_args_set(vm, VCPU_ID_SOURCE, 1, target_mpidr & MPIDR_HWID_BITMASK); + vcpu_run(vm, VCPU_ID_SOURCE); + + switch (get_ucall(vm, VCPU_ID_SOURCE, &uc)) { + case UCALL_DONE: + break; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, + uc.args[1]); + break; + default: + TEST_FAIL("Unhandled ucall: %lu", uc.cmd); + } + + get_reg(vm, VCPU_ID_TARGET, ARM64_CORE_REG(regs.pc), &obs_pc); + get_reg(vm, VCPU_ID_TARGET, ARM64_CORE_REG(regs.regs[0]), &obs_x0); + + TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR, + "unexpected target cpu pc: %lx (expected: %lx)", + obs_pc, CPU_ON_ENTRY_ADDR); + TEST_ASSERT(obs_x0 == CPU_ON_CONTEXT_ID, + "unexpected target context id: %lx (expected: %lx)", + obs_x0, CPU_ON_CONTEXT_ID); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c new file mode 100644 index 000000000000..5d95113c7b7c --- /dev/null +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * access_tracking_perf_test + * + * Copyright (C) 2021, Google, Inc. + * + * This test measures the performance effects of KVM's access tracking. + * Access tracking is driven by the MMU notifiers test_young, clear_young, and + * clear_flush_young. These notifiers do not have a direct userspace API, + * however the clear_young notifier can be triggered by marking a pages as idle + * in /sys/kernel/mm/page_idle/bitmap. This test leverages that mechanism to + * enable access tracking on guest memory. + * + * To measure performance this test runs a VM with a configurable number of + * vCPUs that each touch every page in disjoint regions of memory. Performance + * is measured in the time it takes all vCPUs to finish touching their + * predefined region. + * + * Note that a deterministic correctness test of access tracking is not possible + * by using page_idle as it exists today. This is for a few reasons: + * + * 1. page_idle only issues clear_young notifiers, which lack a TLB flush. This + * means subsequent guest accesses are not guaranteed to see page table + * updates made by KVM until some time in the future. + * + * 2. page_idle only operates on LRU pages. Newly allocated pages are not + * immediately allocated to LRU lists. Instead they are held in a "pagevec", + * which is drained to LRU lists some time in the future. There is no + * userspace API to force this drain to occur. + * + * These limitations are worked around in this test by using a large enough + * region of memory for each vCPU such that the number of translations cached in + * the TLB and the number of pages held in pagevecs are a small fraction of the + * overall workload. And if either of those conditions are not true this test + * will fail rather than silently passing. + */ +#include <inttypes.h> +#include <limits.h> +#include <pthread.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include "kvm_util.h" +#include "test_util.h" +#include "perf_test_util.h" +#include "guest_modes.h" + +/* Global variable used to synchronize all of the vCPU threads. */ +static int iteration = -1; + +/* Defines what vCPU threads should do during a given iteration. */ +static enum { + /* Run the vCPU to access all its memory. */ + ITERATION_ACCESS_MEMORY, + /* Mark the vCPU's memory idle in page_idle. */ + ITERATION_MARK_IDLE, +} iteration_work; + +/* Set to true when vCPU threads should exit. */ +static bool done; + +/* The iteration that was last completed by each vCPU. */ +static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; + +/* Whether to overlap the regions of memory vCPUs access. */ +static bool overlap_memory_access; + +struct test_params { + /* The backing source for the region of memory. */ + enum vm_mem_backing_src_type backing_src; + + /* The amount of memory to allocate for each vCPU. */ + uint64_t vcpu_memory_bytes; + + /* The number of vCPUs to create in the VM. */ + int vcpus; +}; + +static uint64_t pread_uint64(int fd, const char *filename, uint64_t index) +{ + uint64_t value; + off_t offset = index * sizeof(value); + + TEST_ASSERT(pread(fd, &value, sizeof(value), offset) == sizeof(value), + "pread from %s offset 0x%" PRIx64 " failed!", + filename, offset); + + return value; + +} + +#define PAGEMAP_PRESENT (1ULL << 63) +#define PAGEMAP_PFN_MASK ((1ULL << 55) - 1) + +static uint64_t lookup_pfn(int pagemap_fd, struct kvm_vm *vm, uint64_t gva) +{ + uint64_t hva = (uint64_t) addr_gva2hva(vm, gva); + uint64_t entry; + uint64_t pfn; + + entry = pread_uint64(pagemap_fd, "pagemap", hva / getpagesize()); + if (!(entry & PAGEMAP_PRESENT)) + return 0; + + pfn = entry & PAGEMAP_PFN_MASK; + if (!pfn) { + print_skip("Looking up PFNs requires CAP_SYS_ADMIN"); + exit(KSFT_SKIP); + } + + return pfn; +} + +static bool is_page_idle(int page_idle_fd, uint64_t pfn) +{ + uint64_t bits = pread_uint64(page_idle_fd, "page_idle", pfn / 64); + + return !!((bits >> (pfn % 64)) & 1); +} + +static void mark_page_idle(int page_idle_fd, uint64_t pfn) +{ + uint64_t bits = 1ULL << (pfn % 64); + + TEST_ASSERT(pwrite(page_idle_fd, &bits, 8, 8 * (pfn / 64)) == 8, + "Set page_idle bits for PFN 0x%" PRIx64, pfn); +} + +static void mark_vcpu_memory_idle(struct kvm_vm *vm, int vcpu_id) +{ + uint64_t base_gva = perf_test_args.vcpu_args[vcpu_id].gva; + uint64_t pages = perf_test_args.vcpu_args[vcpu_id].pages; + uint64_t page; + uint64_t still_idle = 0; + uint64_t no_pfn = 0; + int page_idle_fd; + int pagemap_fd; + + /* If vCPUs are using an overlapping region, let vCPU 0 mark it idle. */ + if (overlap_memory_access && vcpu_id) + return; + + page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); + TEST_ASSERT(page_idle_fd > 0, "Failed to open page_idle."); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + TEST_ASSERT(pagemap_fd > 0, "Failed to open pagemap."); + + for (page = 0; page < pages; page++) { + uint64_t gva = base_gva + page * perf_test_args.guest_page_size; + uint64_t pfn = lookup_pfn(pagemap_fd, vm, gva); + + if (!pfn) { + no_pfn++; + continue; + } + + if (is_page_idle(page_idle_fd, pfn)) { + still_idle++; + continue; + } + + mark_page_idle(page_idle_fd, pfn); + } + + /* + * Assumption: Less than 1% of pages are going to be swapped out from + * under us during this test. + */ + TEST_ASSERT(no_pfn < pages / 100, + "vCPU %d: No PFN for %" PRIu64 " out of %" PRIu64 " pages.", + vcpu_id, no_pfn, pages); + + /* + * Test that at least 90% of memory has been marked idle (the rest might + * not be marked idle because the pages have not yet made it to an LRU + * list or the translations are still cached in the TLB). 90% is + * arbitrary; high enough that we ensure most memory access went through + * access tracking but low enough as to not make the test too brittle + * over time and across architectures. + */ + TEST_ASSERT(still_idle < pages / 10, + "vCPU%d: Too many pages still idle (%"PRIu64 " out of %" + PRIu64 ").\n", + vcpu_id, still_idle, pages); + + close(page_idle_fd); + close(pagemap_fd); +} + +static void assert_ucall(struct kvm_vm *vm, uint32_t vcpu_id, + uint64_t expected_ucall) +{ + struct ucall uc; + uint64_t actual_ucall = get_ucall(vm, vcpu_id, &uc); + + TEST_ASSERT(expected_ucall == actual_ucall, + "Guest exited unexpectedly (expected ucall %" PRIu64 + ", got %" PRIu64 ")", + expected_ucall, actual_ucall); +} + +static bool spin_wait_for_next_iteration(int *current_iteration) +{ + int last_iteration = *current_iteration; + + do { + if (READ_ONCE(done)) + return false; + + *current_iteration = READ_ONCE(iteration); + } while (last_iteration == *current_iteration); + + return true; +} + +static void *vcpu_thread_main(void *arg) +{ + struct perf_test_vcpu_args *vcpu_args = arg; + struct kvm_vm *vm = perf_test_args.vm; + int vcpu_id = vcpu_args->vcpu_id; + int current_iteration = -1; + + while (spin_wait_for_next_iteration(¤t_iteration)) { + switch (READ_ONCE(iteration_work)) { + case ITERATION_ACCESS_MEMORY: + vcpu_run(vm, vcpu_id); + assert_ucall(vm, vcpu_id, UCALL_SYNC); + break; + case ITERATION_MARK_IDLE: + mark_vcpu_memory_idle(vm, vcpu_id); + break; + }; + + vcpu_last_completed_iteration[vcpu_id] = current_iteration; + } + + return NULL; +} + +static void spin_wait_for_vcpu(int vcpu_id, int target_iteration) +{ + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != + target_iteration) { + continue; + } +} + +/* The type of memory accesses to perform in the VM. */ +enum access_type { + ACCESS_READ, + ACCESS_WRITE, +}; + +static void run_iteration(struct kvm_vm *vm, int vcpus, const char *description) +{ + struct timespec ts_start; + struct timespec ts_elapsed; + int next_iteration; + int vcpu_id; + + /* Kick off the vCPUs by incrementing iteration. */ + next_iteration = ++iteration; + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + + /* Wait for all vCPUs to finish the iteration. */ + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) + spin_wait_for_vcpu(vcpu_id, next_iteration); + + ts_elapsed = timespec_elapsed(ts_start); + pr_info("%-30s: %ld.%09lds\n", + description, ts_elapsed.tv_sec, ts_elapsed.tv_nsec); +} + +static void access_memory(struct kvm_vm *vm, int vcpus, enum access_type access, + const char *description) +{ + perf_test_args.wr_fract = (access == ACCESS_READ) ? INT_MAX : 1; + sync_global_to_guest(vm, perf_test_args); + iteration_work = ITERATION_ACCESS_MEMORY; + run_iteration(vm, vcpus, description); +} + +static void mark_memory_idle(struct kvm_vm *vm, int vcpus) +{ + /* + * Even though this parallelizes the work across vCPUs, this is still a + * very slow operation because page_idle forces the test to mark one pfn + * at a time and the clear_young notifier serializes on the KVM MMU + * lock. + */ + pr_debug("Marking VM memory idle (slow)...\n"); + iteration_work = ITERATION_MARK_IDLE; + run_iteration(vm, vcpus, "Mark memory idle"); +} + +static pthread_t *create_vcpu_threads(int vcpus) +{ + pthread_t *vcpu_threads; + int i; + + vcpu_threads = malloc(vcpus * sizeof(vcpu_threads[0])); + TEST_ASSERT(vcpu_threads, "Failed to allocate vcpu_threads."); + + for (i = 0; i < vcpus; i++) { + vcpu_last_completed_iteration[i] = iteration; + pthread_create(&vcpu_threads[i], NULL, vcpu_thread_main, + &perf_test_args.vcpu_args[i]); + } + + return vcpu_threads; +} + +static void terminate_vcpu_threads(pthread_t *vcpu_threads, int vcpus) +{ + int i; + + /* Set done to signal the vCPU threads to exit */ + done = true; + + for (i = 0; i < vcpus; i++) + pthread_join(vcpu_threads[i], NULL); +} + +static void run_test(enum vm_guest_mode mode, void *arg) +{ + struct test_params *params = arg; + struct kvm_vm *vm; + pthread_t *vcpu_threads; + int vcpus = params->vcpus; + + vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, 1, + params->backing_src); + + perf_test_setup_vcpus(vm, vcpus, params->vcpu_memory_bytes, + !overlap_memory_access); + + vcpu_threads = create_vcpu_threads(vcpus); + + pr_info("\n"); + access_memory(vm, vcpus, ACCESS_WRITE, "Populating memory"); + + /* As a control, read and write to the populated memory first. */ + access_memory(vm, vcpus, ACCESS_WRITE, "Writing to populated memory"); + access_memory(vm, vcpus, ACCESS_READ, "Reading from populated memory"); + + /* Repeat on memory that has been marked as idle. */ + mark_memory_idle(vm, vcpus); + access_memory(vm, vcpus, ACCESS_WRITE, "Writing to idle memory"); + mark_memory_idle(vm, vcpus); + access_memory(vm, vcpus, ACCESS_READ, "Reading from idle memory"); + + terminate_vcpu_threads(vcpu_threads, vcpus); + free(vcpu_threads); + perf_test_destroy_vm(vm); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-m mode] [-b vcpu_bytes] [-v vcpus] [-o] [-s mem_type]\n", + name); + puts(""); + printf(" -h: Display this help message."); + guest_modes_help(); + printf(" -b: specify the size of the memory region which should be\n" + " dirtied by each vCPU. e.g. 10M or 3G.\n" + " (default: 1G)\n"); + printf(" -v: specify the number of vCPUs to run.\n"); + printf(" -o: Overlap guest memory accesses instead of partitioning\n" + " them into a separate region of memory for each vCPU.\n"); + backing_src_help("-s"); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + struct test_params params = { + .backing_src = DEFAULT_VM_MEM_SRC, + .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, + .vcpus = 1, + }; + int page_idle_fd; + int opt; + + guest_modes_append_default(); + + while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) { + switch (opt) { + case 'm': + guest_modes_cmdline(optarg); + break; + case 'b': + params.vcpu_memory_bytes = parse_size(optarg); + break; + case 'v': + params.vcpus = atoi(optarg); + break; + case 'o': + overlap_memory_access = true; + break; + case 's': + params.backing_src = parse_backing_src_type(optarg); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); + if (page_idle_fd < 0) { + print_skip("CONFIG_IDLE_PAGE_TRACKING is not enabled"); + exit(KSFT_SKIP); + } + close(page_idle_fd); + + for_each_guest_mode(run_test, ¶ms); + + return 0; +} diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index b74704305835..1510b21e6306 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -52,7 +52,6 @@ static void *vcpu_worker(void *data) struct timespec start; struct timespec ts_diff; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); clock_gettime(CLOCK_MONOTONIC, &start); @@ -180,7 +179,7 @@ static void *uffd_handler_thread_fn(void *arg) return NULL; } - if (!pollfd[0].revents & POLLIN) + if (!(pollfd[0].revents & POLLIN)) continue; r = read(uffd, &msg, sizeof(msg)); @@ -293,7 +292,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) int vcpu_id; int r; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type); perf_test_args.wr_fract = 1; @@ -417,7 +416,7 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" - " [-b memory] [-t type] [-v vcpus] [-o]\n", name); + " [-b memory] [-s type] [-v vcpus] [-o]\n", name); guest_modes_help(); printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); @@ -427,8 +426,7 @@ static void help(char *name) printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); - printf(" -t: The type of backing memory to use. Default: anonymous\n"); - backing_src_help(); + backing_src_help("-s"); printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); @@ -440,14 +438,14 @@ int main(int argc, char *argv[]) { int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); struct test_params p = { - .src_type = VM_MEM_SRC_ANONYMOUS, + .src_type = DEFAULT_VM_MEM_SRC, .partition_vcpu_memory_access = true, }; int opt; guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) { + while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:o")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -466,7 +464,7 @@ int main(int argc, char *argv[]) case 'b': guest_percpu_mem_size = parse_size(optarg); break; - case 't': + case 's': p.src_type = parse_backing_src_type(optarg); break; case 'v': @@ -486,7 +484,7 @@ int main(int argc, char *argv[]) if (p.uffd_mode == UFFDIO_REGISTER_MODE_MINOR && !backing_src_is_shared(p.src_type)) { - TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -t"); + TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -s"); } for_each_guest_mode(run_test, &p); diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 04a2641261be..7ffab5bd5ce5 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -44,7 +44,6 @@ static void *vcpu_worker(void *data) struct perf_test_vcpu_args *vcpu_args = (struct perf_test_vcpu_args *)data; int vcpu_id = vcpu_args->vcpu_id; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); while (!READ_ONCE(host_quit)) { @@ -94,16 +93,89 @@ struct test_params { int wr_fract; bool partition_vcpu_memory_access; enum vm_mem_backing_src_type backing_src; + int slots; }; +static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0; + + vm_mem_region_set_flags(vm, slot, flags); + } +} + +static inline void enable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, true); +} + +static inline void disable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, false); +} + +static void get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + + kvm_vm_get_dirty_log(vm, slot, bitmaps[i]); + } +} + +static void clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], + int slots, uint64_t pages_per_slot) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + + kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot); + } +} + +static unsigned long **alloc_bitmaps(int slots, uint64_t pages_per_slot) +{ + unsigned long **bitmaps; + int i; + + bitmaps = malloc(slots * sizeof(bitmaps[0])); + TEST_ASSERT(bitmaps, "Failed to allocate bitmaps array."); + + for (i = 0; i < slots; i++) { + bitmaps[i] = bitmap_zalloc(pages_per_slot); + TEST_ASSERT(bitmaps[i], "Failed to allocate slot bitmap."); + } + + return bitmaps; +} + +static void free_bitmaps(unsigned long *bitmaps[], int slots) +{ + int i; + + for (i = 0; i < slots; i++) + free(bitmaps[i]); + + free(bitmaps); +} + static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; pthread_t *vcpu_threads; struct kvm_vm *vm; - unsigned long *bmap; + unsigned long **bitmaps; uint64_t guest_num_pages; uint64_t host_num_pages; + uint64_t pages_per_slot; int vcpu_id; struct timespec start; struct timespec ts_diff; @@ -114,14 +186,16 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct timespec clear_dirty_log_total = (struct timespec){0}; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, - p->backing_src); + p->slots, p->backing_src); perf_test_args.wr_fract = p->wr_fract; guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm); guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); host_num_pages = vm_num_host_pages(mode, guest_num_pages); - bmap = bitmap_alloc(host_num_pages); + pages_per_slot = host_num_pages / p->slots; + + bitmaps = alloc_bitmaps(p->slots, pages_per_slot); if (dirty_log_manual_caps) { cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; @@ -163,8 +237,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Enable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, - KVM_MEM_LOG_DIRTY_PAGES); + enable_dirty_logging(vm, p->slots); ts_diff = timespec_elapsed(start); pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -190,8 +263,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) iteration, ts_diff.tv_sec, ts_diff.tv_nsec); clock_gettime(CLOCK_MONOTONIC, &start); - kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap); - + get_dirty_log(vm, bitmaps, p->slots); ts_diff = timespec_elapsed(start); get_dirty_log_total = timespec_add(get_dirty_log_total, ts_diff); @@ -200,9 +272,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (dirty_log_manual_caps) { clock_gettime(CLOCK_MONOTONIC, &start); - kvm_vm_clear_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap, 0, - host_num_pages); - + clear_dirty_log(vm, bitmaps, p->slots, pages_per_slot); ts_diff = timespec_elapsed(start); clear_dirty_log_total = timespec_add(clear_dirty_log_total, ts_diff); @@ -213,7 +283,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Disable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, 0); + disable_dirty_logging(vm, p->slots); ts_diff = timespec_elapsed(start); pr_info("Disabling dirty logging time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -235,7 +305,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); } - free(bmap); + free_bitmaps(bitmaps, p->slots); free(vcpu_threads); perf_test_destroy_vm(vm); } @@ -244,7 +314,8 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-i iterations] [-p offset] " - "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]\n", name); + "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]" + "[-x memslots]\n", name); puts(""); printf(" -i: specify iteration counts (default: %"PRIu64")\n", TEST_HOST_LOOP_N); @@ -261,9 +332,9 @@ static void help(char *name) printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); - printf(" -s: specify the type of memory that should be used to\n" - " back the guest data region.\n\n"); - backing_src_help(); + backing_src_help("-s"); + printf(" -x: Split the memory region into this number of memslots.\n" + " (default: 1)\n"); puts(""); exit(0); } @@ -275,7 +346,8 @@ int main(int argc, char *argv[]) .iterations = TEST_HOST_LOOP_N, .wr_fract = 1, .partition_vcpu_memory_access = true, - .backing_src = VM_MEM_SRC_ANONYMOUS, + .backing_src = DEFAULT_VM_MEM_SRC, + .slots = 1, }; int opt; @@ -286,7 +358,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:")) != -1) { + while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:x:")) != -1) { switch (opt) { case 'i': p.iterations = atoi(optarg); @@ -312,9 +384,13 @@ int main(int argc, char *argv[]) break; case 'o': p.partition_vcpu_memory_access = false; + break; case 's': p.backing_src = parse_backing_src_type(optarg); break; + case 'x': + p.slots = atoi(optarg); + break; case 'h': default: help(argv[0]); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 81edbd23d371..792c60e1b17d 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -16,7 +16,6 @@ #include <errno.h> #include <linux/bitmap.h> #include <linux/bitops.h> -#include <asm/barrier.h> #include <linux/atomic.h> #include "kvm_util.h" @@ -681,7 +680,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + kvm_vm_elf_load(vm, program_invocation_name); #ifdef __x86_64__ vm_create_irqchip(vm); #endif @@ -750,8 +749,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - bmap = bitmap_alloc(host_num_pages); - host_bmap_track = bitmap_alloc(host_num_pages); + bmap = bitmap_zalloc(host_num_pages); + host_bmap_track = bitmap_zalloc(host_num_pages); /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, @@ -761,7 +760,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) KVM_MEM_LOG_DIRTY_PAGES); /* Do mapping for the dirty track memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); /* Cache the HVA pointer of the region */ host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 4b8db3bce610..b21c69a56daa 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -105,7 +105,7 @@ static void run_test(uint32_t run) CPU_SET(i, &cpu_set); vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + kvm_vm_elf_load(vm, program_invocation_name); vm_create_irqchip(vm); pr_debug("%s: [%d] start vcpus\n", __func__, run); diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index b7fa0c8551db..c0273aefa63d 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -8,16 +8,21 @@ #define SELFTEST_KVM_PROCESSOR_H #include "kvm_util.h" +#include <linux/stringify.h> #define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \ KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x)) -#define CPACR_EL1 3, 0, 1, 0, 2 -#define TCR_EL1 3, 0, 2, 0, 2 -#define MAIR_EL1 3, 0, 10, 2, 0 -#define TTBR0_EL1 3, 0, 2, 0, 0 -#define SCTLR_EL1 3, 0, 1, 0, 0 +#define CPACR_EL1 3, 0, 1, 0, 2 +#define TCR_EL1 3, 0, 2, 0, 2 +#define MAIR_EL1 3, 0, 10, 2, 0 +#define MPIDR_EL1 3, 0, 0, 0, 5 +#define TTBR0_EL1 3, 0, 2, 0, 0 +#define SCTLR_EL1 3, 0, 1, 0, 0 +#define VBAR_EL1 3, 0, 12, 0, 0 + +#define ID_AA64DFR0_EL1 3, 0, 0, 5, 0 /* * Default MAIR @@ -36,6 +41,8 @@ (0xfful << (4 * 8)) | \ (0xbbul << (5 * 8))) +#define MPIDR_HWID_BITMASK (0xff00fffffful) + static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t *addr) { struct kvm_one_reg reg; @@ -56,4 +63,73 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init *init, void *guest_code); +struct ex_regs { + u64 regs[31]; + u64 sp; + u64 pc; + u64 pstate; +}; + +#define VECTOR_NUM 16 + +enum { + VECTOR_SYNC_CURRENT_SP0, + VECTOR_IRQ_CURRENT_SP0, + VECTOR_FIQ_CURRENT_SP0, + VECTOR_ERROR_CURRENT_SP0, + + VECTOR_SYNC_CURRENT, + VECTOR_IRQ_CURRENT, + VECTOR_FIQ_CURRENT, + VECTOR_ERROR_CURRENT, + + VECTOR_SYNC_LOWER_64, + VECTOR_IRQ_LOWER_64, + VECTOR_FIQ_LOWER_64, + VECTOR_ERROR_LOWER_64, + + VECTOR_SYNC_LOWER_32, + VECTOR_IRQ_LOWER_32, + VECTOR_FIQ_LOWER_32, + VECTOR_ERROR_LOWER_32, +}; + +#define VECTOR_IS_SYNC(v) ((v) == VECTOR_SYNC_CURRENT_SP0 || \ + (v) == VECTOR_SYNC_CURRENT || \ + (v) == VECTOR_SYNC_LOWER_64 || \ + (v) == VECTOR_SYNC_LOWER_32) + +#define ESR_EC_NUM 64 +#define ESR_EC_SHIFT 26 +#define ESR_EC_MASK (ESR_EC_NUM - 1) + +#define ESR_EC_SVC64 0x15 +#define ESR_EC_HW_BP_CURRENT 0x31 +#define ESR_EC_SSTEP_CURRENT 0x33 +#define ESR_EC_WP_CURRENT 0x35 +#define ESR_EC_BRK_INS 0x3c + +void vm_init_descriptor_tables(struct kvm_vm *vm); +void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid); + +typedef void(*handler_fn)(struct ex_regs *); +void vm_install_exception_handler(struct kvm_vm *vm, + int vector, handler_fn handler); +void vm_install_sync_handler(struct kvm_vm *vm, + int vector, int ec, handler_fn handler); + +#define write_sysreg(reg, val) \ +({ \ + u64 __val = (u64)(val); \ + asm volatile("msr " __stringify(reg) ", %x0" : : "rZ" (__val)); \ +}) + +#define read_sysreg(reg) \ +({ u64 val; \ + asm volatile("mrs %0, "__stringify(reg) : "=r"(val) : : "memory");\ + val; \ +}) + +#define isb() asm volatile("isb" : : : "memory") + #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 35739567189e..010b59b13917 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -30,6 +30,7 @@ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ /* Minimum allocated guest virtual and physical addresses */ #define KVM_UTIL_MIN_VADDR 0x2000 +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 #define DEFAULT_GUEST_PHY_PAGES 512 #define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 @@ -44,6 +45,7 @@ enum vm_guest_mode { VM_MODE_P40V48_64K, VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ VM_MODE_P47V64_4K, + VM_MODE_P44V64_4K, NUM_VM_MODES, }; @@ -61,7 +63,7 @@ enum vm_guest_mode { #elif defined(__s390x__) -#define VM_MODE_DEFAULT VM_MODE_P47V64_4K +#define VM_MODE_DEFAULT VM_MODE_P44V64_4K #define MIN_PAGE_SHIFT 12U #define ptes_per_page(page_size) ((page_size) / 16) @@ -98,8 +100,7 @@ uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm); int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, size_t len); -void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, - uint32_t data_memslot, uint32_t pgd_memslot); +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); @@ -141,10 +142,12 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); -vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - uint32_t data_memslot, uint32_t pgd_memslot); +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); +vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); +vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); + void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - unsigned int npages, uint32_t pgd_memslot); + unsigned int npages); void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); @@ -237,7 +240,7 @@ int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, const char *exit_reason_str(unsigned int exit_reason); -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot); +void virt_pgd_alloc(struct kvm_vm *vm); /* * VM Virtual Page Map @@ -255,13 +258,13 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot); * Within @vm, creates a virtual translation for the page starting * at @vaddr to the page starting at @paddr. */ -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t memslot); +void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, uint32_t memslot); vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, vm_paddr_t paddr_min, uint32_t memslot); +vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); /* * Create a VM with reasonable defaults @@ -351,6 +354,7 @@ enum { UCALL_SYNC, UCALL_ABORT, UCALL_DONE, + UCALL_UNHANDLED, }; #define UCALL_MAX_ARGS 6 @@ -369,26 +373,31 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc); ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) #define GUEST_DONE() ucall(UCALL_DONE, 0) -#define __GUEST_ASSERT(_condition, _nargs, _args...) do { \ - if (!(_condition)) \ - ucall(UCALL_ABORT, 2 + _nargs, \ - "Failed guest assert: " \ - #_condition, __LINE__, _args); \ +#define __GUEST_ASSERT(_condition, _condstr, _nargs, _args...) do { \ + if (!(_condition)) \ + ucall(UCALL_ABORT, 2 + _nargs, \ + "Failed guest assert: " \ + _condstr, __LINE__, _args); \ } while (0) #define GUEST_ASSERT(_condition) \ - __GUEST_ASSERT((_condition), 0, 0) + __GUEST_ASSERT(_condition, #_condition, 0, 0) #define GUEST_ASSERT_1(_condition, arg1) \ - __GUEST_ASSERT((_condition), 1, (arg1)) + __GUEST_ASSERT(_condition, #_condition, 1, (arg1)) #define GUEST_ASSERT_2(_condition, arg1, arg2) \ - __GUEST_ASSERT((_condition), 2, (arg1), (arg2)) + __GUEST_ASSERT(_condition, #_condition, 2, (arg1), (arg2)) #define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \ - __GUEST_ASSERT((_condition), 3, (arg1), (arg2), (arg3)) + __GUEST_ASSERT(_condition, #_condition, 3, (arg1), (arg2), (arg3)) #define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \ - __GUEST_ASSERT((_condition), 4, (arg1), (arg2), (arg3), (arg4)) + __GUEST_ASSERT(_condition, #_condition, 4, (arg1), (arg2), (arg3), (arg4)) + +#define GUEST_ASSERT_EQ(a, b) __GUEST_ASSERT((a) == (b), #a " == " #b, 2, a, b) + +int vm_get_stats_fd(struct kvm_vm *vm); +int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid); #endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 005f2143adeb..df9f1a3a3ffb 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -44,7 +44,7 @@ extern struct perf_test_args perf_test_args; extern uint64_t guest_test_phys_mem; struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes, + uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src); void perf_test_destroy_vm(struct kvm_vm *vm); void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index d79be15dd3d2..f8fddc84c0d3 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -90,18 +90,23 @@ enum vm_mem_backing_src_type { NUM_SRC_TYPES, }; +#define DEFAULT_VM_MEM_SRC VM_MEM_SRC_ANONYMOUS + struct vm_mem_backing_src_alias { const char *name; uint32_t flag; }; +#define MIN_RUN_DELAY_NS 200000UL + bool thp_configured(void); size_t get_trans_hugepagesz(void); size_t get_def_hugetlb_pagesz(void); const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i); size_t get_backing_src_pagesz(uint32_t i); -void backing_src_help(void); +void backing_src_help(const char *flag); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); +long get_run_delay(void); /* * Whether or not the given source type is shared memory (as opposed to diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h new file mode 100644 index 000000000000..0be4757f1f20 --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/apic.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/x86_64/apic.h + * + * Copyright (C) 2021, Google LLC. + */ + +#ifndef SELFTEST_KVM_APIC_H +#define SELFTEST_KVM_APIC_H + +#include <stdint.h> + +#include "processor.h" + +#define APIC_DEFAULT_GPA 0xfee00000ULL + +/* APIC base address MSR and fields */ +#define MSR_IA32_APICBASE 0x0000001b +#define MSR_IA32_APICBASE_BSP (1<<8) +#define MSR_IA32_APICBASE_EXTD (1<<10) +#define MSR_IA32_APICBASE_ENABLE (1<<11) +#define MSR_IA32_APICBASE_BASE (0xfffff<<12) +#define GET_APIC_BASE(x) (((x) >> 12) << 12) + +#define APIC_BASE_MSR 0x800 +#define X2APIC_ENABLE (1UL << 10) +#define APIC_ID 0x20 +#define APIC_LVR 0x30 +#define GET_APIC_ID_FIELD(x) (((x) >> 24) & 0xFF) +#define APIC_TASKPRI 0x80 +#define APIC_PROCPRI 0xA0 +#define APIC_EOI 0xB0 +#define APIC_SPIV 0xF0 +#define APIC_SPIV_FOCUS_DISABLED (1 << 9) +#define APIC_SPIV_APIC_ENABLED (1 << 8) +#define APIC_ICR 0x300 +#define APIC_DEST_SELF 0x40000 +#define APIC_DEST_ALLINC 0x80000 +#define APIC_DEST_ALLBUT 0xC0000 +#define APIC_ICR_RR_MASK 0x30000 +#define APIC_ICR_RR_INVALID 0x00000 +#define APIC_ICR_RR_INPROG 0x10000 +#define APIC_ICR_RR_VALID 0x20000 +#define APIC_INT_LEVELTRIG 0x08000 +#define APIC_INT_ASSERT 0x04000 +#define APIC_ICR_BUSY 0x01000 +#define APIC_DEST_LOGICAL 0x00800 +#define APIC_DEST_PHYSICAL 0x00000 +#define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 +#define APIC_DM_LOWEST 0x00100 +#define APIC_DM_SMI 0x00200 +#define APIC_DM_REMRD 0x00300 +#define APIC_DM_NMI 0x00400 +#define APIC_DM_INIT 0x00500 +#define APIC_DM_STARTUP 0x00600 +#define APIC_DM_EXTINT 0x00700 +#define APIC_VECTOR_MASK 0x000FF +#define APIC_ICR2 0x310 +#define SET_APIC_DEST_FIELD(x) ((x) << 24) + +void apic_disable(void); +void xapic_enable(void); +void x2apic_enable(void); + +static inline uint32_t get_bsp_flag(void) +{ + return rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BSP; +} + +static inline uint32_t xapic_read_reg(unsigned int reg) +{ + return ((volatile uint32_t *)APIC_DEFAULT_GPA)[reg >> 2]; +} + +static inline void xapic_write_reg(unsigned int reg, uint32_t val) +{ + ((volatile uint32_t *)APIC_DEFAULT_GPA)[reg >> 2] = val; +} + +static inline uint64_t x2apic_read_reg(unsigned int reg) +{ + return rdmsr(APIC_BASE_MSR + (reg >> 4)); +} + +static inline void x2apic_write_reg(unsigned int reg, uint64_t value) +{ + wrmsr(APIC_BASE_MSR + (reg >> 4), value); +} + +#endif /* SELFTEST_KVM_APIC_H */ diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index a034438b6266..c9af97abd622 100644 --- a/tools/testing/selftests/kvm/include/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * tools/testing/selftests/kvm/include/vmx.h + * tools/testing/selftests/kvm/include/x86_64/evmcs.h * * Copyright (C) 2018, Red Hat, Inc. * diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h new file mode 100644 index 000000000000..b66910702c0a --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * tools/testing/selftests/kvm/include/x86_64/hyperv.h + * + * Copyright (C) 2021, Red Hat, Inc. + * + */ + +#ifndef SELFTEST_KVM_HYPERV_H +#define SELFTEST_KVM_HYPERV_H + +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 +#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES 0x40000007 +#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A +#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS 0x40000080 +#define HYPERV_CPUID_SYNDBG_INTERFACE 0x40000081 +#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES 0x40000082 + +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 +#define HV_X64_MSR_HYPERCALL 0x40000001 +#define HV_X64_MSR_VP_INDEX 0x40000002 +#define HV_X64_MSR_RESET 0x40000003 +#define HV_X64_MSR_VP_RUNTIME 0x40000010 +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 +#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 +#define HV_X64_MSR_APIC_FREQUENCY 0x40000023 +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F +#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0 +#define HV_X64_MSR_STIMER0_COUNT 0x400000B1 +#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2 +#define HV_X64_MSR_STIMER1_COUNT 0x400000B3 +#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4 +#define HV_X64_MSR_STIMER2_COUNT 0x400000B5 +#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6 +#define HV_X64_MSR_STIMER3_COUNT 0x400000B7 +#define HV_X64_MSR_GUEST_IDLE 0x400000F0 +#define HV_X64_MSR_CRASH_P0 0x40000100 +#define HV_X64_MSR_CRASH_P1 0x40000101 +#define HV_X64_MSR_CRASH_P2 0x40000102 +#define HV_X64_MSR_CRASH_P3 0x40000103 +#define HV_X64_MSR_CRASH_P4 0x40000104 +#define HV_X64_MSR_CRASH_CTL 0x40000105 +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 +#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 + +#define HV_X64_MSR_SYNDBG_CONTROL 0x400000F1 +#define HV_X64_MSR_SYNDBG_STATUS 0x400000F2 +#define HV_X64_MSR_SYNDBG_SEND_BUFFER 0x400000F3 +#define HV_X64_MSR_SYNDBG_RECV_BUFFER 0x400000F4 +#define HV_X64_MSR_SYNDBG_PENDING_BUFFER 0x400000F5 +#define HV_X64_MSR_SYNDBG_OPTIONS 0x400000FF + +/* HYPERV_CPUID_FEATURES.EAX */ +#define HV_MSR_VP_RUNTIME_AVAILABLE BIT(0) +#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) +#define HV_MSR_SYNIC_AVAILABLE BIT(2) +#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) +#define HV_MSR_APIC_ACCESS_AVAILABLE BIT(4) +#define HV_MSR_HYPERCALL_AVAILABLE BIT(5) +#define HV_MSR_VP_INDEX_AVAILABLE BIT(6) +#define HV_MSR_RESET_AVAILABLE BIT(7) +#define HV_MSR_STAT_PAGES_AVAILABLE BIT(8) +#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) +#define HV_MSR_GUEST_IDLE_AVAILABLE BIT(10) +#define HV_ACCESS_FREQUENCY_MSRS BIT(11) +#define HV_ACCESS_REENLIGHTENMENT BIT(13) +#define HV_ACCESS_TSC_INVARIANT BIT(15) + +/* HYPERV_CPUID_FEATURES.EBX */ +#define HV_CREATE_PARTITIONS BIT(0) +#define HV_ACCESS_PARTITION_ID BIT(1) +#define HV_ACCESS_MEMORY_POOL BIT(2) +#define HV_ADJUST_MESSAGE_BUFFERS BIT(3) +#define HV_POST_MESSAGES BIT(4) +#define HV_SIGNAL_EVENTS BIT(5) +#define HV_CREATE_PORT BIT(6) +#define HV_CONNECT_PORT BIT(7) +#define HV_ACCESS_STATS BIT(8) +#define HV_DEBUGGING BIT(11) +#define HV_CPU_MANAGEMENT BIT(12) +#define HV_ISOLATION BIT(22) + +/* HYPERV_CPUID_FEATURES.EDX */ +#define HV_X64_MWAIT_AVAILABLE BIT(0) +#define HV_X64_GUEST_DEBUGGING_AVAILABLE BIT(1) +#define HV_X64_PERF_MONITOR_AVAILABLE BIT(2) +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE BIT(3) +#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE BIT(4) +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5) +#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE BIT(8) +#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) +#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) +#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) + +/* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */ +#define HV_X64_AS_SWITCH_RECOMMENDED BIT(0) +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED BIT(1) +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED BIT(2) +#define HV_X64_APIC_ACCESS_RECOMMENDED BIT(3) +#define HV_X64_SYSTEM_RESET_RECOMMENDED BIT(4) +#define HV_X64_RELAXED_TIMING_RECOMMENDED BIT(5) +#define HV_DEPRECATING_AEOI_RECOMMENDED BIT(9) +#define HV_X64_CLUSTER_IPI_RECOMMENDED BIT(10) +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED BIT(11) +#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) + +/* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ +#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING BIT(1) + +/* Hypercalls */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 +#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_SEND_IPI 0x000b +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 +#define HVCALL_SEND_IPI_EX 0x0015 +#define HVCALL_GET_PARTITION_ID 0x0046 +#define HVCALL_DEPOSIT_MEMORY 0x0048 +#define HVCALL_CREATE_VP 0x004e +#define HVCALL_GET_VP_REGISTERS 0x0050 +#define HVCALL_SET_VP_REGISTERS 0x0051 +#define HVCALL_POST_MESSAGE 0x005c +#define HVCALL_SIGNAL_EVENT 0x005d +#define HVCALL_POST_DEBUG_DATA 0x0069 +#define HVCALL_RETRIEVE_DEBUG_DATA 0x006a +#define HVCALL_RESET_DEBUG_SESSION 0x006b +#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 +#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c +#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d +#define HVCALL_RETARGET_INTERRUPT 0x007e +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 + +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 +#define HV_STATUS_INVALID_PARAMETER 5 +#define HV_STATUS_ACCESS_DENIED 6 +#define HV_STATUS_OPERATION_DENIED 8 +#define HV_STATUS_INSUFFICIENT_MEMORY 11 +#define HV_STATUS_INVALID_PORT_ID 17 +#define HV_STATUS_INVALID_CONNECTION_ID 18 +#define HV_STATUS_INSUFFICIENT_BUFFERS 19 + +/* hypercall options */ +#define HV_HYPERCALL_FAST_BIT BIT(16) + +#endif /* !SELFTEST_KVM_HYPERV_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 0b30b4e15c38..05e65ca1c30c 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -13,6 +13,8 @@ #include <asm/msr-index.h> +#include "../kvm_util.h" + #define X86_EFLAGS_FIXED (1u << 1) #define X86_CR4_VME (1ul << 0) @@ -53,7 +55,8 @@ #define CPUID_PKU (1ul << 3) #define CPUID_LA57 (1ul << 16) -#define UNEXPECTED_VECTOR_PORT 0xfff0u +/* CPUID.0x8000_0001.EDX */ +#define CPUID_GBPAGES (1ul << 26) /* General Registers in 64-Bit Mode */ struct gpr64_regs { @@ -309,37 +312,37 @@ static inline void set_xmm(int n, unsigned long val) } } -typedef unsigned long v1di __attribute__ ((vector_size (8))); +#define GET_XMM(__xmm) \ +({ \ + unsigned long __val; \ + asm volatile("movq %%"#__xmm", %0" : "=r"(__val)); \ + __val; \ +}) + static inline unsigned long get_xmm(int n) { assert(n >= 0 && n <= 7); - register v1di xmm0 __asm__("%xmm0"); - register v1di xmm1 __asm__("%xmm1"); - register v1di xmm2 __asm__("%xmm2"); - register v1di xmm3 __asm__("%xmm3"); - register v1di xmm4 __asm__("%xmm4"); - register v1di xmm5 __asm__("%xmm5"); - register v1di xmm6 __asm__("%xmm6"); - register v1di xmm7 __asm__("%xmm7"); switch (n) { case 0: - return (unsigned long)xmm0; + return GET_XMM(xmm0); case 1: - return (unsigned long)xmm1; + return GET_XMM(xmm1); case 2: - return (unsigned long)xmm2; + return GET_XMM(xmm2); case 3: - return (unsigned long)xmm3; + return GET_XMM(xmm3); case 4: - return (unsigned long)xmm4; + return GET_XMM(xmm4); case 5: - return (unsigned long)xmm5; + return GET_XMM(xmm5); case 6: - return (unsigned long)xmm6; + return GET_XMM(xmm6); case 7: - return (unsigned long)xmm7; + return GET_XMM(xmm7); } + + /* never reached */ return 0; } @@ -391,9 +394,13 @@ struct ex_regs { void vm_init_descriptor_tables(struct kvm_vm *vm); void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid); -void vm_handle_exception(struct kvm_vm *vm, int vector, +void vm_install_exception_handler(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); +uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr); +void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, + uint64_t pte); + /* * set_cpuid() - overwrites a matching cpuid entry with the provided value. * matches based on ent->function && ent->index. returns true @@ -410,6 +417,14 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); +enum x86_page_size { + X86_PAGE_SIZE_4K = 0, + X86_PAGE_SIZE_2M, + X86_PAGE_SIZE_1G, +}; +void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + enum x86_page_size page_size); + /* * Basic CPU control in CR0 */ @@ -425,53 +440,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui #define X86_CR0_CD (1UL<<30) /* Cache Disable */ #define X86_CR0_PG (1UL<<31) /* Paging */ -#define APIC_DEFAULT_GPA 0xfee00000ULL - -/* APIC base address MSR and fields */ -#define MSR_IA32_APICBASE 0x0000001b -#define MSR_IA32_APICBASE_BSP (1<<8) -#define MSR_IA32_APICBASE_EXTD (1<<10) -#define MSR_IA32_APICBASE_ENABLE (1<<11) -#define MSR_IA32_APICBASE_BASE (0xfffff<<12) -#define GET_APIC_BASE(x) (((x) >> 12) << 12) - -#define APIC_BASE_MSR 0x800 -#define X2APIC_ENABLE (1UL << 10) -#define APIC_ID 0x20 -#define APIC_LVR 0x30 -#define GET_APIC_ID_FIELD(x) (((x) >> 24) & 0xFF) -#define APIC_TASKPRI 0x80 -#define APIC_PROCPRI 0xA0 -#define APIC_EOI 0xB0 -#define APIC_SPIV 0xF0 -#define APIC_SPIV_FOCUS_DISABLED (1 << 9) -#define APIC_SPIV_APIC_ENABLED (1 << 8) -#define APIC_ICR 0x300 -#define APIC_DEST_SELF 0x40000 -#define APIC_DEST_ALLINC 0x80000 -#define APIC_DEST_ALLBUT 0xC0000 -#define APIC_ICR_RR_MASK 0x30000 -#define APIC_ICR_RR_INVALID 0x00000 -#define APIC_ICR_RR_INPROG 0x10000 -#define APIC_ICR_RR_VALID 0x20000 -#define APIC_INT_LEVELTRIG 0x08000 -#define APIC_INT_ASSERT 0x04000 -#define APIC_ICR_BUSY 0x01000 -#define APIC_DEST_LOGICAL 0x00800 -#define APIC_DEST_PHYSICAL 0x00000 -#define APIC_DM_FIXED 0x00000 -#define APIC_DM_FIXED_MASK 0x00700 -#define APIC_DM_LOWEST 0x00100 -#define APIC_DM_SMI 0x00200 -#define APIC_DM_REMRD 0x00300 -#define APIC_DM_NMI 0x00400 -#define APIC_DM_INIT 0x00500 -#define APIC_DM_STARTUP 0x00600 -#define APIC_DM_EXTINT 0x00700 -#define APIC_VECTOR_MASK 0x000FF -#define APIC_ICR2 0x310 -#define SET_APIC_DEST_FIELD(x) ((x) << 24) - /* VMX_EPT_VPID_CAP bits */ #define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21) diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 65eb1079a161..583ceb0d1457 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -10,6 +10,7 @@ #include <stdint.h> #include "processor.h" +#include "apic.h" /* * Definitions of Primary Processor-Based VM-Execution Controls. @@ -607,15 +608,13 @@ bool nested_vmx_supported(void); void nested_vmx_check_supported(void); void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot); + uint64_t nested_paddr, uint64_t paddr); void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, - uint32_t eptp_memslot); + uint64_t nested_paddr, uint64_t paddr, uint64_t size); void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot, uint32_t eptp_memslot); + uint32_t memslot); void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot); -void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot); +void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c new file mode 100644 index 000000000000..17f65d514915 --- /dev/null +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kvm_binary_stats_test + * + * Copyright (C) 2021, Google LLC. + * + * Test the fd-based interface for KVM statistics. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "asm/kvm.h" +#include "linux/kvm.h" + +static void stats_test(int stats_fd) +{ + ssize_t ret; + int i; + size_t size_desc; + size_t size_data = 0; + struct kvm_stats_header *header; + char *id; + struct kvm_stats_desc *stats_desc; + u64 *stats_data; + struct kvm_stats_desc *pdesc; + + /* Read kvm stats header */ + header = malloc(sizeof(*header)); + TEST_ASSERT(header, "Allocate memory for stats header"); + + ret = read(stats_fd, header, sizeof(*header)); + TEST_ASSERT(ret == sizeof(*header), "Read stats header"); + size_desc = sizeof(*stats_desc) + header->name_size; + + /* Read kvm stats id string */ + id = malloc(header->name_size); + TEST_ASSERT(id, "Allocate memory for id string"); + ret = read(stats_fd, id, header->name_size); + TEST_ASSERT(ret == header->name_size, "Read id string"); + + /* Check id string, that should start with "kvm" */ + TEST_ASSERT(!strncmp(id, "kvm", 3) && strlen(id) < header->name_size, + "Invalid KVM stats type, id: %s", id); + + /* Sanity check for other fields in header */ + if (header->num_desc == 0) { + printf("No KVM stats defined!"); + return; + } + /* Check overlap */ + TEST_ASSERT(header->desc_offset > 0 && header->data_offset > 0 + && header->desc_offset >= sizeof(*header) + && header->data_offset >= sizeof(*header), + "Invalid offset fields in header"); + TEST_ASSERT(header->desc_offset > header->data_offset || + (header->desc_offset + size_desc * header->num_desc <= + header->data_offset), + "Descriptor block is overlapped with data block"); + + /* Allocate memory for stats descriptors */ + stats_desc = calloc(header->num_desc, size_desc); + TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors"); + /* Read kvm stats descriptors */ + ret = pread(stats_fd, stats_desc, + size_desc * header->num_desc, header->desc_offset); + TEST_ASSERT(ret == size_desc * header->num_desc, + "Read KVM stats descriptors"); + + /* Sanity check for fields in descriptors */ + for (i = 0; i < header->num_desc; ++i) { + pdesc = (void *)stats_desc + i * size_desc; + /* Check type,unit,base boundaries */ + TEST_ASSERT((pdesc->flags & KVM_STATS_TYPE_MASK) + <= KVM_STATS_TYPE_MAX, "Unknown KVM stats type"); + TEST_ASSERT((pdesc->flags & KVM_STATS_UNIT_MASK) + <= KVM_STATS_UNIT_MAX, "Unknown KVM stats unit"); + TEST_ASSERT((pdesc->flags & KVM_STATS_BASE_MASK) + <= KVM_STATS_BASE_MAX, "Unknown KVM stats base"); + /* Check exponent for stats unit + * Exponent for counter should be greater than or equal to 0 + * Exponent for unit bytes should be greater than or equal to 0 + * Exponent for unit seconds should be less than or equal to 0 + * Exponent for unit clock cycles should be greater than or + * equal to 0 + */ + switch (pdesc->flags & KVM_STATS_UNIT_MASK) { + case KVM_STATS_UNIT_NONE: + case KVM_STATS_UNIT_BYTES: + case KVM_STATS_UNIT_CYCLES: + TEST_ASSERT(pdesc->exponent >= 0, + "Unsupported KVM stats unit"); + break; + case KVM_STATS_UNIT_SECONDS: + TEST_ASSERT(pdesc->exponent <= 0, + "Unsupported KVM stats unit"); + break; + } + /* Check name string */ + TEST_ASSERT(strlen(pdesc->name) < header->name_size, + "KVM stats name(%s) too long", pdesc->name); + /* Check size field, which should not be zero */ + TEST_ASSERT(pdesc->size, "KVM descriptor(%s) with size of 0", + pdesc->name); + /* Check bucket_size field */ + switch (pdesc->flags & KVM_STATS_TYPE_MASK) { + case KVM_STATS_TYPE_LINEAR_HIST: + TEST_ASSERT(pdesc->bucket_size, + "Bucket size of Linear Histogram stats (%s) is zero", + pdesc->name); + break; + default: + TEST_ASSERT(!pdesc->bucket_size, + "Bucket size of stats (%s) is not zero", + pdesc->name); + } + size_data += pdesc->size * sizeof(*stats_data); + } + /* Check overlap */ + TEST_ASSERT(header->data_offset >= header->desc_offset + || header->data_offset + size_data <= header->desc_offset, + "Data block is overlapped with Descriptor block"); + /* Check validity of all stats data size */ + TEST_ASSERT(size_data >= header->num_desc * sizeof(*stats_data), + "Data size is not correct"); + /* Check stats offset */ + for (i = 0; i < header->num_desc; ++i) { + pdesc = (void *)stats_desc + i * size_desc; + TEST_ASSERT(pdesc->offset < size_data, + "Invalid offset (%u) for stats: %s", + pdesc->offset, pdesc->name); + } + + /* Allocate memory for stats data */ + stats_data = malloc(size_data); + TEST_ASSERT(stats_data, "Allocate memory for stats data"); + /* Read kvm stats data as a bulk */ + ret = pread(stats_fd, stats_data, size_data, header->data_offset); + TEST_ASSERT(ret == size_data, "Read KVM stats data"); + /* Read kvm stats data one by one */ + size_data = 0; + for (i = 0; i < header->num_desc; ++i) { + pdesc = (void *)stats_desc + i * size_desc; + ret = pread(stats_fd, stats_data, + pdesc->size * sizeof(*stats_data), + header->data_offset + size_data); + TEST_ASSERT(ret == pdesc->size * sizeof(*stats_data), + "Read data of KVM stats: %s", pdesc->name); + size_data += pdesc->size * sizeof(*stats_data); + } + + free(stats_data); + free(stats_desc); + free(id); + free(header); +} + + +static void vm_stats_test(struct kvm_vm *vm) +{ + int stats_fd; + + /* Get fd for VM stats */ + stats_fd = vm_get_stats_fd(vm); + TEST_ASSERT(stats_fd >= 0, "Get VM stats fd"); + + stats_test(stats_fd); + close(stats_fd); + TEST_ASSERT(fcntl(stats_fd, F_GETFD) == -1, "Stats fd not freed"); +} + +static void vcpu_stats_test(struct kvm_vm *vm, int vcpu_id) +{ + int stats_fd; + + /* Get fd for VCPU stats */ + stats_fd = vcpu_get_stats_fd(vm, vcpu_id); + TEST_ASSERT(stats_fd >= 0, "Get VCPU stats fd"); + + stats_test(stats_fd); + close(stats_fd); + TEST_ASSERT(fcntl(stats_fd, F_GETFD) == -1, "Stats fd not freed"); +} + +#define DEFAULT_NUM_VM 4 +#define DEFAULT_NUM_VCPU 4 + +/* + * Usage: kvm_bin_form_stats [#vm] [#vcpu] + * The first parameter #vm set the number of VMs being created. + * The second parameter #vcpu set the number of VCPUs being created. + * By default, DEFAULT_NUM_VM VM and DEFAULT_NUM_VCPU VCPU for the VM would be + * created for testing. + */ + +int main(int argc, char *argv[]) +{ + int i, j; + struct kvm_vm **vms; + int max_vm = DEFAULT_NUM_VM; + int max_vcpu = DEFAULT_NUM_VCPU; + + /* Get the number of VMs and VCPUs that would be created for testing. */ + if (argc > 1) { + max_vm = strtol(argv[1], NULL, 0); + if (max_vm <= 0) + max_vm = DEFAULT_NUM_VM; + } + if (argc > 2) { + max_vcpu = strtol(argv[2], NULL, 0); + if (max_vcpu <= 0) + max_vcpu = DEFAULT_NUM_VCPU; + } + + /* Check the extension for binary stats */ + if (kvm_check_cap(KVM_CAP_BINARY_STATS_FD) <= 0) { + print_skip("Binary form statistics interface is not supported"); + exit(KSFT_SKIP); + } + + /* Create VMs and VCPUs */ + vms = malloc(sizeof(vms[0]) * max_vm); + TEST_ASSERT(vms, "Allocate memory for storing VM pointers"); + for (i = 0; i < max_vm; ++i) { + vms[i] = vm_create(VM_MODE_DEFAULT, + DEFAULT_GUEST_PHY_PAGES, O_RDWR); + for (j = 0; j < max_vcpu; ++j) + vm_vcpu_add(vms[i], j); + } + + /* Check stats read for every VM and VCPU */ + for (i = 0; i < max_vm; ++i) { + vm_stats_test(vms[i]); + for (j = 0; j < max_vcpu; ++j) + vcpu_stats_test(vms[i], j); + } + + for (i = 0; i < max_vm; ++i) + kvm_vm_free(vms[i]); + free(vms); + return 0; +} diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 82171f17c1d7..36407cb0ec85 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -303,7 +303,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) TEST_MEM_SLOT_INDEX, guest_num_pages, 0); /* Do mapping(GVA->GPA) for the testing memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); /* Cache the HVA pointer of the region */ host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); @@ -456,10 +456,7 @@ static void help(char *name) " (default: 1G)\n"); printf(" -v: specify the number of vCPUs to run\n" " (default: 1)\n"); - printf(" -s: specify the type of memory that should be used to\n" - " back the guest data region.\n" - " (default: anonymous)\n\n"); - backing_src_help(); + backing_src_help("-s"); puts(""); } @@ -468,7 +465,7 @@ int main(int argc, char *argv[]) int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); struct test_params p = { .test_mem_size = DEFAULT_TEST_MEM_SIZE, - .src_type = VM_MEM_SRC_ANONYMOUS, + .src_type = DEFAULT_VM_MEM_SRC, }; int opt; diff --git a/tools/testing/selftests/kvm/lib/aarch64/handlers.S b/tools/testing/selftests/kvm/lib/aarch64/handlers.S new file mode 100644 index 000000000000..0e443eadfac6 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/handlers.S @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +.macro save_registers + add sp, sp, #-16 * 17 + + stp x0, x1, [sp, #16 * 0] + stp x2, x3, [sp, #16 * 1] + stp x4, x5, [sp, #16 * 2] + stp x6, x7, [sp, #16 * 3] + stp x8, x9, [sp, #16 * 4] + stp x10, x11, [sp, #16 * 5] + stp x12, x13, [sp, #16 * 6] + stp x14, x15, [sp, #16 * 7] + stp x16, x17, [sp, #16 * 8] + stp x18, x19, [sp, #16 * 9] + stp x20, x21, [sp, #16 * 10] + stp x22, x23, [sp, #16 * 11] + stp x24, x25, [sp, #16 * 12] + stp x26, x27, [sp, #16 * 13] + stp x28, x29, [sp, #16 * 14] + + /* + * This stores sp_el1 into ex_regs.sp so exception handlers can "look" + * at it. It will _not_ be used to restore the sp on return from the + * exception so handlers can not update it. + */ + add x1, sp, #16 * 17 + stp x30, x1, [sp, #16 * 15] /* x30, SP */ + + mrs x1, elr_el1 + mrs x2, spsr_el1 + stp x1, x2, [sp, #16 * 16] /* PC, PSTATE */ +.endm + +.macro restore_registers + ldp x1, x2, [sp, #16 * 16] /* PC, PSTATE */ + msr elr_el1, x1 + msr spsr_el1, x2 + + /* sp is not restored */ + ldp x30, xzr, [sp, #16 * 15] /* x30, SP */ + + ldp x28, x29, [sp, #16 * 14] + ldp x26, x27, [sp, #16 * 13] + ldp x24, x25, [sp, #16 * 12] + ldp x22, x23, [sp, #16 * 11] + ldp x20, x21, [sp, #16 * 10] + ldp x18, x19, [sp, #16 * 9] + ldp x16, x17, [sp, #16 * 8] + ldp x14, x15, [sp, #16 * 7] + ldp x12, x13, [sp, #16 * 6] + ldp x10, x11, [sp, #16 * 5] + ldp x8, x9, [sp, #16 * 4] + ldp x6, x7, [sp, #16 * 3] + ldp x4, x5, [sp, #16 * 2] + ldp x2, x3, [sp, #16 * 1] + ldp x0, x1, [sp, #16 * 0] + + add sp, sp, #16 * 17 + + eret +.endm + +.pushsection ".entry.text", "ax" +.balign 0x800 +.global vectors +vectors: +.popsection + +.set vector, 0 + +/* + * Build an exception handler for vector and append a jump to it into + * vectors (while making sure that it's 0x80 aligned). + */ +.macro HANDLER, label +handler_\label: + save_registers + mov x0, sp + mov x1, #vector + bl route_exception + restore_registers + +.pushsection ".entry.text", "ax" +.balign 0x80 + b handler_\label +.popsection + +.set vector, vector + 1 +.endm + +.macro HANDLER_INVALID +.pushsection ".entry.text", "ax" +.balign 0x80 +/* This will abort so no need to save and restore registers. */ + mov x0, #vector + mov x1, #0 /* ec */ + mov x2, #0 /* valid_ec */ + b kvm_exit_unexpected_exception +.popsection + +.set vector, vector + 1 +.endm + +/* + * Caution: be sure to not add anything between the declaration of vectors + * above and these macro calls that will build the vectors table below it. + */ + HANDLER_INVALID // Synchronous EL1t + HANDLER_INVALID // IRQ EL1t + HANDLER_INVALID // FIQ EL1t + HANDLER_INVALID // Error EL1t + + HANDLER el1h_sync // Synchronous EL1h + HANDLER el1h_irq // IRQ EL1h + HANDLER el1h_fiq // FIQ EL1h + HANDLER el1h_error // Error EL1h + + HANDLER el0_sync_64 // Synchronous 64-bit EL0 + HANDLER el0_irq_64 // IRQ 64-bit EL0 + HANDLER el0_fiq_64 // FIQ 64-bit EL0 + HANDLER el0_error_64 // Error 64-bit EL0 + + HANDLER el0_sync_32 // Synchronous 32-bit EL0 + HANDLER el0_irq_32 // IRQ 32-bit EL0 + HANDLER el0_fiq_32 // FIQ 32-bit EL0 + HANDLER el0_error_32 // Error 32-bit EL0 diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index cee92d477dc0..632b74d6b3ca 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -6,14 +6,16 @@ */ #include <linux/compiler.h> +#include <assert.h> #include "kvm_util.h" #include "../kvm_util_internal.h" #include "processor.h" -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 #define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000 +static vm_vaddr_t exception_handlers; + static uint64_t page_align(struct kvm_vm *vm, uint64_t v) { return (v + vm->page_size) & ~(vm->page_size - 1); @@ -72,19 +74,19 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm) return 1 << (vm->page_shift - 3); } -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +void virt_pgd_alloc(struct kvm_vm *vm) { if (!vm->pgd_created) { vm_paddr_t paddr = vm_phy_pages_alloc(vm, page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); vm->pgd = paddr; vm->pgd_created = true; } } -void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot, uint64_t flags) +static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint64_t flags) { uint8_t attr_idx = flags & 7; uint64_t *ptep; @@ -104,25 +106,19 @@ void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, paddr, vm->max_gfn, vm->page_size); ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = vm_alloc_page_table(vm) | 3; switch (vm->pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = vm_alloc_page_table(vm) | 3; /* fall through */ case 3: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = vm_alloc_page_table(vm) | 3; /* fall through */ case 2: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8; @@ -135,12 +131,11 @@ void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, *ptep |= (attr_idx << 2) | (1 << 10) /* Access Flag */; } -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot) +void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */ - _virt_pg_map(vm, vaddr, paddr, pgd_memslot, attr_idx); + _virt_pg_map(vm, vaddr, paddr, attr_idx); } vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) @@ -302,7 +297,7 @@ void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, DEFAULT_STACK_PGS * vm->page_size : vm->page_size; uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, 0, 0); + DEFAULT_ARM64_GUEST_STACK_VADDR_MIN); vm_vcpu_add(vm, vcpuid); aarch64_vcpu_setup(vm, vcpuid, init); @@ -334,6 +329,100 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) va_end(ap); } +void kvm_exit_unexpected_exception(int vector, uint64_t ec, bool valid_ec) +{ + ucall(UCALL_UNHANDLED, 3, vector, ec, valid_ec); + while (1) + ; +} + void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) { + struct ucall uc; + + if (get_ucall(vm, vcpuid, &uc) != UCALL_UNHANDLED) + return; + + if (uc.args[2]) /* valid_ec */ { + assert(VECTOR_IS_SYNC(uc.args[0])); + TEST_FAIL("Unexpected exception (vector:0x%lx, ec:0x%lx)", + uc.args[0], uc.args[1]); + } else { + assert(!VECTOR_IS_SYNC(uc.args[0])); + TEST_FAIL("Unexpected exception (vector:0x%lx)", + uc.args[0]); + } +} + +struct handlers { + handler_fn exception_handlers[VECTOR_NUM][ESR_EC_NUM]; +}; + +void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid) +{ + extern char vectors; + + set_reg(vm, vcpuid, ARM64_SYS_REG(VBAR_EL1), (uint64_t)&vectors); +} + +void route_exception(struct ex_regs *regs, int vector) +{ + struct handlers *handlers = (struct handlers *)exception_handlers; + bool valid_ec; + int ec = 0; + + switch (vector) { + case VECTOR_SYNC_CURRENT: + case VECTOR_SYNC_LOWER_64: + ec = (read_sysreg(esr_el1) >> ESR_EC_SHIFT) & ESR_EC_MASK; + valid_ec = true; + break; + case VECTOR_IRQ_CURRENT: + case VECTOR_IRQ_LOWER_64: + case VECTOR_FIQ_CURRENT: + case VECTOR_FIQ_LOWER_64: + case VECTOR_ERROR_CURRENT: + case VECTOR_ERROR_LOWER_64: + ec = 0; + valid_ec = false; + break; + default: + valid_ec = false; + goto unexpected_exception; + } + + if (handlers && handlers->exception_handlers[vector][ec]) + return handlers->exception_handlers[vector][ec](regs); + +unexpected_exception: + kvm_exit_unexpected_exception(vector, ec, valid_ec); +} + +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + vm->handlers = vm_vaddr_alloc(vm, sizeof(struct handlers), + vm->page_size); + + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; +} + +void vm_install_sync_handler(struct kvm_vm *vm, int vector, int ec, + void (*handler)(struct ex_regs *)) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(VECTOR_IS_SYNC(vector)); + assert(vector < VECTOR_NUM); + assert(ec < ESR_EC_NUM); + handlers->exception_handlers[vector][ec] = handler; +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(!VECTOR_IS_SYNC(vector)); + assert(vector < VECTOR_NUM); + handlers->exception_handlers[vector][0] = handler; } diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index 2f37b90ee1a9..e0b0164e9af8 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -14,7 +14,7 @@ static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa) if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1)) return false; - virt_pg_map(vm, gpa, gpa, 0); + virt_pg_map(vm, gpa, gpa); ucall_exit_mmio_addr = (vm_vaddr_t *)gpa; sync_global_to_guest(vm, ucall_exit_mmio_addr); diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c index bc75a91e00a6..eac44f5d0db0 100644 --- a/tools/testing/selftests/kvm/lib/elf.c +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -111,8 +111,7 @@ static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) * by the image and it needs to have sufficient available physical pages, to * back the virtual pages used to load the image. */ -void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, - uint32_t data_memslot, uint32_t pgd_memslot) +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename) { off_t offset, offset_rv; Elf64_Ehdr hdr; @@ -164,8 +163,7 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, seg_vend |= vm->page_size - 1; size_t seg_size = seg_vend - seg_vstart + 1; - vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart, - data_memslot, pgd_memslot); + vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart); TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate " "virtual memory for segment at requested min addr,\n" " segment idx: %u\n" diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c index 25bff307c71f..c330f414ef96 100644 --- a/tools/testing/selftests/kvm/lib/guest_modes.c +++ b/tools/testing/selftests/kvm/lib/guest_modes.c @@ -22,6 +22,22 @@ void guest_modes_append_default(void) } } #endif +#ifdef __s390x__ + { + int kvm_fd, vm_fd; + struct kvm_s390_vm_cpu_processor info; + + kvm_fd = open_kvm_dev_path_or_exit(); + vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, 0); + kvm_device_access(vm_fd, KVM_S390_VM_CPU_MODEL, + KVM_S390_VM_CPU_PROCESSOR, &info, false); + close(vm_fd); + close(kvm_fd); + /* Starting with z13 we have 47bits of physical address */ + if (info.ibc >= 0x30) + guest_mode_append(VM_MODE_P47V64_4K, true, true); + } +#endif } void for_each_guest_mode(void (*func)(enum vm_guest_mode, void *), void *arg) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index a2b732cf96ea..10a8ed691c66 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -176,6 +176,7 @@ const char *vm_guest_mode_string(uint32_t i) [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", + [VM_MODE_P44V64_4K] = "PA-bits:44, VA-bits:64, 4K pages", }; _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, "Missing new mode strings?"); @@ -194,6 +195,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { { 40, 48, 0x10000, 16 }, { 0, 0, 0x1000, 12 }, { 47, 64, 0x1000, 12 }, + { 44, 64, 0x1000, 12 }, }; _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, "Missing new mode params?"); @@ -282,6 +284,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) case VM_MODE_P47V64_4K: vm->pgtable_levels = 5; break; + case VM_MODE_P44V64_4K: + vm->pgtable_levels = 5; + break; default: TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); } @@ -365,7 +370,7 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, pages = vm_adjust_num_guest_pages(mode, pages); vm = vm_create(mode, pages, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + kvm_vm_elf_load(vm, program_invocation_name); #ifdef __x86_64__ vm_create_irqchip(vm); @@ -375,10 +380,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, uint32_t vcpuid = vcpuids ? vcpuids[i] : i; vm_vcpu_add_default(vm, vcpuid, guest_code); - -#ifdef __x86_64__ - vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); -#endif } return vm; @@ -1251,15 +1252,13 @@ va_found: * a unique set of pages, with the minimum real allocation being at least * a page. */ -vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - uint32_t data_memslot, uint32_t pgd_memslot) +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min) { uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); - virt_pgd_alloc(vm, pgd_memslot); + virt_pgd_alloc(vm); vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages, - KVM_UTIL_MIN_PFN * vm->page_size, - data_memslot); + KVM_UTIL_MIN_PFN * vm->page_size, 0); /* * Find an unused range of virtual page addresses of at least @@ -1271,7 +1270,7 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, for (vm_vaddr_t vaddr = vaddr_start; pages > 0; pages--, vaddr += vm->page_size, paddr += vm->page_size) { - virt_pg_map(vm, vaddr, paddr, pgd_memslot); + virt_pg_map(vm, vaddr, paddr); sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); @@ -1281,6 +1280,44 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, } /* + * VM Virtual Address Allocate Pages + * + * Input Args: + * vm - Virtual Machine + * + * Output Args: None + * + * Return: + * Starting guest virtual address + * + * Allocates at least N system pages worth of bytes within the virtual address + * space of the vm. + */ +vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages) +{ + return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR); +} + +/* + * VM Virtual Address Allocate Page + * + * Input Args: + * vm - Virtual Machine + * + * Output Args: None + * + * Return: + * Starting guest virtual address + * + * Allocates at least one system page worth of bytes within the virtual address + * space of the vm. + */ +vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm) +{ + return vm_vaddr_alloc_pages(vm, 1); +} + +/* * Map a range of VM virtual address to the VM's physical address * * Input Args: @@ -1298,7 +1335,7 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, * @npages starting at @vaddr to the page range starting at @paddr. */ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - unsigned int npages, uint32_t pgd_memslot) + unsigned int npages) { size_t page_size = vm->page_size; size_t size = npages * page_size; @@ -1307,7 +1344,7 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - virt_pg_map(vm, vaddr, paddr, pgd_memslot); + virt_pg_map(vm, vaddr, paddr); vaddr += page_size; paddr += page_size; } @@ -2177,6 +2214,14 @@ vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, return vm_phy_pages_alloc(vm, 1, paddr_min, memslot); } +/* Arbitrary minimum physical address used for virtual translation tables. */ +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 + +vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm) +{ + return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); +} + /* * Address Guest Virtual to Host Virtual * @@ -2286,3 +2331,15 @@ unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size) n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size); return vm_adjust_num_guest_pages(mode, n); } + +int vm_get_stats_fd(struct kvm_vm *vm) +{ + return ioctl(vm->fd, KVM_GET_STATS_FD, NULL); +} + +int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + + return ioctl(vcpu->fd, KVM_GET_STATS_FD, NULL); +} diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 7397ca299835..0ef80dbdc116 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -50,11 +50,12 @@ static void guest_code(uint32_t vcpu_id) } struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes, + uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src) { struct kvm_vm *vm; uint64_t guest_num_pages; + int i; pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); @@ -68,6 +69,9 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, "Guest memory size is not host page size aligned."); TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, "Guest memory size is not guest page size aligned."); + TEST_ASSERT(guest_num_pages % slots == 0, + "Guest memory cannot be evenly divided into %d slots.", + slots); vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, @@ -95,13 +99,19 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, #endif pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - /* Add an extra memory slot for testing */ - vm_userspace_mem_region_add(vm, backing_src, guest_test_phys_mem, - PERF_TEST_MEM_SLOT_INDEX, - guest_num_pages, 0); + /* Add extra memory slots for testing */ + for (i = 0; i < slots; i++) { + uint64_t region_pages = guest_num_pages / slots; + vm_paddr_t region_start = guest_test_phys_mem + + region_pages * perf_test_args.guest_page_size * i; + + vm_userspace_mem_region_add(vm, backing_src, region_start, + PERF_TEST_MEM_SLOT_INDEX + i, + region_pages, 0); + } /* Do mapping for the demand paging memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); ucall_init(vm, NULL); @@ -140,6 +150,8 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, vcpu_gpa = guest_test_phys_mem; } + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", vcpu_id, vcpu_gpa, vcpu_gpa + (vcpu_args->pages * perf_test_args.guest_page_size)); diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index 0152f356c099..f87c7137598e 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -9,11 +9,9 @@ #include "kvm_util.h" #include "../kvm_util_internal.h" -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 - #define PAGES_PER_REGION 4 -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) +void virt_pgd_alloc(struct kvm_vm *vm) { vm_paddr_t paddr; @@ -24,7 +22,7 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) return; paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size); vm->pgd = paddr; @@ -36,12 +34,12 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) * a page table (ri == 4). Returns a suitable region/segment table entry * which points to the freshly allocated pages. */ -static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot) +static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri) { uint64_t taddr; taddr = vm_phy_pages_alloc(vm, ri < 4 ? PAGES_PER_REGION : 1, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); memset(addr_gpa2hva(vm, taddr), 0xff, PAGES_PER_REGION * vm->page_size); return (taddr & REGION_ENTRY_ORIGIN) @@ -49,8 +47,7 @@ static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot) | ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH); } -void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, - uint32_t memslot) +void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) { int ri, idx; uint64_t *entry; @@ -77,7 +74,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, for (ri = 1; ri <= 4; ri++) { idx = (gva >> (64 - 11 * ri)) & 0x7ffu; if (entry[idx] & REGION_ENTRY_INVALID) - entry[idx] = virt_alloc_region(vm, ri, memslot); + entry[idx] = virt_alloc_region(vm, ri); entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); } @@ -170,7 +167,7 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) vm->page_size); stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + DEFAULT_GUEST_STACK_VADDR_MIN); vm_vcpu_add(vm, vcpuid); diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index af1031fed97f..b72429108993 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -11,6 +11,7 @@ #include <stdlib.h> #include <time.h> #include <sys/stat.h> +#include <sys/syscall.h> #include <linux/mman.h> #include "linux/kernel.h" @@ -129,13 +130,16 @@ size_t get_trans_hugepagesz(void) { size_t size; FILE *f; + int ret; TEST_ASSERT(thp_configured(), "THP is not configured in host kernel"); f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r"); TEST_ASSERT(f != NULL, "Error in opening transparent_hugepage/hpage_pmd_size"); - fscanf(f, "%ld", &size); + ret = fscanf(f, "%ld", &size); + ret = fscanf(f, "%ld", &size); + TEST_ASSERT(ret < 1, "Error reading transparent_hugepage/hpage_pmd_size"); fclose(f); return size; @@ -279,13 +283,22 @@ size_t get_backing_src_pagesz(uint32_t i) } } -void backing_src_help(void) +static void print_available_backing_src_types(const char *prefix) { int i; - printf("Available backing src types:\n"); + printf("%sAvailable backing src types:\n", prefix); + for (i = 0; i < NUM_SRC_TYPES; i++) - printf("\t%s\n", vm_mem_backing_src_alias(i)->name); + printf("%s %s\n", prefix, vm_mem_backing_src_alias(i)->name); +} + +void backing_src_help(const char *flag) +{ + printf(" %s: specify the type of memory that should be used to\n" + " back the guest data region. (default: %s)\n", + flag, vm_mem_backing_src_alias(DEFAULT_VM_MEM_SRC)->name); + print_available_backing_src_types(" "); } enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) @@ -296,7 +309,23 @@ enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) if (!strcmp(type_name, vm_mem_backing_src_alias(i)->name)) return i; - backing_src_help(); + print_available_backing_src_types(""); TEST_FAIL("Unknown backing src type: %s", type_name); return -1; } + +long get_run_delay(void) +{ + char path[64]; + long val[2]; + FILE *fp; + + sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); + fp = fopen(path, "r"); + /* Return MIN_RUN_DELAY_NS upon failure just to be safe */ + if (fscanf(fp, "%ld %ld ", &val[0], &val[1]) < 2) + val[1] = MIN_RUN_DELAY_NS; + fclose(fp); + + return val[1]; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/apic.c b/tools/testing/selftests/kvm/lib/x86_64/apic.c new file mode 100644 index 000000000000..7168e25c194e --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/apic.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/x86_64/processor.c + * + * Copyright (C) 2021, Google LLC. + */ + +#include "apic.h" + +void apic_disable(void) +{ + wrmsr(MSR_IA32_APICBASE, + rdmsr(MSR_IA32_APICBASE) & + ~(MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD)); +} + +void xapic_enable(void) +{ + uint64_t val = rdmsr(MSR_IA32_APICBASE); + + /* Per SDM: to enable xAPIC when in x2APIC must first disable APIC */ + if (val & MSR_IA32_APICBASE_EXTD) { + apic_disable(); + wrmsr(MSR_IA32_APICBASE, + rdmsr(MSR_IA32_APICBASE) | MSR_IA32_APICBASE_ENABLE); + } else if (!(val & MSR_IA32_APICBASE_ENABLE)) { + wrmsr(MSR_IA32_APICBASE, val | MSR_IA32_APICBASE_ENABLE); + } + + /* + * Per SDM: reset value of spurious interrupt vector register has the + * APIC software enabled bit=0. It must be enabled in addition to the + * enable bit in the MSR. + */ + val = xapic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED; + xapic_write_reg(APIC_SPIV, val); +} + +void x2apic_enable(void) +{ + wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) | + MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD); + x2apic_write_reg(APIC_SPIV, + x2apic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED); +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index efe235044421..28cb881f440d 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -17,13 +17,10 @@ #define DEFAULT_CODE_SELECTOR 0x8 #define DEFAULT_DATA_SELECTOR 0x10 -/* Minimum physical address used for virtual translation tables. */ -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 - vm_vaddr_t exception_handlers; /* Virtual translation table structure declarations */ -struct pageMapL4Entry { +struct pageUpperEntry { uint64_t present:1; uint64_t writable:1; uint64_t user:1; @@ -33,37 +30,7 @@ struct pageMapL4Entry { uint64_t ignored_06:1; uint64_t page_size:1; uint64_t ignored_11_08:4; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - -struct pageDirectoryPointerEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - -struct pageDirectoryEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t address:40; + uint64_t pfn:40; uint64_t ignored_62_52:11; uint64_t execute_disable:1; }; @@ -79,7 +46,7 @@ struct pageTableEntry { uint64_t reserved_07:1; uint64_t global:1; uint64_t ignored_11_09:3; - uint64_t address:40; + uint64_t pfn:40; uint64_t ignored_62_52:11; uint64_t execute_disable:1; }; @@ -207,96 +174,211 @@ void sregs_dump(FILE *stream, struct kvm_sregs *sregs, } } -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +void virt_pgd_alloc(struct kvm_vm *vm) { TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); /* If needed, create page map l4 table. */ if (!vm->pgd_created) { - vm_paddr_t paddr = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - vm->pgd = paddr; + vm->pgd = vm_alloc_page_table(vm); vm->pgd_created = true; } } -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot) +static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr, + int level) +{ + uint64_t *page_table = addr_gpa2hva(vm, pt_pfn << vm->page_shift); + int index = vaddr >> (vm->page_shift + level * 9) & 0x1ffu; + + return &page_table[index]; +} + +static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm, + uint64_t pt_pfn, + uint64_t vaddr, + uint64_t paddr, + int level, + enum x86_page_size page_size) +{ + struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level); + + if (!pte->present) { + pte->writable = true; + pte->present = true; + pte->page_size = (level == page_size); + if (pte->page_size) + pte->pfn = paddr >> vm->page_shift; + else + pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift; + } else { + /* + * Entry already present. Assert that the caller doesn't want + * a hugepage at this level, and that there isn't a hugepage at + * this level. + */ + TEST_ASSERT(level != page_size, + "Cannot create hugepage at level: %u, vaddr: 0x%lx\n", + page_size, vaddr); + TEST_ASSERT(!pte->page_size, + "Cannot create page table at level: %u, vaddr: 0x%lx\n", + level, vaddr); + } + return pte; +} + +void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + enum x86_page_size page_size) +{ + const uint64_t pg_size = 1ull << ((page_size * 9) + 12); + struct pageUpperEntry *pml4e, *pdpe, *pde; + struct pageTableEntry *pte; + + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, + "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + TEST_ASSERT((vaddr % pg_size) == 0, + "Virtual address not aligned,\n" + "vaddr: 0x%lx page size: 0x%lx", vaddr, pg_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", vaddr); + TEST_ASSERT((paddr % pg_size) == 0, + "Physical address not aligned,\n" + " paddr: 0x%lx page size: 0x%lx", paddr, pg_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + /* + * Allocate upper level page tables, if not already present. Return + * early if a hugepage was created. + */ + pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift, + vaddr, paddr, 3, page_size); + if (pml4e->page_size) + return; + + pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size); + if (pdpe->page_size) + return; + + pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size); + if (pde->page_size) + return; + + /* Fill in page table entry. */ + pte = virt_get_pte(vm, pde->pfn, vaddr, 0); + TEST_ASSERT(!pte->present, + "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr); + pte->pfn = paddr >> vm->page_shift; + pte->writable = true; + pte->present = 1; +} + +void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +{ + __virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K); +} + +static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, + uint64_t vaddr) { uint16_t index[4]; - struct pageMapL4Entry *pml4e; + struct pageUpperEntry *pml4e, *pdpe, *pde; + struct pageTableEntry *pte; + struct kvm_cpuid_entry2 *entry; + struct kvm_sregs sregs; + int max_phy_addr; + /* Set the bottom 52 bits. */ + uint64_t rsvd_mask = 0x000fffffffffffff; + + entry = kvm_get_supported_cpuid_index(0x80000008, 0); + max_phy_addr = entry->eax & 0x000000ff; + /* Clear the bottom bits of the reserved mask. */ + rsvd_mask = (rsvd_mask >> max_phy_addr) << max_phy_addr; + + /* + * SDM vol 3, fig 4-11 "Formats of CR3 and Paging-Structure Entries + * with 4-Level Paging and 5-Level Paging". + * If IA32_EFER.NXE = 0 and the P flag of a paging-structure entry is 1, + * the XD flag (bit 63) is reserved. + */ + vcpu_sregs_get(vm, vcpuid, &sregs); + if ((sregs.efer & EFER_NX) == 0) { + rsvd_mask |= (1ull << 63); + } TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); - - TEST_ASSERT((vaddr % vm->page_size) == 0, - "Virtual address not on page boundary,\n" - " vaddr: 0x%lx vm->page_size: 0x%x", - vaddr, vm->page_size); TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), "Invalid virtual address, vaddr: 0x%lx", vaddr); - TEST_ASSERT((paddr % vm->page_size) == 0, - "Physical address not on page boundary,\n" - " paddr: 0x%lx vm->page_size: 0x%x", - paddr, vm->page_size); - TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, - "Physical address beyond beyond maximum supported,\n" - " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", - paddr, vm->max_gfn, vm->page_size); + /* + * Based on the mode check above there are 48 bits in the vaddr, so + * shift 16 to sign extend the last bit (bit-47), + */ + TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16), + "Canonical check failed. The virtual address is invalid."); index[0] = (vaddr >> 12) & 0x1ffu; index[1] = (vaddr >> 21) & 0x1ffu; index[2] = (vaddr >> 30) & 0x1ffu; index[3] = (vaddr >> 39) & 0x1ffu; - /* Allocate page directory pointer table if not present. */ pml4e = addr_gpa2hva(vm, vm->pgd); - if (!pml4e[index[3]].present) { - pml4e[index[3]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pml4e[index[3]].writable = true; - pml4e[index[3]].present = true; - } + TEST_ASSERT(pml4e[index[3]].present, + "Expected pml4e to be present for gva: 0x%08lx", vaddr); + TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) & + (rsvd_mask | (1ull << 7))) == 0, + "Unexpected reserved bits set."); + + pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size); + TEST_ASSERT(pdpe[index[2]].present, + "Expected pdpe to be present for gva: 0x%08lx", vaddr); + TEST_ASSERT(pdpe[index[2]].page_size == 0, + "Expected pdpe to map a pde not a 1-GByte page."); + TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0, + "Unexpected reserved bits set."); + + pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size); + TEST_ASSERT(pde[index[1]].present, + "Expected pde to be present for gva: 0x%08lx", vaddr); + TEST_ASSERT(pde[index[1]].page_size == 0, + "Expected pde to map a pte not a 2-MByte page."); + TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0, + "Unexpected reserved bits set."); + + pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size); + TEST_ASSERT(pte[index[0]].present, + "Expected pte to be present for gva: 0x%08lx", vaddr); + + return &pte[index[0]]; +} - /* Allocate page directory table if not present. */ - struct pageDirectoryPointerEntry *pdpe; - pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); - if (!pdpe[index[2]].present) { - pdpe[index[2]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pdpe[index[2]].writable = true; - pdpe[index[2]].present = true; - } +uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) +{ + struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr); - /* Allocate page table if not present. */ - struct pageDirectoryEntry *pde; - pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); - if (!pde[index[1]].present) { - pde[index[1]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pde[index[1]].writable = true; - pde[index[1]].present = true; - } + return *(uint64_t *)pte; +} - /* Fill in page table entry. */ - struct pageTableEntry *pte; - pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); - pte[index[0]].address = paddr >> vm->page_shift; - pte[index[0]].writable = true; - pte[index[0]].present = 1; +void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, + uint64_t pte) +{ + struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid, + vaddr); + + *(uint64_t *)new_pte = pte; } void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - struct pageMapL4Entry *pml4e, *pml4e_start; - struct pageDirectoryPointerEntry *pdpe, *pdpe_start; - struct pageDirectoryEntry *pde, *pde_start; + struct pageUpperEntry *pml4e, *pml4e_start; + struct pageUpperEntry *pdpe, *pdpe_start; + struct pageUpperEntry *pde, *pde_start; struct pageTableEntry *pte, *pte_start; if (!vm->pgd_created) @@ -307,8 +389,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (struct pageMapL4Entry *) addr_gpa2hva(vm, - vm->pgd); + pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; if (!pml4e->present) @@ -317,11 +398,10 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) " %u\n", indent, "", pml4e - pml4e_start, pml4e, - addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->address, + addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn, pml4e->writable, pml4e->execute_disable); - pdpe_start = addr_gpa2hva(vm, pml4e->address - * vm->page_size); + pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size); for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { pdpe = &pdpe_start[n2]; if (!pdpe->present) @@ -331,11 +411,10 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) indent, "", pdpe - pdpe_start, pdpe, addr_hva2gpa(vm, pdpe), - (uint64_t) pdpe->address, pdpe->writable, + (uint64_t) pdpe->pfn, pdpe->writable, pdpe->execute_disable); - pde_start = addr_gpa2hva(vm, - pdpe->address * vm->page_size); + pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size); for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { pde = &pde_start[n3]; if (!pde->present) @@ -344,11 +423,10 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) "0x%-12lx 0x%-10lx %u %u\n", indent, "", pde - pde_start, pde, addr_hva2gpa(vm, pde), - (uint64_t) pde->address, pde->writable, + (uint64_t) pde->pfn, pde->writable, pde->execute_disable); - pte_start = addr_gpa2hva(vm, - pde->address * vm->page_size); + pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size); for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { pte = &pte_start[n4]; if (!pte->present) @@ -359,7 +437,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) indent, "", pte - pte_start, pte, addr_hva2gpa(vm, pte), - (uint64_t) pte->address, + (uint64_t) pte->pfn, pte->writable, pte->execute_disable, pte->dirty, @@ -480,9 +558,7 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint16_t index[4]; - struct pageMapL4Entry *pml4e; - struct pageDirectoryPointerEntry *pdpe; - struct pageDirectoryEntry *pde; + struct pageUpperEntry *pml4e, *pdpe, *pde; struct pageTableEntry *pte; TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " @@ -499,43 +575,39 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) if (!pml4e[index[3]].present) goto unmapped_gva; - pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); + pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size); if (!pdpe[index[2]].present) goto unmapped_gva; - pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); + pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size); if (!pde[index[1]].present) goto unmapped_gva; - pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); + pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size); if (!pte[index[0]].present) goto unmapped_gva; - return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu); + return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu); unmapped_gva: TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); exit(EXIT_FAILURE); } -static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot, - int pgd_memslot) +static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) { if (!vm->gdt) - vm->gdt = vm_vaddr_alloc(vm, getpagesize(), - KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + vm->gdt = vm_vaddr_alloc_page(vm); dt->base = vm->gdt; dt->limit = getpagesize(); } static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, - int selector, int gdt_memslot, - int pgd_memslot) + int selector) { if (!vm->tss) - vm->tss = vm_vaddr_alloc(vm, getpagesize(), - KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + vm->tss = vm_vaddr_alloc_page(vm); memset(segp, 0, sizeof(*segp)); segp->base = vm->tss; @@ -546,7 +618,7 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, kvm_seg_fill_gdt_64bit(vm, segp); } -static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot) +static void vcpu_setup(struct kvm_vm *vm, int vcpuid) { struct kvm_sregs sregs; @@ -555,7 +627,7 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m sregs.idt.limit = 0; - kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot); + kvm_setup_gdt(vm, &sregs.gdt); switch (vm->mode) { case VM_MODE_PXXV48_4K: @@ -567,7 +639,7 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); - kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot); + kvm_setup_tss_64bit(vm, &sregs.tr, 0x18); break; default: @@ -584,11 +656,11 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) struct kvm_regs regs; vm_vaddr_t stack_vaddr; stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), - DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + DEFAULT_GUEST_STACK_VADDR_MIN); /* Create VCPU */ vm_vcpu_add(vm, vcpuid); - vcpu_setup(vm, vcpuid, 0, 0); + vcpu_setup(vm, vcpuid); /* Setup guest general purpose registers */ vcpu_regs_get(vm, vcpuid, ®s); @@ -600,6 +672,9 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) /* Setup the MP state */ mp_state.mp_state = 0; vcpu_set_mp_state(vm, vcpuid, &mp_state); + + /* Setup supported CPUIDs */ + vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); } /* @@ -1201,7 +1276,7 @@ static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, void kvm_exit_unexpected_vector(uint32_t value) { - outl(UNEXPECTED_VECTOR_PORT, value); + ucall(UCALL_UNHANDLED, 1, value); } void route_exception(struct ex_regs *regs) @@ -1222,8 +1297,8 @@ void vm_init_descriptor_tables(struct kvm_vm *vm) extern void *idt_handlers; int i; - vm->idt = vm_vaddr_alloc(vm, getpagesize(), 0x2000, 0, 0); - vm->handlers = vm_vaddr_alloc(vm, 256 * sizeof(void *), 0x2000, 0, 0); + vm->idt = vm_vaddr_alloc_page(vm); + vm->handlers = vm_vaddr_alloc_page(vm); /* Handlers have the same address in both address spaces.*/ for (i = 0; i < NUM_INTERRUPTS; i++) set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, @@ -1244,8 +1319,8 @@ void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid) *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; } -void vm_handle_exception(struct kvm_vm *vm, int vector, - void (*handler)(struct ex_regs *)) +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) { vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); @@ -1254,16 +1329,13 @@ void vm_handle_exception(struct kvm_vm *vm, int vector, void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) { - if (vcpu_state(vm, vcpuid)->exit_reason == KVM_EXIT_IO - && vcpu_state(vm, vcpuid)->io.port == UNEXPECTED_VECTOR_PORT - && vcpu_state(vm, vcpuid)->io.size == 4) { - /* Grab pointer to io data */ - uint32_t *data = (void *)vcpu_state(vm, vcpuid) - + vcpu_state(vm, vcpuid)->io.data_offset; - - TEST_ASSERT(false, - "Unexpected vectored event in guest (vector:0x%x)", - *data); + struct ucall uc; + + if (get_ucall(vm, vcpuid, &uc) == UCALL_UNHANDLED) { + uint64_t vector = uc.args[0]; + + TEST_FAIL("Unexpected vectored event in guest (vector:0x%lx)", + vector); } } diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 827fe6028dd4..2ac98d70d02b 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -30,17 +30,14 @@ u64 rflags; struct svm_test_data * vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva) { - vm_vaddr_t svm_gva = vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vm_vaddr_t svm_gva = vm_vaddr_alloc_page(vm); struct svm_test_data *svm = addr_gva2hva(vm, svm_gva); - svm->vmcb = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + svm->vmcb = (void *)vm_vaddr_alloc_page(vm); svm->vmcb_hva = addr_gva2hva(vm, (uintptr_t)svm->vmcb); svm->vmcb_gpa = addr_gva2gpa(vm, (uintptr_t)svm->vmcb); - svm->save_area = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + svm->save_area = (void *)vm_vaddr_alloc_page(vm); svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area); svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area); diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 2448b30e8efa..d089d8b850b5 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -77,50 +77,48 @@ int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) struct vmx_pages * vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) { - vm_vaddr_t vmx_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vm_vaddr_t vmx_gva = vm_vaddr_alloc_page(vm); struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva); /* Setup of a region of guest memory for the vmxon region. */ - vmx->vmxon = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmxon = (void *)vm_vaddr_alloc_page(vm); vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon); vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon); /* Setup of a region of guest memory for a vmcs. */ - vmx->vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmcs = (void *)vm_vaddr_alloc_page(vm); vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs); vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs); /* Setup of a region of guest memory for the MSR bitmap. */ - vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->msr = (void *)vm_vaddr_alloc_page(vm); vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr); vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr); memset(vmx->msr_hva, 0, getpagesize()); /* Setup of a region of guest memory for the shadow VMCS. */ - vmx->shadow_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->shadow_vmcs = (void *)vm_vaddr_alloc_page(vm); vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs); vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs); /* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */ - vmx->vmread = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmread = (void *)vm_vaddr_alloc_page(vm); vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread); vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread); memset(vmx->vmread_hva, 0, getpagesize()); - vmx->vmwrite = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmwrite = (void *)vm_vaddr_alloc_page(vm); vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite); vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); /* Setup of a region of guest memory for the VP Assist page. */ - vmx->vp_assist = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vmx->vp_assist = (void *)vm_vaddr_alloc_page(vm); vmx->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)vmx->vp_assist); vmx->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vp_assist); /* Setup of a region of guest memory for the enlightened VMCS. */ - vmx->enlightened_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vmx->enlightened_vmcs = (void *)vm_vaddr_alloc_page(vm); vmx->enlightened_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->enlightened_vmcs); vmx->enlightened_vmcs_gpa = @@ -395,7 +393,7 @@ void nested_vmx_check_supported(void) } void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot) + uint64_t nested_paddr, uint64_t paddr) { uint16_t index[4]; struct eptPageTableEntry *pml4e; @@ -428,9 +426,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, /* Allocate page directory pointer table if not present. */ pml4e = vmx->eptp_hva; if (!pml4e[index[3]].readable) { - pml4e[index[3]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; + pml4e[index[3]].address = vm_alloc_page_table(vm) >> vm->page_shift; pml4e[index[3]].writable = true; pml4e[index[3]].readable = true; pml4e[index[3]].executable = true; @@ -440,9 +436,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, struct eptPageTableEntry *pdpe; pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); if (!pdpe[index[2]].readable) { - pdpe[index[2]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; + pdpe[index[2]].address = vm_alloc_page_table(vm) >> vm->page_shift; pdpe[index[2]].writable = true; pdpe[index[2]].readable = true; pdpe[index[2]].executable = true; @@ -452,9 +446,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, struct eptPageTableEntry *pde; pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); if (!pde[index[1]].readable) { - pde[index[1]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; + pde[index[1]].address = vm_alloc_page_table(vm) >> vm->page_shift; pde[index[1]].writable = true; pde[index[1]].readable = true; pde[index[1]].executable = true; @@ -494,8 +486,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * page range starting at nested_paddr to the page range starting at paddr. */ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, - uint32_t eptp_memslot) + uint64_t nested_paddr, uint64_t paddr, uint64_t size) { size_t page_size = vm->page_size; size_t npages = size / page_size; @@ -504,7 +495,7 @@ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - nested_pg_map(vmx, vm, nested_paddr, paddr, eptp_memslot); + nested_pg_map(vmx, vm, nested_paddr, paddr); nested_paddr += page_size; paddr += page_size; } @@ -514,7 +505,7 @@ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, * physical pages in VM. */ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot, uint32_t eptp_memslot) + uint32_t memslot) { sparsebit_idx_t i, last; struct userspace_mem_region *region = @@ -530,24 +521,21 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, nested_map(vmx, vm, (uint64_t)i << vm->page_shift, (uint64_t)i << vm->page_shift, - 1 << vm->page_shift, - eptp_memslot); + 1 << vm->page_shift); } } void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot) { - vmx->eptp = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->eptp = (void *)vm_vaddr_alloc_page(vm); vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); } -void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot) +void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm) { - vmx->apic_access = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vmx->apic_access = (void *)vm_vaddr_alloc_page(vm); vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access); vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access); } diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 98351ba0933c..4cfcafea9f5a 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -45,7 +45,6 @@ static void *vcpu_worker(void *data) struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); /* Let the guest access its memory until a stop signal is received */ @@ -105,7 +104,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct kvm_vm *vm; int vcpu_id; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, VM_MEM_SRC_ANONYMOUS); perf_test_args.wr_fract = 1; diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 11239652d805..d6e381e01db7 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -306,7 +306,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, guest_addr += npages * 4096; } - virt_map(data->vm, MEM_GPA, MEM_GPA, mempages, 0); + virt_map(data->vm, MEM_GPA, MEM_GPA, mempages); sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); atomic_init(&sync->start_flag, false); diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c new file mode 100644 index 000000000000..4158da0da2bb --- /dev/null +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <signal.h> +#include <syscall.h> +#include <sys/ioctl.h> +#include <sys/sysinfo.h> +#include <asm/barrier.h> +#include <linux/atomic.h> +#include <linux/rseq.h> +#include <linux/unistd.h> + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +#define VCPU_ID 0 + +static __thread volatile struct rseq __rseq = { + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, +}; + +/* + * Use an arbitrary, bogus signature for configuring rseq, this test does not + * actually enter an rseq critical section. + */ +#define RSEQ_SIG 0xdeadbeef + +/* + * Any bug related to task migration is likely to be timing-dependent; perform + * a large number of migrations to reduce the odds of a false negative. + */ +#define NR_TASK_MIGRATIONS 100000 + +static pthread_t migration_thread; +static cpu_set_t possible_mask; +static int min_cpu, max_cpu; +static bool done; + +static atomic_t seq_cnt; + +static void guest_code(void) +{ + for (;;) + GUEST_SYNC(0); +} + +static void sys_rseq(int flags) +{ + int r; + + r = syscall(__NR_rseq, &__rseq, sizeof(__rseq), flags, RSEQ_SIG); + TEST_ASSERT(!r, "rseq failed, errno = %d (%s)", errno, strerror(errno)); +} + +static int next_cpu(int cpu) +{ + /* + * Advance to the next CPU, skipping those that weren't in the original + * affinity set. Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's + * data storage is considered as opaque. Note, if this task is pinned + * to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will + * burn a lot cycles and the test will take longer than normal to + * complete. + */ + do { + cpu++; + if (cpu > max_cpu) { + cpu = min_cpu; + TEST_ASSERT(CPU_ISSET(cpu, &possible_mask), + "Min CPU = %d must always be usable", cpu); + break; + } + } while (!CPU_ISSET(cpu, &possible_mask)); + + return cpu; +} + +static void *migration_worker(void *ign) +{ + cpu_set_t allowed_mask; + int r, i, cpu; + + CPU_ZERO(&allowed_mask); + + for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) { + CPU_SET(cpu, &allowed_mask); + + /* + * Bump the sequence count twice to allow the reader to detect + * that a migration may have occurred in between rseq and sched + * CPU ID reads. An odd sequence count indicates a migration + * is in-progress, while a completely different count indicates + * a migration occurred since the count was last read. + */ + atomic_inc(&seq_cnt); + + /* + * Ensure the odd count is visible while sched_getcpu() isn't + * stable, i.e. while changing affinity is in-progress. + */ + smp_wmb(); + r = sched_setaffinity(0, sizeof(allowed_mask), &allowed_mask); + TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)", + errno, strerror(errno)); + smp_wmb(); + atomic_inc(&seq_cnt); + + CPU_CLR(cpu, &allowed_mask); + + /* + * Wait 1-10us before proceeding to the next iteration and more + * specifically, before bumping seq_cnt again. A delay is + * needed on three fronts: + * + * 1. To allow sched_setaffinity() to prompt migration before + * ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME + * (or TIF_NEED_RESCHED, which indirectly leads to handling + * NOTIFY_RESUME) is handled in KVM context. + * + * If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters + * the guest, the guest will trigger a IO/MMIO exit all the + * way to userspace and the TIF flags will be handled by + * the generic "exit to userspace" logic, not by KVM. The + * exit to userspace is necessary to give the test a chance + * to check the rseq CPU ID (see #2). + * + * Alternatively, guest_code() could include an instruction + * to trigger an exit that is handled by KVM, but any such + * exit requires architecture specific code. + * + * 2. To let ioctl(KVM_RUN) make its way back to the test + * before the next round of migration. The test's check on + * the rseq CPU ID must wait for migration to complete in + * order to avoid false positive, thus any kernel rseq bug + * will be missed if the next migration starts before the + * check completes. + * + * 3. To ensure the read-side makes efficient forward progress, + * e.g. if sched_getcpu() involves a syscall. Stalling the + * read-side means the test will spend more time waiting for + * sched_getcpu() to stabilize and less time trying to hit + * the timing-dependent bug. + * + * Because any bug in this area is likely to be timing-dependent, + * run with a range of delays at 1us intervals from 1us to 10us + * as a best effort to avoid tuning the test to the point where + * it can hit _only_ the original bug and not detect future + * regressions. + * + * The original bug can reproduce with a delay up to ~500us on + * x86-64, but starts to require more iterations to reproduce + * as the delay creeps above ~10us, and the average runtime of + * each iteration obviously increases as well. Cap the delay + * at 10us to keep test runtime reasonable while minimizing + * potential coverage loss. + * + * The lower bound for reproducing the bug is likely below 1us, + * e.g. failures occur on x86-64 with nanosleep(0), but at that + * point the overhead of the syscall likely dominates the delay. + * Use usleep() for simplicity and to avoid unnecessary kernel + * dependencies. + */ + usleep((i % 10) + 1); + } + done = true; + return NULL; +} + +static int calc_min_max_cpu(void) +{ + int i, cnt, nproc; + + if (CPU_COUNT(&possible_mask) < 2) + return -EINVAL; + + /* + * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that + * this task is affined to in order to reduce the time spent querying + * unusable CPUs, e.g. if this task is pinned to a small percentage of + * total CPUs. + */ + nproc = get_nprocs_conf(); + min_cpu = -1; + max_cpu = -1; + cnt = 0; + + for (i = 0; i < nproc; i++) { + if (!CPU_ISSET(i, &possible_mask)) + continue; + if (min_cpu == -1) + min_cpu = i; + max_cpu = i; + cnt++; + } + + return (cnt < 2) ? -EINVAL : 0; +} + +int main(int argc, char *argv[]) +{ + int r, i, snapshot; + struct kvm_vm *vm; + u32 cpu, rseq_cpu; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); + TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, + strerror(errno)); + + if (calc_min_max_cpu()) { + print_skip("Only one usable CPU, task migration not possible"); + exit(KSFT_SKIP); + } + + sys_rseq(0); + + /* + * Create and run a dummy VM that immediately exits to userspace via + * GUEST_SYNC, while concurrently migrating the process by setting its + * CPU affinity. + */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + ucall_init(vm, NULL); + + pthread_create(&migration_thread, NULL, migration_worker, 0); + + for (i = 0; !done; i++) { + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC, + "Guest failed?"); + + /* + * Verify rseq's CPU matches sched's CPU. Ensure migration + * doesn't occur between sched_getcpu() and reading the rseq + * cpu_id by rereading both if the sequence count changes, or + * if the count is odd (migration in-progress). + */ + do { + /* + * Drop bit 0 to force a mismatch if the count is odd, + * i.e. if a migration is in-progress. + */ + snapshot = atomic_read(&seq_cnt) & ~1; + + /* + * Ensure reading sched_getcpu() and rseq.cpu_id + * complete in a single "no migration" window, i.e. are + * not reordered across the seq_cnt reads. + */ + smp_rmb(); + cpu = sched_getcpu(); + rseq_cpu = READ_ONCE(__rseq.cpu_id); + smp_rmb(); + } while (snapshot != atomic_read(&seq_cnt)); + + TEST_ASSERT(rseq_cpu == cpu, + "rseq CPU = %d, sched CPU = %d\n", rseq_cpu, cpu); + } + + /* + * Sanity check that the test was able to enter the guest a reasonable + * number of times, e.g. didn't get stalled too often/long waiting for + * sched_getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a + * fairly conservative ratio on x86-64, which can do _more_ KVM_RUNs + * than migrations given the 1us+ delay in the migration task. + */ + TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), + "Only performed %d KVM_RUNs, task stalled too much?\n", i); + + pthread_join(migration_thread, NULL); + + kvm_vm_free(vm); + + sys_rseq(RSEQ_FLAG_UNREGISTER); + + return 0; +} diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index d8812f27648c..72a1c9b4882c 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -132,7 +132,7 @@ static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code) gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT); TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n"); - virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0); + virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2); /* Ditto for the host mapping so that both pages can be zeroed. */ hva = addr_gpa2hva(vm, MEM_REGION_GPA); @@ -377,7 +377,8 @@ static void test_add_max_memory_regions(void) (max_mem_slots - 1), MEM_REGION_SIZE >> 10); mem = mmap(NULL, (size_t)max_mem_slots * MEM_REGION_SIZE + alignment, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0); TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host"); mem_aligned = (void *)(((size_t) mem + alignment - 1) & ~(alignment - 1)); diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index fcc840088c91..62f2eb9ee3d5 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -10,7 +10,6 @@ #include <sched.h> #include <pthread.h> #include <linux/kernel.h> -#include <sys/syscall.h> #include <asm/kvm.h> #include <asm/kvm_para.h> @@ -20,7 +19,6 @@ #define NR_VCPUS 4 #define ST_GPA_BASE (1 << 30) -#define MIN_RUN_DELAY_NS 200000UL static void *st_gva[NR_VCPUS]; static uint64_t guest_stolen_time[NR_VCPUS]; @@ -73,8 +71,6 @@ static void steal_time_init(struct kvm_vm *vm) for (i = 0; i < NR_VCPUS; ++i) { int ret; - vcpu_set_cpuid(vm, i, kvm_get_supported_cpuid()); - /* ST_GPA_BASE is identity mapped */ st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); sync_global_to_guest(vm, st_gva[i]); @@ -120,12 +116,12 @@ struct st_time { uint64_t st_time; }; -static int64_t smccc(uint32_t func, uint32_t arg) +static int64_t smccc(uint32_t func, uint64_t arg) { unsigned long ret; asm volatile( - "mov x0, %1\n" + "mov w0, %w1\n" "mov x1, %2\n" "hvc #0\n" "mov %0, x0\n" @@ -219,20 +215,6 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid) #endif -static long get_run_delay(void) -{ - char path[64]; - long val[2]; - FILE *fp; - - sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); - fp = fopen(path, "r"); - fscanf(fp, "%ld %ld ", &val[0], &val[1]); - fclose(fp); - - return val[1]; -} - static void *do_steal_time(void *arg) { struct timespec ts, stop; @@ -295,7 +277,7 @@ int main(int ac, char **av) vm = vm_create_default(0, 0, guest_code); gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE * NR_VCPUS); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); - virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages, 0); + virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages); ucall_init(vm, NULL); /* Add the rest of the VCPUs */ @@ -322,7 +304,7 @@ int main(int ac, char **av) run_delay = get_run_delay(); pthread_create(&thread, &attr, do_steal_time, NULL); do - pthread_yield(); + sched_yield(); while (get_run_delay() - run_delay < MIN_RUN_DELAY_NS); pthread_join(thread, NULL); run_delay = get_run_delay() - run_delay; diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index 6097a8283377..5f078db1bcba 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -8,12 +8,15 @@ #include <string.h> #include "kvm_util.h" #include "processor.h" +#include "apic.h" #define VCPU_ID 0 #define DR6_BD (1 << 13) #define DR7_GD (1 << 13) +#define IRQ_VECTOR 0xAA + /* For testing data access debug BP */ uint32_t guest_value; @@ -21,6 +24,11 @@ extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start; static void guest_code(void) { + /* Create a pending interrupt on current vCPU */ + x2apic_enable(); + x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT | + APIC_DM_FIXED | IRQ_VECTOR); + /* * Software BP tests. * @@ -38,12 +46,19 @@ static void guest_code(void) "mov %%rax,%0;\n\t write_data:" : "=m" (guest_value) : : "rax"); - /* Single step test, covers 2 basic instructions and 2 emulated */ + /* + * Single step test, covers 2 basic instructions and 2 emulated + * + * Enable interrupts during the single stepping to see that + * pending interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ + */ asm volatile("ss_start: " + "sti\n\t" "xor %%eax,%%eax\n\t" "cpuid\n\t" "movl $0x1a0,%%ecx\n\t" "rdmsr\n\t" + "cli\n\t" : : : "eax", "ebx", "ecx", "edx"); /* DR6.BD test */ @@ -72,11 +87,13 @@ int main(void) uint64_t cmd; int i; /* Instruction lengths starting at ss_start */ - int ss_size[4] = { + int ss_size[6] = { + 1, /* sti*/ 2, /* xor */ 2, /* cpuid */ 5, /* mov */ 2, /* rdmsr */ + 1, /* cli */ }; if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) { @@ -154,7 +171,8 @@ int main(void) for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) { target_rip += ss_size[i]; CLEAR_DEBUG(); - debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP | + KVM_GUESTDBG_BLOCKIRQ; debug.arch.debugreg[7] = 0x00000400; APPLY_DEBUG(); vcpu_run(vm, VCPU_ID); diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c new file mode 100644 index 000000000000..f070ff0224fa --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020, Google LLC. + * + * Tests for KVM_CAP_EXIT_ON_EMULATION_FAILURE capability. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ + +#include "test_util.h" +#include "kvm_util.h" +#include "vmx.h" + +#define VCPU_ID 1 +#define PAGE_SIZE 4096 +#define MAXPHYADDR 36 + +#define MEM_REGION_GVA 0x0000123456789000 +#define MEM_REGION_GPA 0x0000000700000000 +#define MEM_REGION_SLOT 10 +#define MEM_REGION_SIZE PAGE_SIZE + +static void guest_code(void) +{ + __asm__ __volatile__("flds (%[addr])" + :: [addr]"r"(MEM_REGION_GVA)); + + GUEST_DONE(); +} + +static void run_guest(struct kvm_vm *vm) +{ + int rc; + + rc = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc); +} + +/* + * Accessors to get R/M, REG, and Mod bits described in the SDM vol 2, + * figure 2-2 "Table Interpretation of ModR/M Byte (C8H)". + */ +#define GET_RM(insn_byte) (insn_byte & 0x7) +#define GET_REG(insn_byte) ((insn_byte & 0x38) >> 3) +#define GET_MOD(insn_byte) ((insn_byte & 0xc) >> 6) + +/* Ensure we are dealing with a simple 2-byte flds instruction. */ +static bool is_flds(uint8_t *insn_bytes, uint8_t insn_size) +{ + return insn_size >= 2 && + insn_bytes[0] == 0xd9 && + GET_REG(insn_bytes[1]) == 0x0 && + GET_MOD(insn_bytes[1]) == 0x0 && + /* Ensure there is no SIB byte. */ + GET_RM(insn_bytes[1]) != 0x4 && + /* Ensure there is no displacement byte. */ + GET_RM(insn_bytes[1]) != 0x5; +} + +static void process_exit_on_emulation_error(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_regs regs; + uint8_t *insn_bytes; + uint8_t insn_size; + uint64_t flags; + + TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, + "Unexpected exit reason: %u (%s)", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION, + "Unexpected suberror: %u", + run->emulation_failure.suberror); + + if (run->emulation_failure.ndata >= 1) { + flags = run->emulation_failure.flags; + if ((flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES) && + run->emulation_failure.ndata >= 3) { + insn_size = run->emulation_failure.insn_size; + insn_bytes = run->emulation_failure.insn_bytes; + + TEST_ASSERT(insn_size <= 15 && insn_size > 0, + "Unexpected instruction size: %u", + insn_size); + + TEST_ASSERT(is_flds(insn_bytes, insn_size), + "Unexpected instruction. Expected 'flds' (0xd9 /0)"); + + /* + * If is_flds() succeeded then the instruction bytes + * contained an flds instruction that is 2-bytes in + * length (ie: no prefix, no SIB, no displacement). + */ + vcpu_regs_get(vm, VCPU_ID, ®s); + regs.rip += 2; + vcpu_regs_set(vm, VCPU_ID, ®s); + } + } +} + +static void do_guest_assert(struct kvm_vm *vm, struct ucall *uc) +{ + TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0], __FILE__, + uc->args[1]); +} + +static void check_for_guest_assert(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + if (run->exit_reason == KVM_EXIT_IO && + get_ucall(vm, VCPU_ID, &uc) == UCALL_ABORT) { + do_guest_assert(vm, &uc); + } +} + +static void process_ucall_done(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + check_for_guest_assert(vm); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s)", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + TEST_ASSERT(get_ucall(vm, VCPU_ID, &uc) == UCALL_DONE, + "Unexpected ucall command: %lu, expected UCALL_DONE (%d)", + uc.cmd, UCALL_DONE); +} + +static uint64_t process_ucall(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s)", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_SYNC: + break; + case UCALL_ABORT: + do_guest_assert(vm, &uc); + break; + case UCALL_DONE: + process_ucall_done(vm); + break; + default: + TEST_ASSERT(false, "Unexpected ucall"); + } + + return uc.cmd; +} + +int main(int argc, char *argv[]) +{ + struct kvm_enable_cap emul_failure_cap = { + .cap = KVM_CAP_EXIT_ON_EMULATION_FAILURE, + .args[0] = 1, + }; + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid2 *cpuid; + struct kvm_vm *vm; + uint64_t gpa, pte; + uint64_t *hva; + int rc; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + vm = vm_create_default(VCPU_ID, 0, guest_code); + + if (!kvm_check_cap(KVM_CAP_SMALLER_MAXPHYADDR)) { + printf("module parameter 'allow_smaller_maxphyaddr' is not set. Skipping test.\n"); + return 0; + } + + cpuid = kvm_get_supported_cpuid(); + + entry = kvm_get_supported_cpuid_index(0x80000008, 0); + entry->eax = (entry->eax & 0xffffff00) | MAXPHYADDR; + set_cpuid(cpuid, entry); + + vcpu_set_cpuid(vm, VCPU_ID, cpuid); + + rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); + TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable"); + vm_enable_cap(vm, &emul_failure_cap); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + MEM_REGION_GPA, MEM_REGION_SLOT, + MEM_REGION_SIZE / PAGE_SIZE, 0); + gpa = vm_phy_pages_alloc(vm, MEM_REGION_SIZE / PAGE_SIZE, + MEM_REGION_GPA, MEM_REGION_SLOT); + TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n"); + virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1); + hva = addr_gpa2hva(vm, MEM_REGION_GPA); + memset(hva, 0, PAGE_SIZE); + pte = vm_get_page_table_entry(vm, VCPU_ID, MEM_REGION_GVA); + vm_set_page_table_entry(vm, VCPU_ID, MEM_REGION_GVA, pte | (1ull << 36)); + + run_guest(vm); + process_exit_on_emulation_error(vm); + run_guest(vm); + + TEST_ASSERT(process_ucall(vm) == UCALL_DONE, "Expected UCALL_DONE"); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 63096cea26c6..2b46dcca86a8 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -22,15 +22,6 @@ static int ud_count; -void enable_x2apic(void) -{ - uint32_t spiv_reg = APIC_BASE_MSR + (APIC_SPIV >> 4); - - wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) | - MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD); - wrmsr(spiv_reg, rdmsr(spiv_reg) | APIC_SPIV_APIC_ENABLED); -} - static void guest_ud_handler(struct ex_regs *regs) { ud_count++; @@ -59,7 +50,7 @@ void guest_code(struct vmx_pages *vmx_pages) #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - enable_x2apic(); + x2apic_enable(); GUEST_SYNC(1); GUEST_SYNC(2); @@ -121,14 +112,38 @@ void inject_nmi(struct kvm_vm *vm) vcpu_events_set(vm, VCPU_ID, &events); } +static void save_restore_vm(struct kvm_vm *vm) +{ + struct kvm_regs regs1, regs2; + struct kvm_x86_state *state; + + state = vcpu_save_state(vm, VCPU_ID); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vm, VCPU_ID, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + kvm_vm_restart(vm, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + vcpu_set_hv_cpuid(vm, VCPU_ID); + vcpu_enable_evmcs(vm, VCPU_ID); + vcpu_load_state(vm, VCPU_ID, state); + free(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vm, VCPU_ID, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); +} + int main(int argc, char *argv[]) { vm_vaddr_t vmx_pages_gva = 0; - struct kvm_regs regs1, regs2; struct kvm_vm *vm; struct kvm_run *run; - struct kvm_x86_state *state; struct ucall uc; int stage; @@ -145,21 +160,18 @@ int main(int argc, char *argv[]) vcpu_set_hv_cpuid(vm, VCPU_ID); vcpu_enable_evmcs(vm, VCPU_ID); - run = vcpu_state(vm, VCPU_ID); - - vcpu_regs_get(vm, VCPU_ID, ®s1); - vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vm, VCPU_ID); - vm_handle_exception(vm, UD_VECTOR, guest_ud_handler); - vm_handle_exception(vm, NMI_VECTOR, guest_nmi_handler); + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); + vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); pr_info("Running L1 which uses EVMCS to run L2\n"); for (stage = 1;; stage++) { + run = vcpu_state(vm, VCPU_ID); _vcpu_run(vm, VCPU_ID); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", @@ -184,32 +196,23 @@ int main(int argc, char *argv[]) uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", stage, (ulong)uc.args[1]); - state = vcpu_save_state(vm, VCPU_ID); - memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, VCPU_ID, ®s1); - - kvm_vm_release(vm); - - /* Restore state in a new VM. */ - kvm_vm_restart(vm, O_RDWR); - vm_vcpu_add(vm, VCPU_ID); - vcpu_set_hv_cpuid(vm, VCPU_ID); - vcpu_enable_evmcs(vm, VCPU_ID); - vcpu_load_state(vm, VCPU_ID, state); - run = vcpu_state(vm, VCPU_ID); - free(state); - - memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, VCPU_ID, ®s2); - TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), - "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", - (ulong) regs2.rdi, (ulong) regs2.rsi); + save_restore_vm(vm); /* Force immediate L2->L1 exit before resuming */ if (stage == 8) { pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n"); inject_nmi(vm); } + + /* + * Do KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE for a freshly + * restored VM (before the first KVM_RUN) to check that + * KVM_STATE_NESTED_EVMCS is not lost. + */ + if (stage == 9) { + pr_info("Trying extra KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE cycle\n"); + save_restore_vm(vm); + } } done: diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c index 8c77537af5a1..a711f83749ea 100644 --- a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c @@ -145,8 +145,7 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid) { int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]); - vm_vaddr_t gva = vm_vaddr_alloc(vm, size, - getpagesize(), 0, 0); + vm_vaddr_t gva = vm_vaddr_alloc(vm, size, KVM_UTIL_MIN_VADDR); struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva); memcpy(guest_cpuids, cpuid, size); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c index 7f1d2765572c..e0b2bb1339b1 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -7,6 +7,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "hyperv.h" struct ms_hyperv_tsc_page { volatile u32 tsc_sequence; @@ -15,13 +16,6 @@ struct ms_hyperv_tsc_page { volatile s64 tsc_offset; } __packed; -#define HV_X64_MSR_GUEST_OS_ID 0x40000000 -#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 -#define HV_X64_MSR_REFERENCE_TSC 0x40000021 -#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 -#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 -#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 - /* Simplified mul_u64_u64_shr() */ static inline u64 mul_u64_u64_shr64(u64 a, u64 b) { @@ -220,8 +214,8 @@ int main(void) vcpu_set_hv_cpuid(vm, VCPU_ID); - tsc_page_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); - memset(addr_gpa2hva(vm, tsc_page_gva), 0x0, getpagesize()); + tsc_page_gva = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, tsc_page_gva), 0x0, getpagesize()); TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0, "TSC page has to be page aligned\n"); vcpu_args_set(vm, VCPU_ID, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva)); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c new file mode 100644 index 000000000000..91d88aaa9899 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -0,0 +1,684 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Red Hat, Inc. + * + * Tests for Hyper-V features enablement + */ +#include <asm/kvm_para.h> +#include <linux/kvm_para.h> +#include <stdint.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "hyperv.h" + +#define VCPU_ID 0 +#define LINUX_OS_ID ((u64)0x8100 << 48) + +extern unsigned char rdmsr_start; +extern unsigned char rdmsr_end; + +static u64 do_rdmsr(u32 idx) +{ + u32 lo, hi; + + asm volatile("rdmsr_start: rdmsr;" + "rdmsr_end:" + : "=a"(lo), "=c"(hi) + : "c"(idx)); + + return (((u64) hi) << 32) | lo; +} + +extern unsigned char wrmsr_start; +extern unsigned char wrmsr_end; + +static void do_wrmsr(u32 idx, u64 val) +{ + u32 lo, hi; + + lo = val; + hi = val >> 32; + + asm volatile("wrmsr_start: wrmsr;" + "wrmsr_end:" + : : "a"(lo), "c"(idx), "d"(hi)); +} + +static int nr_gp; +static int nr_ud; + +static inline u64 hypercall(u64 control, vm_vaddr_t input_address, + vm_vaddr_t output_address) +{ + u64 hv_status; + + asm volatile("mov %3, %%r8\n" + "vmcall" + : "=a" (hv_status), + "+c" (control), "+d" (input_address) + : "r" (output_address) + : "cc", "memory", "r8", "r9", "r10", "r11"); + + return hv_status; +} + +static void guest_gp_handler(struct ex_regs *regs) +{ + unsigned char *rip = (unsigned char *)regs->rip; + bool r, w; + + r = rip == &rdmsr_start; + w = rip == &wrmsr_start; + GUEST_ASSERT(r || w); + + nr_gp++; + + if (r) + regs->rip = (uint64_t)&rdmsr_end; + else + regs->rip = (uint64_t)&wrmsr_end; +} + +static void guest_ud_handler(struct ex_regs *regs) +{ + nr_ud++; + regs->rip += 3; +} + +struct msr_data { + uint32_t idx; + bool available; + bool write; + u64 write_val; +}; + +struct hcall_data { + uint64_t control; + uint64_t expect; + bool ud_expected; +}; + +static void guest_msr(struct msr_data *msr) +{ + int i = 0; + + while (msr->idx) { + WRITE_ONCE(nr_gp, 0); + if (!msr->write) + do_rdmsr(msr->idx); + else + do_wrmsr(msr->idx, msr->write_val); + + if (msr->available) + GUEST_ASSERT(READ_ONCE(nr_gp) == 0); + else + GUEST_ASSERT(READ_ONCE(nr_gp) == 1); + + GUEST_SYNC(i++); + } + + GUEST_DONE(); +} + +static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) +{ + int i = 0; + u64 res, input, output; + + wrmsr(HV_X64_MSR_GUEST_OS_ID, LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa); + + while (hcall->control) { + nr_ud = 0; + if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) { + input = pgs_gpa; + output = pgs_gpa + 4096; + } else { + input = output = 0; + } + + res = hypercall(hcall->control, input, output); + if (hcall->ud_expected) + GUEST_ASSERT(nr_ud == 1); + else + GUEST_ASSERT(res == hcall->expect); + + GUEST_SYNC(i++); + } + + GUEST_DONE(); +} + +static void hv_set_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 *feat, + struct kvm_cpuid_entry2 *recomm, + struct kvm_cpuid_entry2 *dbg) +{ + TEST_ASSERT(set_cpuid(cpuid, feat), + "failed to set KVM_CPUID_FEATURES leaf"); + TEST_ASSERT(set_cpuid(cpuid, recomm), + "failed to set HYPERV_CPUID_ENLIGHTMENT_INFO leaf"); + TEST_ASSERT(set_cpuid(cpuid, dbg), + "failed to set HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES leaf"); + vcpu_set_cpuid(vm, VCPU_ID, cpuid); +} + +static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr, + struct kvm_cpuid2 *best) +{ + struct kvm_run *run; + struct ucall uc; + int stage = 0, r; + struct kvm_cpuid_entry2 feat = { + .function = HYPERV_CPUID_FEATURES + }; + struct kvm_cpuid_entry2 recomm = { + .function = HYPERV_CPUID_ENLIGHTMENT_INFO + }; + struct kvm_cpuid_entry2 dbg = { + .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES + }; + struct kvm_enable_cap cap = {0}; + + run = vcpu_state(vm, VCPU_ID); + + while (true) { + switch (stage) { + case 0: + /* + * Only available when Hyper-V identification is set + */ + msr->idx = HV_X64_MSR_GUEST_OS_ID; + msr->write = 0; + msr->available = 0; + break; + case 1: + msr->idx = HV_X64_MSR_HYPERCALL; + msr->write = 0; + msr->available = 0; + break; + case 2: + feat.eax |= HV_MSR_HYPERCALL_AVAILABLE; + /* + * HV_X64_MSR_GUEST_OS_ID has to be written first to make + * HV_X64_MSR_HYPERCALL available. + */ + msr->idx = HV_X64_MSR_GUEST_OS_ID; + msr->write = 1; + msr->write_val = LINUX_OS_ID; + msr->available = 1; + break; + case 3: + msr->idx = HV_X64_MSR_GUEST_OS_ID; + msr->write = 0; + msr->available = 1; + break; + case 4: + msr->idx = HV_X64_MSR_HYPERCALL; + msr->write = 0; + msr->available = 1; + break; + + case 5: + msr->idx = HV_X64_MSR_VP_RUNTIME; + msr->write = 0; + msr->available = 0; + break; + case 6: + feat.eax |= HV_MSR_VP_RUNTIME_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 7: + /* Read only */ + msr->write = 1; + msr->write_val = 1; + msr->available = 0; + break; + + case 8: + msr->idx = HV_X64_MSR_TIME_REF_COUNT; + msr->write = 0; + msr->available = 0; + break; + case 9: + feat.eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 10: + /* Read only */ + msr->write = 1; + msr->write_val = 1; + msr->available = 0; + break; + + case 11: + msr->idx = HV_X64_MSR_VP_INDEX; + msr->write = 0; + msr->available = 0; + break; + case 12: + feat.eax |= HV_MSR_VP_INDEX_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 13: + /* Read only */ + msr->write = 1; + msr->write_val = 1; + msr->available = 0; + break; + + case 14: + msr->idx = HV_X64_MSR_RESET; + msr->write = 0; + msr->available = 0; + break; + case 15: + feat.eax |= HV_MSR_RESET_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 16: + msr->write = 1; + msr->write_val = 0; + msr->available = 1; + break; + + case 17: + msr->idx = HV_X64_MSR_REFERENCE_TSC; + msr->write = 0; + msr->available = 0; + break; + case 18: + feat.eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 19: + msr->write = 1; + msr->write_val = 0; + msr->available = 1; + break; + + case 20: + msr->idx = HV_X64_MSR_EOM; + msr->write = 0; + msr->available = 0; + break; + case 21: + /* + * Remains unavailable even with KVM_CAP_HYPERV_SYNIC2 + * capability enabled and guest visible CPUID bit unset. + */ + cap.cap = KVM_CAP_HYPERV_SYNIC2; + vcpu_enable_cap(vm, VCPU_ID, &cap); + break; + case 22: + feat.eax |= HV_MSR_SYNIC_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 23: + msr->write = 1; + msr->write_val = 0; + msr->available = 1; + break; + + case 24: + msr->idx = HV_X64_MSR_STIMER0_CONFIG; + msr->write = 0; + msr->available = 0; + break; + case 25: + feat.eax |= HV_MSR_SYNTIMER_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 26: + msr->write = 1; + msr->write_val = 0; + msr->available = 1; + break; + case 27: + /* Direct mode test */ + msr->write = 1; + msr->write_val = 1 << 12; + msr->available = 0; + break; + case 28: + feat.edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; + msr->available = 1; + break; + + case 29: + msr->idx = HV_X64_MSR_EOI; + msr->write = 0; + msr->available = 0; + break; + case 30: + feat.eax |= HV_MSR_APIC_ACCESS_AVAILABLE; + msr->write = 1; + msr->write_val = 1; + msr->available = 1; + break; + + case 31: + msr->idx = HV_X64_MSR_TSC_FREQUENCY; + msr->write = 0; + msr->available = 0; + break; + case 32: + feat.eax |= HV_ACCESS_FREQUENCY_MSRS; + msr->write = 0; + msr->available = 1; + break; + case 33: + /* Read only */ + msr->write = 1; + msr->write_val = 1; + msr->available = 0; + break; + + case 34: + msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL; + msr->write = 0; + msr->available = 0; + break; + case 35: + feat.eax |= HV_ACCESS_REENLIGHTENMENT; + msr->write = 0; + msr->available = 1; + break; + case 36: + msr->write = 1; + msr->write_val = 1; + msr->available = 1; + break; + case 37: + /* Can only write '0' */ + msr->idx = HV_X64_MSR_TSC_EMULATION_STATUS; + msr->write = 1; + msr->write_val = 1; + msr->available = 0; + break; + + case 38: + msr->idx = HV_X64_MSR_CRASH_P0; + msr->write = 0; + msr->available = 0; + break; + case 39: + feat.edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 40: + msr->write = 1; + msr->write_val = 1; + msr->available = 1; + break; + + case 41: + msr->idx = HV_X64_MSR_SYNDBG_STATUS; + msr->write = 0; + msr->available = 0; + break; + case 42: + feat.edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + dbg.eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + msr->write = 0; + msr->available = 1; + break; + case 43: + msr->write = 1; + msr->write_val = 0; + msr->available = 1; + break; + + case 44: + /* END */ + msr->idx = 0; + break; + } + + hv_set_cpuid(vm, best, &feat, &recomm, &dbg); + + if (msr->idx) + pr_debug("Stage %d: testing msr: 0x%x for %s\n", stage, + msr->idx, msr->write ? "write" : "read"); + else + pr_debug("Stage %d: finish\n", stage); + + r = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(!r, "vcpu_run failed: %d\n", r); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "unexpected exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(uc.args[1] == stage, + "Unexpected stage: %ld (%d expected)\n", + uc.args[1], stage); + break; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + return; + case UCALL_DONE: + return; + } + + stage++; + } +} + +static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall, + void *input, void *output, struct kvm_cpuid2 *best) +{ + struct kvm_run *run; + struct ucall uc; + int stage = 0, r; + struct kvm_cpuid_entry2 feat = { + .function = HYPERV_CPUID_FEATURES, + .eax = HV_MSR_HYPERCALL_AVAILABLE + }; + struct kvm_cpuid_entry2 recomm = { + .function = HYPERV_CPUID_ENLIGHTMENT_INFO + }; + struct kvm_cpuid_entry2 dbg = { + .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES + }; + + run = vcpu_state(vm, VCPU_ID); + + while (true) { + switch (stage) { + case 0: + hcall->control = 0xdeadbeef; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + + case 1: + hcall->control = HVCALL_POST_MESSAGE; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 2: + feat.ebx |= HV_POST_MESSAGES; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + + case 3: + hcall->control = HVCALL_SIGNAL_EVENT; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 4: + feat.ebx |= HV_SIGNAL_EVENTS; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + + case 5: + hcall->control = HVCALL_RESET_DEBUG_SESSION; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + case 6: + dbg.eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 7: + feat.ebx |= HV_DEBUGGING; + hcall->expect = HV_STATUS_OPERATION_DENIED; + break; + + case 8: + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 9: + recomm.eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + hcall->expect = HV_STATUS_SUCCESS; + break; + case 10: + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 11: + recomm.eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; + hcall->expect = HV_STATUS_SUCCESS; + break; + + case 12: + hcall->control = HVCALL_SEND_IPI; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 13: + recomm.eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + case 14: + /* Nothing in 'sparse banks' -> success */ + hcall->control = HVCALL_SEND_IPI_EX; + hcall->expect = HV_STATUS_SUCCESS; + break; + + case 15: + hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 16: + recomm.ebx = 0xfff; + hcall->expect = HV_STATUS_SUCCESS; + break; + case 17: + /* XMM fast hypercall */ + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT; + hcall->ud_expected = true; + break; + case 18: + feat.edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; + hcall->ud_expected = false; + hcall->expect = HV_STATUS_SUCCESS; + break; + + case 19: + /* END */ + hcall->control = 0; + break; + } + + hv_set_cpuid(vm, best, &feat, &recomm, &dbg); + + if (hcall->control) + pr_debug("Stage %d: testing hcall: 0x%lx\n", stage, + hcall->control); + else + pr_debug("Stage %d: finish\n", stage); + + r = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(!r, "vcpu_run failed: %d\n", r); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "unexpected exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(uc.args[1] == stage, + "Unexpected stage: %ld (%d expected)\n", + uc.args[1], stage); + break; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + return; + case UCALL_DONE: + return; + } + + stage++; + } +} + +int main(void) +{ + struct kvm_cpuid2 *best; + struct kvm_vm *vm; + vm_vaddr_t msr_gva, hcall_page, hcall_params; + struct kvm_enable_cap cap = { + .cap = KVM_CAP_HYPERV_ENFORCE_CPUID, + .args = {1} + }; + + /* Test MSRs */ + vm = vm_create_default(VCPU_ID, 0, guest_msr); + + msr_gva = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize()); + vcpu_args_set(vm, VCPU_ID, 1, msr_gva); + vcpu_enable_cap(vm, VCPU_ID, &cap); + + vcpu_set_hv_cpuid(vm, VCPU_ID); + + best = kvm_get_supported_hv_cpuid(); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); + + pr_info("Testing access to Hyper-V specific MSRs\n"); + guest_test_msrs_access(vm, addr_gva2hva(vm, msr_gva), + best); + kvm_vm_free(vm); + + /* Test hypercalls */ + vm = vm_create_default(VCPU_ID, 0, guest_hcall); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); + + /* Hypercall input/output */ + hcall_page = vm_vaddr_alloc_pages(vm, 2); + memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); + + hcall_params = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize()); + + vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params); + vcpu_enable_cap(vm, VCPU_ID, &cap); + + vcpu_set_hv_cpuid(vm, VCPU_ID); + + best = kvm_get_supported_hv_cpuid(); + + pr_info("Testing access to Hyper-V hypercalls\n"); + guest_test_hcalls_access(vm, addr_gva2hva(vm, hcall_params), + addr_gva2hva(vm, hcall_page), + addr_gva2hva(vm, hcall_page) + getpagesize(), + best); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 732b244d6956..04ed975662c9 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -227,7 +227,7 @@ int main(void) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vm, VCPU_ID); - vm_handle_exception(vm, GP_VECTOR, guest_gp_handler); + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); enter_guest(vm); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c index e6480fd5c4bd..8039e1eff938 100644 --- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c @@ -82,7 +82,8 @@ int get_warnings_count(void) FILE *f; f = popen("dmesg | grep \"WARNING:\" | wc -l", "r"); - fscanf(f, "%d", &warnings); + if (fscanf(f, "%d", &warnings) < 1) + warnings = 0; fclose(f); return warnings; diff --git a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c new file mode 100644 index 000000000000..da2325fcad87 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 1 + +#define MMIO_GPA 0x100000000ull + +static void guest_code(void) +{ + (void)READ_ONCE(*((uint64_t *)MMIO_GPA)); + (void)READ_ONCE(*((uint64_t *)MMIO_GPA)); + + GUEST_ASSERT(0); +} + +static void guest_pf_handler(struct ex_regs *regs) +{ + /* PFEC == RSVD | PRESENT (read, kernel). */ + GUEST_ASSERT(regs->error_code == 0x9); + GUEST_DONE(); +} + +static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val) +{ + u32 good_cpuid_val = *cpuid_reg; + struct kvm_run *run; + struct kvm_vm *vm; + uint64_t cmd; + int r; + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + run = vcpu_state(vm, VCPU_ID); + + /* Map 1gb page without a backing memlot. */ + __virt_pg_map(vm, MMIO_GPA, MMIO_GPA, X86_PAGE_SIZE_1G); + + r = _vcpu_run(vm, VCPU_ID); + + /* Guest access to the 1gb page should trigger MMIO. */ + TEST_ASSERT(r == 0, "vcpu_run failed: %d\n", r); + TEST_ASSERT(run->exit_reason == KVM_EXIT_MMIO, + "Unexpected exit reason: %u (%s), expected MMIO exit (1gb page w/o memslot)\n", + run->exit_reason, exit_reason_str(run->exit_reason)); + + TEST_ASSERT(run->mmio.len == 8, "Unexpected exit mmio size = %u", run->mmio.len); + + TEST_ASSERT(run->mmio.phys_addr == MMIO_GPA, + "Unexpected exit mmio address = 0x%llx", run->mmio.phys_addr); + + /* + * Effect the CPUID change for the guest and re-enter the guest. Its + * access should now #PF due to the PAGE_SIZE bit being reserved or + * the resulting GPA being invalid. Note, kvm_get_supported_cpuid() + * returns the struct that contains the entry being modified. Eww. + */ + *cpuid_reg = evil_cpuid_val; + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + /* + * Add a dummy memslot to coerce KVM into bumping the MMIO generation. + * KVM does not "officially" support mucking with CPUID after KVM_RUN, + * and will incorrectly reuse MMIO SPTEs. Don't delete the memslot! + * KVM x86 zaps all shadow pages on memslot deletion. + */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + MMIO_GPA << 1, 10, 1, 0); + + /* Set up a #PF handler to eat the RSVD #PF and signal all done! */ + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_install_exception_handler(vm, PF_VECTOR, guest_pf_handler); + + r = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(r == 0, "vcpu_run failed: %d\n", r); + + cmd = get_ucall(vm, VCPU_ID, NULL); + TEST_ASSERT(cmd == UCALL_DONE, + "Unexpected guest exit, exit_reason=%s, ucall.cmd = %lu\n", + exit_reason_str(run->exit_reason), cmd); + + /* + * Restore the happy CPUID value for the next test. Yes, changes are + * indeed persistent across VM destruction. + */ + *cpuid_reg = good_cpuid_val; + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_cpuid_entry2 *entry; + int opt; + + /* + * All tests are opt-in because TDP doesn't play nice with reserved #PF + * in the GVA->GPA translation. The hardware page walker doesn't let + * software change GBPAGES or MAXPHYADDR, and KVM doesn't manually walk + * the GVA on fault for performance reasons. + */ + bool do_gbpages = false; + bool do_maxphyaddr = false; + + setbuf(stdout, NULL); + + while ((opt = getopt(argc, argv, "gm")) != -1) { + switch (opt) { + case 'g': + do_gbpages = true; + break; + case 'm': + do_maxphyaddr = true; + break; + case 'h': + default: + printf("usage: %s [-g (GBPAGES)] [-m (MAXPHYADDR)]\n", argv[0]); + break; + } + } + + if (!do_gbpages && !do_maxphyaddr) { + print_skip("No sub-tests selected"); + return 0; + } + + entry = kvm_get_supported_cpuid_entry(0x80000001); + if (!(entry->edx & CPUID_GBPAGES)) { + print_skip("1gb hugepages not supported"); + return 0; + } + + if (do_gbpages) { + pr_info("Test MMIO after toggling CPUID.GBPAGES\n\n"); + mmu_role_test(&entry->edx, entry->edx & ~CPUID_GBPAGES); + } + + if (do_maxphyaddr) { + pr_info("Test MMIO after changing CPUID.MAXPHYADDR\n\n"); + entry = kvm_get_supported_cpuid_entry(0x80000008); + mmu_role_test(&entry->eax, (entry->eax & ~0xff) | 0x20); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index 12c558fc8074..ae76436af0cc 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -14,16 +14,12 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "apic.h" #define N_VCPU 2 #define VCPU_ID0 0 #define VCPU_ID1 1 -static uint32_t get_bsp_flag(void) -{ - return rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BSP; -} - static void guest_bsp_vcpu(void *arg) { GUEST_SYNC(1); @@ -94,7 +90,7 @@ static struct kvm_vm *create_vm(void) pages = vm_adjust_num_guest_pages(VM_MODE_DEFAULT, pages); vm = vm_create(VM_MODE_DEFAULT, pages, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + kvm_vm_elf_load(vm, program_invocation_name); vm_create_irqchip(vm); return vm; @@ -106,8 +102,6 @@ static void add_x86_vcpu(struct kvm_vm *vm, uint32_t vcpuid, bool bsp_code) vm_vcpu_add_default(vm, vcpuid, guest_bsp_vcpu); else vm_vcpu_add_default(vm, vcpuid, guest_not_bsp_vcpu); - - vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); } static void run_vm_bsp(uint32_t bsp_vcpu) diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index 613c42c5a9b8..d0fe2fdce58c 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -53,15 +53,28 @@ static inline void sync_with_host(uint64_t phase) : "+a" (phase)); } -void self_smi(void) +static void self_smi(void) { - wrmsr(APIC_BASE_MSR + (APIC_ICR >> 4), - APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI); + x2apic_write_reg(APIC_ICR, + APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI); } -void guest_code(void *arg) +static void l2_guest_code(void) { + sync_with_host(8); + + sync_with_host(10); + + vmcall(); +} + +static void guest_code(void *arg) +{ + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; uint64_t apicbase = rdmsr(MSR_IA32_APICBASE); + struct svm_test_data *svm = arg; + struct vmx_pages *vmx_pages = arg; sync_with_host(1); @@ -74,21 +87,50 @@ void guest_code(void *arg) sync_with_host(4); if (arg) { - if (cpu_has_svm()) - generic_svm_setup(arg, NULL, NULL); - else - GUEST_ASSERT(prepare_for_vmx_operation(arg)); + if (cpu_has_svm()) { + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + } else { + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + } sync_with_host(5); self_smi(); sync_with_host(7); + + if (cpu_has_svm()) { + run_guest(svm->vmcb, svm->vmcb_gpa); + svm->vmcb->save.rip += 3; + run_guest(svm->vmcb, svm->vmcb_gpa); + } else { + vmlaunch(); + vmresume(); + } + + /* Stages 8-11 are eaten by SMM (SMRAM_STAGE reported instead) */ + sync_with_host(12); } sync_with_host(DONE); } +void inject_smi(struct kvm_vm *vm) +{ + struct kvm_vcpu_events events; + + vcpu_events_get(vm, VCPU_ID, &events); + + events.smi.pending = 1; + events.flags |= KVM_VCPUEVENT_VALID_SMM; + + vcpu_events_set(vm, VCPU_ID, &events); +} + int main(int argc, char *argv[]) { vm_vaddr_t nested_gva = 0; @@ -147,6 +189,22 @@ int main(int argc, char *argv[]) "Unexpected stage: #%x, got %x", stage, stage_reported); + /* + * Enter SMM during L2 execution and check that we correctly + * return from it. Do not perform save/restore while in SMM yet. + */ + if (stage == 8) { + inject_smi(vm); + continue; + } + + /* + * Perform save/restore while the guest is in SMM triggered + * during L2 execution. + */ + if (stage == 10) + inject_smi(vm); + state = vcpu_save_state(vm, VCPU_ID); kvm_vm_release(vm); kvm_vm_restart(vm, O_RDWR); diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c new file mode 100644 index 000000000000..df04f56ce859 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * svm_int_ctl_test + * + * Copyright (C) 2021, Red Hat, Inc. + * + * Nested SVM testing: test simultaneous use of V_IRQ from L1 and L0. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "apic.h" + +#define VCPU_ID 0 + +static struct kvm_vm *vm; + +bool vintr_irq_called; +bool intr_irq_called; + +#define VINTR_IRQ_NUMBER 0x20 +#define INTR_IRQ_NUMBER 0x30 + +static void vintr_irq_handler(struct ex_regs *regs) +{ + vintr_irq_called = true; +} + +static void intr_irq_handler(struct ex_regs *regs) +{ + x2apic_write_reg(APIC_EOI, 0x00); + intr_irq_called = true; +} + +static void l2_guest_code(struct svm_test_data *svm) +{ + /* This code raises interrupt INTR_IRQ_NUMBER in the L1's LAPIC, + * and since L1 didn't enable virtual interrupt masking, + * L2 should receive it and not L1. + * + * L2 also has virtual interrupt 'VINTR_IRQ_NUMBER' pending in V_IRQ + * so it should also receive it after the following 'sti'. + */ + x2apic_write_reg(APIC_ICR, + APIC_DEST_SELF | APIC_INT_ASSERT | INTR_IRQ_NUMBER); + + __asm__ __volatile__( + "sti\n" + "nop\n" + ); + + GUEST_ASSERT(vintr_irq_called); + GUEST_ASSERT(intr_irq_called); + + __asm__ __volatile__( + "vmcall\n" + ); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + x2apic_enable(); + + /* Prepare for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* No virtual interrupt masking */ + vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; + + /* No intercepts for real and virtual interrupts */ + vmcb->control.intercept &= ~(1ULL << INTERCEPT_INTR | INTERCEPT_VINTR); + + /* Make a virtual interrupt VINTR_IRQ_NUMBER pending */ + vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT); + vmcb->control.int_vector = VINTR_IRQ_NUMBER; + + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t svm_gva; + + nested_svm_check_supported(); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + + vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler); + vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler); + + vcpu_alloc_svm(vm, &svm_gva); + vcpu_args_set(vm, VCPU_ID, 1, svm_gva); + + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + break; + /* NOT REACHED */ + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index d672f0a473f8..fc03a150278d 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -24,6 +24,10 @@ #define UCALL_PIO_PORT ((uint16_t)0x1000) +struct ucall uc_none = { + .cmd = UCALL_NONE, +}; + /* * ucall is embedded here to protect against compiler reshuffling registers * before calling a function. In this test we only need to get KVM_EXIT_IO @@ -34,7 +38,8 @@ void guest_code(void) asm volatile("1: in %[port], %%al\n" "add $0x1, %%rbx\n" "jmp 1b" - : : [port] "d" (UCALL_PIO_PORT) : "rax", "rbx"); + : : [port] "d" (UCALL_PIO_PORT), "D" (&uc_none) + : "rax", "rbx"); } static void compare_regs(struct kvm_regs *left, struct kvm_regs *right) diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c index e357d8e222d4..5a6a662f2e59 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c @@ -18,15 +18,6 @@ #define rounded_rdmsr(x) ROUND(rdmsr(x)) #define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vm, 0, x)) -#define GUEST_ASSERT_EQ(a, b) do { \ - __typeof(a) _a = (a); \ - __typeof(b) _b = (b); \ - if (_a != _b) \ - ucall(UCALL_ABORT, 4, \ - "Failed guest assert: " \ - #a " == " #b, __LINE__, _a, _b); \ - } while(0) - static void guest_code(void) { u64 val = 0; diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index 72c0d0797522..e3e20e8848d0 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -574,7 +574,7 @@ static void test_msr_filter_allow(void) { vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vm, VCPU_ID); - vm_handle_exception(vm, GP_VECTOR, guest_gp_handler); + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); /* Process guest code userspace exits. */ run_guest_then_process_rdmsr(vm, MSR_IA32_XSS); @@ -588,12 +588,12 @@ static void test_msr_filter_allow(void) { run_guest_then_process_wrmsr(vm, MSR_NON_EXISTENT); run_guest_then_process_rdmsr(vm, MSR_NON_EXISTENT); - vm_handle_exception(vm, UD_VECTOR, guest_ud_handler); + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); run_guest(vm); - vm_handle_exception(vm, UD_VECTOR, NULL); + vm_install_exception_handler(vm, UD_VECTOR, NULL); if (process_ucall(vm) != UCALL_DONE) { - vm_handle_exception(vm, GP_VECTOR, guest_fep_gp_handler); + vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler); /* Process emulated rdmsr and wrmsr instructions. */ run_guest_then_process_rdmsr(vm, MSR_IA32_XSS); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c index d14888b34adb..d438c4d3228a 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c @@ -96,7 +96,7 @@ int main(int argc, char *argv[]) } vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); - prepare_virtualize_apic_accesses(vmx, vm, 0); + prepare_virtualize_apic_accesses(vmx, vm); vcpu_args_set(vm, VCPU_ID, 2, vmx_pages_gva, high_gpa); while (!done) { diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 537de1068554..68f26a8b4f42 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -97,7 +97,7 @@ int main(int argc, char *argv[]) * Add an identity map for GVA range [0xc0000000, 0xc0002000). This * affects both L1 and L2. However... */ - virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES, 0); + virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES); /* * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to @@ -107,11 +107,11 @@ int main(int argc, char *argv[]) * meaning after the last call to virt_map. */ prepare_eptp(vmx, vm, 0); - nested_map_memslot(vmx, vm, 0, 0); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0); + nested_map_memslot(vmx, vm, 0); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); - bmap = bitmap_alloc(TEST_MEM_PAGES); + bmap = bitmap_zalloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); while (!done) { diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c new file mode 100644 index 000000000000..280c01fd2412 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vmx_nested_tsc_scaling_test + * + * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * This test case verifies that nested TSC scaling behaves as expected when + * both L1 and L2 are scaled using different ratios. For this test we scale + * L1 down and scale L2 up. + */ + +#include <time.h> + +#include "kvm_util.h" +#include "vmx.h" +#include "kselftest.h" + + +#define VCPU_ID 0 + +/* L2 is scaled up (from L1's perspective) by this factor */ +#define L2_SCALE_FACTOR 4ULL + +#define TSC_OFFSET_L2 ((uint64_t) -33125236320908) +#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48) + +#define L2_GUEST_STACK_SIZE 64 + +enum { USLEEP, UCHECK_L1, UCHECK_L2 }; +#define GUEST_SLEEP(sec) ucall(UCALL_SYNC, 2, USLEEP, sec) +#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq) + + +/* + * This function checks whether the "actual" TSC frequency of a guest matches + * its expected frequency. In order to account for delays in taking the TSC + * measurements, a difference of 1% between the actual and the expected value + * is tolerated. + */ +static void compare_tsc_freq(uint64_t actual, uint64_t expected) +{ + uint64_t tolerance, thresh_low, thresh_high; + + tolerance = expected / 100; + thresh_low = expected - tolerance; + thresh_high = expected + tolerance; + + TEST_ASSERT(thresh_low < actual, + "TSC freq is expected to be between %"PRIu64" and %"PRIu64 + " but it actually is %"PRIu64, + thresh_low, thresh_high, actual); + TEST_ASSERT(thresh_high > actual, + "TSC freq is expected to be between %"PRIu64" and %"PRIu64 + " but it actually is %"PRIu64, + thresh_low, thresh_high, actual); +} + +static void check_tsc_freq(int level) +{ + uint64_t tsc_start, tsc_end, tsc_freq; + + /* + * Reading the TSC twice with about a second's difference should give + * us an approximation of the TSC frequency from the guest's + * perspective. Now, this won't be completely accurate, but it should + * be good enough for the purposes of this test. + */ + tsc_start = rdmsr(MSR_IA32_TSC); + GUEST_SLEEP(1); + tsc_end = rdmsr(MSR_IA32_TSC); + + tsc_freq = tsc_end - tsc_start; + + GUEST_CHECK(level, tsc_freq); +} + +static void l2_guest_code(void) +{ + check_tsc_freq(UCHECK_L2); + + /* exit to L1 */ + __asm__ __volatile__("vmcall"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + + /* check that L1's frequency looks alright before launching L2 */ + check_tsc_freq(UCHECK_L1); + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* prepare the VMCS for L2 execution */ + prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* enable TSC offsetting and TSC scaling for L2 */ + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + + control = vmreadz(SECONDARY_VM_EXEC_CONTROL); + control |= SECONDARY_EXEC_TSC_SCALING; + vmwrite(SECONDARY_VM_EXEC_CONTROL, control); + + vmwrite(TSC_OFFSET, TSC_OFFSET_L2); + vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2); + vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32); + + /* launch L2 */ + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + /* check that L1's frequency still looks good */ + check_tsc_freq(UCHECK_L1); + + GUEST_DONE(); +} + +static void tsc_scaling_check_supported(void) +{ + if (!kvm_check_cap(KVM_CAP_TSC_CONTROL)) { + print_skip("TSC scaling not supported by the HW"); + exit(KSFT_SKIP); + } +} + +static void stable_tsc_check_supported(void) +{ + FILE *fp; + char buf[4]; + + fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + if (fp == NULL) + goto skip_test; + + if (fgets(buf, sizeof(buf), fp) == NULL) + goto skip_test; + + if (strncmp(buf, "tsc", sizeof(buf))) + goto skip_test; + + return; +skip_test: + print_skip("Kernel does not use TSC clocksource - assuming that host TSC is not stable"); + exit(KSFT_SKIP); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + vm_vaddr_t vmx_pages_gva; + + uint64_t tsc_start, tsc_end; + uint64_t tsc_khz; + uint64_t l1_scale_factor; + uint64_t l0_tsc_freq = 0; + uint64_t l1_tsc_freq = 0; + uint64_t l2_tsc_freq = 0; + + nested_vmx_check_supported(); + tsc_scaling_check_supported(); + stable_tsc_check_supported(); + + /* + * We set L1's scale factor to be a random number from 2 to 10. + * Ideally we would do the same for L2's factor but that one is + * referenced by both main() and l1_guest_code() and using a global + * variable does not work. + */ + srand(time(NULL)); + l1_scale_factor = (rand() % 9) + 2; + printf("L1's scale down factor is: %"PRIu64"\n", l1_scale_factor); + printf("L2's scale up factor is: %llu\n", L2_SCALE_FACTOR); + + tsc_start = rdtsc(); + sleep(1); + tsc_end = rdtsc(); + + l0_tsc_freq = tsc_end - tsc_start; + printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + + tsc_khz = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL); + TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); + + /* scale down L1's TSC frequency */ + vcpu_ioctl(vm, VCPU_ID, KVM_SET_TSC_KHZ, + (void *) (tsc_khz / l1_scale_factor)); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *) uc.args[0]); + case UCALL_SYNC: + switch (uc.args[0]) { + case USLEEP: + sleep(uc.args[1]); + break; + case UCHECK_L1: + l1_tsc_freq = uc.args[1]; + printf("L1's TSC frequency is around: %"PRIu64 + "\n", l1_tsc_freq); + + compare_tsc_freq(l1_tsc_freq, + l0_tsc_freq / l1_scale_factor); + break; + case UCHECK_L2: + l2_tsc_freq = uc.args[1]; + printf("L2's TSC frequency is around: %"PRIu64 + "\n", l2_tsc_freq); + + compare_tsc_freq(l2_tsc_freq, + l1_tsc_freq * L2_SCALE_FACTOR); + break; + } + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c index 2f964cdc273c..afbbc40df884 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -42,8 +42,6 @@ #define HALTER_VCPU_ID 0 #define SENDER_VCPU_ID 1 -volatile uint32_t *apic_base = (volatile uint32_t *)APIC_DEFAULT_GPA; - /* * Vector for IPI from sender vCPU to halting vCPU. * Value is arbitrary and was chosen for the alternating bit pattern. Any @@ -86,45 +84,6 @@ struct thread_params { uint64_t *pipis_rcvd; /* host address of ipis_rcvd global */ }; -uint32_t read_apic_reg(uint reg) -{ - return apic_base[reg >> 2]; -} - -void write_apic_reg(uint reg, uint32_t val) -{ - apic_base[reg >> 2] = val; -} - -void disable_apic(void) -{ - wrmsr(MSR_IA32_APICBASE, - rdmsr(MSR_IA32_APICBASE) & - ~(MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD)); -} - -void enable_xapic(void) -{ - uint64_t val = rdmsr(MSR_IA32_APICBASE); - - /* Per SDM: to enable xAPIC when in x2APIC must first disable APIC */ - if (val & MSR_IA32_APICBASE_EXTD) { - disable_apic(); - wrmsr(MSR_IA32_APICBASE, - rdmsr(MSR_IA32_APICBASE) | MSR_IA32_APICBASE_ENABLE); - } else if (!(val & MSR_IA32_APICBASE_ENABLE)) { - wrmsr(MSR_IA32_APICBASE, val | MSR_IA32_APICBASE_ENABLE); - } - - /* - * Per SDM: reset value of spurious interrupt vector register has the - * APIC software enabled bit=0. It must be enabled in addition to the - * enable bit in the MSR. - */ - val = read_apic_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED; - write_apic_reg(APIC_SPIV, val); -} - void verify_apic_base_addr(void) { uint64_t msr = rdmsr(MSR_IA32_APICBASE); @@ -136,10 +95,10 @@ void verify_apic_base_addr(void) static void halter_guest_code(struct test_data_page *data) { verify_apic_base_addr(); - enable_xapic(); + xapic_enable(); - data->halter_apic_id = GET_APIC_ID_FIELD(read_apic_reg(APIC_ID)); - data->halter_lvr = read_apic_reg(APIC_LVR); + data->halter_apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID)); + data->halter_lvr = xapic_read_reg(APIC_LVR); /* * Loop forever HLTing and recording halts & wakes. Disable interrupts @@ -150,8 +109,8 @@ static void halter_guest_code(struct test_data_page *data) * TPR and PPR for diagnostic purposes in case the test fails. */ for (;;) { - data->halter_tpr = read_apic_reg(APIC_TASKPRI); - data->halter_ppr = read_apic_reg(APIC_PROCPRI); + data->halter_tpr = xapic_read_reg(APIC_TASKPRI); + data->halter_ppr = xapic_read_reg(APIC_PROCPRI); data->hlt_count++; asm volatile("sti; hlt; cli"); data->wake_count++; @@ -166,7 +125,7 @@ static void halter_guest_code(struct test_data_page *data) static void guest_ipi_handler(struct ex_regs *regs) { ipis_rcvd++; - write_apic_reg(APIC_EOI, 77); + xapic_write_reg(APIC_EOI, 77); } static void sender_guest_code(struct test_data_page *data) @@ -179,7 +138,7 @@ static void sender_guest_code(struct test_data_page *data) uint64_t tsc_start; verify_apic_base_addr(); - enable_xapic(); + xapic_enable(); /* * Init interrupt command register for sending IPIs @@ -206,8 +165,8 @@ static void sender_guest_code(struct test_data_page *data) * First IPI can be sent unconditionally because halter vCPU * starts earlier. */ - write_apic_reg(APIC_ICR2, icr2_val); - write_apic_reg(APIC_ICR, icr_val); + xapic_write_reg(APIC_ICR2, icr2_val); + xapic_write_reg(APIC_ICR, icr_val); data->ipis_sent++; /* @@ -462,13 +421,13 @@ int main(int argc, char *argv[]) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vm, HALTER_VCPU_ID); - vm_handle_exception(vm, IPI_VECTOR, guest_ipi_handler); + vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler); - virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA, 0); + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); vm_vcpu_add_default(vm, SENDER_VCPU_ID, sender_guest_code); - test_data_page_vaddr = vm_vaddr_alloc(vm, 0x1000, 0x1000, 0, 0); + test_data_page_vaddr = vm_vaddr_alloc_page(vm); data = (struct test_data_page *)addr_gva2hva(vm, test_data_page_vaddr); memset(data, 0, sizeof(*data)); diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 1f4a0599683c..eda0d2a51224 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -14,7 +14,6 @@ #include <stdint.h> #include <time.h> #include <sched.h> -#include <sys/syscall.h> #define VCPU_ID 5 @@ -98,20 +97,6 @@ static void guest_code(void) GUEST_DONE(); } -static long get_run_delay(void) -{ - char path[64]; - long val[2]; - FILE *fp; - - sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); - fp = fopen(path, "r"); - fscanf(fp, "%ld %ld ", &val[0], &val[1]); - fclose(fp); - - return val[1]; -} - static int cmp_timespec(struct timespec *a, struct timespec *b) { if (a->tv_sec > b->tv_sec) @@ -146,7 +131,7 @@ int main(int argc, char *argv[]) /* Map a region for the shared_info page */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0); - virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2, 0); + virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2); struct kvm_xen_hvm_config hvmc = { .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c index 8389e0bfd711..adc94452b57c 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c @@ -103,7 +103,7 @@ int main(int argc, char *argv[]) /* Map a region for the hypercall pages */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, HCALL_REGION_GPA, HCALL_REGION_SLOT, 2, 0); - virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2, 0); + virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2); for (;;) { volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 0af84ad48aa7..fe7ee2b0f29c 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -48,6 +48,7 @@ ARCH ?= $(SUBARCH) # When local build is done, headers are installed in the default # INSTALL_HDR_PATH usr/include. .PHONY: khdr +.NOTPARALLEL: khdr: ifndef KSFT_KHDR_INSTALL_DONE ifeq (1,$(DEFAULT_INSTALL_HDR_PATH)) @@ -100,6 +101,7 @@ define INSTALL_RULE $(eval INSTALL_LIST = $(TEST_CUSTOM_PROGS)) $(INSTALL_SINGLE_RULE) $(eval INSTALL_LIST = $(TEST_GEN_PROGS_EXTENDED)) $(INSTALL_SINGLE_RULE) $(eval INSTALL_LIST = $(TEST_GEN_FILES)) $(INSTALL_SINGLE_RULE) + $(eval INSTALL_LIST = $(wildcard config settings)) $(INSTALL_SINGLE_RULE) endef install: all diff --git a/tools/testing/selftests/lib/Makefile b/tools/testing/selftests/lib/Makefile index a105f094676e..ee71fc99d5b5 100644 --- a/tools/testing/selftests/lib/Makefile +++ b/tools/testing/selftests/lib/Makefile @@ -4,6 +4,6 @@ # No binaries, but make sure arg-less "make" doesn't trigger "run_tests" all: -TEST_PROGS := printf.sh bitmap.sh prime_numbers.sh strscpy.sh +TEST_PROGS := printf.sh bitmap.sh prime_numbers.sh scanf.sh strscpy.sh include ../lib.mk diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index b80ee3f6e265..645839b50b0a 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -1,4 +1,5 @@ CONFIG_TEST_PRINTF=m +CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_PRIME_NUMBERS=m CONFIG_TEST_STRSCPY=m diff --git a/tools/testing/selftests/lib/scanf.sh b/tools/testing/selftests/lib/scanf.sh new file mode 100755 index 000000000000..b59b8ba561c3 --- /dev/null +++ b/tools/testing/selftests/lib/scanf.sh @@ -0,0 +1,4 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# Tests the scanf infrastructure using test_scanf kernel module. +$(dirname $0)/../kselftest/module.sh "scanf" test_scanf diff --git a/tools/testing/selftests/lkdtm/config b/tools/testing/selftests/lkdtm/config index d874990e442b..38edea25631b 100644 --- a/tools/testing/selftests/lkdtm/config +++ b/tools/testing/selftests/lkdtm/config @@ -1 +1,10 @@ CONFIG_LKDTM=y +CONFIG_DEBUG_LIST=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_FORTIFY_SOURCE=y +CONFIG_HARDENED_USERCOPY=y +# CONFIG_HARDENED_USERCOPY_FALLBACK is not set +CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +CONFIG_UBSAN_BOUNDS=y +CONFIG_UBSAN_TRAP=y diff --git a/tools/testing/selftests/lkdtm/run.sh b/tools/testing/selftests/lkdtm/run.sh index bb7a1775307b..e95e79bd3126 100755 --- a/tools/testing/selftests/lkdtm/run.sh +++ b/tools/testing/selftests/lkdtm/run.sh @@ -76,10 +76,14 @@ fi # Save existing dmesg so we can detect new content below dmesg > "$DMESG" -# Most shells yell about signals and we're expecting the "cat" process -# to usually be killed by the kernel. So we have to run it in a sub-shell -# and silence errors. -($SHELL -c 'cat <(echo '"$test"') >'"$TRIGGER" 2>/dev/null) || true +# Since the kernel is likely killing the process writing to the trigger +# file, it must not be the script's shell itself. i.e. we cannot do: +# echo "$test" >"$TRIGGER" +# Instead, use "cat" to take the signal. Since the shell will yell about +# the signal that killed the subprocess, we must ignore the failure and +# continue. However we don't silence stderr since there might be other +# useful details reported there in the case of other unexpected conditions. +echo "$test" | cat >"$TRIGGER" || true # Record and dump the results dmesg | comm --nocheck-order -13 "$DMESG" - > "$LOG" || true diff --git a/tools/testing/selftests/lkdtm/stack-entropy.sh b/tools/testing/selftests/lkdtm/stack-entropy.sh index b1b8a5097cbb..1b4d95d575f8 100755 --- a/tools/testing/selftests/lkdtm/stack-entropy.sh +++ b/tools/testing/selftests/lkdtm/stack-entropy.sh @@ -30,6 +30,7 @@ rm -f "$log" # We would expect any functional stack randomization to be at least 5 bits. if [ "$bits" -lt 5 ]; then + echo "Stack entropy is low! Booted without 'randomize_kstack_offset=y'?" exit 1 else exit 0 diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt index 11ef159be0fd..09f7bfa383cc 100644 --- a/tools/testing/selftests/lkdtm/tests.txt +++ b/tools/testing/selftests/lkdtm/tests.txt @@ -7,19 +7,23 @@ EXCEPTION #EXHAUST_STACK Corrupts memory on failure #CORRUPT_STACK Crashes entire system on success #CORRUPT_STACK_STRONG Crashes entire system on success +ARRAY_BOUNDS CORRUPT_LIST_ADD list_add corruption CORRUPT_LIST_DEL list_del corruption STACK_GUARD_PAGE_LEADING STACK_GUARD_PAGE_TRAILING -UNSET_SMEP CR4 bits went missing +UNSET_SMEP pinned CR4 bits changed: DOUBLE_FAULT CORRUPT_PAC UNALIGNED_LOAD_STORE_WRITE -#OVERWRITE_ALLOCATION Corrupts memory on failure +SLAB_LINEAR_OVERFLOW +VMALLOC_LINEAR_OVERFLOW #WRITE_AFTER_FREE Corrupts memory on failure -READ_AFTER_FREE +READ_AFTER_FREE call trace:|Memory correctly poisoned #WRITE_BUDDY_AFTER_FREE Corrupts memory on failure -READ_BUDDY_AFTER_FREE +READ_BUDDY_AFTER_FREE call trace:|Memory correctly poisoned +SLAB_INIT_ON_ALLOC Memory appears initialized +BUDDY_INIT_ON_ALLOC Memory appears initialized SLAB_FREE_DOUBLE SLAB_FREE_CROSS SLAB_FREE_PAGE @@ -69,4 +73,6 @@ USERCOPY_KERNEL STACKLEAK_ERASING OK: the rest of the thread stack is properly erased CFI_FORWARD_PROTO FORTIFIED_STRSCPY +FORTIFIED_OBJECT +FORTIFIED_SUBOBJECT PPC_SLB_MULTIHIT Recovered diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 74baab83fec3..192a2899bae8 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -56,7 +56,7 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) static int mfd_assert_reopen_fd(int fd_in) { - int r, fd; + int fd; char path[100]; sprintf(path, "/proc/self/fd/%d", fd_in); diff --git a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh index b37585e6aa38..46a97f318f58 100755 --- a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh +++ b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh @@ -282,7 +282,9 @@ done # echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error for memory in `hotpluggable_online_memory`; do - offline_memory_expect_fail $memory + if [ $((RANDOM % 100)) -lt $ratio ]; then + offline_memory_expect_fail $memory + fi done echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 4e94e566e040..f31205f04ee0 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -136,6 +136,10 @@ struct mount_attr { #define MOUNT_ATTR_IDMAP 0x00100000 #endif +#ifndef MOUNT_ATTR_NOSYMFOLLOW +#define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 +#endif + static inline int sys_mount_setattr(int dfd, const char *path, unsigned int flags, struct mount_attr *attr, size_t size) { @@ -235,6 +239,10 @@ static int prepare_unpriv_mountns(void) return 0; } +#ifndef ST_NOSYMFOLLOW +#define ST_NOSYMFOLLOW 0x2000 /* do not follow symlinks */ +#endif + static int read_mnt_flags(const char *path) { int ret; @@ -245,9 +253,9 @@ static int read_mnt_flags(const char *path) if (ret != 0) return -EINVAL; - if (stat.f_flag & - ~(ST_RDONLY | ST_NOSUID | ST_NODEV | ST_NOEXEC | ST_NOATIME | - ST_NODIRATIME | ST_RELATIME | ST_SYNCHRONOUS | ST_MANDLOCK)) + if (stat.f_flag & ~(ST_RDONLY | ST_NOSUID | ST_NODEV | ST_NOEXEC | + ST_NOATIME | ST_NODIRATIME | ST_RELATIME | + ST_SYNCHRONOUS | ST_MANDLOCK | ST_NOSYMFOLLOW)) return -EINVAL; mnt_flags = 0; @@ -269,6 +277,8 @@ static int read_mnt_flags(const char *path) mnt_flags |= MS_SYNCHRONOUS; if (stat.f_flag & ST_MANDLOCK) mnt_flags |= ST_MANDLOCK; + if (stat.f_flag & ST_NOSYMFOLLOW) + mnt_flags |= ST_NOSYMFOLLOW; return mnt_flags; } @@ -368,8 +378,13 @@ static bool mount_setattr_supported(void) FIXTURE(mount_setattr) { }; +#define NOSYMFOLLOW_TARGET "/mnt/A/AA/data" +#define NOSYMFOLLOW_SYMLINK "/mnt/A/AA/symlink" + FIXTURE_SETUP(mount_setattr) { + int fd = -EBADF; + if (!mount_setattr_supported()) SKIP(return, "mount_setattr syscall not supported"); @@ -412,6 +427,11 @@ FIXTURE_SETUP(mount_setattr) ASSERT_EQ(mount("testing", "/tmp/B/BB", "devpts", MS_RELATIME | MS_NOEXEC | MS_RDONLY, 0), 0); + + fd = creat(NOSYMFOLLOW_TARGET, O_RDWR | O_CLOEXEC); + ASSERT_GT(fd, 0); + ASSERT_EQ(symlink(NOSYMFOLLOW_TARGET, NOSYMFOLLOW_SYMLINK), 0); + ASSERT_EQ(close(fd), 0); } FIXTURE_TEARDOWN(mount_setattr) @@ -1421,4 +1441,66 @@ TEST_F(mount_setattr_idmapped, idmap_mount_tree_invalid) ASSERT_EQ(expected_uid_gid(open_tree_fd, "B/BB/b", 0, 0, 0), 0); } +TEST_F(mount_setattr, mount_attr_nosymfollow) +{ + int fd; + unsigned int old_flags = 0, new_flags = 0, expected_flags = 0; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_NOSYMFOLLOW, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + fd = open(NOSYMFOLLOW_SYMLINK, O_RDWR | O_CLOEXEC); + ASSERT_GT(fd, 0); + ASSERT_EQ(close(fd), 0); + + old_flags = read_mnt_flags("/mnt/A"); + ASSERT_GT(old_flags, 0); + + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + expected_flags = old_flags; + expected_flags |= ST_NOSYMFOLLOW; + + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); + + fd = open(NOSYMFOLLOW_SYMLINK, O_RDWR | O_CLOEXEC); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, ELOOP); + + attr.attr_set &= ~MOUNT_ATTR_NOSYMFOLLOW; + attr.attr_clr |= MOUNT_ATTR_NOSYMFOLLOW; + + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + expected_flags &= ~ST_NOSYMFOLLOW; + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); + + fd = open(NOSYMFOLLOW_SYMLINK, O_RDWR | O_CLOEXEC); + ASSERT_GT(fd, 0); + ASSERT_EQ(close(fd), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/move_mount_set_group/.gitignore b/tools/testing/selftests/move_mount_set_group/.gitignore new file mode 100644 index 000000000000..f5e339268720 --- /dev/null +++ b/tools/testing/selftests/move_mount_set_group/.gitignore @@ -0,0 +1 @@ +move_mount_set_group_test diff --git a/tools/testing/selftests/move_mount_set_group/Makefile b/tools/testing/selftests/move_mount_set_group/Makefile new file mode 100644 index 000000000000..80c2d86812b0 --- /dev/null +++ b/tools/testing/selftests/move_mount_set_group/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for mount selftests. +CFLAGS = -g -I../../../../usr/include/ -Wall -O2 + +TEST_GEN_FILES += move_mount_set_group_test + +include ../lib.mk diff --git a/tools/testing/selftests/move_mount_set_group/config b/tools/testing/selftests/move_mount_set_group/config new file mode 100644 index 000000000000..416bd53ce982 --- /dev/null +++ b/tools/testing/selftests/move_mount_set_group/config @@ -0,0 +1 @@ +CONFIG_USER_NS=y diff --git a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c new file mode 100644 index 000000000000..860198f83a53 --- /dev/null +++ b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <sched.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/mount.h> +#include <sys/wait.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdarg.h> +#include <sys/syscall.h> + +#include "../kselftest_harness.h" + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 +#endif + +#ifndef CLONE_NEWUSER +#define CLONE_NEWUSER 0x10000000 +#endif + +#ifndef MS_SHARED +#define MS_SHARED (1 << 20) +#endif + +#ifndef MS_PRIVATE +#define MS_PRIVATE (1<<18) +#endif + +#ifndef MOVE_MOUNT_SET_GROUP +#define MOVE_MOUNT_SET_GROUP 0x00000100 +#endif + +#ifndef MOVE_MOUNT_F_EMPTY_PATH +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 +#endif + +#ifndef MOVE_MOUNT_T_EMPTY_PATH +#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 +#endif + +static ssize_t write_nointr(int fd, const void *buf, size_t count) +{ + ssize_t ret; + + do { + ret = write(fd, buf, count); + } while (ret < 0 && errno == EINTR); + + return ret; +} + +static int write_file(const char *path, const void *buf, size_t count) +{ + int fd; + ssize_t ret; + + fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW); + if (fd < 0) + return -1; + + ret = write_nointr(fd, buf, count); + close(fd); + if (ret < 0 || (size_t)ret != count) + return -1; + + return 0; +} + +static int create_and_enter_userns(void) +{ + uid_t uid; + gid_t gid; + char map[100]; + + uid = getuid(); + gid = getgid(); + + if (unshare(CLONE_NEWUSER)) + return -1; + + if (write_file("/proc/self/setgroups", "deny", sizeof("deny") - 1) && + errno != ENOENT) + return -1; + + snprintf(map, sizeof(map), "0 %d 1", uid); + if (write_file("/proc/self/uid_map", map, strlen(map))) + return -1; + + + snprintf(map, sizeof(map), "0 %d 1", gid); + if (write_file("/proc/self/gid_map", map, strlen(map))) + return -1; + + if (setgid(0)) + return -1; + + if (setuid(0)) + return -1; + + return 0; +} + +static int prepare_unpriv_mountns(void) +{ + if (create_and_enter_userns()) + return -1; + + if (unshare(CLONE_NEWNS)) + return -1; + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + return -1; + + return 0; +} + +static char *get_field(char *src, int nfields) +{ + int i; + char *p = src; + + for (i = 0; i < nfields; i++) { + while (*p && *p != ' ' && *p != '\t') + p++; + + if (!*p) + break; + + p++; + } + + return p; +} + +static void null_endofword(char *word) +{ + while (*word && *word != ' ' && *word != '\t') + word++; + *word = '\0'; +} + +static bool is_shared_mount(const char *path) +{ + size_t len = 0; + char *line = NULL; + FILE *f = NULL; + + f = fopen("/proc/self/mountinfo", "re"); + if (!f) + return false; + + while (getline(&line, &len, f) != -1) { + char *opts, *target; + + target = get_field(line, 4); + if (!target) + continue; + + opts = get_field(target, 2); + if (!opts) + continue; + + null_endofword(target); + + if (strcmp(target, path) != 0) + continue; + + null_endofword(opts); + if (strstr(opts, "shared:")) + return true; + } + + free(line); + fclose(f); + + return false; +} + +/* Attempt to de-conflict with the selftests tree. */ +#ifndef SKIP +#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__) +#endif + +#define SET_GROUP_FROM "/tmp/move_mount_set_group_supported_from" +#define SET_GROUP_TO "/tmp/move_mount_set_group_supported_to" + +static int move_mount_set_group_supported(void) +{ + int ret; + + if (mount("testing", "/tmp", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700")) + return -1; + + if (mount(NULL, "/tmp", NULL, MS_PRIVATE, 0)) + return -1; + + if (mkdir(SET_GROUP_FROM, 0777)) + return -1; + + if (mkdir(SET_GROUP_TO, 0777)) + return -1; + + if (mount("testing", SET_GROUP_FROM, "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700")) + return -1; + + if (mount(SET_GROUP_FROM, SET_GROUP_TO, NULL, MS_BIND, NULL)) + return -1; + + if (mount(NULL, SET_GROUP_FROM, NULL, MS_SHARED, 0)) + return -1; + + ret = syscall(SYS_move_mount, AT_FDCWD, SET_GROUP_FROM, + AT_FDCWD, SET_GROUP_TO, MOVE_MOUNT_SET_GROUP); + umount2("/tmp", MNT_DETACH); + + return ret < 0 ? false : true; +} + +FIXTURE(move_mount_set_group) { +}; + +#define SET_GROUP_A "/tmp/A" + +FIXTURE_SETUP(move_mount_set_group) +{ + int ret; + + ASSERT_EQ(prepare_unpriv_mountns(), 0); + + ret = move_mount_set_group_supported(); + ASSERT_GE(ret, 0); + if (!ret) + SKIP(return, "move_mount(MOVE_MOUNT_SET_GROUP) is not supported"); + + umount2("/tmp", MNT_DETACH); + + ASSERT_EQ(mount("testing", "/tmp", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); + + ASSERT_EQ(mkdir(SET_GROUP_A, 0777), 0); + + ASSERT_EQ(mount("testing", SET_GROUP_A, "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); +} + +FIXTURE_TEARDOWN(move_mount_set_group) +{ + int ret; + + ret = move_mount_set_group_supported(); + ASSERT_GE(ret, 0); + if (!ret) + SKIP(return, "move_mount(MOVE_MOUNT_SET_GROUP) is not supported"); + + umount2("/tmp", MNT_DETACH); +} + +#define __STACK_SIZE (8 * 1024 * 1024) +static pid_t do_clone(int (*fn)(void *), void *arg, int flags) +{ + void *stack; + + stack = malloc(__STACK_SIZE); + if (!stack) + return -ENOMEM; + +#ifdef __ia64__ + return __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, NULL); +#else + return clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, NULL); +#endif +} + +static int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +struct child_args { + int unsfd; + int mntnsfd; + bool shared; + int mntfd; +}; + +static int get_nestedns_mount_cb(void *data) +{ + struct child_args *ca = (struct child_args *)data; + int ret; + + ret = prepare_unpriv_mountns(); + if (ret) + return 1; + + if (ca->shared) { + ret = mount(NULL, SET_GROUP_A, NULL, MS_SHARED, 0); + if (ret) + return 1; + } + + ret = open("/proc/self/ns/user", O_RDONLY); + if (ret < 0) + return 1; + ca->unsfd = ret; + + ret = open("/proc/self/ns/mnt", O_RDONLY); + if (ret < 0) + return 1; + ca->mntnsfd = ret; + + ret = open(SET_GROUP_A, O_RDONLY); + if (ret < 0) + return 1; + ca->mntfd = ret; + + return 0; +} + +TEST_F(move_mount_set_group, complex_sharing_copying) +{ + struct child_args ca_from = { + .shared = true, + }; + struct child_args ca_to = { + .shared = false, + }; + pid_t pid; + int ret; + + ret = move_mount_set_group_supported(); + ASSERT_GE(ret, 0); + if (!ret) + SKIP(return, "move_mount(MOVE_MOUNT_SET_GROUP) is not supported"); + + pid = do_clone(get_nestedns_mount_cb, (void *)&ca_from, CLONE_VFORK | + CLONE_VM | CLONE_FILES); ASSERT_GT(pid, 0); + ASSERT_EQ(wait_for_pid(pid), 0); + + pid = do_clone(get_nestedns_mount_cb, (void *)&ca_to, CLONE_VFORK | + CLONE_VM | CLONE_FILES); ASSERT_GT(pid, 0); + ASSERT_EQ(wait_for_pid(pid), 0); + + ASSERT_EQ(syscall(SYS_move_mount, ca_from.mntfd, "", + ca_to.mntfd, "", MOVE_MOUNT_SET_GROUP + | MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), + 0); + + ASSERT_EQ(setns(ca_to.mntnsfd, CLONE_NEWNS), 0); + ASSERT_EQ(is_shared_mount(SET_GROUP_A), 1); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/nci/nci_dev.c b/tools/testing/selftests/nci/nci_dev.c index 57b505cb1561..162c41e9bcae 100644 --- a/tools/testing/selftests/nci/nci_dev.c +++ b/tools/testing/selftests/nci/nci_dev.c @@ -57,6 +57,29 @@ const __u8 nci_init_rsp_v2[] = {0x40, 0x01, 0x1c, 0x00, 0x1a, 0x7e, 0x06, const __u8 nci_rf_disc_map_rsp[] = {0x41, 0x00, 0x01, 0x00}; const __u8 nci_rf_disc_rsp[] = {0x41, 0x03, 0x01, 0x00}; const __u8 nci_rf_deact_rsp[] = {0x41, 0x06, 0x01, 0x00}; +const __u8 nci_rf_deact_ntf[] = {0x61, 0x06, 0x02, 0x00, 0x00}; +const __u8 nci_rf_activate_ntf[] = {0x61, 0x05, 0x1D, 0x01, 0x02, 0x04, 0x00, + 0xFF, 0xFF, 0x0C, 0x44, 0x03, 0x07, 0x04, + 0x62, 0x26, 0x11, 0x80, 0x1D, 0x80, 0x01, + 0x20, 0x00, 0x00, 0x00, 0x06, 0x05, 0x75, + 0x77, 0x81, 0x02, 0x80}; +const __u8 nci_t4t_select_cmd[] = {0x00, 0x00, 0x0C, 0x00, 0xA4, 0x04, 0x00, + 0x07, 0xD2, 0x76, 0x00, 0x00, 0x85, 0x01, 0x01}; +const __u8 nci_t4t_select_cmd2[] = {0x00, 0x00, 0x07, 0x00, 0xA4, 0x00, 0x0C, 0x02, + 0xE1, 0x03}; +const __u8 nci_t4t_select_cmd3[] = {0x00, 0x00, 0x07, 0x00, 0xA4, 0x00, 0x0C, 0x02, + 0xE1, 0x04}; +const __u8 nci_t4t_read_cmd[] = {0x00, 0x00, 0x05, 0x00, 0xB0, 0x00, 0x00, 0x0F}; +const __u8 nci_t4t_read_rsp[] = {0x00, 0x00, 0x11, 0x00, 0x0F, 0x20, 0x00, 0x3B, + 0x00, 0x34, 0x04, 0x06, 0xE1, 0x04, 0x08, 0x00, + 0x00, 0x00, 0x90, 0x00}; +const __u8 nci_t4t_read_cmd2[] = {0x00, 0x00, 0x05, 0x00, 0xB0, 0x00, 0x00, 0x02}; +const __u8 nci_t4t_read_rsp2[] = {0x00, 0x00, 0x04, 0x00, 0x0F, 0x90, 0x00}; +const __u8 nci_t4t_read_cmd3[] = {0x00, 0x00, 0x05, 0x00, 0xB0, 0x00, 0x02, 0x0F}; +const __u8 nci_t4t_read_rsp3[] = {0x00, 0x00, 0x11, 0xD1, 0x01, 0x0B, 0x54, 0x02, + 0x65, 0x6E, 0x4E, 0x46, 0x43, 0x20, 0x54, 0x45, + 0x53, 0x54, 0x90, 0x00}; +const __u8 nci_t4t_rsp_ok[] = {0x00, 0x00, 0x02, 0x90, 0x00}; struct msgtemplate { struct nlmsghdr n; @@ -87,7 +110,7 @@ error: static int send_cmd_mt_nla(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, __u8 genl_cmd, int nla_num, __u16 nla_type[], - void *nla_data[], int nla_len[]) + void *nla_data[], int nla_len[], __u16 flags) { struct sockaddr_nl nladdr; struct msgtemplate msg; @@ -98,7 +121,7 @@ static int send_cmd_mt_nla(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); msg.n.nlmsg_type = nlmsg_type; - msg.n.nlmsg_flags = NLM_F_REQUEST; + msg.n.nlmsg_flags = flags; msg.n.nlmsg_seq = 0; msg.n.nlmsg_pid = nlmsg_pid; msg.g.cmd = genl_cmd; @@ -110,11 +133,11 @@ static int send_cmd_mt_nla(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, na->nla_type = nla_type[cnt]; na->nla_len = nla_len[cnt] + NLA_HDRLEN; - if (nla_len > 0) + if (nla_len[cnt] > 0) memcpy(NLA_DATA(na), nla_data[cnt], nla_len[cnt]); - msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); - prv_len = na->nla_len; + prv_len = NLA_ALIGN(nla_len[cnt]) + NLA_HDRLEN; + msg.n.nlmsg_len += prv_len; } buf = (char *)&msg; @@ -146,11 +169,11 @@ static int send_get_nfc_family(int sd, __u32 pid) nla_get_family_data = family_name; return send_cmd_mt_nla(sd, GENL_ID_CTRL, pid, CTRL_CMD_GETFAMILY, - 1, &nla_get_family_type, - &nla_get_family_data, &nla_get_family_len); + 1, &nla_get_family_type, &nla_get_family_data, + &nla_get_family_len, NLM_F_REQUEST); } -static int get_family_id(int sd, __u32 pid) +static int get_family_id(int sd, __u32 pid, __u32 *event_group) { struct { struct nlmsghdr n; @@ -158,8 +181,9 @@ static int get_family_id(int sd, __u32 pid) char buf[512]; } ans; struct nlattr *na; - int rep_len; + int resp_len; __u16 id; + int len; int rc; rc = send_get_nfc_family(sd, pid); @@ -167,17 +191,49 @@ static int get_family_id(int sd, __u32 pid) if (rc < 0) return 0; - rep_len = recv(sd, &ans, sizeof(ans), 0); + resp_len = recv(sd, &ans, sizeof(ans), 0); - if (ans.n.nlmsg_type == NLMSG_ERROR || rep_len < 0 || - !NLMSG_OK(&ans.n, rep_len)) + if (ans.n.nlmsg_type == NLMSG_ERROR || resp_len < 0 || + !NLMSG_OK(&ans.n, resp_len)) return 0; + len = 0; + resp_len = GENLMSG_PAYLOAD(&ans.n); na = (struct nlattr *)GENLMSG_DATA(&ans); - na = (struct nlattr *)((char *)na + NLA_ALIGN(na->nla_len)); - if (na->nla_type == CTRL_ATTR_FAMILY_ID) - id = *(__u16 *)NLA_DATA(na); + while (len < resp_len) { + len += NLA_ALIGN(na->nla_len); + if (na->nla_type == CTRL_ATTR_FAMILY_ID) { + id = *(__u16 *)NLA_DATA(na); + } else if (na->nla_type == CTRL_ATTR_MCAST_GROUPS) { + struct nlattr *nested_na; + struct nlattr *group_na; + int group_attr_len; + int group_attr; + + nested_na = (struct nlattr *)((char *)na + NLA_HDRLEN); + group_na = (struct nlattr *)((char *)nested_na + NLA_HDRLEN); + group_attr_len = 0; + + for (group_attr = CTRL_ATTR_MCAST_GRP_UNSPEC; + group_attr < CTRL_ATTR_MCAST_GRP_MAX; group_attr++) { + if (group_na->nla_type == CTRL_ATTR_MCAST_GRP_ID) { + *event_group = *(__u32 *)((char *)group_na + + NLA_HDRLEN); + break; + } + + group_attr_len += NLA_ALIGN(group_na->nla_len) + + NLA_HDRLEN; + if (group_attr_len >= nested_na->nla_len) + break; + + group_na = (struct nlattr *)((char *)group_na + + NLA_ALIGN(group_na->nla_len)); + } + } + na = (struct nlattr *)(GENLMSG_DATA(&ans) + len); + } return id; } @@ -189,12 +245,12 @@ static int send_cmd_with_idx(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, int nla_len = 4; return send_cmd_mt_nla(sd, nlmsg_type, nlmsg_pid, genl_cmd, 1, - &nla_type, &nla_data, &nla_len); + &nla_type, &nla_data, &nla_len, NLM_F_REQUEST); } static int get_nci_devid(int sd, __u16 fid, __u32 pid, int dev_id, struct msgtemplate *msg) { - int rc, rep_len; + int rc, resp_len; rc = send_cmd_with_idx(sd, fid, pid, NFC_CMD_GET_DEVICE, dev_id); if (rc < 0) { @@ -202,14 +258,14 @@ static int get_nci_devid(int sd, __u16 fid, __u32 pid, int dev_id, struct msgtem goto error; } - rep_len = recv(sd, msg, sizeof(*msg), 0); - if (rep_len < 0) { + resp_len = recv(sd, msg, sizeof(*msg), 0); + if (resp_len < 0) { rc = -2; goto error; } if (msg->n.nlmsg_type == NLMSG_ERROR || - !NLMSG_OK(&msg->n, rep_len)) { + !NLMSG_OK(&msg->n, resp_len)) { rc = -3; goto error; } @@ -222,21 +278,21 @@ error: static __u8 get_dev_enable_state(struct msgtemplate *msg) { struct nlattr *na; - int rep_len; + int resp_len; int len; - rep_len = GENLMSG_PAYLOAD(&msg->n); + resp_len = GENLMSG_PAYLOAD(&msg->n); na = (struct nlattr *)GENLMSG_DATA(msg); len = 0; - while (len < rep_len) { + while (len < resp_len) { len += NLA_ALIGN(na->nla_len); if (na->nla_type == NFC_ATTR_DEVICE_POWERED) return *(char *)NLA_DATA(na); na = (struct nlattr *)(GENLMSG_DATA(msg) + len); } - return rep_len; + return resp_len; } FIXTURE(NCI) { @@ -270,8 +326,7 @@ static void *virtual_dev_open(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_reset_cmd)) @@ -280,8 +335,7 @@ static void *virtual_dev_open(void *data) goto error; write(dev_fd, nci_reset_rsp, sizeof(nci_reset_rsp)); - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_init_cmd)) @@ -290,8 +344,7 @@ static void *virtual_dev_open(void *data) goto error; write(dev_fd, nci_init_rsp, sizeof(nci_init_rsp)); - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_rf_disc_map_cmd)) @@ -313,8 +366,7 @@ static void *virtual_dev_open_v2(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_reset_cmd)) @@ -324,8 +376,7 @@ static void *virtual_dev_open_v2(void *data) write(dev_fd, nci_reset_rsp_v2, sizeof(nci_reset_rsp_v2)); write(dev_fd, nci_reset_ntf, sizeof(nci_reset_ntf)); - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_init_cmd_v2)) @@ -334,8 +385,7 @@ static void *virtual_dev_open_v2(void *data) goto error; write(dev_fd, nci_init_rsp_v2, sizeof(nci_init_rsp_v2)); - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_rf_disc_map_cmd)) @@ -353,6 +403,7 @@ FIXTURE_SETUP(NCI) { struct msgtemplate msg; pthread_t thread_t; + __u32 event_group; int status; int rc; @@ -364,12 +415,16 @@ FIXTURE_SETUP(NCI) ASSERT_NE(self->sd, -1); self->pid = getpid(); - self->fid = get_family_id(self->sd, self->pid); + self->fid = get_family_id(self->sd, self->pid, &event_group); ASSERT_NE(self->fid, -1); self->virtual_nci_fd = open("/dev/virtual_nci", O_RDWR); ASSERT_GT(self->virtual_nci_fd, -1); + rc = setsockopt(self->sd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &event_group, + sizeof(event_group)); + ASSERT_NE(rc, -1); + rc = ioctl(self->virtual_nci_fd, IOCTL_GET_NCIDEV_IDX, &self->dev_idex); ASSERT_EQ(rc, 0); @@ -402,8 +457,7 @@ static void *virtual_deinit(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_reset_cmd)) @@ -425,8 +479,7 @@ static void *virtual_deinit_v2(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_reset_cmd)) @@ -489,16 +542,14 @@ static void *virtual_poll_start(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_rf_discovery_cmd)) goto error; if (memcmp(nci_rf_discovery_cmd, buf, len)) goto error; - write(dev_fd, nci_rf_disc_rsp, sizeof(nci_rf_disc_rsp)) - ; + write(dev_fd, nci_rf_disc_rsp, sizeof(nci_rf_disc_rsp)); return (void *)0; error: @@ -513,8 +564,7 @@ static void *virtual_poll_stop(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_rf_deact_cmd)) @@ -528,38 +578,282 @@ error: return (void *)-1; } -TEST_F(NCI, start_poll) +int start_polling(int dev_idx, int proto, int virtual_fd, int sd, int fid, int pid) { __u16 nla_start_poll_type[2] = {NFC_ATTR_DEVICE_INDEX, NFC_ATTR_PROTOCOLS}; - void *nla_start_poll_data[2] = {&self->dev_idex, &self->proto}; + void *nla_start_poll_data[2] = {&dev_idx, &proto}; int nla_start_poll_len[2] = {4, 4}; pthread_t thread_t; int status; int rc; rc = pthread_create(&thread_t, NULL, virtual_poll_start, - (void *)&self->virtual_nci_fd); - ASSERT_GT(rc, -1); + (void *)&virtual_fd); + if (rc < 0) + return rc; - rc = send_cmd_mt_nla(self->sd, self->fid, self->pid, - NFC_CMD_START_POLL, 2, nla_start_poll_type, - nla_start_poll_data, nla_start_poll_len); - EXPECT_EQ(rc, 0); + rc = send_cmd_mt_nla(sd, fid, pid, NFC_CMD_START_POLL, 2, nla_start_poll_type, + nla_start_poll_data, nla_start_poll_len, NLM_F_REQUEST); + if (rc != 0) + return rc; pthread_join(thread_t, (void **)&status); - ASSERT_EQ(status, 0); + return status; +} + +int stop_polling(int dev_idx, int virtual_fd, int sd, int fid, int pid) +{ + pthread_t thread_t; + int status; + int rc; rc = pthread_create(&thread_t, NULL, virtual_poll_stop, - (void *)&self->virtual_nci_fd); - ASSERT_GT(rc, -1); + (void *)&virtual_fd); + if (rc < 0) + return rc; - rc = send_cmd_with_idx(self->sd, self->fid, self->pid, - NFC_CMD_STOP_POLL, self->dev_idex); - EXPECT_EQ(rc, 0); + rc = send_cmd_with_idx(sd, fid, pid, + NFC_CMD_STOP_POLL, dev_idx); + if (rc != 0) + return rc; + + pthread_join(thread_t, (void **)&status); + return status; +} + +TEST_F(NCI, start_poll) +{ + int status; + + status = start_polling(self->dev_idex, self->proto, self->virtual_nci_fd, + self->sd, self->fid, self->pid); + EXPECT_EQ(status, 0); + + status = stop_polling(self->dev_idex, self->virtual_nci_fd, self->sd, + self->fid, self->pid); + EXPECT_EQ(status, 0); +} + +int get_taginfo(int dev_idx, int sd, int fid, int pid) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[512]; + } ans; + + struct nlattr *na; + __u32 protocol; + int targetidx; + __u8 sel_res; + int resp_len; + int len; + + __u16 tagid_type; + void *tagid_type_data; + int tagid_len; + + tagid_type = NFC_ATTR_DEVICE_INDEX; + tagid_type_data = &dev_idx; + tagid_len = 4; + + send_cmd_mt_nla(sd, fid, pid, NFC_CMD_GET_TARGET, 1, &tagid_type, + &tagid_type_data, &tagid_len, NLM_F_REQUEST | NLM_F_DUMP); + resp_len = recv(sd, &ans, sizeof(ans), 0); + if (ans.n.nlmsg_type == NLMSG_ERROR || resp_len < 0 || + !NLMSG_OK(&ans.n, resp_len)) + return -1; + + resp_len = GENLMSG_PAYLOAD(&ans.n); + na = (struct nlattr *)GENLMSG_DATA(&ans); + + len = 0; + targetidx = -1; + protocol = -1; + sel_res = -1; + + while (len < resp_len) { + len += NLA_ALIGN(na->nla_len); + + if (na->nla_type == NFC_ATTR_TARGET_INDEX) + targetidx = *(int *)((char *)na + NLA_HDRLEN); + else if (na->nla_type == NFC_ATTR_TARGET_SEL_RES) + sel_res = *(__u8 *)((char *)na + NLA_HDRLEN); + else if (na->nla_type == NFC_ATTR_PROTOCOLS) + protocol = *(__u32 *)((char *)na + NLA_HDRLEN); + + na = (struct nlattr *)(GENLMSG_DATA(&ans) + len); + } + + if (targetidx == -1 || sel_res != 0x20 || protocol != NFC_PROTO_ISO14443_MASK) + return -1; + + return targetidx; +} + +int connect_socket(int dev_idx, int target_idx) +{ + struct sockaddr_nfc addr; + int sock; + int err = 0; + + sock = socket(AF_NFC, SOCK_SEQPACKET, NFC_SOCKPROTO_RAW); + if (sock == -1) + return -1; + + addr.sa_family = AF_NFC; + addr.dev_idx = dev_idx; + addr.target_idx = target_idx; + addr.nfc_protocol = NFC_PROTO_ISO14443; + err = connect(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (err) { + close(sock); + return -1; + } + + return sock; +} + +int connect_tag(int dev_idx, int virtual_fd, int sd, int fid, int pid) +{ + struct genlmsghdr *genlhdr; + struct nlattr *na; + char evt_data[255]; + int target_idx; + int resp_len; + int evt_dev; + + write(virtual_fd, nci_rf_activate_ntf, sizeof(nci_rf_activate_ntf)); + resp_len = recv(sd, evt_data, sizeof(evt_data), 0); + if (resp_len < 0) + return -1; + + genlhdr = (struct genlmsghdr *)((struct nlmsghdr *)evt_data + 1); + na = (struct nlattr *)(genlhdr + 1); + evt_dev = *(int *)((char *)na + NLA_HDRLEN); + if (dev_idx != evt_dev) + return -1; + + target_idx = get_taginfo(dev_idx, sd, fid, pid); + if (target_idx == -1) + return -1; + return connect_socket(dev_idx, target_idx); +} + +int read_write_nci_cmd(int nfc_sock, int virtual_fd, const __u8 *cmd, __u32 cmd_len, + const __u8 *rsp, __u32 rsp_len) +{ + char buf[256]; + int len; + + send(nfc_sock, &cmd[3], cmd_len - 3, 0); + len = read(virtual_fd, buf, cmd_len); + if (len < 0 || memcmp(buf, cmd, cmd_len)) + return -1; + + write(virtual_fd, rsp, rsp_len); + len = recv(nfc_sock, buf, rsp_len - 2, 0); + if (len < 0 || memcmp(&buf[1], &rsp[3], rsp_len - 3)) + return -1; + + return 0; +} + +int read_tag(int nfc_sock, int virtual_fd) +{ + if (read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_select_cmd, + sizeof(nci_t4t_select_cmd), nci_t4t_rsp_ok, + sizeof(nci_t4t_rsp_ok))) + return -1; + + if (read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_select_cmd2, + sizeof(nci_t4t_select_cmd2), nci_t4t_rsp_ok, + sizeof(nci_t4t_rsp_ok))) + return -1; + + if (read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_read_cmd, + sizeof(nci_t4t_read_cmd), nci_t4t_read_rsp, + sizeof(nci_t4t_read_rsp))) + return -1; + + if (read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_select_cmd3, + sizeof(nci_t4t_select_cmd3), nci_t4t_rsp_ok, + sizeof(nci_t4t_rsp_ok))) + return -1; + + if (read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_read_cmd2, + sizeof(nci_t4t_read_cmd2), nci_t4t_read_rsp2, + sizeof(nci_t4t_read_rsp2))) + return -1; + + return read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_read_cmd3, + sizeof(nci_t4t_read_cmd3), nci_t4t_read_rsp3, + sizeof(nci_t4t_read_rsp3)); +} + +static void *virtual_deactivate_proc(void *data) +{ + int virtual_fd; + char buf[256]; + int deactcmd_len; + int len; + + virtual_fd = *(int *)data; + deactcmd_len = sizeof(nci_rf_deact_cmd); + len = read(virtual_fd, buf, deactcmd_len); + if (len != deactcmd_len || memcmp(buf, nci_rf_deact_cmd, deactcmd_len)) + return (void *)-1; + + write(virtual_fd, nci_rf_deact_rsp, sizeof(nci_rf_deact_rsp)); + write(virtual_fd, nci_rf_deact_ntf, sizeof(nci_rf_deact_ntf)); + + return (void *)0; +} + +int disconnect_tag(int nfc_sock, int virtual_fd) +{ + pthread_t thread_t; + char buf[256]; + int status; + int len; + + send(nfc_sock, &nci_t4t_select_cmd3[3], sizeof(nci_t4t_select_cmd3) - 3, 0); + len = read(virtual_fd, buf, sizeof(nci_t4t_select_cmd3)); + if (len < 0 || memcmp(buf, nci_t4t_select_cmd3, sizeof(nci_t4t_select_cmd3))) + return -1; + + len = recv(nfc_sock, buf, sizeof(nci_t4t_rsp_ok), 0); + if (len != -1) + return -1; + + status = pthread_create(&thread_t, NULL, virtual_deactivate_proc, + (void *)&virtual_fd); + + close(nfc_sock); pthread_join(thread_t, (void **)&status); + return status; +} + +TEST_F(NCI, t4t_tag_read) +{ + int nfc_sock; + int status; + + status = start_polling(self->dev_idex, self->proto, self->virtual_nci_fd, + self->sd, self->fid, self->pid); + EXPECT_EQ(status, 0); + + nfc_sock = connect_tag(self->dev_idex, self->virtual_nci_fd, self->sd, + self->fid, self->pid); + ASSERT_GT(nfc_sock, -1); + + status = read_tag(nfc_sock, self->virtual_nci_fd); ASSERT_EQ(status, 0); + + status = disconnect_tag(nfc_sock, self->virtual_nci_fd); + EXPECT_EQ(status, 0); } TEST_F(NCI, deinit) diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 61ae899cfc17..19deb9cdf72f 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -30,3 +30,4 @@ hwtstamp_config rxtimestamp timestamping txtimestamp +so_netns_cookie diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 3915bb7bfc39..492b273743b4 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -25,19 +25,25 @@ TEST_PROGS += bareudp.sh TEST_PROGS += unicast_extensions.sh TEST_PROGS += udpgro_fwd.sh TEST_PROGS += veth.sh +TEST_PROGS += ioam6.sh +TEST_PROGS += gro.sh +TEST_PROGS += gre_gso.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx ip_defrag -TEST_GEN_FILES += so_txtime ipv6_flowlabel ipv6_flowlabel_mgr +TEST_GEN_FILES += so_txtime ipv6_flowlabel ipv6_flowlabel_mgr so_netns_cookie TEST_GEN_FILES += tcp_fastopen_backup_key TEST_GEN_FILES += fin_ack_lat TEST_GEN_FILES += reuseaddr_ports_exhausted TEST_GEN_FILES += hwtstamp_config rxtimestamp timestamping txtimestamp TEST_GEN_FILES += ipsec +TEST_GEN_FILES += ioam6_parser +TEST_GEN_FILES += gro TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls +TEST_GEN_FILES += toeplitz TEST_FILES := settings diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile new file mode 100644 index 000000000000..df341648f818 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -0,0 +1,2 @@ +TEST_GEN_PROGS := test_unix_oob +include ../../lib.mk diff --git a/tools/testing/selftests/net/af_unix/test_unix_oob.c b/tools/testing/selftests/net/af_unix/test_unix_oob.c new file mode 100644 index 000000000000..3dece8b29253 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/test_unix_oob.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <arpa/inet.h> +#include <unistd.h> +#include <string.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <errno.h> +#include <netinet/tcp.h> +#include <sys/un.h> +#include <sys/signal.h> +#include <sys/poll.h> + +static int pipefd[2]; +static int signal_recvd; +static pid_t producer_id; +static char sock_name[32]; + +static void sig_hand(int sn, siginfo_t *si, void *p) +{ + signal_recvd = sn; +} + +static int set_sig_handler(int signal) +{ + struct sigaction sa; + + sa.sa_sigaction = sig_hand; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO | SA_RESTART; + + return sigaction(signal, &sa, NULL); +} + +static void set_filemode(int fd, int set) +{ + int flags = fcntl(fd, F_GETFL, 0); + + if (set) + flags &= ~O_NONBLOCK; + else + flags |= O_NONBLOCK; + fcntl(fd, F_SETFL, flags); +} + +static void signal_producer(int fd) +{ + char cmd; + + cmd = 'S'; + write(fd, &cmd, sizeof(cmd)); +} + +static void wait_for_signal(int fd) +{ + char buf[5]; + + read(fd, buf, 5); +} + +static void die(int status) +{ + fflush(NULL); + unlink(sock_name); + kill(producer_id, SIGTERM); + exit(status); +} + +int is_sioctatmark(int fd) +{ + int ans = -1; + + if (ioctl(fd, SIOCATMARK, &ans, sizeof(ans)) < 0) { +#ifdef DEBUG + perror("SIOCATMARK Failed"); +#endif + } + return ans; +} + +void read_oob(int fd, char *c) +{ + + *c = ' '; + if (recv(fd, c, sizeof(*c), MSG_OOB) < 0) { +#ifdef DEBUG + perror("Reading MSG_OOB Failed"); +#endif + } +} + +int read_data(int pfd, char *buf, int size) +{ + int len = 0; + + memset(buf, size, '0'); + len = read(pfd, buf, size); +#ifdef DEBUG + if (len < 0) + perror("read failed"); +#endif + return len; +} + +static void wait_for_data(int pfd, int event) +{ + struct pollfd pfds[1]; + + pfds[0].fd = pfd; + pfds[0].events = event; + poll(pfds, 1, -1); +} + +void producer(struct sockaddr_un *consumer_addr) +{ + int cfd; + char buf[64]; + int i; + + memset(buf, 'x', sizeof(buf)); + cfd = socket(AF_UNIX, SOCK_STREAM, 0); + + wait_for_signal(pipefd[0]); + if (connect(cfd, (struct sockaddr *)consumer_addr, + sizeof(struct sockaddr)) != 0) { + perror("Connect failed"); + kill(0, SIGTERM); + exit(1); + } + + for (i = 0; i < 2; i++) { + /* Test 1: Test for SIGURG and OOB */ + wait_for_signal(pipefd[0]); + memset(buf, 'x', sizeof(buf)); + buf[63] = '@'; + send(cfd, buf, sizeof(buf), MSG_OOB); + + wait_for_signal(pipefd[0]); + + /* Test 2: Test for OOB being overwitten */ + memset(buf, 'x', sizeof(buf)); + buf[63] = '%'; + send(cfd, buf, sizeof(buf), MSG_OOB); + + memset(buf, 'x', sizeof(buf)); + buf[63] = '#'; + send(cfd, buf, sizeof(buf), MSG_OOB); + + wait_for_signal(pipefd[0]); + + /* Test 3: Test for SIOCATMARK */ + memset(buf, 'x', sizeof(buf)); + buf[63] = '@'; + send(cfd, buf, sizeof(buf), MSG_OOB); + + memset(buf, 'x', sizeof(buf)); + buf[63] = '%'; + send(cfd, buf, sizeof(buf), MSG_OOB); + + memset(buf, 'x', sizeof(buf)); + send(cfd, buf, sizeof(buf), 0); + + wait_for_signal(pipefd[0]); + + /* Test 4: Test for 1byte OOB msg */ + memset(buf, 'x', sizeof(buf)); + buf[0] = '@'; + send(cfd, buf, 1, MSG_OOB); + } +} + +int +main(int argc, char **argv) +{ + int lfd, pfd; + struct sockaddr_un consumer_addr, paddr; + socklen_t len = sizeof(consumer_addr); + char buf[1024]; + int on = 0; + char oob; + int flags; + int atmark; + char *tmp_file; + + lfd = socket(AF_UNIX, SOCK_STREAM, 0); + memset(&consumer_addr, 0, sizeof(consumer_addr)); + consumer_addr.sun_family = AF_UNIX; + sprintf(sock_name, "unix_oob_%d", getpid()); + unlink(sock_name); + strcpy(consumer_addr.sun_path, sock_name); + + if ((bind(lfd, (struct sockaddr *)&consumer_addr, + sizeof(consumer_addr))) != 0) { + perror("socket bind failed"); + exit(1); + } + + pipe(pipefd); + + listen(lfd, 1); + + producer_id = fork(); + if (producer_id == 0) { + producer(&consumer_addr); + exit(0); + } + + set_sig_handler(SIGURG); + signal_producer(pipefd[1]); + + pfd = accept(lfd, (struct sockaddr *) &paddr, &len); + fcntl(pfd, F_SETOWN, getpid()); + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 1: + * veriyf that SIGURG is + * delivered and 63 bytes are + * read and oob is '@' + */ + wait_for_data(pfd, POLLIN | POLLPRI); + read_oob(pfd, &oob); + len = read_data(pfd, buf, 1024); + if (!signal_recvd || len != 63 || oob != '@') { + fprintf(stderr, "Test 1 failed sigurg %d len %d %c\n", + signal_recvd, len, oob); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 2: + * Verify that the first OOB is over written by + * the 2nd one and the first OOB is returned as + * part of the read, and sigurg is received. + */ + wait_for_data(pfd, POLLIN | POLLPRI); + len = 0; + while (len < 70) + len = recv(pfd, buf, 1024, MSG_PEEK); + len = read_data(pfd, buf, 1024); + read_oob(pfd, &oob); + if (!signal_recvd || len != 127 || oob != '#') { + fprintf(stderr, "Test 2 failed, sigurg %d len %d OOB %c\n", + signal_recvd, len, oob); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 3: + * verify that 2nd oob over writes + * the first one and read breaks at + * oob boundary returning 127 bytes + * and sigurg is received and atmark + * is set. + * oob is '%' and second read returns + * 64 bytes. + */ + len = 0; + wait_for_data(pfd, POLLIN | POLLPRI); + while (len < 150) + len = recv(pfd, buf, 1024, MSG_PEEK); + len = read_data(pfd, buf, 1024); + atmark = is_sioctatmark(pfd); + read_oob(pfd, &oob); + + if (!signal_recvd || len != 127 || oob != '%' || atmark != 1) { + fprintf(stderr, + "Test 3 failed, sigurg %d len %d OOB %c atmark %d\n", + signal_recvd, len, oob, atmark); + die(1); + } + + signal_recvd = 0; + + len = read_data(pfd, buf, 1024); + if (len != 64) { + fprintf(stderr, "Test 3.1 failed, sigurg %d len %d OOB %c\n", + signal_recvd, len, oob); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 4: + * verify that a single byte + * oob message is delivered. + * set non blocking mode and + * check proper error is + * returned and sigurg is + * received and correct + * oob is read. + */ + + set_filemode(pfd, 0); + + wait_for_data(pfd, POLLIN | POLLPRI); + len = read_data(pfd, buf, 1024); + if ((len == -1) && (errno == 11)) + len = 0; + + read_oob(pfd, &oob); + + if (!signal_recvd || len != 0 || oob != '@') { + fprintf(stderr, "Test 4 failed, sigurg %d len %d OOB %c\n", + signal_recvd, len, oob); + die(1); + } + + set_filemode(pfd, 1); + + /* Inline Testing */ + + on = 1; + if (setsockopt(pfd, SOL_SOCKET, SO_OOBINLINE, &on, sizeof(on))) { + perror("SO_OOBINLINE"); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 1 -- Inline: + * Check that SIGURG is + * delivered and 63 bytes are + * read and oob is '@' + */ + + wait_for_data(pfd, POLLIN | POLLPRI); + len = read_data(pfd, buf, 1024); + + if (!signal_recvd || len != 63) { + fprintf(stderr, "Test 1 Inline failed, sigurg %d len %d\n", + signal_recvd, len); + die(1); + } + + len = read_data(pfd, buf, 1024); + + if (len != 1) { + fprintf(stderr, + "Test 1.1 Inline failed, sigurg %d len %d oob %c\n", + signal_recvd, len, oob); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 2 -- Inline: + * Verify that the first OOB is over written by + * the 2nd one and read breaks correctly on + * 2nd OOB boundary with the first OOB returned as + * part of the read, and sigurg is delivered and + * siocatmark returns true. + * next read returns one byte, the oob byte + * and siocatmark returns false. + */ + len = 0; + wait_for_data(pfd, POLLIN | POLLPRI); + while (len < 70) + len = recv(pfd, buf, 1024, MSG_PEEK); + len = read_data(pfd, buf, 1024); + atmark = is_sioctatmark(pfd); + if (len != 127 || atmark != 1 || !signal_recvd) { + fprintf(stderr, "Test 2 Inline failed, len %d atmark %d\n", + len, atmark); + die(1); + } + + len = read_data(pfd, buf, 1024); + atmark = is_sioctatmark(pfd); + if (len != 1 || buf[0] != '#' || atmark == 1) { + fprintf(stderr, "Test 2.1 Inline failed, len %d data %c atmark %d\n", + len, buf[0], atmark); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 3 -- Inline: + * verify that 2nd oob over writes + * the first one and read breaks at + * oob boundary returning 127 bytes + * and sigurg is received and siocatmark + * is true after the read. + * subsequent read returns 65 bytes + * because of oob which should be '%'. + */ + len = 0; + wait_for_data(pfd, POLLIN | POLLPRI); + while (len < 126) + len = recv(pfd, buf, 1024, MSG_PEEK); + len = read_data(pfd, buf, 1024); + atmark = is_sioctatmark(pfd); + if (!signal_recvd || len != 127 || !atmark) { + fprintf(stderr, + "Test 3 Inline failed, sigurg %d len %d data %c\n", + signal_recvd, len, buf[0]); + die(1); + } + + len = read_data(pfd, buf, 1024); + atmark = is_sioctatmark(pfd); + if (len != 65 || buf[0] != '%' || atmark != 0) { + fprintf(stderr, + "Test 3.1 Inline failed, len %d oob %c atmark %d\n", + len, buf[0], atmark); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 4 -- Inline: + * verify that a single + * byte oob message is delivered + * and read returns one byte, the oob + * byte and sigurg is received + */ + wait_for_data(pfd, POLLIN | POLLPRI); + len = read_data(pfd, buf, 1024); + if (!signal_recvd || len != 1 || buf[0] != '@') { + fprintf(stderr, + "Test 4 Inline failed, signal %d len %d data %c\n", + signal_recvd, len, buf[0]); + die(1); + } + die(0); +} diff --git a/tools/testing/selftests/net/altnames.sh b/tools/testing/selftests/net/altnames.sh index 4254ddc3f70b..1ef9e4159bba 100755 --- a/tools/testing/selftests/net/altnames.sh +++ b/tools/testing/selftests/net/altnames.sh @@ -45,7 +45,7 @@ altnames_test() check_err $? "Got unexpected long alternative name from link show JSON" ip link property del $DUMMY_DEV altname $SHORT_NAME - check_err $? "Failed to add short alternative name" + check_err $? "Failed to delete short alternative name" ip -j -p link show $SHORT_NAME &>/dev/null check_fail $? "Unexpected success while trying to do link show with deleted short alternative name" diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 614d5477365a..86ab429fe7f3 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -1,4 +1,5 @@ CONFIG_USER_NS=y +CONFIG_NET_NS=y CONFIG_BPF_SYSCALL=y CONFIG_TEST_BPF=m CONFIG_NUMA=y @@ -41,3 +42,5 @@ CONFIG_NET_CLS_FLOWER=m CONFIG_NET_ACT_TUNNEL_KEY=m CONFIG_NET_ACT_MIRRED=m CONFIG_BAREUDP=m +CONFIG_IPV6_IOAM6_LWTUNNEL=y +CONFIG_CRYPTO_SM4=y diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/net/devlink_port_split.py index 834066d465fc..2b5d6ff87373 100755 --- a/tools/testing/selftests/net/devlink_port_split.py +++ b/tools/testing/selftests/net/devlink_port_split.py @@ -18,6 +18,8 @@ import sys # +# Kselftest framework requirement - SKIP code is 4 +KSFT_SKIP=4 Port = collections.namedtuple('Port', 'bus_info name') @@ -239,7 +241,11 @@ def main(cmdline=None): assert stderr == "" devs = json.loads(stdout)['dev'] - dev = list(devs.keys())[0] + if devs: + dev = list(devs.keys())[0] + else: + print("no devlink device was found, test skipped") + sys.exit(KSFT_SKIP) cmd = "devlink dev show %s" % dev stdout, stderr = run_command(cmd) diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index a8ad92850e63..3313566ce906 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -37,6 +37,9 @@ # # server / client nomenclature relative to ns-A +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + VERBOSE=0 NSA_DEV=eth1 @@ -286,6 +289,12 @@ set_sysctl() run_cmd sysctl -q -w $* } +# get sysctl values in NS-A +get_sysctl() +{ + ${NSA_CMD} sysctl -n $* +} + ################################################################################ # Setup for tests @@ -436,10 +445,13 @@ cleanup() ip -netns ${NSA} link set dev ${NSA_DEV} down ip -netns ${NSA} link del dev ${NSA_DEV} + ip netns pids ${NSA} | xargs kill 2>/dev/null ip netns del ${NSA} fi + ip netns pids ${NSB} | xargs kill 2>/dev/null ip netns del ${NSB} + ip netns pids ${NSC} | xargs kill 2>/dev/null ip netns del ${NSC} >/dev/null 2>&1 } @@ -1000,6 +1012,60 @@ ipv4_tcp_md5() run_cmd nettest -s -I ${NSA_DEV} -M ${MD5_PW} -m ${NS_NET} log_test $? 1 "MD5: VRF: Device must be a VRF - prefix" + test_ipv4_md5_vrf__vrf_server__no_bind_ifindex + test_ipv4_md5_vrf__global_server__bind_ifindex0 +} + +test_ipv4_md5_vrf__vrf_server__no_bind_ifindex() +{ + log_start + show_hint "Simulates applications using VRF without TCP_MD5SIG_FLAG_IFINDEX" + run_cmd nettest -s -I ${VRF} -M ${MD5_PW} -m ${NS_NET} --no-bind-key-ifindex & + sleep 1 + run_cmd_nsb nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 0 "MD5: VRF: VRF-bound server, unbound key accepts connection" + + log_start + show_hint "Binding both the socket and the key is not required but it works" + run_cmd nettest -s -I ${VRF} -M ${MD5_PW} -m ${NS_NET} --force-bind-key-ifindex & + sleep 1 + run_cmd_nsb nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 0 "MD5: VRF: VRF-bound server, bound key accepts connection" +} + +test_ipv4_md5_vrf__global_server__bind_ifindex0() +{ + # This particular test needs tcp_l3mdev_accept=1 for Global server to accept VRF connections + local old_tcp_l3mdev_accept + old_tcp_l3mdev_accept=$(get_sysctl net.ipv4.tcp_l3mdev_accept) + set_sysctl net.ipv4.tcp_l3mdev_accept=1 + + log_start + run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} --force-bind-key-ifindex & + sleep 1 + run_cmd_nsb nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 2 "MD5: VRF: Global server, Key bound to ifindex=0 rejects VRF connection" + + log_start + run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} --force-bind-key-ifindex & + sleep 1 + run_cmd_nsc nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 0 "MD5: VRF: Global server, key bound to ifindex=0 accepts non-VRF connection" + log_start + + run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} --no-bind-key-ifindex & + sleep 1 + run_cmd_nsb nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 0 "MD5: VRF: Global server, key not bound to ifindex accepts VRF connection" + + log_start + run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} --no-bind-key-ifindex & + sleep 1 + run_cmd_nsc nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 0 "MD5: VRF: Global server, key not bound to ifindex accepts non-VRF connection" + + # restore value + set_sysctl net.ipv4.tcp_l3mdev_accept="$old_tcp_l3mdev_accept" } ipv4_tcp_novrf() @@ -3879,6 +3945,32 @@ use_case_ping_lla_multi() log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV2}, ping out ns-C" } +# Perform IPv{4,6} SNAT on ns-A, and verify TCP connection is successfully +# established with ns-B. +use_case_snat_on_vrf() +{ + setup "yes" + + local port="12345" + + run_cmd iptables -t nat -A POSTROUTING -p tcp -m tcp --dport ${port} -j SNAT --to-source ${NSA_LO_IP} -o ${VRF} + run_cmd ip6tables -t nat -A POSTROUTING -p tcp -m tcp --dport ${port} -j SNAT --to-source ${NSA_LO_IP6} -o ${VRF} + + run_cmd_nsb nettest -s -l ${NSB_IP} -p ${port} & + sleep 1 + run_cmd nettest -d ${VRF} -r ${NSB_IP} -p ${port} + log_test $? 0 "IPv4 TCP connection over VRF with SNAT" + + run_cmd_nsb nettest -6 -s -l ${NSB_IP6} -p ${port} & + sleep 1 + run_cmd nettest -6 -d ${VRF} -r ${NSB_IP6} -p ${port} + log_test $? 0 "IPv6 TCP connection over VRF with SNAT" + + # Cleanup + run_cmd iptables -t nat -D POSTROUTING -p tcp -m tcp --dport ${port} -j SNAT --to-source ${NSA_LO_IP} -o ${VRF} + run_cmd ip6tables -t nat -D POSTROUTING -p tcp -m tcp --dport ${port} -j SNAT --to-source ${NSA_LO_IP6} -o ${VRF} +} + use_cases() { log_section "Use cases" @@ -3886,6 +3978,8 @@ use_cases() use_case_br log_subsection "Ping LLA with multiple interfaces" use_case_ping_lla_multi + log_subsection "SNAT on VRF" + use_case_snat_on_vrf } ################################################################################ @@ -3946,7 +4040,7 @@ fi which nettest >/dev/null if [ $? -ne 0 ]; then echo "'nettest' command not found; skipping tests" - exit 0 + exit $ksft_skip fi declare -i nfail=0 diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 49774a8a7736..0d293391e9a4 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -925,6 +925,14 @@ ipv6_fcnal_runtime() run_cmd "$IP nexthop add id 86 via 2001:db8:91::2 dev veth1" run_cmd "$IP ro add 2001:db8:101::1/128 nhid 81" + # route can not use prefsrc with nexthops + run_cmd "$IP ro add 2001:db8:101::2/128 nhid 86 from 2001:db8:91::1" + log_test $? 2 "IPv6 route can not use src routing with external nexthop" + + # check cleanup path on invalid metric + run_cmd "$IP ro add 2001:db8:101::2/128 nhid 86 congctl lock foo" + log_test $? 2 "IPv6 route with invalid metric" + # rpfilter and default route $IP nexthop flush >/dev/null 2>&1 run_cmd "ip netns exec me ip6tables -t mangle -I PREROUTING 1 -m rpfilter --invert -j DROP" @@ -1366,6 +1374,10 @@ ipv4_fcnal_runtime() run_cmd "$IP nexthop replace id 22 via 172.16.2.2 dev veth3" log_test $? 2 "Nexthop replace with invalid scope for existing route" + # check cleanup path on invalid metric + run_cmd "$IP ro add 172.16.101.2/32 nhid 22 congctl lock foo" + log_test $? 2 "IPv4 route with invalid metric" + # # add route with nexthop and check traffic # diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index a93e6b690e06..43ea8407a82e 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -3,6 +3,9 @@ # This test is for checking IPv4 and IPv6 FIB rules API +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + ret=0 PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} @@ -238,12 +241,12 @@ run_fibrule_tests() if [ "$(id -u)" -ne 0 ];then echo "SKIP: Need root privileges" - exit 0 + exit $ksft_skip fi if [ ! -x "$(command -v ip)" ]; then echo "SKIP: Could not run test without ip tool" - exit 0 + exit $ksft_skip fi # start clean diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index d97bd6889446..72ee644d47bf 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -9,6 +9,7 @@ TEST_PROGS = bridge_igmp.sh \ gre_inner_v4_multipath.sh \ gre_inner_v6_multipath.sh \ gre_multipath.sh \ + ip6_forward_instats_vrf.sh \ ip6gre_inner_v4_multipath.sh \ ip6gre_inner_v6_multipath.sh \ ipip_flat_gre_key.sh \ diff --git a/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh new file mode 100755 index 000000000000..a15d21dc035a --- /dev/null +++ b/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh @@ -0,0 +1,364 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test traffic distribution between two paths when using custom hash policy. +# +# +--------------------------------+ +# | H1 | +# | $h1 + | +# | 198.51.100.{2-253}/24 | | +# | 2001:db8:1::{2-fd}/64 | | +# +-------------------------|------+ +# | +# +-------------------------|-------------------------+ +# | SW1 | | +# | $rp1 + | +# | 198.51.100.1/24 | +# | 2001:db8:1::1/64 | +# | | +# | | +# | $rp11 + + $rp12 | +# | 192.0.2.1/28 | | 192.0.2.17/28 | +# | 2001:db8:2::1/64 | | 2001:db8:3::1/64 | +# +------------------|-------------|------------------+ +# | | +# +------------------|-------------|------------------+ +# | SW2 | | | +# | | | | +# | $rp21 + + $rp22 | +# | 192.0.2.2/28 192.0.2.18/28 | +# | 2001:db8:2::2/64 2001:db8:3::2/64 | +# | | +# | | +# | $rp2 + | +# | 203.0.113.1/24 | | +# | 2001:db8:4::1/64 | | +# +-------------------------|-------------------------+ +# | +# +-------------------------|------+ +# | H2 | | +# | $h2 + | +# | 203.0.113.{2-253}/24 | +# | 2001:db8:4::{2-fd}/64 | +# +--------------------------------+ + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + custom_hash +" + +NUM_NETIFS=8 +source lib.sh + +h1_create() +{ + simple_if_init $h1 198.51.100.2/24 2001:db8:1::2/64 + ip route add vrf v$h1 default via 198.51.100.1 dev $h1 + ip -6 route add vrf v$h1 default via 2001:db8:1::1 dev $h1 +} + +h1_destroy() +{ + ip -6 route del vrf v$h1 default + ip route del vrf v$h1 default + simple_if_fini $h1 198.51.100.2/24 2001:db8:1::2/64 +} + +sw1_create() +{ + simple_if_init $rp1 198.51.100.1/24 2001:db8:1::1/64 + __simple_if_init $rp11 v$rp1 192.0.2.1/28 2001:db8:2::1/64 + __simple_if_init $rp12 v$rp1 192.0.2.17/28 2001:db8:3::1/64 + + ip route add vrf v$rp1 203.0.113.0/24 \ + nexthop via 192.0.2.2 dev $rp11 \ + nexthop via 192.0.2.18 dev $rp12 + + ip -6 route add vrf v$rp1 2001:db8:4::/64 \ + nexthop via 2001:db8:2::2 dev $rp11 \ + nexthop via 2001:db8:3::2 dev $rp12 +} + +sw1_destroy() +{ + ip -6 route del vrf v$rp1 2001:db8:4::/64 + + ip route del vrf v$rp1 203.0.113.0/24 + + __simple_if_fini $rp12 192.0.2.17/28 2001:db8:3::1/64 + __simple_if_fini $rp11 192.0.2.1/28 2001:db8:2::1/64 + simple_if_fini $rp1 198.51.100.1/24 2001:db8:1::1/64 +} + +sw2_create() +{ + simple_if_init $rp2 203.0.113.1/24 2001:db8:4::1/64 + __simple_if_init $rp21 v$rp2 192.0.2.2/28 2001:db8:2::2/64 + __simple_if_init $rp22 v$rp2 192.0.2.18/28 2001:db8:3::2/64 + + ip route add vrf v$rp2 198.51.100.0/24 \ + nexthop via 192.0.2.1 dev $rp21 \ + nexthop via 192.0.2.17 dev $rp22 + + ip -6 route add vrf v$rp2 2001:db8:1::/64 \ + nexthop via 2001:db8:2::1 dev $rp21 \ + nexthop via 2001:db8:3::1 dev $rp22 +} + +sw2_destroy() +{ + ip -6 route del vrf v$rp2 2001:db8:1::/64 + + ip route del vrf v$rp2 198.51.100.0/24 + + __simple_if_fini $rp22 192.0.2.18/28 2001:db8:3::2/64 + __simple_if_fini $rp21 192.0.2.2/28 2001:db8:2::2/64 + simple_if_fini $rp2 203.0.113.1/24 2001:db8:4::1/64 +} + +h2_create() +{ + simple_if_init $h2 203.0.113.2/24 2001:db8:4::2/64 + ip route add vrf v$h2 default via 203.0.113.1 dev $h2 + ip -6 route add vrf v$h2 default via 2001:db8:4::1 dev $h2 +} + +h2_destroy() +{ + ip -6 route del vrf v$h2 default + ip route del vrf v$h2 default + simple_if_fini $h2 203.0.113.2/24 2001:db8:4::2/64 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + + rp1=${NETIFS[p2]} + + rp11=${NETIFS[p3]} + rp21=${NETIFS[p4]} + + rp12=${NETIFS[p5]} + rp22=${NETIFS[p6]} + + rp2=${NETIFS[p7]} + + h2=${NETIFS[p8]} + + vrf_prepare + h1_create + sw1_create + sw2_create + h2_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + h2_destroy + sw2_destroy + sw1_destroy + h1_destroy + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 203.0.113.2 +} + +ping_ipv6() +{ + ping6_test $h1 2001:db8:4::2 +} + +send_src_ipv4() +{ + $MZ $h1 -q -p 64 -A "198.51.100.2-198.51.100.253" -B 203.0.113.2 \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_dst_ipv4() +{ + $MZ $h1 -q -p 64 -A 198.51.100.2 -B "203.0.113.2-203.0.113.253" \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_src_udp4() +{ + $MZ $h1 -q -p 64 -A 198.51.100.2 -B 203.0.113.2 \ + -d 1msec -t udp "sp=0-32768,dp=30000" +} + +send_dst_udp4() +{ + $MZ $h1 -q -p 64 -A 198.51.100.2 -B 203.0.113.2 \ + -d 1msec -t udp "sp=20000,dp=0-32768" +} + +send_src_ipv6() +{ + $MZ -6 $h1 -q -p 64 -A "2001:db8:1::2-2001:db8:1::fd" -B 2001:db8:4::2 \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_dst_ipv6() +{ + $MZ -6 $h1 -q -p 64 -A 2001:db8:1::2 -B "2001:db8:4::2-2001:db8:4::fd" \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_flowlabel() +{ + # Generate 16384 echo requests, each with a random flow label. + for _ in $(seq 1 16384); do + ip vrf exec v$h1 \ + $PING6 2001:db8:4::2 -F 0 -c 1 -q >/dev/null 2>&1 + done +} + +send_src_udp6() +{ + $MZ -6 $h1 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:4::2 \ + -d 1msec -t udp "sp=0-32768,dp=30000" +} + +send_dst_udp6() +{ + $MZ -6 $h1 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:4::2 \ + -d 1msec -t udp "sp=20000,dp=0-32768" +} + +custom_hash_test() +{ + local field="$1"; shift + local balanced="$1"; shift + local send_flows="$@" + + RET=0 + + local t0_rp11=$(link_stats_tx_packets_get $rp11) + local t0_rp12=$(link_stats_tx_packets_get $rp12) + + $send_flows + + local t1_rp11=$(link_stats_tx_packets_get $rp11) + local t1_rp12=$(link_stats_tx_packets_get $rp12) + + local d_rp11=$((t1_rp11 - t0_rp11)) + local d_rp12=$((t1_rp12 - t0_rp12)) + + local diff=$((d_rp12 - d_rp11)) + local sum=$((d_rp11 + d_rp12)) + + local pct=$(echo "$diff / $sum * 100" | bc -l) + local is_balanced=$(echo "-20 <= $pct && $pct <= 20" | bc) + + [[ ( $is_balanced -eq 1 && $balanced == "balanced" ) || + ( $is_balanced -eq 0 && $balanced == "unbalanced" ) ]] + check_err $? "Expected traffic to be $balanced, but it is not" + + log_test "Multipath hash field: $field ($balanced)" + log_info "Packets sent on path1 / path2: $d_rp11 / $d_rp12" +} + +custom_hash_v4() +{ + log_info "Running IPv4 custom multipath hash tests" + + sysctl_set net.ipv4.fib_multipath_hash_policy 3 + + # Prevent the neighbour table from overflowing, as different neighbour + # entries will be created on $ol4 when using different destination IPs. + sysctl_set net.ipv4.neigh.default.gc_thresh1 1024 + sysctl_set net.ipv4.neigh.default.gc_thresh2 1024 + sysctl_set net.ipv4.neigh.default.gc_thresh3 1024 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0001 + custom_hash_test "Source IP" "balanced" send_src_ipv4 + custom_hash_test "Source IP" "unbalanced" send_dst_ipv4 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0002 + custom_hash_test "Destination IP" "balanced" send_dst_ipv4 + custom_hash_test "Destination IP" "unbalanced" send_src_ipv4 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0010 + custom_hash_test "Source port" "balanced" send_src_udp4 + custom_hash_test "Source port" "unbalanced" send_dst_udp4 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0020 + custom_hash_test "Destination port" "balanced" send_dst_udp4 + custom_hash_test "Destination port" "unbalanced" send_src_udp4 + + sysctl_restore net.ipv4.neigh.default.gc_thresh3 + sysctl_restore net.ipv4.neigh.default.gc_thresh2 + sysctl_restore net.ipv4.neigh.default.gc_thresh1 + + sysctl_restore net.ipv4.fib_multipath_hash_policy +} + +custom_hash_v6() +{ + log_info "Running IPv6 custom multipath hash tests" + + sysctl_set net.ipv6.fib_multipath_hash_policy 3 + + # Prevent the neighbour table from overflowing, as different neighbour + # entries will be created on $ol4 when using different destination IPs. + sysctl_set net.ipv6.neigh.default.gc_thresh1 1024 + sysctl_set net.ipv6.neigh.default.gc_thresh2 1024 + sysctl_set net.ipv6.neigh.default.gc_thresh3 1024 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0001 + custom_hash_test "Source IP" "balanced" send_src_ipv6 + custom_hash_test "Source IP" "unbalanced" send_dst_ipv6 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0002 + custom_hash_test "Destination IP" "balanced" send_dst_ipv6 + custom_hash_test "Destination IP" "unbalanced" send_src_ipv6 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0008 + custom_hash_test "Flowlabel" "balanced" send_flowlabel + custom_hash_test "Flowlabel" "unbalanced" send_src_ipv6 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0010 + custom_hash_test "Source port" "balanced" send_src_udp6 + custom_hash_test "Source port" "unbalanced" send_dst_udp6 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0020 + custom_hash_test "Destination port" "balanced" send_dst_udp6 + custom_hash_test "Destination port" "unbalanced" send_src_udp6 + + sysctl_restore net.ipv6.neigh.default.gc_thresh3 + sysctl_restore net.ipv6.neigh.default.gc_thresh2 + sysctl_restore net.ipv6.neigh.default.gc_thresh1 + + sysctl_restore net.ipv6.fib_multipath_hash_policy +} + +custom_hash() +{ + # Test that when the hash policy is set to custom, traffic is + # distributed only according to the fields set in the + # fib_multipath_hash_fields sysctl. + # + # Each time set a different field and make sure traffic is only + # distributed when the field is changed in the packet stream. + custom_hash_v4 + custom_hash_v6 +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh index 9c12c4fd3afc..2c14a86adaaa 100644 --- a/tools/testing/selftests/net/forwarding/devlink_lib.sh +++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh @@ -1,6 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + ############################################################################## # Defines @@ -9,15 +12,21 @@ if [[ ! -v DEVLINK_DEV ]]; then | jq -r '.port | keys[]' | cut -d/ -f-2) if [ -z "$DEVLINK_DEV" ]; then echo "SKIP: ${NETIFS[p1]} has no devlink device registered for it" - exit 1 + exit $ksft_skip fi if [[ "$(echo $DEVLINK_DEV | grep -c pci)" -eq 0 ]]; then echo "SKIP: devlink device's bus is not PCI" - exit 1 + exit $ksft_skip fi DEVLINK_VIDDID=$(lspci -s $(echo $DEVLINK_DEV | cut -d"/" -f2) \ -n | cut -d" " -f3) +elif [[ ! -z "$DEVLINK_DEV" ]]; then + devlink dev show $DEVLINK_DEV &> /dev/null + if [ $? -ne 0 ]; then + echo "SKIP: devlink device \"$DEVLINK_DEV\" not found" + exit $ksft_skip + fi fi ############################################################################## @@ -26,19 +35,19 @@ fi devlink help 2>&1 | grep resource &> /dev/null if [ $? -ne 0 ]; then echo "SKIP: iproute2 too old, missing devlink resource support" - exit 1 + exit $ksft_skip fi devlink help 2>&1 | grep trap &> /dev/null if [ $? -ne 0 ]; then echo "SKIP: iproute2 too old, missing devlink trap support" - exit 1 + exit $ksft_skip fi devlink dev help 2>&1 | grep info &> /dev/null if [ $? -ne 0 ]; then echo "SKIP: iproute2 too old, missing devlink dev info support" - exit 1 + exit $ksft_skip fi ############################################################################## @@ -318,6 +327,14 @@ devlink_trap_rx_bytes_get() | jq '.[][][]["stats"]["rx"]["bytes"]' } +devlink_trap_drop_packets_get() +{ + local trap_name=$1; shift + + devlink -js trap show $DEVLINK_DEV trap $trap_name \ + | jq '.[][][]["stats"]["rx"]["dropped"]' +} + devlink_trap_stats_idle_test() { local trap_name=$1; shift @@ -339,6 +356,24 @@ devlink_trap_stats_idle_test() fi } +devlink_trap_drop_stats_idle_test() +{ + local trap_name=$1; shift + local t0_packets t0_bytes + + t0_packets=$(devlink_trap_drop_packets_get $trap_name) + + sleep 1 + + t1_packets=$(devlink_trap_drop_packets_get $trap_name) + + if [[ $t0_packets -eq $t1_packets ]]; then + return 0 + else + return 1 + fi +} + devlink_traps_enable_all() { local trap_name diff --git a/tools/testing/selftests/net/forwarding/forwarding.config.sample b/tools/testing/selftests/net/forwarding/forwarding.config.sample index b802c14d2950..e5e2fbeca22e 100644 --- a/tools/testing/selftests/net/forwarding/forwarding.config.sample +++ b/tools/testing/selftests/net/forwarding/forwarding.config.sample @@ -39,3 +39,5 @@ NETIF_CREATE=yes # Timeout (in seconds) before ping exits regardless of how many packets have # been sent or received PING_TIMEOUT=5 +# IPv6 traceroute utility name. +TROUTE6=traceroute6 diff --git a/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh new file mode 100755 index 000000000000..a73f52efcb6c --- /dev/null +++ b/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh @@ -0,0 +1,456 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test traffic distribution when there are multiple paths between an IPv4 GRE +# tunnel. The tunnel carries IPv4 and IPv6 traffic between multiple hosts. +# Multiple routes are in the underlay network. With the default multipath +# policy, SW2 will only look at the outer IP addresses, hence only a single +# route would be used. +# +# +--------------------------------+ +# | H1 | +# | $h1 + | +# | 198.51.100.{2-253}/24 | | +# | 2001:db8:1::{2-fd}/64 | | +# +-------------------------|------+ +# | +# +-------------------------|------------------+ +# | SW1 | | +# | $ol1 + | +# | 198.51.100.1/24 | +# | 2001:db8:1::1/64 | +# | | +# | + g1 (gre) | +# | loc=192.0.2.1 | +# | rem=192.0.2.2 --. | +# | tos=inherit | | +# | v | +# | + $ul1 | +# | | 192.0.2.17/28 | +# +---------------------|----------------------+ +# | +# +---------------------|----------------------+ +# | SW2 | | +# | $ul21 + | +# | 192.0.2.18/28 | | +# | | | +# ! __________________+___ | +# | / \ | +# | | | | +# | + $ul22.111 (vlan) + $ul22.222 (vlan) | +# | | 192.0.2.33/28 | 192.0.2.49/28 | +# | | | | +# +--|----------------------|------------------+ +# | | +# +--|----------------------|------------------+ +# | | | | +# | + $ul32.111 (vlan) + $ul32.222 (vlan) | +# | | 192.0.2.34/28 | 192.0.2.50/28 | +# | | | | +# | \__________________+___/ | +# | | | +# | | | +# | $ul31 + | +# | 192.0.2.65/28 | SW3 | +# +---------------------|----------------------+ +# | +# +---------------------|----------------------+ +# | + $ul4 | +# | ^ 192.0.2.66/28 | +# | | | +# | + g2 (gre) | | +# | loc=192.0.2.2 | | +# | rem=192.0.2.1 --' | +# | tos=inherit | +# | | +# | $ol4 + | +# | 203.0.113.1/24 | | +# | 2001:db8:2::1/64 | SW4 | +# +-------------------------|------------------+ +# | +# +-------------------------|------+ +# | | | +# | $h2 + | +# | 203.0.113.{2-253}/24 | +# | 2001:db8:2::{2-fd}/64 H2 | +# +--------------------------------+ + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + custom_hash +" + +NUM_NETIFS=10 +source lib.sh + +h1_create() +{ + simple_if_init $h1 198.51.100.2/24 2001:db8:1::2/64 + ip route add vrf v$h1 default via 198.51.100.1 dev $h1 + ip -6 route add vrf v$h1 default via 2001:db8:1::1 dev $h1 +} + +h1_destroy() +{ + ip -6 route del vrf v$h1 default + ip route del vrf v$h1 default + simple_if_fini $h1 198.51.100.2/24 2001:db8:1::2/64 +} + +sw1_create() +{ + simple_if_init $ol1 198.51.100.1/24 2001:db8:1::1/64 + __simple_if_init $ul1 v$ol1 192.0.2.17/28 + + tunnel_create g1 gre 192.0.2.1 192.0.2.2 tos inherit dev v$ol1 + __simple_if_init g1 v$ol1 192.0.2.1/32 + ip route add vrf v$ol1 192.0.2.2/32 via 192.0.2.18 + + ip route add vrf v$ol1 203.0.113.0/24 dev g1 + ip -6 route add vrf v$ol1 2001:db8:2::/64 dev g1 +} + +sw1_destroy() +{ + ip -6 route del vrf v$ol1 2001:db8:2::/64 + ip route del vrf v$ol1 203.0.113.0/24 + + ip route del vrf v$ol1 192.0.2.2/32 + __simple_if_fini g1 192.0.2.1/32 + tunnel_destroy g1 + + __simple_if_fini $ul1 192.0.2.17/28 + simple_if_fini $ol1 198.51.100.1/24 2001:db8:1::1/64 +} + +sw2_create() +{ + simple_if_init $ul21 192.0.2.18/28 + __simple_if_init $ul22 v$ul21 + vlan_create $ul22 111 v$ul21 192.0.2.33/28 + vlan_create $ul22 222 v$ul21 192.0.2.49/28 + + ip route add vrf v$ul21 192.0.2.1/32 via 192.0.2.17 + ip route add vrf v$ul21 192.0.2.2/32 \ + nexthop via 192.0.2.34 \ + nexthop via 192.0.2.50 +} + +sw2_destroy() +{ + ip route del vrf v$ul21 192.0.2.2/32 + ip route del vrf v$ul21 192.0.2.1/32 + + vlan_destroy $ul22 222 + vlan_destroy $ul22 111 + __simple_if_fini $ul22 + simple_if_fini $ul21 192.0.2.18/28 +} + +sw3_create() +{ + simple_if_init $ul31 192.0.2.65/28 + __simple_if_init $ul32 v$ul31 + vlan_create $ul32 111 v$ul31 192.0.2.34/28 + vlan_create $ul32 222 v$ul31 192.0.2.50/28 + + ip route add vrf v$ul31 192.0.2.2/32 via 192.0.2.66 + ip route add vrf v$ul31 192.0.2.1/32 \ + nexthop via 192.0.2.33 \ + nexthop via 192.0.2.49 + + tc qdisc add dev $ul32 clsact + tc filter add dev $ul32 ingress pref 111 prot 802.1Q \ + flower vlan_id 111 action pass + tc filter add dev $ul32 ingress pref 222 prot 802.1Q \ + flower vlan_id 222 action pass +} + +sw3_destroy() +{ + tc qdisc del dev $ul32 clsact + + ip route del vrf v$ul31 192.0.2.1/32 + ip route del vrf v$ul31 192.0.2.2/32 + + vlan_destroy $ul32 222 + vlan_destroy $ul32 111 + __simple_if_fini $ul32 + simple_if_fini $ul31 192.0.2.65/28 +} + +sw4_create() +{ + simple_if_init $ol4 203.0.113.1/24 2001:db8:2::1/64 + __simple_if_init $ul4 v$ol4 192.0.2.66/28 + + tunnel_create g2 gre 192.0.2.2 192.0.2.1 tos inherit dev v$ol4 + __simple_if_init g2 v$ol4 192.0.2.2/32 + ip route add vrf v$ol4 192.0.2.1/32 via 192.0.2.65 + + ip route add vrf v$ol4 198.51.100.0/24 dev g2 + ip -6 route add vrf v$ol4 2001:db8:1::/64 dev g2 +} + +sw4_destroy() +{ + ip -6 route del vrf v$ol4 2001:db8:1::/64 + ip route del vrf v$ol4 198.51.100.0/24 + + ip route del vrf v$ol4 192.0.2.1/32 + __simple_if_fini g2 192.0.2.2/32 + tunnel_destroy g2 + + __simple_if_fini $ul4 192.0.2.66/28 + simple_if_fini $ol4 203.0.113.1/24 2001:db8:2::1/64 +} + +h2_create() +{ + simple_if_init $h2 203.0.113.2/24 2001:db8:2::2/64 + ip route add vrf v$h2 default via 203.0.113.1 dev $h2 + ip -6 route add vrf v$h2 default via 2001:db8:2::1 dev $h2 +} + +h2_destroy() +{ + ip -6 route del vrf v$h2 default + ip route del vrf v$h2 default + simple_if_fini $h2 203.0.113.2/24 2001:db8:2::2/64 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + + ol1=${NETIFS[p2]} + ul1=${NETIFS[p3]} + + ul21=${NETIFS[p4]} + ul22=${NETIFS[p5]} + + ul32=${NETIFS[p6]} + ul31=${NETIFS[p7]} + + ul4=${NETIFS[p8]} + ol4=${NETIFS[p9]} + + h2=${NETIFS[p10]} + + vrf_prepare + h1_create + sw1_create + sw2_create + sw3_create + sw4_create + h2_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + h2_destroy + sw4_destroy + sw3_destroy + sw2_destroy + sw1_destroy + h1_destroy + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 203.0.113.2 +} + +ping_ipv6() +{ + ping6_test $h1 2001:db8:2::2 +} + +send_src_ipv4() +{ + $MZ $h1 -q -p 64 -A "198.51.100.2-198.51.100.253" -B 203.0.113.2 \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_dst_ipv4() +{ + $MZ $h1 -q -p 64 -A 198.51.100.2 -B "203.0.113.2-203.0.113.253" \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_src_udp4() +{ + $MZ $h1 -q -p 64 -A 198.51.100.2 -B 203.0.113.2 \ + -d 1msec -t udp "sp=0-32768,dp=30000" +} + +send_dst_udp4() +{ + $MZ $h1 -q -p 64 -A 198.51.100.2 -B 203.0.113.2 \ + -d 1msec -t udp "sp=20000,dp=0-32768" +} + +send_src_ipv6() +{ + $MZ -6 $h1 -q -p 64 -A "2001:db8:1::2-2001:db8:1::fd" -B 2001:db8:2::2 \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_dst_ipv6() +{ + $MZ -6 $h1 -q -p 64 -A 2001:db8:1::2 -B "2001:db8:2::2-2001:db8:2::fd" \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_flowlabel() +{ + # Generate 16384 echo requests, each with a random flow label. + for _ in $(seq 1 16384); do + ip vrf exec v$h1 \ + $PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1 + done +} + +send_src_udp6() +{ + $MZ -6 $h1 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ + -d 1msec -t udp "sp=0-32768,dp=30000" +} + +send_dst_udp6() +{ + $MZ -6 $h1 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ + -d 1msec -t udp "sp=20000,dp=0-32768" +} + +custom_hash_test() +{ + local field="$1"; shift + local balanced="$1"; shift + local send_flows="$@" + + RET=0 + + local t0_111=$(tc_rule_stats_get $ul32 111 ingress) + local t0_222=$(tc_rule_stats_get $ul32 222 ingress) + + $send_flows + + local t1_111=$(tc_rule_stats_get $ul32 111 ingress) + local t1_222=$(tc_rule_stats_get $ul32 222 ingress) + + local d111=$((t1_111 - t0_111)) + local d222=$((t1_222 - t0_222)) + + local diff=$((d222 - d111)) + local sum=$((d111 + d222)) + + local pct=$(echo "$diff / $sum * 100" | bc -l) + local is_balanced=$(echo "-20 <= $pct && $pct <= 20" | bc) + + [[ ( $is_balanced -eq 1 && $balanced == "balanced" ) || + ( $is_balanced -eq 0 && $balanced == "unbalanced" ) ]] + check_err $? "Expected traffic to be $balanced, but it is not" + + log_test "Multipath hash field: $field ($balanced)" + log_info "Packets sent on path1 / path2: $d111 / $d222" +} + +custom_hash_v4() +{ + log_info "Running IPv4 overlay custom multipath hash tests" + + # Prevent the neighbour table from overflowing, as different neighbour + # entries will be created on $ol4 when using different destination IPs. + sysctl_set net.ipv4.neigh.default.gc_thresh1 1024 + sysctl_set net.ipv4.neigh.default.gc_thresh2 1024 + sysctl_set net.ipv4.neigh.default.gc_thresh3 1024 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0040 + custom_hash_test "Inner source IP" "balanced" send_src_ipv4 + custom_hash_test "Inner source IP" "unbalanced" send_dst_ipv4 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0080 + custom_hash_test "Inner destination IP" "balanced" send_dst_ipv4 + custom_hash_test "Inner destination IP" "unbalanced" send_src_ipv4 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0400 + custom_hash_test "Inner source port" "balanced" send_src_udp4 + custom_hash_test "Inner source port" "unbalanced" send_dst_udp4 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0800 + custom_hash_test "Inner destination port" "balanced" send_dst_udp4 + custom_hash_test "Inner destination port" "unbalanced" send_src_udp4 + + sysctl_restore net.ipv4.neigh.default.gc_thresh3 + sysctl_restore net.ipv4.neigh.default.gc_thresh2 + sysctl_restore net.ipv4.neigh.default.gc_thresh1 +} + +custom_hash_v6() +{ + log_info "Running IPv6 overlay custom multipath hash tests" + + # Prevent the neighbour table from overflowing, as different neighbour + # entries will be created on $ol4 when using different destination IPs. + sysctl_set net.ipv6.neigh.default.gc_thresh1 1024 + sysctl_set net.ipv6.neigh.default.gc_thresh2 1024 + sysctl_set net.ipv6.neigh.default.gc_thresh3 1024 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0040 + custom_hash_test "Inner source IP" "balanced" send_src_ipv6 + custom_hash_test "Inner source IP" "unbalanced" send_dst_ipv6 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0080 + custom_hash_test "Inner destination IP" "balanced" send_dst_ipv6 + custom_hash_test "Inner destination IP" "unbalanced" send_src_ipv6 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0200 + custom_hash_test "Inner flowlabel" "balanced" send_flowlabel + custom_hash_test "Inner flowlabel" "unbalanced" send_src_ipv6 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0400 + custom_hash_test "Inner source port" "balanced" send_src_udp6 + custom_hash_test "Inner source port" "unbalanced" send_dst_udp6 + + sysctl_set net.ipv4.fib_multipath_hash_fields 0x0800 + custom_hash_test "Inner destination port" "balanced" send_dst_udp6 + custom_hash_test "Inner destination port" "unbalanced" send_src_udp6 + + sysctl_restore net.ipv6.neigh.default.gc_thresh3 + sysctl_restore net.ipv6.neigh.default.gc_thresh2 + sysctl_restore net.ipv6.neigh.default.gc_thresh1 +} + +custom_hash() +{ + # Test that when the hash policy is set to custom, traffic is + # distributed only according to the fields set in the + # fib_multipath_hash_fields sysctl. + # + # Each time set a different field and make sure traffic is only + # distributed when the field is changed in the packet stream. + + sysctl_set net.ipv4.fib_multipath_hash_policy 3 + + custom_hash_v4 + custom_hash_v6 + + sysctl_restore net.ipv4.fib_multipath_hash_policy +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh b/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh new file mode 100755 index 000000000000..9f5b3e2e5e95 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test ipv6 stats on the incoming if when forwarding with VRF + +ALL_TESTS=" + ipv6_ping + ipv6_in_too_big_err + ipv6_in_hdr_err + ipv6_in_addr_err + ipv6_in_discard +" + +NUM_NETIFS=4 +source lib.sh + +h1_create() +{ + simple_if_init $h1 2001:1:1::2/64 + ip -6 route add vrf v$h1 2001:1:2::/64 via 2001:1:1::1 +} + +h1_destroy() +{ + ip -6 route del vrf v$h1 2001:1:2::/64 via 2001:1:1::1 + simple_if_fini $h1 2001:1:1::2/64 +} + +router_create() +{ + vrf_create router + __simple_if_init $rtr1 router 2001:1:1::1/64 + __simple_if_init $rtr2 router 2001:1:2::1/64 + mtu_set $rtr2 1280 +} + +router_destroy() +{ + mtu_restore $rtr2 + __simple_if_fini $rtr2 2001:1:2::1/64 + __simple_if_fini $rtr1 2001:1:1::1/64 + vrf_destroy router +} + +h2_create() +{ + simple_if_init $h2 2001:1:2::2/64 + ip -6 route add vrf v$h2 2001:1:1::/64 via 2001:1:2::1 + mtu_set $h2 1280 +} + +h2_destroy() +{ + mtu_restore $h2 + ip -6 route del vrf v$h2 2001:1:1::/64 via 2001:1:2::1 + simple_if_fini $h2 2001:1:2::2/64 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rtr1=${NETIFS[p2]} + + rtr2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + vrf_prepare + h1_create + router_create + h2_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + h2_destroy + router_destroy + h1_destroy + vrf_cleanup +} + +ipv6_in_too_big_err() +{ + RET=0 + + local t0=$(ipv6_stats_get $rtr1 Ip6InTooBigErrors) + local vrf_name=$(master_name_get $h1) + + # Send too big packets + ip vrf exec $vrf_name \ + $PING6 -s 1300 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null + + local t1=$(ipv6_stats_get $rtr1 Ip6InTooBigErrors) + test "$((t1 - t0))" -ne 0 + check_err $? + log_test "Ip6InTooBigErrors" +} + +ipv6_in_hdr_err() +{ + RET=0 + + local t0=$(ipv6_stats_get $rtr1 Ip6InHdrErrors) + local vrf_name=$(master_name_get $h1) + + # Send packets with hop limit 1, easiest with traceroute6 as some ping6 + # doesn't allow hop limit to be specified + ip vrf exec $vrf_name \ + $TROUTE6 2001:1:2::2 &> /dev/null + + local t1=$(ipv6_stats_get $rtr1 Ip6InHdrErrors) + test "$((t1 - t0))" -ne 0 + check_err $? + log_test "Ip6InHdrErrors" +} + +ipv6_in_addr_err() +{ + RET=0 + + local t0=$(ipv6_stats_get $rtr1 Ip6InAddrErrors) + local vrf_name=$(master_name_get $h1) + + # Disable forwarding temporary while sending the packet + sysctl -qw net.ipv6.conf.all.forwarding=0 + ip vrf exec $vrf_name \ + $PING6 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null + sysctl -qw net.ipv6.conf.all.forwarding=1 + + local t1=$(ipv6_stats_get $rtr1 Ip6InAddrErrors) + test "$((t1 - t0))" -ne 0 + check_err $? + log_test "Ip6InAddrErrors" +} + +ipv6_in_discard() +{ + RET=0 + + local t0=$(ipv6_stats_get $rtr1 Ip6InDiscards) + local vrf_name=$(master_name_get $h1) + + # Add a policy to discard + ip xfrm policy add dst 2001:1:2::2/128 dir fwd action block + ip vrf exec $vrf_name \ + $PING6 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null + ip xfrm policy del dst 2001:1:2::2/128 dir fwd + + local t1=$(ipv6_stats_get $rtr1 Ip6InDiscards) + test "$((t1 - t0))" -ne 0 + check_err $? + log_test "Ip6InDiscards" +} +ipv6_ping() +{ + RET=0 + + ping6_test $h1 2001:1:2::2 +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh new file mode 100755 index 000000000000..8fea2c2e0b25 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh @@ -0,0 +1,458 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test traffic distribution when there are multiple paths between an IPv6 GRE +# tunnel. The tunnel carries IPv4 and IPv6 traffic between multiple hosts. +# Multiple routes are in the underlay network. With the default multipath +# policy, SW2 will only look at the outer IP addresses, hence only a single +# route would be used. +# +# +--------------------------------+ +# | H1 | +# | $h1 + | +# | 198.51.100.{2-253}/24 | | +# | 2001:db8:1::{2-fd}/64 | | +# +-------------------------|------+ +# | +# +-------------------------|-------------------+ +# | SW1 | | +# | $ol1 + | +# | 198.51.100.1/24 | +# | 2001:db8:1::1/64 | +# | | +# |+ g1 (ip6gre) | +# | loc=2001:db8:3::1 | +# | rem=2001:db8:3::2 -. | +# | tos=inherit | | +# | v | +# | + $ul1 | +# | | 2001:db8:10::1/64 | +# +---------------------|-----------------------+ +# | +# +---------------------|-----------------------+ +# | SW2 | | +# | $ul21 + | +# | 2001:db8:10::2/64 | | +# | | | +# ! __________________+___ | +# | / \ | +# | | | | +# | + $ul22.111 (vlan) + $ul22.222 (vlan) | +# | | 2001:db8:11::1/64 | 2001:db8:12::1/64 | +# | | | | +# +--|----------------------|-------------------+ +# | | +# +--|----------------------|-------------------+ +# | | | | +# | + $ul32.111 (vlan) + $ul32.222 (vlan) | +# | | 2001:db8:11::2/64 | 2001:db8:12::2/64 | +# | | | | +# | \__________________+___/ | +# | | | +# | | | +# | $ul31 + | +# | 2001:db8:13::1/64 | SW3 | +# +---------------------|-----------------------+ +# | +# +---------------------|-----------------------+ +# | + $ul4 | +# | ^ 2001:db8:13::2/64 | +# | | | +# |+ g2 (ip6gre) | | +# | loc=2001:db8:3::2 | | +# | rem=2001:db8:3::1 -' | +# | tos=inherit | +# | | +# | $ol4 + | +# | 203.0.113.1/24 | | +# | 2001:db8:2::1/64 | SW4 | +# +-------------------------|-------------------+ +# | +# +-------------------------|------+ +# | | | +# | $h2 + | +# | 203.0.113.{2-253}/24 | +# | 2001:db8:2::{2-fd}/64 H2 | +# +--------------------------------+ + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + custom_hash +" + +NUM_NETIFS=10 +source lib.sh + +h1_create() +{ + simple_if_init $h1 198.51.100.2/24 2001:db8:1::2/64 + ip route add vrf v$h1 default via 198.51.100.1 dev $h1 + ip -6 route add vrf v$h1 default via 2001:db8:1::1 dev $h1 +} + +h1_destroy() +{ + ip -6 route del vrf v$h1 default + ip route del vrf v$h1 default + simple_if_fini $h1 198.51.100.2/24 2001:db8:1::2/64 +} + +sw1_create() +{ + simple_if_init $ol1 198.51.100.1/24 2001:db8:1::1/64 + __simple_if_init $ul1 v$ol1 2001:db8:10::1/64 + + tunnel_create g1 ip6gre 2001:db8:3::1 2001:db8:3::2 tos inherit \ + dev v$ol1 + __simple_if_init g1 v$ol1 2001:db8:3::1/128 + ip route add vrf v$ol1 2001:db8:3::2/128 via 2001:db8:10::2 + + ip route add vrf v$ol1 203.0.113.0/24 dev g1 + ip -6 route add vrf v$ol1 2001:db8:2::/64 dev g1 +} + +sw1_destroy() +{ + ip -6 route del vrf v$ol1 2001:db8:2::/64 + ip route del vrf v$ol1 203.0.113.0/24 + + ip route del vrf v$ol1 2001:db8:3::2/128 + __simple_if_fini g1 2001:db8:3::1/128 + tunnel_destroy g1 + + __simple_if_fini $ul1 2001:db8:10::1/64 + simple_if_fini $ol1 198.51.100.1/24 2001:db8:1::1/64 +} + +sw2_create() +{ + simple_if_init $ul21 2001:db8:10::2/64 + __simple_if_init $ul22 v$ul21 + vlan_create $ul22 111 v$ul21 2001:db8:11::1/64 + vlan_create $ul22 222 v$ul21 2001:db8:12::1/64 + + ip -6 route add vrf v$ul21 2001:db8:3::1/128 via 2001:db8:10::1 + ip -6 route add vrf v$ul21 2001:db8:3::2/128 \ + nexthop via 2001:db8:11::2 \ + nexthop via 2001:db8:12::2 +} + +sw2_destroy() +{ + ip -6 route del vrf v$ul21 2001:db8:3::2/128 + ip -6 route del vrf v$ul21 2001:db8:3::1/128 + + vlan_destroy $ul22 222 + vlan_destroy $ul22 111 + __simple_if_fini $ul22 + simple_if_fini $ul21 2001:db8:10::2/64 +} + +sw3_create() +{ + simple_if_init $ul31 2001:db8:13::1/64 + __simple_if_init $ul32 v$ul31 + vlan_create $ul32 111 v$ul31 2001:db8:11::2/64 + vlan_create $ul32 222 v$ul31 2001:db8:12::2/64 + + ip -6 route add vrf v$ul31 2001:db8:3::2/128 via 2001:db8:13::2 + ip -6 route add vrf v$ul31 2001:db8:3::1/128 \ + nexthop via 2001:db8:11::1 \ + nexthop via 2001:db8:12::1 + + tc qdisc add dev $ul32 clsact + tc filter add dev $ul32 ingress pref 111 prot 802.1Q \ + flower vlan_id 111 action pass + tc filter add dev $ul32 ingress pref 222 prot 802.1Q \ + flower vlan_id 222 action pass +} + +sw3_destroy() +{ + tc qdisc del dev $ul32 clsact + + ip -6 route del vrf v$ul31 2001:db8:3::1/128 + ip -6 route del vrf v$ul31 2001:db8:3::2/128 + + vlan_destroy $ul32 222 + vlan_destroy $ul32 111 + __simple_if_fini $ul32 + simple_if_fini $ul31 2001:db8:13::1/64 +} + +sw4_create() +{ + simple_if_init $ol4 203.0.113.1/24 2001:db8:2::1/64 + __simple_if_init $ul4 v$ol4 2001:db8:13::2/64 + + tunnel_create g2 ip6gre 2001:db8:3::2 2001:db8:3::1 tos inherit \ + dev v$ol4 + __simple_if_init g2 v$ol4 2001:db8:3::2/128 + ip -6 route add vrf v$ol4 2001:db8:3::1/128 via 2001:db8:13::1 + + ip route add vrf v$ol4 198.51.100.0/24 dev g2 + ip -6 route add vrf v$ol4 2001:db8:1::/64 dev g2 +} + +sw4_destroy() +{ + ip -6 route del vrf v$ol4 2001:db8:1::/64 + ip route del vrf v$ol4 198.51.100.0/24 + + ip -6 route del vrf v$ol4 2001:db8:3::1/128 + __simple_if_fini g2 2001:db8:3::2/128 + tunnel_destroy g2 + + __simple_if_fini $ul4 2001:db8:13::2/64 + simple_if_fini $ol4 203.0.113.1/24 2001:db8:2::1/64 +} + +h2_create() +{ + simple_if_init $h2 203.0.113.2/24 2001:db8:2::2/64 + ip route add vrf v$h2 default via 203.0.113.1 dev $h2 + ip -6 route add vrf v$h2 default via 2001:db8:2::1 dev $h2 +} + +h2_destroy() +{ + ip -6 route del vrf v$h2 default + ip route del vrf v$h2 default + simple_if_fini $h2 203.0.113.2/24 2001:db8:2::2/64 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + + ol1=${NETIFS[p2]} + ul1=${NETIFS[p3]} + + ul21=${NETIFS[p4]} + ul22=${NETIFS[p5]} + + ul32=${NETIFS[p6]} + ul31=${NETIFS[p7]} + + ul4=${NETIFS[p8]} + ol4=${NETIFS[p9]} + + h2=${NETIFS[p10]} + + vrf_prepare + h1_create + sw1_create + sw2_create + sw3_create + sw4_create + h2_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + h2_destroy + sw4_destroy + sw3_destroy + sw2_destroy + sw1_destroy + h1_destroy + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 203.0.113.2 +} + +ping_ipv6() +{ + ping6_test $h1 2001:db8:2::2 +} + +send_src_ipv4() +{ + $MZ $h1 -q -p 64 -A "198.51.100.2-198.51.100.253" -B 203.0.113.2 \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_dst_ipv4() +{ + $MZ $h1 -q -p 64 -A 198.51.100.2 -B "203.0.113.2-203.0.113.253" \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_src_udp4() +{ + $MZ $h1 -q -p 64 -A 198.51.100.2 -B 203.0.113.2 \ + -d 1msec -t udp "sp=0-32768,dp=30000" +} + +send_dst_udp4() +{ + $MZ $h1 -q -p 64 -A 198.51.100.2 -B 203.0.113.2 \ + -d 1msec -t udp "sp=20000,dp=0-32768" +} + +send_src_ipv6() +{ + $MZ -6 $h1 -q -p 64 -A "2001:db8:1::2-2001:db8:1::fd" -B 2001:db8:2::2 \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_dst_ipv6() +{ + $MZ -6 $h1 -q -p 64 -A 2001:db8:1::2 -B "2001:db8:2::2-2001:db8:2::fd" \ + -d 1msec -c 50 -t udp "sp=20000,dp=30000" +} + +send_flowlabel() +{ + # Generate 16384 echo requests, each with a random flow label. + for _ in $(seq 1 16384); do + ip vrf exec v$h1 \ + $PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1 + done +} + +send_src_udp6() +{ + $MZ -6 $h1 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ + -d 1msec -t udp "sp=0-32768,dp=30000" +} + +send_dst_udp6() +{ + $MZ -6 $h1 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ + -d 1msec -t udp "sp=20000,dp=0-32768" +} + +custom_hash_test() +{ + local field="$1"; shift + local balanced="$1"; shift + local send_flows="$@" + + RET=0 + + local t0_111=$(tc_rule_stats_get $ul32 111 ingress) + local t0_222=$(tc_rule_stats_get $ul32 222 ingress) + + $send_flows + + local t1_111=$(tc_rule_stats_get $ul32 111 ingress) + local t1_222=$(tc_rule_stats_get $ul32 222 ingress) + + local d111=$((t1_111 - t0_111)) + local d222=$((t1_222 - t0_222)) + + local diff=$((d222 - d111)) + local sum=$((d111 + d222)) + + local pct=$(echo "$diff / $sum * 100" | bc -l) + local is_balanced=$(echo "-20 <= $pct && $pct <= 20" | bc) + + [[ ( $is_balanced -eq 1 && $balanced == "balanced" ) || + ( $is_balanced -eq 0 && $balanced == "unbalanced" ) ]] + check_err $? "Expected traffic to be $balanced, but it is not" + + log_test "Multipath hash field: $field ($balanced)" + log_info "Packets sent on path1 / path2: $d111 / $d222" +} + +custom_hash_v4() +{ + log_info "Running IPv4 overlay custom multipath hash tests" + + # Prevent the neighbour table from overflowing, as different neighbour + # entries will be created on $ol4 when using different destination IPs. + sysctl_set net.ipv4.neigh.default.gc_thresh1 1024 + sysctl_set net.ipv4.neigh.default.gc_thresh2 1024 + sysctl_set net.ipv4.neigh.default.gc_thresh3 1024 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0040 + custom_hash_test "Inner source IP" "balanced" send_src_ipv4 + custom_hash_test "Inner source IP" "unbalanced" send_dst_ipv4 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0080 + custom_hash_test "Inner destination IP" "balanced" send_dst_ipv4 + custom_hash_test "Inner destination IP" "unbalanced" send_src_ipv4 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0400 + custom_hash_test "Inner source port" "balanced" send_src_udp4 + custom_hash_test "Inner source port" "unbalanced" send_dst_udp4 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0800 + custom_hash_test "Inner destination port" "balanced" send_dst_udp4 + custom_hash_test "Inner destination port" "unbalanced" send_src_udp4 + + sysctl_restore net.ipv4.neigh.default.gc_thresh3 + sysctl_restore net.ipv4.neigh.default.gc_thresh2 + sysctl_restore net.ipv4.neigh.default.gc_thresh1 +} + +custom_hash_v6() +{ + log_info "Running IPv6 overlay custom multipath hash tests" + + # Prevent the neighbour table from overflowing, as different neighbour + # entries will be created on $ol4 when using different destination IPs. + sysctl_set net.ipv6.neigh.default.gc_thresh1 1024 + sysctl_set net.ipv6.neigh.default.gc_thresh2 1024 + sysctl_set net.ipv6.neigh.default.gc_thresh3 1024 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0040 + custom_hash_test "Inner source IP" "balanced" send_src_ipv6 + custom_hash_test "Inner source IP" "unbalanced" send_dst_ipv6 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0080 + custom_hash_test "Inner destination IP" "balanced" send_dst_ipv6 + custom_hash_test "Inner destination IP" "unbalanced" send_src_ipv6 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0200 + custom_hash_test "Inner flowlabel" "balanced" send_flowlabel + custom_hash_test "Inner flowlabel" "unbalanced" send_src_ipv6 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0400 + custom_hash_test "Inner source port" "balanced" send_src_udp6 + custom_hash_test "Inner source port" "unbalanced" send_dst_udp6 + + sysctl_set net.ipv6.fib_multipath_hash_fields 0x0800 + custom_hash_test "Inner destination port" "balanced" send_dst_udp6 + custom_hash_test "Inner destination port" "unbalanced" send_src_udp6 + + sysctl_restore net.ipv6.neigh.default.gc_thresh3 + sysctl_restore net.ipv6.neigh.default.gc_thresh2 + sysctl_restore net.ipv6.neigh.default.gc_thresh1 +} + +custom_hash() +{ + # Test that when the hash policy is set to custom, traffic is + # distributed only according to the fields set in the + # fib_multipath_hash_fields sysctl. + # + # Each time set a different field and make sure traffic is only + # distributed when the field is changed in the packet stream. + + sysctl_set net.ipv6.fib_multipath_hash_policy 3 + + custom_hash_v4 + custom_hash_v6 + + sysctl_restore net.ipv6.fib_multipath_hash_policy +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 42e28c983d41..92087d423bcf 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -4,6 +4,9 @@ ############################################################################## # Defines +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + # Can be overridden by the configuration file. PING=${PING:=ping} PING6=${PING6:=ping6} @@ -38,7 +41,7 @@ check_tc_version() tc -j &> /dev/null if [[ $? -ne 0 ]]; then echo "SKIP: iproute2 too old; tc is missing JSON support" - exit 1 + exit $ksft_skip fi } @@ -51,7 +54,7 @@ check_tc_mpls_support() matchall action pipe &> /dev/null if [[ $? -ne 0 ]]; then echo "SKIP: iproute2 too old; tc is missing MPLS support" - return 1 + return $ksft_skip fi tc filter del dev $dev ingress protocol mpls_uc pref 1 handle 1 \ matchall @@ -69,7 +72,7 @@ check_tc_mpls_lse_stats() if [[ $? -ne 0 ]]; then echo "SKIP: iproute2 too old; tc-flower is missing extended MPLS support" - return 1 + return $ksft_skip fi tc -j filter show dev $dev ingress protocol mpls_uc | jq . &> /dev/null @@ -79,7 +82,7 @@ check_tc_mpls_lse_stats() if [[ $ret -ne 0 ]]; then echo "SKIP: iproute2 too old; tc-flower produces invalid json output for extended MPLS filters" - return 1 + return $ksft_skip fi } @@ -88,7 +91,7 @@ check_tc_shblock_support() tc filter help 2>&1 | grep block &> /dev/null if [[ $? -ne 0 ]]; then echo "SKIP: iproute2 too old; tc is missing shared block support" - exit 1 + exit $ksft_skip fi } @@ -97,7 +100,7 @@ check_tc_chain_support() tc help 2>&1|grep chain &> /dev/null if [[ $? -ne 0 ]]; then echo "SKIP: iproute2 too old; tc is missing chain support" - exit 1 + exit $ksft_skip fi } @@ -106,7 +109,7 @@ check_tc_action_hw_stats_support() tc actions help 2>&1 | grep -q hw_stats if [[ $? -ne 0 ]]; then echo "SKIP: iproute2 too old; tc is missing action hw_stats support" - exit 1 + exit $ksft_skip fi } @@ -115,13 +118,13 @@ check_ethtool_lanes_support() ethtool --help 2>&1| grep lanes &> /dev/null if [[ $? -ne 0 ]]; then echo "SKIP: ethtool too old; it is missing lanes support" - exit 1 + exit $ksft_skip fi } if [[ "$(id -u)" -ne 0 ]]; then echo "SKIP: need root privileges" - exit 0 + exit $ksft_skip fi if [[ "$CHECK_TC" = "yes" ]]; then @@ -134,7 +137,7 @@ require_command() if [[ ! -x "$(command -v "$cmd")" ]]; then echo "SKIP: $cmd not installed" - exit 1 + exit $ksft_skip fi } @@ -143,7 +146,7 @@ require_command $MZ if [[ ! -v NUM_NETIFS ]]; then echo "SKIP: importer does not define \"NUM_NETIFS\"" - exit 1 + exit $ksft_skip fi ############################################################################## @@ -203,7 +206,7 @@ for ((i = 1; i <= NUM_NETIFS; ++i)); do ip link show dev ${NETIFS[p$i]} &> /dev/null if [[ $? -ne 0 ]]; then echo "SKIP: could not find all required interfaces" - exit 1 + exit $ksft_skip fi done @@ -748,6 +751,14 @@ qdisc_parent_stats_get() | jq '.[] | select(.parent == "'"$parent"'") | '"$selector" } +ipv6_stats_get() +{ + local dev=$1; shift + local stat=$1; shift + + cat /proc/net/dev_snmp6/$dev | grep "^$stat" | cut -f2 +} + humanize() { local speed=$1; shift diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh index 55eeacf59241..64fbd211d907 100755 --- a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh +++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh @@ -75,7 +75,9 @@ switch_destroy() tc qdisc del dev $swp2 clsact tc qdisc del dev $swp1 clsact + ip link set dev $swp2 down ip link set dev $swp2 nomaster + ip link set dev $swp1 down ip link set dev $swp1 nomaster ip link del dev br1 } diff --git a/tools/testing/selftests/net/forwarding/pedit_l4port.sh b/tools/testing/selftests/net/forwarding/pedit_l4port.sh index 5f20d289ee43..10e594c55117 100755 --- a/tools/testing/selftests/net/forwarding/pedit_l4port.sh +++ b/tools/testing/selftests/net/forwarding/pedit_l4port.sh @@ -71,7 +71,9 @@ switch_destroy() tc qdisc del dev $swp2 clsact tc qdisc del dev $swp1 clsact + ip link set dev $swp2 down ip link set dev $swp2 nomaster + ip link set dev $swp1 down ip link set dev $swp1 nomaster ip link del dev br1 } diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh index 76efb1f8375e..a0d612e04990 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh @@ -411,7 +411,7 @@ ping_ipv6() ip nexthop ls >/dev/null 2>&1 if [ $? -ne 0 ]; then echo "Nexthop objects not supported; skipping tests" - exit 0 + exit $ksft_skip fi trap cleanup EXIT diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh index 4898dd4118f1..cb08ffe2356a 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh @@ -386,7 +386,7 @@ ping_ipv6() ip nexthop ls >/dev/null 2>&1 if [ $? -ne 0 ]; then echo "Nexthop objects not supported; skipping tests" - exit 0 + exit $ksft_skip fi trap cleanup EXIT diff --git a/tools/testing/selftests/net/forwarding/skbedit_priority.sh b/tools/testing/selftests/net/forwarding/skbedit_priority.sh index e3bd8a6bb8b4..bde11dc27873 100755 --- a/tools/testing/selftests/net/forwarding/skbedit_priority.sh +++ b/tools/testing/selftests/net/forwarding/skbedit_priority.sh @@ -72,7 +72,9 @@ switch_destroy() tc qdisc del dev $swp2 clsact tc qdisc del dev $swp1 clsact + ip link set dev $swp2 down ip link set dev $swp2 nomaster + ip link set dev $swp1 down ip link set dev $swp1 nomaster ip link del dev br1 } diff --git a/tools/testing/selftests/net/gre_gso.sh b/tools/testing/selftests/net/gre_gso.sh new file mode 100755 index 000000000000..facbb0c80443 --- /dev/null +++ b/tools/testing/selftests/net/gre_gso.sh @@ -0,0 +1,236 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test is for checking GRE GSO. + +ret=0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# all tests in this script. Can be overridden with -t option +TESTS="gre_gso" + +VERBOSE=0 +PAUSE_ON_FAIL=no +PAUSE=no +IP="ip -netns ns1" +NS_EXEC="ip netns exec ns1" +TMPFILE=`mktemp` +PID= + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf " TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf " TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi + + if [ "${PAUSE}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi +} + +setup() +{ + set -e + ip netns add ns1 + ip netns set ns1 auto + $IP link set dev lo up + + ip link add veth0 type veth peer name veth1 + ip link set veth0 up + ip link set veth1 netns ns1 + $IP link set veth1 name veth0 + $IP link set veth0 up + + dd if=/dev/urandom of=$TMPFILE bs=1024 count=2048 &>/dev/null + set +e +} + +cleanup() +{ + rm -rf $TMPFILE + [ -n "$PID" ] && kill $PID + ip link del dev gre1 &> /dev/null + ip link del dev veth0 &> /dev/null + ip netns del ns1 +} + +get_linklocal() +{ + local dev=$1 + local ns=$2 + local addr + + [ -n "$ns" ] && ns="-netns $ns" + + addr=$(ip -6 -br $ns addr show dev ${dev} | \ + awk '{ + for (i = 3; i <= NF; ++i) { + if ($i ~ /^fe80/) + print $i + } + }' + ) + addr=${addr/\/*} + + [ -z "$addr" ] && return 1 + + echo $addr + + return 0 +} + +gre_create_tun() +{ + local a1=$1 + local a2=$2 + local mode + + [[ $a1 =~ ^[0-9.]*$ ]] && mode=gre || mode=ip6gre + + ip tunnel add gre1 mode $mode local $a1 remote $a2 dev veth0 + ip link set gre1 up + $IP tunnel add gre1 mode $mode local $a2 remote $a1 dev veth0 + $IP link set gre1 up +} + +gre_gst_test_checks() +{ + local name=$1 + local addr=$2 + + $NS_EXEC nc -kl $port >/dev/null & + PID=$! + while ! $NS_EXEC ss -ltn | grep -q $port; do ((i++)); sleep 0.01; done + + cat $TMPFILE | timeout 1 nc $addr $port + log_test $? 0 "$name - copy file w/ TSO" + + ethtool -K veth0 tso off + + cat $TMPFILE | timeout 1 nc $addr $port + log_test $? 0 "$name - copy file w/ GSO" + + ethtool -K veth0 tso on + + kill $PID + PID= +} + +gre6_gso_test() +{ + local port=7777 + + setup + + a1=$(get_linklocal veth0) + a2=$(get_linklocal veth0 ns1) + + gre_create_tun $a1 $a2 + + ip addr add 172.16.2.1/24 dev gre1 + $IP addr add 172.16.2.2/24 dev gre1 + + ip -6 addr add 2001:db8:1::1/64 dev gre1 nodad + $IP -6 addr add 2001:db8:1::2/64 dev gre1 nodad + + sleep 2 + + gre_gst_test_checks GREv6/v4 172.16.2.2 + gre_gst_test_checks GREv6/v6 2001:db8:1::2 + + cleanup +} + +gre_gso_test() +{ + gre6_gso_test +} + +################################################################################ +# usage + +usage() +{ + cat <<EOF +usage: ${0##*/} OPTS + + -t <test> Test(s) to run (default: all) + (options: $TESTS) + -p Pause on fail + -P Pause after each test before cleanup + -v verbose mode (show commands and output) +EOF +} + +################################################################################ +# main + +while getopts :t:pPhv o +do + case $o in + t) TESTS=$OPTARG;; + p) PAUSE_ON_FAIL=yes;; + P) PAUSE=yes;; + v) VERBOSE=$(($VERBOSE + 1));; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +PEER_CMD="ip netns exec ${PEER_NS}" + +# make sure we don't pause twice +[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip; +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +if [ ! -x "$(command -v nc)" ]; then + echo "SKIP: Could not run test without nc tool" + exit $ksft_skip +fi + +# start clean +cleanup &> /dev/null + +for t in $TESTS +do + case $t in + gre_gso) gre_gso_test;; + + help) echo "Test names: $TESTS"; exit 0;; + esac +done + +if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} +fi + +exit $ret diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c new file mode 100644 index 000000000000..cf37ce86b0fd --- /dev/null +++ b/tools/testing/selftests/net/gro.c @@ -0,0 +1,1095 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This testsuite provides conformance testing for GRO coalescing. + * + * Test cases: + * 1.data + * Data packets of the same size and same header setup with correct + * sequence numbers coalesce. The one exception being the last data + * packet coalesced: it can be smaller than the rest and coalesced + * as long as it is in the same flow. + * 2.ack + * Pure ACK does not coalesce. + * 3.flags + * Specific test cases: no packets with PSH, SYN, URG, RST set will + * be coalesced. + * 4.tcp + * Packets with incorrect checksum, non-consecutive seqno and + * different TCP header options shouldn't coalesce. Nit: given that + * some extension headers have paddings, such as timestamp, headers + * that are padding differently would not be coalesced. + * 5.ip: + * Packets with different (ECN, TTL, TOS) header, ip options or + * ip fragments (ipv6) shouldn't coalesce. + * 6.large: + * Packets larger than GRO_MAX_SIZE packets shouldn't coalesce. + * + * MSS is defined as 4096 - header because if it is too small + * (i.e. 1500 MTU - header), it will result in many packets, + * increasing the "large" test case's flakiness. This is because + * due to time sensitivity in the coalescing window, the receiver + * may not coalesce all of the packets. + * + * Note the timing issue applies to all of the test cases, so some + * flakiness is to be expected. + * + */ + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <errno.h> +#include <error.h> +#include <getopt.h> +#include <linux/filter.h> +#include <linux/if_packet.h> +#include <linux/ipv6.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <unistd.h> + +#define DPORT 8000 +#define SPORT 1500 +#define PAYLOAD_LEN 100 +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#define NUM_PACKETS 4 +#define START_SEQ 100 +#define START_ACK 100 +#define SIP6 "fdaa::2" +#define DIP6 "fdaa::1" +#define SIP4 "192.168.1.200" +#define DIP4 "192.168.1.100" +#define ETH_P_NONE 0 +#define TOTAL_HDR_LEN (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct tcphdr)) +#define MSS (4096 - sizeof(struct tcphdr) - sizeof(struct ipv6hdr)) +#define MAX_PAYLOAD (IP_MAXPACKET - sizeof(struct tcphdr) - sizeof(struct ipv6hdr)) +#define NUM_LARGE_PKT (MAX_PAYLOAD / MSS) +#define MAX_HDR_LEN (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct tcphdr)) + +static int proto = -1; +static uint8_t src_mac[ETH_ALEN], dst_mac[ETH_ALEN]; +static char *testname = "data"; +static char *ifname = "eth0"; +static char *smac = "aa:00:00:00:00:02"; +static char *dmac = "aa:00:00:00:00:01"; +static bool verbose; +static bool tx_socket = true; +static int tcp_offset = -1; +static int total_hdr_len = -1; +static int ethhdr_proto = -1; + +static void vlog(const char *fmt, ...) +{ + va_list args; + + if (verbose) { + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + } +} + +static void setup_sock_filter(int fd) +{ + const int dport_off = tcp_offset + offsetof(struct tcphdr, dest); + const int ethproto_off = offsetof(struct ethhdr, h_proto); + int optlen = 0; + int ipproto_off; + int next_off; + + if (proto == PF_INET) + next_off = offsetof(struct iphdr, protocol); + else + next_off = offsetof(struct ipv6hdr, nexthdr); + ipproto_off = ETH_HLEN + next_off; + + if (strcmp(testname, "ip") == 0) { + if (proto == PF_INET) + optlen = sizeof(struct ip_timestamp); + else + optlen = sizeof(struct ip6_frag); + } + + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, ethproto_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ntohs(ethhdr_proto), 0, 7), + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, ipproto_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_TCP, 0, 5), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, dport_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DPORT, 2, 0), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, dport_off + optlen), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DPORT, 0, 1), + BPF_STMT(BPF_RET + BPF_K, 0xFFFFFFFF), + BPF_STMT(BPF_RET + BPF_K, 0), + }; + + struct sock_fprog bpf = { + .len = ARRAY_SIZE(filter), + .filter = filter, + }; + + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)) < 0) + error(1, errno, "error setting filter"); +} + +static uint32_t checksum_nofold(void *data, size_t len, uint32_t sum) +{ + uint16_t *words = data; + int i; + + for (i = 0; i < len / 2; i++) + sum += words[i]; + if (len & 1) + sum += ((char *)data)[len - 1]; + return sum; +} + +static uint16_t checksum_fold(void *data, size_t len, uint32_t sum) +{ + sum = checksum_nofold(data, len, sum); + while (sum > 0xFFFF) + sum = (sum & 0xFFFF) + (sum >> 16); + return ~sum; +} + +static uint16_t tcp_checksum(void *buf, int payload_len) +{ + struct pseudo_header6 { + struct in6_addr saddr; + struct in6_addr daddr; + uint16_t protocol; + uint16_t payload_len; + } ph6; + struct pseudo_header4 { + struct in_addr saddr; + struct in_addr daddr; + uint16_t protocol; + uint16_t payload_len; + } ph4; + uint32_t sum = 0; + + if (proto == PF_INET6) { + if (inet_pton(AF_INET6, SIP6, &ph6.saddr) != 1) + error(1, errno, "inet_pton6 source ip pseudo"); + if (inet_pton(AF_INET6, DIP6, &ph6.daddr) != 1) + error(1, errno, "inet_pton6 dest ip pseudo"); + ph6.protocol = htons(IPPROTO_TCP); + ph6.payload_len = htons(sizeof(struct tcphdr) + payload_len); + + sum = checksum_nofold(&ph6, sizeof(ph6), 0); + } else if (proto == PF_INET) { + if (inet_pton(AF_INET, SIP4, &ph4.saddr) != 1) + error(1, errno, "inet_pton source ip pseudo"); + if (inet_pton(AF_INET, DIP4, &ph4.daddr) != 1) + error(1, errno, "inet_pton dest ip pseudo"); + ph4.protocol = htons(IPPROTO_TCP); + ph4.payload_len = htons(sizeof(struct tcphdr) + payload_len); + + sum = checksum_nofold(&ph4, sizeof(ph4), 0); + } + + return checksum_fold(buf, sizeof(struct tcphdr) + payload_len, sum); +} + +static void read_MAC(uint8_t *mac_addr, char *mac) +{ + if (sscanf(mac, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + &mac_addr[0], &mac_addr[1], &mac_addr[2], + &mac_addr[3], &mac_addr[4], &mac_addr[5]) != 6) + error(1, 0, "sscanf"); +} + +static void fill_datalinklayer(void *buf) +{ + struct ethhdr *eth = buf; + + memcpy(eth->h_dest, dst_mac, ETH_ALEN); + memcpy(eth->h_source, src_mac, ETH_ALEN); + eth->h_proto = ethhdr_proto; +} + +static void fill_networklayer(void *buf, int payload_len) +{ + struct ipv6hdr *ip6h = buf; + struct iphdr *iph = buf; + + if (proto == PF_INET6) { + memset(ip6h, 0, sizeof(*ip6h)); + + ip6h->version = 6; + ip6h->payload_len = htons(sizeof(struct tcphdr) + payload_len); + ip6h->nexthdr = IPPROTO_TCP; + ip6h->hop_limit = 8; + if (inet_pton(AF_INET6, SIP6, &ip6h->saddr) != 1) + error(1, errno, "inet_pton source ip6"); + if (inet_pton(AF_INET6, DIP6, &ip6h->daddr) != 1) + error(1, errno, "inet_pton dest ip6"); + } else if (proto == PF_INET) { + memset(iph, 0, sizeof(*iph)); + + iph->version = 4; + iph->ihl = 5; + iph->ttl = 8; + iph->protocol = IPPROTO_TCP; + iph->tot_len = htons(sizeof(struct tcphdr) + + payload_len + sizeof(struct iphdr)); + iph->frag_off = htons(0x4000); /* DF = 1, MF = 0 */ + if (inet_pton(AF_INET, SIP4, &iph->saddr) != 1) + error(1, errno, "inet_pton source ip"); + if (inet_pton(AF_INET, DIP4, &iph->daddr) != 1) + error(1, errno, "inet_pton dest ip"); + iph->check = checksum_fold(buf, sizeof(struct iphdr), 0); + } +} + +static void fill_transportlayer(void *buf, int seq_offset, int ack_offset, + int payload_len, int fin) +{ + struct tcphdr *tcph = buf; + + memset(tcph, 0, sizeof(*tcph)); + + tcph->source = htons(SPORT); + tcph->dest = htons(DPORT); + tcph->seq = ntohl(START_SEQ + seq_offset); + tcph->ack_seq = ntohl(START_ACK + ack_offset); + tcph->ack = 1; + tcph->fin = fin; + tcph->doff = 5; + tcph->window = htons(TCP_MAXWIN); + tcph->urg_ptr = 0; + tcph->check = tcp_checksum(tcph, payload_len); +} + +static void write_packet(int fd, char *buf, int len, struct sockaddr_ll *daddr) +{ + int ret = -1; + + ret = sendto(fd, buf, len, 0, (struct sockaddr *)daddr, sizeof(*daddr)); + if (ret == -1) + error(1, errno, "sendto failure"); + if (ret != len) + error(1, errno, "sendto wrong length"); +} + +static void create_packet(void *buf, int seq_offset, int ack_offset, + int payload_len, int fin) +{ + memset(buf, 0, total_hdr_len); + memset(buf + total_hdr_len, 'a', payload_len); + fill_transportlayer(buf + tcp_offset, seq_offset, ack_offset, + payload_len, fin); + fill_networklayer(buf + ETH_HLEN, payload_len); + fill_datalinklayer(buf); +} + +/* send one extra flag, not first and not last pkt */ +static void send_flags(int fd, struct sockaddr_ll *daddr, int psh, int syn, + int rst, int urg) +{ + static char flag_buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + int payload_len, pkt_size, flag, i; + struct tcphdr *tcph; + + payload_len = PAYLOAD_LEN * psh; + pkt_size = total_hdr_len + payload_len; + flag = NUM_PACKETS / 2; + + create_packet(flag_buf, flag * payload_len, 0, payload_len, 0); + + tcph = (struct tcphdr *)(flag_buf + tcp_offset); + tcph->psh = psh; + tcph->syn = syn; + tcph->rst = rst; + tcph->urg = urg; + tcph->check = 0; + tcph->check = tcp_checksum(tcph, payload_len); + + for (i = 0; i < NUM_PACKETS + 1; i++) { + if (i == flag) { + write_packet(fd, flag_buf, pkt_size, daddr); + continue; + } + create_packet(buf, i * PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, total_hdr_len + PAYLOAD_LEN, daddr); + } +} + +/* Test for data of same length, smaller than previous + * and of different lengths + */ +static void send_data_pkts(int fd, struct sockaddr_ll *daddr, + int payload_len1, int payload_len2) +{ + static char buf[ETH_HLEN + IP_MAXPACKET]; + + create_packet(buf, 0, 0, payload_len1, 0); + write_packet(fd, buf, total_hdr_len + payload_len1, daddr); + create_packet(buf, payload_len1, 0, payload_len2, 0); + write_packet(fd, buf, total_hdr_len + payload_len2, daddr); +} + +/* If incoming segments make tracked segment length exceed + * legal IP datagram length, do not coalesce + */ +static void send_large(int fd, struct sockaddr_ll *daddr, int remainder) +{ + static char pkts[NUM_LARGE_PKT][TOTAL_HDR_LEN + MSS]; + static char last[TOTAL_HDR_LEN + MSS]; + static char new_seg[TOTAL_HDR_LEN + MSS]; + int i; + + for (i = 0; i < NUM_LARGE_PKT; i++) + create_packet(pkts[i], i * MSS, 0, MSS, 0); + create_packet(last, NUM_LARGE_PKT * MSS, 0, remainder, 0); + create_packet(new_seg, (NUM_LARGE_PKT + 1) * MSS, 0, remainder, 0); + + for (i = 0; i < NUM_LARGE_PKT; i++) + write_packet(fd, pkts[i], total_hdr_len + MSS, daddr); + write_packet(fd, last, total_hdr_len + remainder, daddr); + write_packet(fd, new_seg, total_hdr_len + remainder, daddr); +} + +/* Pure acks and dup acks don't coalesce */ +static void send_ack(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN]; + + create_packet(buf, 0, 0, 0, 0); + write_packet(fd, buf, total_hdr_len, daddr); + write_packet(fd, buf, total_hdr_len, daddr); + create_packet(buf, 0, 1, 0, 0); + write_packet(fd, buf, total_hdr_len, daddr); +} + +static void recompute_packet(char *buf, char *no_ext, int extlen) +{ + struct tcphdr *tcphdr = (struct tcphdr *)(buf + tcp_offset); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buf + ETH_HLEN); + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + + memmove(buf, no_ext, total_hdr_len); + memmove(buf + total_hdr_len + extlen, + no_ext + total_hdr_len, PAYLOAD_LEN); + + tcphdr->doff = tcphdr->doff + (extlen / 4); + tcphdr->check = 0; + tcphdr->check = tcp_checksum(tcphdr, PAYLOAD_LEN + extlen); + if (proto == PF_INET) { + iph->tot_len = htons(ntohs(iph->tot_len) + extlen); + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + } else { + ip6h->payload_len = htons(ntohs(ip6h->payload_len) + extlen); + } +} + +static void tcp_write_options(char *buf, int kind, int ts) +{ + struct tcp_option_ts { + uint8_t kind; + uint8_t len; + uint32_t tsval; + uint32_t tsecr; + } *opt_ts = (void *)buf; + struct tcp_option_window { + uint8_t kind; + uint8_t len; + uint8_t shift; + } *opt_window = (void *)buf; + + switch (kind) { + case TCPOPT_NOP: + buf[0] = TCPOPT_NOP; + break; + case TCPOPT_WINDOW: + memset(opt_window, 0, sizeof(struct tcp_option_window)); + opt_window->kind = TCPOPT_WINDOW; + opt_window->len = TCPOLEN_WINDOW; + opt_window->shift = 0; + break; + case TCPOPT_TIMESTAMP: + memset(opt_ts, 0, sizeof(struct tcp_option_ts)); + opt_ts->kind = TCPOPT_TIMESTAMP; + opt_ts->len = TCPOLEN_TIMESTAMP; + opt_ts->tsval = ts; + opt_ts->tsecr = 0; + break; + default: + error(1, 0, "unimplemented TCP option"); + break; + } +} + +/* TCP with options is always a permutation of {TS, NOP, NOP}. + * Implement different orders to verify coalescing stops. + */ +static void add_standard_tcp_options(char *buf, char *no_ext, int ts, int order) +{ + switch (order) { + case 0: + tcp_write_options(buf + total_hdr_len, TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + 1, TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + 2 /* two NOP opts */, + TCPOPT_TIMESTAMP, ts); + break; + case 1: + tcp_write_options(buf + total_hdr_len, TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + 1, + TCPOPT_TIMESTAMP, ts); + tcp_write_options(buf + total_hdr_len + 1 + TCPOLEN_TIMESTAMP, + TCPOPT_NOP, 0); + break; + case 2: + tcp_write_options(buf + total_hdr_len, TCPOPT_TIMESTAMP, ts); + tcp_write_options(buf + total_hdr_len + TCPOLEN_TIMESTAMP + 1, + TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + TCPOLEN_TIMESTAMP + 2, + TCPOPT_NOP, 0); + break; + default: + error(1, 0, "unknown order"); + break; + } + recompute_packet(buf, no_ext, TCPOLEN_TSTAMP_APPA); +} + +/* Packets with invalid checksum don't coalesce. */ +static void send_changed_checksum(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct tcphdr *tcph = (struct tcphdr *)(buf + tcp_offset); + int pkt_size = total_hdr_len + PAYLOAD_LEN; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + tcph->check = tcph->check - 1; + write_packet(fd, buf, pkt_size, daddr); +} + + /* Packets with non-consecutive sequence number don't coalesce.*/ +static void send_changed_seq(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct tcphdr *tcph = (struct tcphdr *)(buf + tcp_offset); + int pkt_size = total_hdr_len + PAYLOAD_LEN; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + tcph->seq = ntohl(htonl(tcph->seq) + 1); + tcph->check = 0; + tcph->check = tcp_checksum(tcph, PAYLOAD_LEN); + write_packet(fd, buf, pkt_size, daddr); +} + + /* Packet with different timestamp option or different timestamps + * don't coalesce. + */ +static void send_changed_ts(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char extpkt[sizeof(buf) + TCPOLEN_TSTAMP_APPA]; + int pkt_size = total_hdr_len + PAYLOAD_LEN + TCPOLEN_TSTAMP_APPA; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 0, 0); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 0, 0); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 100, 0); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 3, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 100, 1); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 4, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 100, 2); + write_packet(fd, extpkt, pkt_size, daddr); +} + +/* Packet with different tcp options don't coalesce. */ +static void send_diff_opt(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char extpkt1[sizeof(buf) + TCPOLEN_TSTAMP_APPA]; + static char extpkt2[sizeof(buf) + TCPOLEN_MAXSEG]; + int extpkt1_size = total_hdr_len + PAYLOAD_LEN + TCPOLEN_TSTAMP_APPA; + int extpkt2_size = total_hdr_len + PAYLOAD_LEN + TCPOLEN_MAXSEG; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt1, buf, 0, 0); + write_packet(fd, extpkt1, extpkt1_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt1, buf, 0, 0); + write_packet(fd, extpkt1, extpkt1_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + tcp_write_options(extpkt2 + MAX_HDR_LEN, TCPOPT_NOP, 0); + tcp_write_options(extpkt2 + MAX_HDR_LEN + 1, TCPOPT_WINDOW, 0); + recompute_packet(extpkt2, buf, TCPOLEN_WINDOW + 1); + write_packet(fd, extpkt2, extpkt2_size, daddr); +} + +static void add_ipv4_ts_option(void *buf, void *optpkt) +{ + struct ip_timestamp *ts = (struct ip_timestamp *)(optpkt + tcp_offset); + int optlen = sizeof(struct ip_timestamp); + struct iphdr *iph; + + if (optlen % 4) + error(1, 0, "ipv4 timestamp length is not a multiple of 4B"); + + ts->ipt_code = IPOPT_TS; + ts->ipt_len = optlen; + ts->ipt_ptr = 5; + ts->ipt_flg = IPOPT_TS_TSONLY; + + memcpy(optpkt, buf, tcp_offset); + memcpy(optpkt + tcp_offset + optlen, buf + tcp_offset, + sizeof(struct tcphdr) + PAYLOAD_LEN); + + iph = (struct iphdr *)(optpkt + ETH_HLEN); + iph->ihl = 5 + (optlen / 4); + iph->tot_len = htons(ntohs(iph->tot_len) + optlen); + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr) + optlen, 0); +} + +/* IPv4 options shouldn't coalesce */ +static void send_ip_options(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char optpkt[sizeof(buf) + sizeof(struct ip_timestamp)]; + int optlen = sizeof(struct ip_timestamp); + int pkt_size = total_hdr_len + PAYLOAD_LEN + optlen; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, total_hdr_len + PAYLOAD_LEN, daddr); + + create_packet(buf, PAYLOAD_LEN * 1, 0, PAYLOAD_LEN, 0); + add_ipv4_ts_option(buf, optpkt); + write_packet(fd, optpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, total_hdr_len + PAYLOAD_LEN, daddr); +} + +/* IPv4 fragments shouldn't coalesce */ +static void send_fragment4(int fd, struct sockaddr_ll *daddr) +{ + static char buf[IP_MAXPACKET]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + int pkt_size = total_hdr_len + PAYLOAD_LEN; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + /* Once fragmented, packet would retain the total_len. + * Tcp header is prepared as if rest of data is in follow-up frags, + * but follow up frags aren't actually sent. + */ + memset(buf + total_hdr_len, 'a', PAYLOAD_LEN * 2); + fill_transportlayer(buf + tcp_offset, PAYLOAD_LEN, 0, PAYLOAD_LEN * 2, 0); + fill_networklayer(buf + ETH_HLEN, PAYLOAD_LEN); + fill_datalinklayer(buf); + + iph->frag_off = htons(0x6000); // DF = 1, MF = 1 + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + write_packet(fd, buf, pkt_size, daddr); +} + +/* IPv4 packets with different ttl don't coalesce.*/ +static void send_changed_ttl(int fd, struct sockaddr_ll *daddr) +{ + int pkt_size = total_hdr_len + PAYLOAD_LEN; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + iph->ttl = 7; + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + write_packet(fd, buf, pkt_size, daddr); +} + +/* Packets with different tos don't coalesce.*/ +static void send_changed_tos(int fd, struct sockaddr_ll *daddr) +{ + int pkt_size = total_hdr_len + PAYLOAD_LEN; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buf + ETH_HLEN); + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + if (proto == PF_INET) { + iph->tos = 1; + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + } else if (proto == PF_INET6) { + ip6h->priority = 0xf; + } + write_packet(fd, buf, pkt_size, daddr); +} + +/* Packets with different ECN don't coalesce.*/ +static void send_changed_ECN(int fd, struct sockaddr_ll *daddr) +{ + int pkt_size = total_hdr_len + PAYLOAD_LEN; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + if (proto == PF_INET) { + buf[ETH_HLEN + 1] ^= 0x2; // ECN set to 10 + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + } else { + buf[ETH_HLEN + 1] ^= 0x20; // ECN set to 10 + } + write_packet(fd, buf, pkt_size, daddr); +} + +/* IPv6 fragments and packets with extensions don't coalesce.*/ +static void send_fragment6(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char extpkt[MAX_HDR_LEN + PAYLOAD_LEN + + sizeof(struct ip6_frag)]; + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buf + ETH_HLEN); + struct ip6_frag *frag = (void *)(extpkt + tcp_offset); + int extlen = sizeof(struct ip6_frag); + int bufpkt_len = total_hdr_len + PAYLOAD_LEN; + int extpkt_len = bufpkt_len + extlen; + int i; + + for (i = 0; i < 2; i++) { + create_packet(buf, PAYLOAD_LEN * i, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, bufpkt_len, daddr); + } + + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + memset(extpkt, 0, extpkt_len); + + ip6h->nexthdr = IPPROTO_FRAGMENT; + ip6h->payload_len = htons(ntohs(ip6h->payload_len) + extlen); + frag->ip6f_nxt = IPPROTO_TCP; + + memcpy(extpkt, buf, tcp_offset); + memcpy(extpkt + tcp_offset + extlen, buf + tcp_offset, + sizeof(struct tcphdr) + PAYLOAD_LEN); + write_packet(fd, extpkt, extpkt_len, daddr); + + create_packet(buf, PAYLOAD_LEN * 3, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, bufpkt_len, daddr); +} + +static void bind_packetsocket(int fd) +{ + struct sockaddr_ll daddr = {}; + + daddr.sll_family = AF_PACKET; + daddr.sll_protocol = ethhdr_proto; + daddr.sll_ifindex = if_nametoindex(ifname); + if (daddr.sll_ifindex == 0) + error(1, errno, "if_nametoindex"); + + if (bind(fd, (void *)&daddr, sizeof(daddr)) < 0) + error(1, errno, "could not bind socket"); +} + +static void set_timeout(int fd) +{ + struct timeval timeout; + + timeout.tv_sec = 120; + timeout.tv_usec = 0; + if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, + sizeof(timeout)) < 0) + error(1, errno, "cannot set timeout, setsockopt failed"); +} + +static void check_recv_pkts(int fd, int *correct_payload, + int correct_num_pkts) +{ + static char buffer[IP_MAXPACKET + ETH_HLEN + 1]; + struct iphdr *iph = (struct iphdr *)(buffer + ETH_HLEN); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + ETH_HLEN); + struct tcphdr *tcph; + bool bad_packet = false; + int tcp_ext_len = 0; + int ip_ext_len = 0; + int pkt_size = -1; + int data_len = 0; + int num_pkt = 0; + int i; + + vlog("Expected {"); + for (i = 0; i < correct_num_pkts; i++) + vlog("%d ", correct_payload[i]); + vlog("}, Total %d packets\nReceived {", correct_num_pkts); + + while (1) { + pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0); + if (pkt_size < 0) + error(1, errno, "could not receive"); + + if (iph->version == 4) + ip_ext_len = (iph->ihl - 5) * 4; + else if (ip6h->version == 6 && ip6h->nexthdr != IPPROTO_TCP) + ip_ext_len = sizeof(struct ip6_frag); + + tcph = (struct tcphdr *)(buffer + tcp_offset + ip_ext_len); + + if (tcph->fin) + break; + + tcp_ext_len = (tcph->doff - 5) * 4; + data_len = pkt_size - total_hdr_len - tcp_ext_len - ip_ext_len; + /* Min ethernet frame payload is 46(ETH_ZLEN - ETH_HLEN) by RFC 802.3. + * Ipv4/tcp packets without at least 6 bytes of data will be padded. + * Packet sockets are protocol agnostic, and will not trim the padding. + */ + if (pkt_size == ETH_ZLEN && iph->version == 4) { + data_len = ntohs(iph->tot_len) + - sizeof(struct tcphdr) - sizeof(struct iphdr); + } + vlog("%d ", data_len); + if (data_len != correct_payload[num_pkt]) { + vlog("[!=%d]", correct_payload[num_pkt]); + bad_packet = true; + } + num_pkt++; + } + vlog("}, Total %d packets.\n", num_pkt); + if (num_pkt != correct_num_pkts) + error(1, 0, "incorrect number of packets"); + if (bad_packet) + error(1, 0, "incorrect packet geometry"); + + printf("Test succeeded\n\n"); +} + +static void gro_sender(void) +{ + static char fin_pkt[MAX_HDR_LEN]; + struct sockaddr_ll daddr = {}; + int txfd = -1; + + txfd = socket(PF_PACKET, SOCK_RAW, IPPROTO_RAW); + if (txfd < 0) + error(1, errno, "socket creation"); + + memset(&daddr, 0, sizeof(daddr)); + daddr.sll_ifindex = if_nametoindex(ifname); + if (daddr.sll_ifindex == 0) + error(1, errno, "if_nametoindex"); + daddr.sll_family = AF_PACKET; + memcpy(daddr.sll_addr, dst_mac, ETH_ALEN); + daddr.sll_halen = ETH_ALEN; + create_packet(fin_pkt, PAYLOAD_LEN * 2, 0, 0, 1); + + if (strcmp(testname, "data") == 0) { + send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN / 2); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_data_pkts(txfd, &daddr, PAYLOAD_LEN / 2, PAYLOAD_LEN); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ack") == 0) { + send_ack(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "flags") == 0) { + send_flags(txfd, &daddr, 1, 0, 0, 0); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_flags(txfd, &daddr, 0, 1, 0, 0); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_flags(txfd, &daddr, 0, 0, 1, 0); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_flags(txfd, &daddr, 0, 0, 0, 1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "tcp") == 0) { + send_changed_checksum(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_changed_seq(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_changed_ts(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_diff_opt(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip") == 0) { + send_changed_ECN(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_changed_tos(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + if (proto == PF_INET) { + /* Modified packets may be received out of order. + * Sleep function added to enforce test boundaries + * so that fin pkts are not received prior to other pkts. + */ + sleep(1); + send_changed_ttl(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + sleep(1); + send_ip_options(txfd, &daddr); + sleep(1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + sleep(1); + send_fragment4(txfd, &daddr); + sleep(1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (proto == PF_INET6) { + send_fragment6(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } + } else if (strcmp(testname, "large") == 0) { + /* 20 is the difference between min iphdr size + * and min ipv6hdr size. Like MAX_HDR_SIZE, + * MAX_PAYLOAD is defined with the larger header of the two. + */ + int offset = proto == PF_INET ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; + + send_large(txfd, &daddr, remainder); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_large(txfd, &daddr, remainder + 1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else { + error(1, 0, "Unknown testcase"); + } + + if (close(txfd)) + error(1, errno, "socket close"); +} + +static void gro_receiver(void) +{ + static int correct_payload[NUM_PACKETS]; + int rxfd = -1; + + rxfd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_NONE)); + if (rxfd < 0) + error(1, 0, "socket creation"); + setup_sock_filter(rxfd); + set_timeout(rxfd); + bind_packetsocket(rxfd); + + memset(correct_payload, 0, sizeof(correct_payload)); + + if (strcmp(testname, "data") == 0) { + printf("pure data packet of same size: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("large data packets followed by a smaller one: "); + correct_payload[0] = PAYLOAD_LEN * 1.5; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("small data packets followed by a larger one: "); + correct_payload[0] = PAYLOAD_LEN / 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ack") == 0) { + printf("duplicate ack and pure ack: "); + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "flags") == 0) { + correct_payload[0] = PAYLOAD_LEN * 3; + correct_payload[1] = PAYLOAD_LEN * 2; + + printf("psh flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 2); + + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = 0; + correct_payload[2] = PAYLOAD_LEN * 2; + printf("syn flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 3); + + printf("rst flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 3); + + printf("urg flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "tcp") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + correct_payload[3] = PAYLOAD_LEN; + + printf("changed checksum does not coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("Wrong Seq number doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("Different timestamp doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 4); + + printf("Different options doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + + printf("different ECN doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("different tos doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + if (proto == PF_INET) { + printf("different ttl doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("ip options doesn't coalesce: "); + correct_payload[2] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 3); + + printf("fragmented ip4 doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + } else if (proto == PF_INET6) { + /* GRO doesn't check for ipv6 hop limit when flushing. + * Hence no corresponding test to the ipv4 case. + */ + printf("fragmented ip6 doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 2); + } + } else if (strcmp(testname, "large") == 0) { + int offset = proto == PF_INET ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; + + correct_payload[0] = (MAX_PAYLOAD + offset); + correct_payload[1] = remainder; + printf("Shouldn't coalesce if exceed IP max pkt size: "); + check_recv_pkts(rxfd, correct_payload, 2); + + /* last segment sent individually, doesn't start new segment */ + correct_payload[0] = correct_payload[0] - remainder; + correct_payload[1] = remainder + 1; + correct_payload[2] = remainder + 1; + check_recv_pkts(rxfd, correct_payload, 3); + } else { + error(1, 0, "Test case error, should never trigger"); + } + + if (close(rxfd)) + error(1, 0, "socket close"); +} + +static void parse_args(int argc, char **argv) +{ + static const struct option opts[] = { + { "dmac", required_argument, NULL, 'D' }, + { "iface", required_argument, NULL, 'i' }, + { "ipv4", no_argument, NULL, '4' }, + { "ipv6", no_argument, NULL, '6' }, + { "rx", no_argument, NULL, 'r' }, + { "smac", required_argument, NULL, 'S' }, + { "test", required_argument, NULL, 't' }, + { "verbose", no_argument, NULL, 'v' }, + { 0, 0, 0, 0 } + }; + int c; + + while ((c = getopt_long(argc, argv, "46D:i:rS:t:v", opts, NULL)) != -1) { + switch (c) { + case '4': + proto = PF_INET; + ethhdr_proto = htons(ETH_P_IP); + break; + case '6': + proto = PF_INET6; + ethhdr_proto = htons(ETH_P_IPV6); + break; + case 'D': + dmac = optarg; + break; + case 'i': + ifname = optarg; + break; + case 'r': + tx_socket = false; + break; + case 'S': + smac = optarg; + break; + case 't': + testname = optarg; + break; + case 'v': + verbose = true; + break; + default: + error(1, 0, "%s invalid option %c\n", __func__, c); + break; + } + } +} + +int main(int argc, char **argv) +{ + parse_args(argc, argv); + + if (proto == PF_INET) { + tcp_offset = ETH_HLEN + sizeof(struct iphdr); + total_hdr_len = tcp_offset + sizeof(struct tcphdr); + } else if (proto == PF_INET6) { + tcp_offset = ETH_HLEN + sizeof(struct ipv6hdr); + total_hdr_len = MAX_HDR_LEN; + } else { + error(1, 0, "Protocol family is not ipv4 or ipv6"); + } + + read_MAC(src_mac, smac); + read_MAC(dst_mac, dmac); + + if (tx_socket) + gro_sender(); + else + gro_receiver(); + return 0; +} diff --git a/tools/testing/selftests/net/gro.sh b/tools/testing/selftests/net/gro.sh new file mode 100755 index 000000000000..342ad27f631b --- /dev/null +++ b/tools/testing/selftests/net/gro.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +readonly SERVER_MAC="aa:00:00:00:00:02" +readonly CLIENT_MAC="aa:00:00:00:00:01" +readonly TESTS=("data" "ack" "flags" "tcp" "ip" "large") +readonly PROTOS=("ipv4" "ipv6") +dev="" +test="all" +proto="ipv4" + +run_test() { + local server_pid=0 + local exit_code=0 + local protocol=$1 + local test=$2 + local ARGS=( "--${protocol}" "--dmac" "${SERVER_MAC}" \ + "--smac" "${CLIENT_MAC}" "--test" "${test}" "--verbose" ) + + setup_ns + # Each test is run 3 times to deflake, because given the receive timing, + # not all packets that should coalesce will be considered in the same flow + # on every try. + for tries in {1..3}; do + # Actual test starts here + ip netns exec server_ns ./gro "${ARGS[@]}" "--rx" "--iface" "server" \ + 1>>log.txt & + server_pid=$! + sleep 0.5 # to allow for socket init + ip netns exec client_ns ./gro "${ARGS[@]}" "--iface" "client" \ + 1>>log.txt + wait "${server_pid}" + exit_code=$? + if [[ "${exit_code}" -eq 0 ]]; then + break; + fi + done + cleanup_ns + echo ${exit_code} +} + +run_all_tests() { + local failed_tests=() + for proto in "${PROTOS[@]}"; do + for test in "${TESTS[@]}"; do + echo "running test ${proto} ${test}" >&2 + exit_code=$(run_test $proto $test) + if [[ "${exit_code}" -ne 0 ]]; then + failed_tests+=("${proto}_${test}") + fi; + done; + done + if [[ ${#failed_tests[@]} -ne 0 ]]; then + echo "failed tests: ${failed_tests[*]}. \ + Please see log.txt for more logs" + exit 1 + else + echo "All Tests Succeeded!" + fi; +} + +usage() { + echo "Usage: $0 \ + [-i <DEV>] \ + [-t data|ack|flags|tcp|ip|large] \ + [-p <ipv4|ipv6>]" 1>&2; + exit 1; +} + +while getopts "i:t:p:" opt; do + case "${opt}" in + i) + dev="${OPTARG}" + ;; + t) + test="${OPTARG}" + ;; + p) + proto="${OPTARG}" + ;; + *) + usage + ;; + esac +done + +if [ -n "$dev" ]; then + source setup_loopback.sh +else + source setup_veth.sh +fi + +setup +trap cleanup EXIT +if [[ "${test}" == "all" ]]; then + run_all_tests +else + run_test "${proto}" "${test}" +fi; diff --git a/tools/testing/selftests/net/icmp_redirect.sh b/tools/testing/selftests/net/icmp_redirect.sh index bf361f30d6ef..ecbf57f264ed 100755 --- a/tools/testing/selftests/net/icmp_redirect.sh +++ b/tools/testing/selftests/net/icmp_redirect.sh @@ -63,10 +63,14 @@ log_test() local rc=$1 local expected=$2 local msg="$3" + local xfail=$4 if [ ${rc} -eq ${expected} ]; then printf "TEST: %-60s [ OK ]\n" "${msg}" nsuccess=$((nsuccess+1)) + elif [ ${rc} -eq ${xfail} ]; then + printf "TEST: %-60s [XFAIL]\n" "${msg}" + nxfail=$((nxfail+1)) else ret=1 nfail=$((nfail+1)) @@ -309,9 +313,10 @@ check_exception() fi log_test $? 0 "IPv4: ${desc}" - if [ "$with_redirect" = "yes" ]; then + # No PMTU info for test "redirect" and "mtu exception plus redirect" + if [ "$with_redirect" = "yes" ] && [ "$desc" != "redirect exception plus mtu" ]; then ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \ - grep -q "${H2_N2_IP6} from :: via ${R2_LLADDR} dev br0.*${mtu}" + grep -v "mtu" | grep -q "${H2_N2_IP6} .*via ${R2_LLADDR} dev br0" elif [ -n "${mtu}" ]; then ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \ grep -q "${mtu}" @@ -322,7 +327,7 @@ check_exception() ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \ grep -v "mtu" | grep -q "${R1_LLADDR}" fi - log_test $? 0 "IPv6: ${desc}" + log_test $? 0 "IPv6: ${desc}" 1 } run_ping() @@ -488,6 +493,7 @@ which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) ret=0 nsuccess=0 nfail=0 +nxfail=0 while getopts :pv o do @@ -532,5 +538,6 @@ fi printf "\nTests passed: %3d\n" ${nsuccess} printf "Tests failed: %3d\n" ${nfail} +printf "Tests xfailed: %3d\n" ${nxfail} exit $ret diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh new file mode 100755 index 000000000000..a2489ec398fe --- /dev/null +++ b/tools/testing/selftests/net/ioam6.sh @@ -0,0 +1,668 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Author: Justin Iurman <justin.iurman@uliege.be> +# +# This script evaluates the IOAM insertion for IPv6 by checking the IOAM data +# consistency directly inside packets on the receiver side. Tests are divided +# into three categories: OUTPUT (evaluates the IOAM processing by the sender), +# INPUT (evaluates the IOAM processing by the receiver) and GLOBAL (evaluates +# wider use cases that do not fall into the other two categories). Both OUTPUT +# and INPUT tests only use a two-node topology (alpha and beta), while GLOBAL +# tests use the entire three-node topology (alpha, beta, gamma). Each test is +# documented inside its own handler in the code below. +# +# An IOAM domain is configured from Alpha to Gamma but not on the reverse path. +# When either Beta or Gamma is the destination (depending on the test category), +# Alpha adds an IOAM option (Pre-allocated Trace) inside a Hop-by-hop. +# +# +# +-------------------+ +-------------------+ +# | | | | +# | Alpha netns | | Gamma netns | +# | | | | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | db01::2/64 | | | | db02::2/64 | | +# | +-------------+ | | +-------------+ | +# | . | | . | +# +-------------------+ +-------------------+ +# . . +# . . +# . . +# +----------------------------------------------------+ +# | . . | +# | +-------------+ +-------------+ | +# | | veth0 | | veth1 | | +# | | db01::1/64 | ................ | db02::1/64 | | +# | +-------------+ +-------------+ | +# | | +# | Beta netns | +# | | +# +----------------------------------------------------+ +# +# +# +# ============================================================= +# | Alpha - IOAM configuration | +# +===========================================================+ +# | Node ID | 1 | +# +-----------------------------------------------------------+ +# | Node Wide ID | 11111111 | +# +-----------------------------------------------------------+ +# | Ingress ID | 0xffff (default value) | +# +-----------------------------------------------------------+ +# | Ingress Wide ID | 0xffffffff (default value) | +# +-----------------------------------------------------------+ +# | Egress ID | 101 | +# +-----------------------------------------------------------+ +# | Egress Wide ID | 101101 | +# +-----------------------------------------------------------+ +# | Namespace Data | 0xdeadbee0 | +# +-----------------------------------------------------------+ +# | Namespace Wide Data | 0xcafec0caf00dc0de | +# +-----------------------------------------------------------+ +# | Schema ID | 777 | +# +-----------------------------------------------------------+ +# | Schema Data | something that will be 4n-aligned | +# +-----------------------------------------------------------+ +# +# +# ============================================================= +# | Beta - IOAM configuration | +# +===========================================================+ +# | Node ID | 2 | +# +-----------------------------------------------------------+ +# | Node Wide ID | 22222222 | +# +-----------------------------------------------------------+ +# | Ingress ID | 201 | +# +-----------------------------------------------------------+ +# | Ingress Wide ID | 201201 | +# +-----------------------------------------------------------+ +# | Egress ID | 202 | +# +-----------------------------------------------------------+ +# | Egress Wide ID | 202202 | +# +-----------------------------------------------------------+ +# | Namespace Data | 0xdeadbee1 | +# +-----------------------------------------------------------+ +# | Namespace Wide Data | 0xcafec0caf11dc0de | +# +-----------------------------------------------------------+ +# | Schema ID | 666 | +# +-----------------------------------------------------------+ +# | Schema Data | Hello there -Obi | +# +-----------------------------------------------------------+ +# +# +# ============================================================= +# | Gamma - IOAM configuration | +# +===========================================================+ +# | Node ID | 3 | +# +-----------------------------------------------------------+ +# | Node Wide ID | 33333333 | +# +-----------------------------------------------------------+ +# | Ingress ID | 301 | +# +-----------------------------------------------------------+ +# | Ingress Wide ID | 301301 | +# +-----------------------------------------------------------+ +# | Egress ID | 0xffff (default value) | +# +-----------------------------------------------------------+ +# | Egress Wide ID | 0xffffffff (default value) | +# +-----------------------------------------------------------+ +# | Namespace Data | 0xdeadbee2 | +# +-----------------------------------------------------------+ +# | Namespace Wide Data | 0xcafec0caf22dc0de | +# +-----------------------------------------------------------+ +# | Schema ID | 0xffffff (= None) | +# +-----------------------------------------------------------+ +# | Schema Data | | +# +-----------------------------------------------------------+ + + +################################################################################ +# # +# WARNING: Be careful if you modify the block below - it MUST be kept # +# synchronized with configurations inside ioam6_parser.c and always # +# reflect the same. # +# # +################################################################################ + +ALPHA=( + 1 # ID + 11111111 # Wide ID + 0xffff # Ingress ID + 0xffffffff # Ingress Wide ID + 101 # Egress ID + 101101 # Egress Wide ID + 0xdeadbee0 # Namespace Data + 0xcafec0caf00dc0de # Namespace Wide Data + 777 # Schema ID (0xffffff = None) + "something that will be 4n-aligned" # Schema Data +) + +BETA=( + 2 + 22222222 + 201 + 201201 + 202 + 202202 + 0xdeadbee1 + 0xcafec0caf11dc0de + 666 + "Hello there -Obi" +) + +GAMMA=( + 3 + 33333333 + 301 + 301301 + 0xffff + 0xffffffff + 0xdeadbee2 + 0xcafec0caf22dc0de + 0xffffff + "" +) + +TESTS_OUTPUT=" + out_undef_ns + out_no_room + out_bits + out_full_supp_trace +" + +TESTS_INPUT=" + in_undef_ns + in_no_room + in_oflag + in_bits + in_full_supp_trace +" + +TESTS_GLOBAL=" + fwd_full_supp_trace +" + + +################################################################################ +# # +# LIBRARY # +# # +################################################################################ + +check_kernel_compatibility() +{ + ip netns add ioam-tmp-node + ip link add name veth0 netns ioam-tmp-node type veth \ + peer name veth1 netns ioam-tmp-node + + ip -netns ioam-tmp-node link set veth0 up + ip -netns ioam-tmp-node link set veth1 up + + ip -netns ioam-tmp-node ioam namespace add 0 &>/dev/null + ns_ad=$? + + ip -netns ioam-tmp-node ioam namespace show | grep -q "namespace 0" + ns_sh=$? + + if [[ $ns_ad != 0 || $ns_sh != 0 ]] + then + echo "SKIP: kernel version probably too old, missing ioam support" + ip link del veth0 2>/dev/null || true + ip netns del ioam-tmp-node || true + exit 1 + fi + + ip -netns ioam-tmp-node route add db02::/64 encap ioam6 trace prealloc \ + type 0x800000 ns 0 size 4 dev veth0 &>/dev/null + tr_ad=$? + + ip -netns ioam-tmp-node -6 route | grep -q "encap ioam6 trace" + tr_sh=$? + + if [[ $tr_ad != 0 || $tr_sh != 0 ]] + then + echo "SKIP: cannot attach an ioam trace to a route, did you compile" \ + "without CONFIG_IPV6_IOAM6_LWTUNNEL?" + ip link del veth0 2>/dev/null || true + ip netns del ioam-tmp-node || true + exit 1 + fi + + ip link del veth0 2>/dev/null || true + ip netns del ioam-tmp-node || true +} + +cleanup() +{ + ip link del ioam-veth-alpha 2>/dev/null || true + ip link del ioam-veth-gamma 2>/dev/null || true + + ip netns del ioam-node-alpha || true + ip netns del ioam-node-beta || true + ip netns del ioam-node-gamma || true +} + +setup() +{ + ip netns add ioam-node-alpha + ip netns add ioam-node-beta + ip netns add ioam-node-gamma + + ip link add name ioam-veth-alpha netns ioam-node-alpha type veth \ + peer name ioam-veth-betaL netns ioam-node-beta + ip link add name ioam-veth-betaR netns ioam-node-beta type veth \ + peer name ioam-veth-gamma netns ioam-node-gamma + + ip -netns ioam-node-alpha link set ioam-veth-alpha name veth0 + ip -netns ioam-node-beta link set ioam-veth-betaL name veth0 + ip -netns ioam-node-beta link set ioam-veth-betaR name veth1 + ip -netns ioam-node-gamma link set ioam-veth-gamma name veth0 + + ip -netns ioam-node-alpha addr add db01::2/64 dev veth0 + ip -netns ioam-node-alpha link set veth0 up + ip -netns ioam-node-alpha link set lo up + ip -netns ioam-node-alpha route add db02::/64 via db01::1 dev veth0 + ip -netns ioam-node-alpha route del db01::/64 + ip -netns ioam-node-alpha route add db01::/64 dev veth0 + + ip -netns ioam-node-beta addr add db01::1/64 dev veth0 + ip -netns ioam-node-beta addr add db02::1/64 dev veth1 + ip -netns ioam-node-beta link set veth0 up + ip -netns ioam-node-beta link set veth1 up + ip -netns ioam-node-beta link set lo up + + ip -netns ioam-node-gamma addr add db02::2/64 dev veth0 + ip -netns ioam-node-gamma link set veth0 up + ip -netns ioam-node-gamma link set lo up + ip -netns ioam-node-gamma route add db01::/64 via db02::1 dev veth0 + + # - IOAM config - + ip netns exec ioam-node-alpha sysctl -wq net.ipv6.ioam6_id=${ALPHA[0]} + ip netns exec ioam-node-alpha sysctl -wq net.ipv6.ioam6_id_wide=${ALPHA[1]} + ip netns exec ioam-node-alpha sysctl -wq net.ipv6.conf.veth0.ioam6_id=${ALPHA[4]} + ip netns exec ioam-node-alpha sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${ALPHA[5]} + ip -netns ioam-node-alpha ioam namespace add 123 data ${ALPHA[6]} wide ${ALPHA[7]} + ip -netns ioam-node-alpha ioam schema add ${ALPHA[8]} "${ALPHA[9]}" + ip -netns ioam-node-alpha ioam namespace set 123 schema ${ALPHA[8]} + + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.all.forwarding=1 + ip netns exec ioam-node-beta sysctl -wq net.ipv6.ioam6_id=${BETA[0]} + ip netns exec ioam-node-beta sysctl -wq net.ipv6.ioam6_id_wide=${BETA[1]} + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth0.ioam6_id=${BETA[2]} + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${BETA[3]} + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth1.ioam6_id=${BETA[4]} + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth1.ioam6_id_wide=${BETA[5]} + ip -netns ioam-node-beta ioam namespace add 123 data ${BETA[6]} wide ${BETA[7]} + ip -netns ioam-node-beta ioam schema add ${BETA[8]} "${BETA[9]}" + ip -netns ioam-node-beta ioam namespace set 123 schema ${BETA[8]} + + ip netns exec ioam-node-gamma sysctl -wq net.ipv6.ioam6_id=${GAMMA[0]} + ip netns exec ioam-node-gamma sysctl -wq net.ipv6.ioam6_id_wide=${GAMMA[1]} + ip netns exec ioam-node-gamma sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 + ip netns exec ioam-node-gamma sysctl -wq net.ipv6.conf.veth0.ioam6_id=${GAMMA[2]} + ip netns exec ioam-node-gamma sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${GAMMA[3]} + ip -netns ioam-node-gamma ioam namespace add 123 data ${GAMMA[6]} wide ${GAMMA[7]} + + sleep 1 + + ip netns exec ioam-node-alpha ping6 -c 5 -W 1 db02::2 &>/dev/null + if [ $? != 0 ] + then + echo "Setup FAILED" + cleanup &>/dev/null + exit 0 + fi +} + +log_test_passed() +{ + local desc=$1 + printf "TEST: %-60s [ OK ]\n" "${desc}" +} + +log_test_failed() +{ + local desc=$1 + printf "TEST: %-60s [FAIL]\n" "${desc}" +} + +run_test() +{ + local name=$1 + local desc=$2 + local node_src=$3 + local node_dst=$4 + local ip6_src=$5 + local ip6_dst=$6 + local if_dst=$7 + local trace_type=$8 + local ioam_ns=$9 + + ip netns exec $node_dst ./ioam6_parser $if_dst $name $ip6_src $ip6_dst \ + $trace_type $ioam_ns & + local spid=$! + sleep 0.1 + + ip netns exec $node_src ping6 -t 64 -c 1 -W 1 $ip6_dst &>/dev/null + if [ $? != 0 ] + then + log_test_failed "${desc}" + kill -2 $spid &>/dev/null + else + wait $spid + [ $? = 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}" + fi +} + +run() +{ + echo + echo "OUTPUT tests" + printf "%0.s-" {1..74} + echo + + # set OUTPUT settings + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=0 + + for t in $TESTS_OUTPUT + do + $t + done + + # clean OUTPUT settings + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 + ip -netns ioam-node-alpha route change db01::/64 dev veth0 + + + echo + echo "INPUT tests" + printf "%0.s-" {1..74} + echo + + # set INPUT settings + ip -netns ioam-node-alpha ioam namespace del 123 + + for t in $TESTS_INPUT + do + $t + done + + # clean INPUT settings + ip -netns ioam-node-alpha ioam namespace add 123 \ + data ${ALPHA[6]} wide ${ALPHA[7]} + ip -netns ioam-node-alpha ioam namespace set 123 schema ${ALPHA[8]} + ip -netns ioam-node-alpha route change db01::/64 dev veth0 + + + echo + echo "GLOBAL tests" + printf "%0.s-" {1..74} + echo + + for t in $TESTS_GLOBAL + do + $t + done +} + +bit2type=( + 0x800000 0x400000 0x200000 0x100000 0x080000 0x040000 0x020000 0x010000 + 0x008000 0x004000 0x002000 0x001000 0x000800 0x000400 0x000200 0x000100 + 0x000080 0x000040 0x000020 0x000010 0x000008 0x000004 0x000002 +) +bit2size=( 4 4 4 4 4 4 4 4 8 8 8 4 4 4 4 4 4 4 4 4 4 4 4 ) + + +################################################################################ +# # +# OUTPUT tests # +# # +# Two nodes (sender/receiver), IOAM disabled on ingress for the receiver. # +################################################################################ + +out_undef_ns() +{ + ############################################################################## + # Make sure that the encap node won't fill the trace if the chosen IOAM # + # namespace is not configured locally. # + ############################################################################## + local desc="Unknown IOAM namespace" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0x800000 ns 0 size 4 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0x800000 0 +} + +out_no_room() +{ + ############################################################################## + # Make sure that the encap node won't fill the trace and will set the # + # Overflow flag since there is no room enough for its data. # + ############################################################################## + local desc="Missing trace room" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0xc00000 ns 123 size 4 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0xc00000 123 +} + +out_bits() +{ + ############################################################################## + # Make sure that, for each trace type bit, the encap node will either: # + # (i) fill the trace with its data when it is a supported bit # + # (ii) not fill the trace with its data when it is an unsupported bit # + ############################################################################## + local desc="Trace type with bit <n> only" + + local tmp=${bit2size[22]} + bit2size[22]=$(( $tmp + ${#ALPHA[9]} + ((4 - (${#ALPHA[9]} % 4)) % 4) )) + + for i in {0..22} + do + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace \ + prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} \ + dev veth0 &>/dev/null + + local cmd_res=$? + local descr="${desc/<n>/$i}" + + if [[ $i -ge 12 && $i -le 21 ]] + then + if [ $cmd_res != 0 ] + then + npassed=$((npassed+1)) + log_test_passed "$descr" + else + nfailed=$((nfailed+1)) + log_test_failed "$descr" + fi + else + run_test "out_bit$i" "$descr" ioam-node-alpha ioam-node-beta \ + db01::2 db01::1 veth0 ${bit2type[$i]} 123 + fi + done + + bit2size[22]=$tmp +} + +out_full_supp_trace() +{ + ############################################################################## + # Make sure that the encap node will correctly fill a full trace. Be careful,# + # "full trace" here does NOT mean all bits (only supported ones). # + ############################################################################## + local desc="Full supported trace" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0xfff002 ns 123 size 100 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0xfff002 123 +} + + +################################################################################ +# # +# INPUT tests # +# # +# Two nodes (sender/receiver), the sender MUST NOT fill the trace upon # +# insertion -> the IOAM namespace configured on the sender is removed # +# and is used in the inserted trace to force the sender not to fill it. # +################################################################################ + +in_undef_ns() +{ + ############################################################################## + # Make sure that the receiving node won't fill the trace if the related IOAM # + # namespace is not configured locally. # + ############################################################################## + local desc="Unknown IOAM namespace" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0x800000 ns 0 size 4 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0x800000 0 +} + +in_no_room() +{ + ############################################################################## + # Make sure that the receiving node won't fill the trace and will set the # + # Overflow flag if there is no room enough for its data. # + ############################################################################## + local desc="Missing trace room" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0xc00000 ns 123 size 4 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0xc00000 123 +} + +in_bits() +{ + ############################################################################## + # Make sure that, for each trace type bit, the receiving node will either: # + # (i) fill the trace with its data when it is a supported bit # + # (ii) not fill the trace with its data when it is an unsupported bit # + ############################################################################## + local desc="Trace type with bit <n> only" + + local tmp=${bit2size[22]} + bit2size[22]=$(( $tmp + ${#BETA[9]} + ((4 - (${#BETA[9]} % 4)) % 4) )) + + for i in {0..11} {22..22} + do + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace \ + prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} dev veth0 + + run_test "in_bit$i" "${desc/<n>/$i}" ioam-node-alpha ioam-node-beta \ + db01::2 db01::1 veth0 ${bit2type[$i]} 123 + done + + bit2size[22]=$tmp +} + +in_oflag() +{ + ############################################################################## + # Make sure that the receiving node won't fill the trace since the Overflow # + # flag is set. # + ############################################################################## + local desc="Overflow flag is set" + + # Exception: + # Here, we need the sender to set the Overflow flag. For that, we will add + # back the IOAM namespace that was previously configured on the sender. + ip -netns ioam-node-alpha ioam namespace add 123 + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0xc00000 ns 123 size 4 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0xc00000 123 + + # And we clean the exception for this test to get things back to normal for + # other INPUT tests + ip -netns ioam-node-alpha ioam namespace del 123 +} + +in_full_supp_trace() +{ + ############################################################################## + # Make sure that the receiving node will correctly fill a full trace. Be # + # careful, "full trace" here does NOT mean all bits (only supported ones). # + ############################################################################## + local desc="Full supported trace" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0xfff002 ns 123 size 80 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0xfff002 123 +} + + +################################################################################ +# # +# GLOBAL tests # +# # +# Three nodes (sender/router/receiver), IOAM fully enabled on every node. # +################################################################################ + +fwd_full_supp_trace() +{ + ############################################################################## + # Make sure that all three nodes correctly filled the full supported trace # + # by checking that the trace data is consistent with the predefined config. # + ############################################################################## + local desc="Forward - Full supported trace" + + ip -netns ioam-node-alpha route change db02::/64 encap ioam6 trace prealloc \ + type 0xfff002 ns 123 size 244 via db01::1 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-gamma db01::2 \ + db02::2 veth0 0xfff002 123 +} + + +################################################################################ +# # +# MAIN # +# # +################################################################################ + +if [ "$(id -u)" -ne 0 ] +then + echo "SKIP: Need root privileges" + exit 1 +fi + +if [ ! -x "$(command -v ip)" ] +then + echo "SKIP: Could not run test without ip tool" + exit 1 +fi + +ip ioam &>/dev/null +if [ $? = 1 ] +then + echo "SKIP: iproute2 too old, missing ioam command" + exit 1 +fi + +check_kernel_compatibility + +cleanup &>/dev/null +setup +run +cleanup &>/dev/null diff --git a/tools/testing/selftests/net/ioam6_parser.c b/tools/testing/selftests/net/ioam6_parser.c new file mode 100644 index 000000000000..8f6997d35816 --- /dev/null +++ b/tools/testing/selftests/net/ioam6_parser.c @@ -0,0 +1,676 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Author: Justin Iurman (justin.iurman@uliege.be) + * + * IOAM tester for IPv6, see ioam6.sh for details on each test case. + */ +#include <arpa/inet.h> +#include <errno.h> +#include <limits.h> +#include <linux/const.h> +#include <linux/if_ether.h> +#include <linux/ioam6.h> +#include <linux/ipv6.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +struct ioam_config { + __u32 id; + __u64 wide; + __u16 ingr_id; + __u16 egr_id; + __u32 ingr_wide; + __u32 egr_wide; + __u32 ns_data; + __u64 ns_wide; + __u32 sc_id; + __u8 hlim; + char *sc_data; +}; + +/* + * Be careful if you modify structs below - everything MUST be kept synchronized + * with configurations inside ioam6.sh and always reflect the same. + */ + +static struct ioam_config node1 = { + .id = 1, + .wide = 11111111, + .ingr_id = 0xffff, /* default value */ + .egr_id = 101, + .ingr_wide = 0xffffffff, /* default value */ + .egr_wide = 101101, + .ns_data = 0xdeadbee0, + .ns_wide = 0xcafec0caf00dc0de, + .sc_id = 777, + .sc_data = "something that will be 4n-aligned", + .hlim = 64, +}; + +static struct ioam_config node2 = { + .id = 2, + .wide = 22222222, + .ingr_id = 201, + .egr_id = 202, + .ingr_wide = 201201, + .egr_wide = 202202, + .ns_data = 0xdeadbee1, + .ns_wide = 0xcafec0caf11dc0de, + .sc_id = 666, + .sc_data = "Hello there -Obi", + .hlim = 63, +}; + +static struct ioam_config node3 = { + .id = 3, + .wide = 33333333, + .ingr_id = 301, + .egr_id = 0xffff, /* default value */ + .ingr_wide = 301301, + .egr_wide = 0xffffffff, /* default value */ + .ns_data = 0xdeadbee2, + .ns_wide = 0xcafec0caf22dc0de, + .sc_id = 0xffffff, /* default value */ + .sc_data = NULL, + .hlim = 62, +}; + +enum { + /********** + * OUTPUT * + **********/ + TEST_OUT_UNDEF_NS, + TEST_OUT_NO_ROOM, + TEST_OUT_BIT0, + TEST_OUT_BIT1, + TEST_OUT_BIT2, + TEST_OUT_BIT3, + TEST_OUT_BIT4, + TEST_OUT_BIT5, + TEST_OUT_BIT6, + TEST_OUT_BIT7, + TEST_OUT_BIT8, + TEST_OUT_BIT9, + TEST_OUT_BIT10, + TEST_OUT_BIT11, + TEST_OUT_BIT22, + TEST_OUT_FULL_SUPP_TRACE, + + /********* + * INPUT * + *********/ + TEST_IN_UNDEF_NS, + TEST_IN_NO_ROOM, + TEST_IN_OFLAG, + TEST_IN_BIT0, + TEST_IN_BIT1, + TEST_IN_BIT2, + TEST_IN_BIT3, + TEST_IN_BIT4, + TEST_IN_BIT5, + TEST_IN_BIT6, + TEST_IN_BIT7, + TEST_IN_BIT8, + TEST_IN_BIT9, + TEST_IN_BIT10, + TEST_IN_BIT11, + TEST_IN_BIT22, + TEST_IN_FULL_SUPP_TRACE, + + /********** + * GLOBAL * + **********/ + TEST_FWD_FULL_SUPP_TRACE, + + __TEST_MAX, +}; + +static int check_ioam_header(int tid, struct ioam6_trace_hdr *ioam6h, + __u32 trace_type, __u16 ioam_ns) +{ + if (__be16_to_cpu(ioam6h->namespace_id) != ioam_ns || + __be32_to_cpu(ioam6h->type_be32) != (trace_type << 8)) + return 1; + + switch (tid) { + case TEST_OUT_UNDEF_NS: + case TEST_IN_UNDEF_NS: + return ioam6h->overflow || + ioam6h->nodelen != 1 || + ioam6h->remlen != 1; + + case TEST_OUT_NO_ROOM: + case TEST_IN_NO_ROOM: + case TEST_IN_OFLAG: + return !ioam6h->overflow || + ioam6h->nodelen != 2 || + ioam6h->remlen != 1; + + case TEST_OUT_BIT0: + case TEST_IN_BIT0: + case TEST_OUT_BIT1: + case TEST_IN_BIT1: + case TEST_OUT_BIT2: + case TEST_IN_BIT2: + case TEST_OUT_BIT3: + case TEST_IN_BIT3: + case TEST_OUT_BIT4: + case TEST_IN_BIT4: + case TEST_OUT_BIT5: + case TEST_IN_BIT5: + case TEST_OUT_BIT6: + case TEST_IN_BIT6: + case TEST_OUT_BIT7: + case TEST_IN_BIT7: + case TEST_OUT_BIT11: + case TEST_IN_BIT11: + return ioam6h->overflow || + ioam6h->nodelen != 1 || + ioam6h->remlen; + + case TEST_OUT_BIT8: + case TEST_IN_BIT8: + case TEST_OUT_BIT9: + case TEST_IN_BIT9: + case TEST_OUT_BIT10: + case TEST_IN_BIT10: + return ioam6h->overflow || + ioam6h->nodelen != 2 || + ioam6h->remlen; + + case TEST_OUT_BIT22: + case TEST_IN_BIT22: + return ioam6h->overflow || + ioam6h->nodelen || + ioam6h->remlen; + + case TEST_OUT_FULL_SUPP_TRACE: + case TEST_IN_FULL_SUPP_TRACE: + case TEST_FWD_FULL_SUPP_TRACE: + return ioam6h->overflow || + ioam6h->nodelen != 15 || + ioam6h->remlen; + + default: + break; + } + + return 1; +} + +static int check_ioam6_data(__u8 **p, struct ioam6_trace_hdr *ioam6h, + const struct ioam_config cnf) +{ + unsigned int len; + __u8 aligned; + __u64 raw64; + __u32 raw32; + + if (ioam6h->type.bit0) { + raw32 = __be32_to_cpu(*((__u32 *)*p)); + if (cnf.hlim != (raw32 >> 24) || cnf.id != (raw32 & 0xffffff)) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit1) { + raw32 = __be32_to_cpu(*((__u32 *)*p)); + if (cnf.ingr_id != (raw32 >> 16) || + cnf.egr_id != (raw32 & 0xffff)) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit2) + *p += sizeof(__u32); + + if (ioam6h->type.bit3) + *p += sizeof(__u32); + + if (ioam6h->type.bit4) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit5) { + if (__be32_to_cpu(*((__u32 *)*p)) != cnf.ns_data) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit6) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit7) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit8) { + raw64 = __be64_to_cpu(*((__u64 *)*p)); + if (cnf.hlim != (raw64 >> 56) || + cnf.wide != (raw64 & 0xffffffffffffff)) + return 1; + *p += sizeof(__u64); + } + + if (ioam6h->type.bit9) { + if (__be32_to_cpu(*((__u32 *)*p)) != cnf.ingr_wide) + return 1; + *p += sizeof(__u32); + + if (__be32_to_cpu(*((__u32 *)*p)) != cnf.egr_wide) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit10) { + if (__be64_to_cpu(*((__u64 *)*p)) != cnf.ns_wide) + return 1; + *p += sizeof(__u64); + } + + if (ioam6h->type.bit11) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit12) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit13) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit14) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit15) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit16) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit17) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit18) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit19) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit20) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit21) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit22) { + len = cnf.sc_data ? strlen(cnf.sc_data) : 0; + aligned = cnf.sc_data ? __ALIGN_KERNEL(len, 4) : 0; + + raw32 = __be32_to_cpu(*((__u32 *)*p)); + if (aligned != (raw32 >> 24) * 4 || + cnf.sc_id != (raw32 & 0xffffff)) + return 1; + *p += sizeof(__u32); + + if (cnf.sc_data) { + if (strncmp((char *)*p, cnf.sc_data, len)) + return 1; + + *p += len; + aligned -= len; + + while (aligned--) { + if (**p != '\0') + return 1; + *p += sizeof(__u8); + } + } + } + + return 0; +} + +static int check_ioam_header_and_data(int tid, struct ioam6_trace_hdr *ioam6h, + __u32 trace_type, __u16 ioam_ns) +{ + __u8 *p; + + if (check_ioam_header(tid, ioam6h, trace_type, ioam_ns)) + return 1; + + p = ioam6h->data + ioam6h->remlen * 4; + + switch (tid) { + case TEST_OUT_BIT0: + case TEST_OUT_BIT1: + case TEST_OUT_BIT2: + case TEST_OUT_BIT3: + case TEST_OUT_BIT4: + case TEST_OUT_BIT5: + case TEST_OUT_BIT6: + case TEST_OUT_BIT7: + case TEST_OUT_BIT8: + case TEST_OUT_BIT9: + case TEST_OUT_BIT10: + case TEST_OUT_BIT11: + case TEST_OUT_BIT22: + case TEST_OUT_FULL_SUPP_TRACE: + return check_ioam6_data(&p, ioam6h, node1); + + case TEST_IN_BIT0: + case TEST_IN_BIT1: + case TEST_IN_BIT2: + case TEST_IN_BIT3: + case TEST_IN_BIT4: + case TEST_IN_BIT5: + case TEST_IN_BIT6: + case TEST_IN_BIT7: + case TEST_IN_BIT8: + case TEST_IN_BIT9: + case TEST_IN_BIT10: + case TEST_IN_BIT11: + case TEST_IN_BIT22: + case TEST_IN_FULL_SUPP_TRACE: + { + __u32 tmp32 = node2.egr_wide; + __u16 tmp16 = node2.egr_id; + int res; + + node2.egr_id = 0xffff; + node2.egr_wide = 0xffffffff; + + res = check_ioam6_data(&p, ioam6h, node2); + + node2.egr_id = tmp16; + node2.egr_wide = tmp32; + + return res; + } + + case TEST_FWD_FULL_SUPP_TRACE: + if (check_ioam6_data(&p, ioam6h, node3)) + return 1; + if (check_ioam6_data(&p, ioam6h, node2)) + return 1; + return check_ioam6_data(&p, ioam6h, node1); + + default: + break; + } + + return 1; +} + +static int str2id(const char *tname) +{ + if (!strcmp("out_undef_ns", tname)) + return TEST_OUT_UNDEF_NS; + if (!strcmp("out_no_room", tname)) + return TEST_OUT_NO_ROOM; + if (!strcmp("out_bit0", tname)) + return TEST_OUT_BIT0; + if (!strcmp("out_bit1", tname)) + return TEST_OUT_BIT1; + if (!strcmp("out_bit2", tname)) + return TEST_OUT_BIT2; + if (!strcmp("out_bit3", tname)) + return TEST_OUT_BIT3; + if (!strcmp("out_bit4", tname)) + return TEST_OUT_BIT4; + if (!strcmp("out_bit5", tname)) + return TEST_OUT_BIT5; + if (!strcmp("out_bit6", tname)) + return TEST_OUT_BIT6; + if (!strcmp("out_bit7", tname)) + return TEST_OUT_BIT7; + if (!strcmp("out_bit8", tname)) + return TEST_OUT_BIT8; + if (!strcmp("out_bit9", tname)) + return TEST_OUT_BIT9; + if (!strcmp("out_bit10", tname)) + return TEST_OUT_BIT10; + if (!strcmp("out_bit11", tname)) + return TEST_OUT_BIT11; + if (!strcmp("out_bit22", tname)) + return TEST_OUT_BIT22; + if (!strcmp("out_full_supp_trace", tname)) + return TEST_OUT_FULL_SUPP_TRACE; + if (!strcmp("in_undef_ns", tname)) + return TEST_IN_UNDEF_NS; + if (!strcmp("in_no_room", tname)) + return TEST_IN_NO_ROOM; + if (!strcmp("in_oflag", tname)) + return TEST_IN_OFLAG; + if (!strcmp("in_bit0", tname)) + return TEST_IN_BIT0; + if (!strcmp("in_bit1", tname)) + return TEST_IN_BIT1; + if (!strcmp("in_bit2", tname)) + return TEST_IN_BIT2; + if (!strcmp("in_bit3", tname)) + return TEST_IN_BIT3; + if (!strcmp("in_bit4", tname)) + return TEST_IN_BIT4; + if (!strcmp("in_bit5", tname)) + return TEST_IN_BIT5; + if (!strcmp("in_bit6", tname)) + return TEST_IN_BIT6; + if (!strcmp("in_bit7", tname)) + return TEST_IN_BIT7; + if (!strcmp("in_bit8", tname)) + return TEST_IN_BIT8; + if (!strcmp("in_bit9", tname)) + return TEST_IN_BIT9; + if (!strcmp("in_bit10", tname)) + return TEST_IN_BIT10; + if (!strcmp("in_bit11", tname)) + return TEST_IN_BIT11; + if (!strcmp("in_bit22", tname)) + return TEST_IN_BIT22; + if (!strcmp("in_full_supp_trace", tname)) + return TEST_IN_FULL_SUPP_TRACE; + if (!strcmp("fwd_full_supp_trace", tname)) + return TEST_FWD_FULL_SUPP_TRACE; + + return -1; +} + +static int ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2) +{ + return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) | + (a1->s6_addr32[1] ^ a2->s6_addr32[1]) | + (a1->s6_addr32[2] ^ a2->s6_addr32[2]) | + (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0; +} + +static int get_u32(__u32 *val, const char *arg, int base) +{ + unsigned long res; + char *ptr; + + if (!arg || !*arg) + return -1; + res = strtoul(arg, &ptr, base); + + if (!ptr || ptr == arg || *ptr) + return -1; + + if (res == ULONG_MAX && errno == ERANGE) + return -1; + + if (res > 0xFFFFFFFFUL) + return -1; + + *val = res; + return 0; +} + +static int get_u16(__u16 *val, const char *arg, int base) +{ + unsigned long res; + char *ptr; + + if (!arg || !*arg) + return -1; + res = strtoul(arg, &ptr, base); + + if (!ptr || ptr == arg || *ptr) + return -1; + + if (res == ULONG_MAX && errno == ERANGE) + return -1; + + if (res > 0xFFFFUL) + return -1; + + *val = res; + return 0; +} + +static int (*func[__TEST_MAX])(int, struct ioam6_trace_hdr *, __u32, __u16) = { + [TEST_OUT_UNDEF_NS] = check_ioam_header, + [TEST_OUT_NO_ROOM] = check_ioam_header, + [TEST_OUT_BIT0] = check_ioam_header_and_data, + [TEST_OUT_BIT1] = check_ioam_header_and_data, + [TEST_OUT_BIT2] = check_ioam_header_and_data, + [TEST_OUT_BIT3] = check_ioam_header_and_data, + [TEST_OUT_BIT4] = check_ioam_header_and_data, + [TEST_OUT_BIT5] = check_ioam_header_and_data, + [TEST_OUT_BIT6] = check_ioam_header_and_data, + [TEST_OUT_BIT7] = check_ioam_header_and_data, + [TEST_OUT_BIT8] = check_ioam_header_and_data, + [TEST_OUT_BIT9] = check_ioam_header_and_data, + [TEST_OUT_BIT10] = check_ioam_header_and_data, + [TEST_OUT_BIT11] = check_ioam_header_and_data, + [TEST_OUT_BIT22] = check_ioam_header_and_data, + [TEST_OUT_FULL_SUPP_TRACE] = check_ioam_header_and_data, + [TEST_IN_UNDEF_NS] = check_ioam_header, + [TEST_IN_NO_ROOM] = check_ioam_header, + [TEST_IN_OFLAG] = check_ioam_header, + [TEST_IN_BIT0] = check_ioam_header_and_data, + [TEST_IN_BIT1] = check_ioam_header_and_data, + [TEST_IN_BIT2] = check_ioam_header_and_data, + [TEST_IN_BIT3] = check_ioam_header_and_data, + [TEST_IN_BIT4] = check_ioam_header_and_data, + [TEST_IN_BIT5] = check_ioam_header_and_data, + [TEST_IN_BIT6] = check_ioam_header_and_data, + [TEST_IN_BIT7] = check_ioam_header_and_data, + [TEST_IN_BIT8] = check_ioam_header_and_data, + [TEST_IN_BIT9] = check_ioam_header_and_data, + [TEST_IN_BIT10] = check_ioam_header_and_data, + [TEST_IN_BIT11] = check_ioam_header_and_data, + [TEST_IN_BIT22] = check_ioam_header_and_data, + [TEST_IN_FULL_SUPP_TRACE] = check_ioam_header_and_data, + [TEST_FWD_FULL_SUPP_TRACE] = check_ioam_header_and_data, +}; + +int main(int argc, char **argv) +{ + int fd, size, hoplen, tid, ret = 1; + struct in6_addr src, dst; + struct ioam6_hdr *opt; + struct ipv6hdr *ip6h; + __u8 buffer[400], *p; + __u16 ioam_ns; + __u32 tr_type; + + if (argc != 7) + goto out; + + tid = str2id(argv[2]); + if (tid < 0 || !func[tid]) + goto out; + + if (inet_pton(AF_INET6, argv[3], &src) != 1 || + inet_pton(AF_INET6, argv[4], &dst) != 1) + goto out; + + if (get_u32(&tr_type, argv[5], 16) || + get_u16(&ioam_ns, argv[6], 0)) + goto out; + + fd = socket(AF_PACKET, SOCK_DGRAM, __cpu_to_be16(ETH_P_IPV6)); + if (!fd) + goto out; + + if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, + argv[1], strlen(argv[1]))) + goto close; + +recv: + size = recv(fd, buffer, sizeof(buffer), 0); + if (size <= 0) + goto close; + + ip6h = (struct ipv6hdr *)buffer; + + if (!ipv6_addr_equal(&ip6h->saddr, &src) || + !ipv6_addr_equal(&ip6h->daddr, &dst)) + goto recv; + + if (ip6h->nexthdr != IPPROTO_HOPOPTS) + goto close; + + p = buffer + sizeof(*ip6h); + hoplen = (p[1] + 1) << 3; + p += sizeof(struct ipv6_hopopt_hdr); + + while (hoplen > 0) { + opt = (struct ioam6_hdr *)p; + + if (opt->opt_type == IPV6_TLV_IOAM && + opt->type == IOAM6_TYPE_PREALLOC) { + p += sizeof(*opt); + ret = func[tid](tid, (struct ioam6_trace_hdr *)p, + tr_type, ioam_ns); + break; + } + + p += opt->opt_len + 2; + hoplen -= opt->opt_len + 2; + } +close: + close(fd); +out: + return ret; +} diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c index f23438d512c5..3d7dde2c321b 100644 --- a/tools/testing/selftests/net/ipsec.c +++ b/tools/testing/selftests/net/ipsec.c @@ -484,13 +484,16 @@ enum desc_type { MONITOR_ACQUIRE, EXPIRE_STATE, EXPIRE_POLICY, + SPDINFO_ATTRS, }; const char *desc_name[] = { "create tunnel", "alloc spi", "monitor acquire", "expire state", - "expire policy" + "expire policy", + "spdinfo attributes", + "" }; struct xfrm_desc { enum desc_type type; @@ -1593,6 +1596,155 @@ out_close: return ret; } +static int xfrm_spdinfo_set_thresh(int xfrm_sock, uint32_t *seq, + unsigned thresh4_l, unsigned thresh4_r, + unsigned thresh6_l, unsigned thresh6_r, + bool add_bad_attr) + +{ + struct { + struct nlmsghdr nh; + union { + uint32_t unused; + int error; + }; + char attrbuf[MAX_PAYLOAD]; + } req; + struct xfrmu_spdhthresh thresh; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.unused)); + req.nh.nlmsg_type = XFRM_MSG_NEWSPDINFO; + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_seq = (*seq)++; + + thresh.lbits = thresh4_l; + thresh.rbits = thresh4_r; + if (rtattr_pack(&req.nh, sizeof(req), XFRMA_SPD_IPV4_HTHRESH, &thresh, sizeof(thresh))) + return -1; + + thresh.lbits = thresh6_l; + thresh.rbits = thresh6_r; + if (rtattr_pack(&req.nh, sizeof(req), XFRMA_SPD_IPV6_HTHRESH, &thresh, sizeof(thresh))) + return -1; + + if (add_bad_attr) { + BUILD_BUG_ON(XFRMA_IF_ID <= XFRMA_SPD_MAX + 1); + if (rtattr_pack(&req.nh, sizeof(req), XFRMA_IF_ID, NULL, 0)) { + pr_err("adding attribute failed: no space"); + return -1; + } + } + + if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) { + pr_err("send()"); + return -1; + } + + if (recv(xfrm_sock, &req, sizeof(req), 0) < 0) { + pr_err("recv()"); + return -1; + } else if (req.nh.nlmsg_type != NLMSG_ERROR) { + printk("expected NLMSG_ERROR, got %d", (int)req.nh.nlmsg_type); + return -1; + } + + if (req.error) { + printk("NLMSG_ERROR: %d: %s", req.error, strerror(-req.error)); + return -1; + } + + return 0; +} + +static int xfrm_spdinfo_attrs(int xfrm_sock, uint32_t *seq) +{ + struct { + struct nlmsghdr nh; + union { + uint32_t unused; + int error; + }; + char attrbuf[MAX_PAYLOAD]; + } req; + + if (xfrm_spdinfo_set_thresh(xfrm_sock, seq, 32, 31, 120, 16, false)) { + pr_err("Can't set SPD HTHRESH"); + return KSFT_FAIL; + } + + memset(&req, 0, sizeof(req)); + + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.unused)); + req.nh.nlmsg_type = XFRM_MSG_GETSPDINFO; + req.nh.nlmsg_flags = NLM_F_REQUEST; + req.nh.nlmsg_seq = (*seq)++; + if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) { + pr_err("send()"); + return KSFT_FAIL; + } + + if (recv(xfrm_sock, &req, sizeof(req), 0) < 0) { + pr_err("recv()"); + return KSFT_FAIL; + } else if (req.nh.nlmsg_type == XFRM_MSG_NEWSPDINFO) { + size_t len = NLMSG_PAYLOAD(&req.nh, sizeof(req.unused)); + struct rtattr *attr = (void *)req.attrbuf; + int got_thresh = 0; + + for (; RTA_OK(attr, len); attr = RTA_NEXT(attr, len)) { + if (attr->rta_type == XFRMA_SPD_IPV4_HTHRESH) { + struct xfrmu_spdhthresh *t = RTA_DATA(attr); + + got_thresh++; + if (t->lbits != 32 || t->rbits != 31) { + pr_err("thresh differ: %u, %u", + t->lbits, t->rbits); + return KSFT_FAIL; + } + } + if (attr->rta_type == XFRMA_SPD_IPV6_HTHRESH) { + struct xfrmu_spdhthresh *t = RTA_DATA(attr); + + got_thresh++; + if (t->lbits != 120 || t->rbits != 16) { + pr_err("thresh differ: %u, %u", + t->lbits, t->rbits); + return KSFT_FAIL; + } + } + } + if (got_thresh != 2) { + pr_err("only %d thresh returned by XFRM_MSG_GETSPDINFO", got_thresh); + return KSFT_FAIL; + } + } else if (req.nh.nlmsg_type != NLMSG_ERROR) { + printk("expected NLMSG_ERROR, got %d", (int)req.nh.nlmsg_type); + return KSFT_FAIL; + } else { + printk("NLMSG_ERROR: %d: %s", req.error, strerror(-req.error)); + return -1; + } + + /* Restore the default */ + if (xfrm_spdinfo_set_thresh(xfrm_sock, seq, 32, 32, 128, 128, false)) { + pr_err("Can't restore SPD HTHRESH"); + return KSFT_FAIL; + } + + /* + * At this moment xfrm uses nlmsg_parse_deprecated(), which + * implies NL_VALIDATE_LIBERAL - ignoring attributes with + * (type > maxtype). nla_parse_depricated_strict() would enforce + * it. Or even stricter nla_parse(). + * Right now it's not expected to fail, but to be ignored. + */ + if (xfrm_spdinfo_set_thresh(xfrm_sock, seq, 32, 32, 128, 128, true)) + return KSFT_PASS; + + return KSFT_PASS; +} + static int child_serv(int xfrm_sock, uint32_t *seq, unsigned int nr, int cmd_fd, void *buf, struct xfrm_desc *desc) { @@ -1717,6 +1869,9 @@ static int child_f(unsigned int nr, int test_desc_fd, int cmd_fd, void *buf) case EXPIRE_POLICY: ret = xfrm_expire_policy(xfrm_sock, &seq, nr, &desc); break; + case SPDINFO_ATTRS: + ret = xfrm_spdinfo_attrs(xfrm_sock, &seq); + break; default: printk("Unknown desc type %d", desc.type); exit(KSFT_FAIL); @@ -1994,8 +2149,10 @@ static int write_proto_plan(int fd, int proto) * sizeof(xfrm_user_polexpire) = 168 | sizeof(xfrm_user_polexpire) = 176 * * Check the affected by the UABI difference structures. + * Also, check translation for xfrm_set_spdinfo: it has it's own attributes + * which needs to be correctly copied, but not translated. */ -const unsigned int compat_plan = 4; +const unsigned int compat_plan = 5; static int write_compat_struct_tests(int test_desc_fd) { struct xfrm_desc desc = {}; @@ -2019,6 +2176,10 @@ static int write_compat_struct_tests(int test_desc_fd) if (__write_desc(test_desc_fd, &desc)) return -1; + desc.type = SPDINFO_ATTRS; + if (__write_desc(test_desc_fd, &desc)) + return -1; + return 0; } diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index d88e1fdfb147..89c4753c2760 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -6,6 +6,7 @@ #include <limits.h> #include <fcntl.h> #include <string.h> +#include <stdarg.h> #include <stdbool.h> #include <stdint.h> #include <stdio.h> @@ -25,6 +26,7 @@ #include <netinet/in.h> #include <linux/tcp.h> +#include <linux/time_types.h> extern int optind; @@ -66,6 +68,13 @@ static unsigned int cfg_do_w; static int cfg_wait; static uint32_t cfg_mark; +struct cfg_cmsg_types { + unsigned int cmsg_enabled:1; + unsigned int timestampns:1; +}; + +static struct cfg_cmsg_types cfg_cmsg_types; + static void die_usage(void) { fprintf(stderr, "Usage: mptcp_connect [-6] [-u] [-s MPTCP|TCP] [-p port] [-m mode]" @@ -80,11 +89,22 @@ static void die_usage(void) fprintf(stderr, "\t-M mark -- set socket packet mark\n"); fprintf(stderr, "\t-u -- check mptcp ulp\n"); fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n"); + fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n"); fprintf(stderr, "\t-P [saveWithPeek|saveAfterPeek] -- save data with/after MSG_PEEK form tcp socket\n"); exit(1); } +static void xerror(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + exit(1); +} + static void handle_signal(int nr) { quit = true; @@ -338,6 +358,58 @@ static size_t do_write(const int fd, char *buf, const size_t len) return offset; } +static void process_cmsg(struct msghdr *msgh) +{ + struct __kernel_timespec ts; + bool ts_found = false; + struct cmsghdr *cmsg; + + for (cmsg = CMSG_FIRSTHDR(msgh); cmsg ; cmsg = CMSG_NXTHDR(msgh, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SO_TIMESTAMPNS_NEW) { + memcpy(&ts, CMSG_DATA(cmsg), sizeof(ts)); + ts_found = true; + continue; + } + } + + if (cfg_cmsg_types.timestampns) { + if (!ts_found) + xerror("TIMESTAMPNS not present\n"); + } +} + +static ssize_t do_recvmsg_cmsg(const int fd, char *buf, const size_t len) +{ + char msg_buf[8192]; + struct iovec iov = { + .iov_base = buf, + .iov_len = len, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = msg_buf, + .msg_controllen = sizeof(msg_buf), + }; + int flags = 0; + int ret = recvmsg(fd, &msg, flags); + + if (ret <= 0) + return ret; + + if (msg.msg_controllen && !cfg_cmsg_types.cmsg_enabled) + xerror("got %lu bytes of cmsg data, expected 0\n", + (unsigned long)msg.msg_controllen); + + if (msg.msg_controllen == 0 && cfg_cmsg_types.cmsg_enabled) + xerror("%s\n", "got no cmsg data"); + + if (msg.msg_controllen) + process_cmsg(&msg); + + return ret; +} + static ssize_t do_rnd_read(const int fd, char *buf, const size_t len) { int ret = 0; @@ -357,6 +429,8 @@ static ssize_t do_rnd_read(const int fd, char *buf, const size_t len) } else if (cfg_peek == CFG_AFTER_PEEK) { ret = recv(fd, buf, cap, MSG_PEEK); ret = (ret < 0) ? ret : read(fd, buf, cap); + } else if (cfg_cmsg_types.cmsg_enabled) { + ret = do_recvmsg_cmsg(fd, buf, cap); } else { ret = read(fd, buf, cap); } @@ -786,6 +860,48 @@ static void init_rng(void) srand(foo); } +static void xsetsockopt(int fd, int level, int optname, const void *optval, socklen_t optlen) +{ + int err; + + err = setsockopt(fd, level, optname, optval, optlen); + if (err) { + perror("setsockopt"); + exit(1); + } +} + +static void apply_cmsg_types(int fd, const struct cfg_cmsg_types *cmsg) +{ + static const unsigned int on = 1; + + if (cmsg->timestampns) + xsetsockopt(fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW, &on, sizeof(on)); +} + +static void parse_cmsg_types(const char *type) +{ + char *next = strchr(type, ','); + unsigned int len = 0; + + cfg_cmsg_types.cmsg_enabled = 1; + + if (next) { + parse_cmsg_types(next + 1); + len = next - type; + } else { + len = strlen(type); + } + + if (strncmp(type, "TIMESTAMPNS", len) == 0) { + cfg_cmsg_types.timestampns = 1; + return; + } + + fprintf(stderr, "Unrecognized cmsg option %s\n", type); + exit(1); +} + int main_loop(void) { int fd; @@ -801,6 +917,8 @@ int main_loop(void) set_rcvbuf(fd, cfg_rcvbuf); if (cfg_sndbuf) set_sndbuf(fd, cfg_sndbuf); + if (cfg_cmsg_types.cmsg_enabled) + apply_cmsg_types(fd, &cfg_cmsg_types); return copyfd_io(0, fd, 1); } @@ -887,7 +1005,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "6jr:lp:s:hut:m:S:R:w:M:P:")) != -1) { + while ((c = getopt(argc, argv, "6jr:lp:s:hut:m:S:R:w:M:P:c:")) != -1) { switch (c) { case 'j': cfg_join = true; @@ -943,6 +1061,9 @@ static void parse_opts(int argc, char **argv) case 'P': cfg_peek = parse_peek(optarg); break; + case 'c': + parse_cmsg_types(optarg); + break; } } @@ -976,6 +1097,8 @@ int main(int argc, char *argv[]) set_sndbuf(fd, cfg_sndbuf); if (cfg_mark) set_mark(fd, cfg_mark); + if (cfg_cmsg_types.cmsg_enabled) + apply_cmsg_types(fd, &cfg_cmsg_types); return main_loop_s(fd); } diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 2b495dc8d78e..559173a8e387 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -3,7 +3,7 @@ time_start=$(date +%s) -optstring="S:R:d:e:l:r:h4cm:f:t" +optstring="S:R:d:e:l:r:h4cm:f:tC" ret=0 sin="" sout="" @@ -22,6 +22,7 @@ sndbuf=0 rcvbuf=0 options_log=true do_tcp=0 +checksum=false filesize=0 if [ $tc_loss -eq 100 ];then @@ -47,6 +48,7 @@ usage() { echo -e "\t-R: set rcvbuf value (default: use kernel default)" echo -e "\t-m: test mode (poll, sendfile; default: poll)" echo -e "\t-t: also run tests with TCP (use twice to non-fallback tcp)" + echo -e "\t-C: enable the MPTCP data checksum" } while getopts "$optstring" option;do @@ -104,6 +106,9 @@ while getopts "$optstring" option;do "t") do_tcp=$((do_tcp+1)) ;; + "C") + checksum=true + ;; "?") usage $0 exit 1 @@ -197,6 +202,12 @@ ip -net "$ns4" link set ns4eth3 up ip -net "$ns4" route add default via 10.0.3.2 ip -net "$ns4" route add default via dead:beef:3::2 +if $checksum; then + for i in "$ns1" "$ns2" "$ns3" "$ns4";do + ip netns exec $i sysctl -q net.mptcp.checksum_enabled=1 + done +fi + set_ethtool_flags() { local ns="$1" local dev="$2" @@ -669,6 +680,25 @@ run_tests_peekmode() run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-P ${peekmode}" } +display_time() +{ + time_end=$(date +%s) + time_run=$((time_end-time_start)) + + echo "Time: ${time_run} seconds" +} + +stop_if_error() +{ + local msg="$1" + + if [ ${ret} -ne 0 ]; then + echo "FAIL: ${msg}" 1>&2 + display_time + exit ${ret} + fi +} + make_file "$cin" "client" make_file "$sin" "server" @@ -676,6 +706,8 @@ check_mptcp_disabled check_mptcp_ulp_setsockopt +stop_if_error "The kernel configuration is not valid for MPTCP" + echo "INFO: validating network environment with pings" for sender in "$ns1" "$ns2" "$ns3" "$ns4";do do_ping "$ns1" $sender 10.0.1.1 @@ -695,6 +727,8 @@ for sender in "$ns1" "$ns2" "$ns3" "$ns4";do do_ping "$ns4" $sender dead:beef:3::1 done +stop_if_error "Could not even run ping tests" + [ -n "$tc_loss" ] && tc -net "$ns2" qdisc add dev ns2eth3 root netem loss random $tc_loss delay ${tc_delay}ms echo -n "INFO: Using loss of $tc_loss " test "$tc_delay" -gt 0 && echo -n "delay $tc_delay ms " @@ -722,18 +756,13 @@ echo "on ns3eth4" tc -net "$ns3" qdisc add dev ns3eth4 root netem delay ${reorder_delay}ms $tc_reorder -for sender in $ns1 $ns2 $ns3 $ns4;do - run_tests_lo "$ns1" "$sender" 10.0.1.1 1 - if [ $ret -ne 0 ] ;then - echo "FAIL: Could not even run loopback test" 1>&2 - exit $ret - fi - run_tests_lo "$ns1" $sender dead:beef:1::1 1 - if [ $ret -ne 0 ] ;then - echo "FAIL: Could not even run loopback v6 test" 2>&1 - exit $ret - fi +run_tests_lo "$ns1" "$ns1" 10.0.1.1 1 +stop_if_error "Could not even run loopback test" +run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 +stop_if_error "Could not even run loopback v6 test" + +for sender in $ns1 $ns2 $ns3 $ns4;do # ns1<->ns2 is not subject to reordering/tc delays. Use it to test # mptcp syncookie support. if [ $sender = $ns1 ]; then @@ -742,6 +771,9 @@ for sender in $ns1 $ns2 $ns3 $ns4;do ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=1 fi + run_tests "$ns1" $sender 10.0.1.1 + run_tests "$ns1" $sender dead:beef:1::1 + run_tests "$ns2" $sender 10.0.1.2 run_tests "$ns2" $sender dead:beef:1::2 run_tests "$ns2" $sender 10.0.2.1 @@ -754,14 +786,13 @@ for sender in $ns1 $ns2 $ns3 $ns4;do run_tests "$ns4" $sender 10.0.3.1 run_tests "$ns4" $sender dead:beef:3::1 + + stop_if_error "Tests with $sender as a sender have failed" done run_tests_peekmode "saveWithPeek" run_tests_peekmode "saveAfterPeek" +stop_if_error "Tests with peek mode have failed" -time_end=$(date +%s) -time_run=$((time_end-time_start)) - -echo "Time: ${time_run} seconds" - +display_time exit $ret diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index fd99485cf2a4..255793c5ac4f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3,8 +3,10 @@ ret=0 sin="" +sinfail="" sout="" cin="" +cinfail="" cinsent="" cout="" ksft_skip=4 @@ -12,6 +14,7 @@ timeout_poll=30 timeout_test=$((timeout_poll * 2 + 1)) mptcp_connect="" capture=0 +checksum=0 do_all_tests=1 TEST_COUNT=0 @@ -49,6 +52,9 @@ init() ip netns exec $netns sysctl -q net.mptcp.enabled=1 ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0 ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0 + if [ $checksum -eq 1 ]; then + ip netns exec $netns sysctl -q net.mptcp.checksum_enabled=1 + fi done # ns1 ns2 @@ -72,6 +78,14 @@ init() done } +init_shapers() +{ + for i in `seq 1 4`; do + tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1 + tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1 + done +} + cleanup_partial() { rm -f "$capout" @@ -84,8 +98,8 @@ cleanup_partial() cleanup() { - rm -f "$cin" "$cout" - rm -f "$sin" "$sout" "$cinsent" + rm -f "$cin" "$cout" "$sinfail" + rm -f "$sin" "$sout" "$cinsent" "$cinfail" cleanup_partial } @@ -124,6 +138,28 @@ reset_with_add_addr_timeout() -j DROP } +reset_with_checksum() +{ + local ns1_enable=$1 + local ns2_enable=$2 + + reset + + ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=$ns1_enable + ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=$ns2_enable +} + +reset_with_allow_join_id0() +{ + local ns1_enable=$1 + local ns2_enable=$2 + + reset + + ip netns exec $ns1 sysctl -q net.mptcp.allow_join_initial_addr_port=$ns1_enable + ip netns exec $ns2 sysctl -q net.mptcp.allow_join_initial_addr_port=$ns2_enable +} + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" @@ -185,11 +221,15 @@ link_failure() { ns="$1" - l=$((RANDOM%4)) - l=$((l+1)) + if [ -z "$FAILING_LINKS" ]; then + l=$((RANDOM%4)) + FAILING_LINKS=$((l+1)) + fi - veth="ns1eth$l" - ip -net "$ns" link set "$veth" down + for l in $FAILING_LINKS; do + veth="ns1eth$l" + ip -net "$ns" link set "$veth" down + done } # $1: IP address @@ -254,10 +294,17 @@ do_transfer() local_addr="0.0.0.0" fi - timeout ${timeout_test} \ - ip netns exec ${listener_ns} \ - $mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ - ${local_addr} < "$sin" > "$sout" & + if [ "$test_link_fail" -eq 2 ];then + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + $mptcp_connect -t ${timeout_poll} -l -p $port -s ${cl_proto} \ + ${local_addr} < "$sinfail" > "$sout" & + else + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + $mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ + ${local_addr} < "$sin" > "$sout" & + fi spid=$! sleep 1 @@ -268,7 +315,7 @@ do_transfer() $mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ $connect_addr < "$cin" > "$cout" & else - ( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | \ + ( cat "$cinfail" ; sleep 2; link_failure $listener_ns ; cat "$cinfail" ) | \ tee "$cinsent" | \ timeout ${timeout_test} \ ip netns exec ${connector_ns} \ @@ -297,17 +344,18 @@ do_transfer() let rm_nr_ns1=-addr_nr_ns1 if [ $rm_nr_ns1 -lt 8 ]; then counter=1 + pos=1 dump=(`ip netns exec ${listener_ns} ./pm_nl_ctl dump`) if [ ${#dump[@]} -gt 0 ]; then - id=${dump[1]} sleep 1 while [ $counter -le $rm_nr_ns1 ] do + id=${dump[$pos]} ip netns exec ${listener_ns} ./pm_nl_ctl del $id sleep 1 let counter+=1 - let id+=1 + let pos+=5 done fi elif [ $rm_nr_ns1 -eq 8 ]; then @@ -319,6 +367,12 @@ do_transfer() fi fi + flags="subflow" + if [[ "${addr_nr_ns2}" = "fullmesh_"* ]]; then + flags="${flags},fullmesh" + addr_nr_ns2=${addr_nr_ns2:9} + fi + if [ $addr_nr_ns2 -gt 0 ]; then let add_nr_ns2=addr_nr_ns2 counter=3 @@ -330,7 +384,7 @@ do_transfer() else addr="10.0.$counter.2" fi - ip netns exec $ns2 ./pm_nl_ctl add $addr flags subflow + ip netns exec $ns2 ./pm_nl_ctl add $addr flags $flags let counter+=1 let add_nr_ns2-=1 done @@ -339,17 +393,18 @@ do_transfer() let rm_nr_ns2=-addr_nr_ns2 if [ $rm_nr_ns2 -lt 8 ]; then counter=1 + pos=1 dump=(`ip netns exec ${connector_ns} ./pm_nl_ctl dump`) if [ ${#dump[@]} -gt 0 ]; then - id=${dump[1]} sleep 1 while [ $counter -le $rm_nr_ns2 ] do + id=${dump[$pos]} ip netns exec ${connector_ns} ./pm_nl_ctl del $id sleep 1 let counter+=1 - let id+=1 + let pos+=5 done fi elif [ $rm_nr_ns2 -eq 8 ]; then @@ -408,7 +463,11 @@ do_transfer() return 1 fi - check_transfer $sin $cout "file received by client" + if [ "$test_link_fail" -eq 2 ];then + check_transfer $sinfail $cout "file received by client" + else + check_transfer $sin $cout "file received by client" + fi retc=$? if [ "$test_link_fail" -eq 0 ];then check_transfer $cin $sout "file received by server" @@ -451,28 +510,108 @@ run_tests() lret=0 oldin="" - if [ "$test_linkfail" -eq 1 ];then - size=$((RANDOM%1024)) + # create the input file for the failure test when + # the first failure test run + if [ "$test_linkfail" -ne 0 -a -z "$cinfail" ]; then + # the client file must be considerably larger + # of the maximum expected cwin value, or the + # link utilization will be not predicable + size=$((RANDOM%2)) size=$((size+1)) - size=$((size*128)) + size=$((size*8192)) + size=$((size + ( $RANDOM % 8192) )) - oldin=$(mktemp) - cp "$cin" "$oldin" - make_file "$cin" "client" $size + cinfail=$(mktemp) + make_file "$cinfail" "client" $size + fi + + if [ "$test_linkfail" -eq 2 -a -z "$sinfail" ]; then + size=$((RANDOM%16)) + size=$((size+1)) + size=$((size*2048)) + + sinfail=$(mktemp) + make_file "$sinfail" "server" $size fi do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} \ ${test_linkfail} ${addr_nr_ns1} ${addr_nr_ns2} ${speed} ${bkup} lret=$? +} + +chk_csum_nr() +{ + local msg=${1:-""} + local count + local dump_stats + + if [ ! -z "$msg" ]; then + printf "%02u" "$TEST_COUNT" + else + echo -n " " + fi + printf " %-36s %s" "$msg" "sum" + count=`ip netns exec $ns1 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != 0 ]; then + echo "[fail] got $count data checksum error[s] expected 0" + ret=1 + dump_stats=1 + else + echo -n "[ ok ]" + fi + echo -n " - csum " + count=`ip netns exec $ns2 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != 0 ]; then + echo "[fail] got $count data checksum error[s] expected 0" + ret=1 + dump_stats=1 + else + echo "[ ok ]" + fi + if [ "${dump_stats}" = 1 ]; then + echo Server ns stats + ip netns exec $ns1 nstat -as | grep MPTcp + echo Client ns stats + ip netns exec $ns2 nstat -as | grep MPTcp + fi +} + +chk_fail_nr() +{ + local mp_fail_nr_tx=$1 + local mp_fail_nr_rx=$2 + local count + local dump_stats + + printf "%-39s %s" " " "ftx" + count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPFailTx | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != "$mp_fail_nr_tx" ]; then + echo "[fail] got $count MP_FAIL[s] TX expected $mp_fail_nr_tx" + ret=1 + dump_stats=1 + else + echo -n "[ ok ]" + fi - if [ "$test_linkfail" -eq 1 ];then - cp "$oldin" "$cin" - rm -f "$oldin" + echo -n " - frx " + count=`ip netns exec $ns2 nstat -as | grep MPTcpExtMPFailRx | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != "$mp_fail_nr_rx" ]; then + echo "[fail] got $count MP_FAIL[s] RX expected $mp_fail_nr_rx" + ret=1 + dump_stats=1 + else + echo "[ ok ]" fi - if [ $lret -ne 0 ]; then - ret=$lret - return + if [ "${dump_stats}" = 1 ]; then + echo Server ns stats + ip netns exec $ns1 nstat -as | grep MPTcp + echo Client ns stats + ip netns exec $ns2 nstat -as | grep MPTcp fi } @@ -523,6 +662,50 @@ chk_join_nr() echo Client ns stats ip netns exec $ns2 nstat -as | grep MPTcp fi + if [ $checksum -eq 1 ]; then + chk_csum_nr + chk_fail_nr 0 0 + fi +} + +# a negative value for 'stale_max' means no upper bound: +# for bidirectional transfer, if one peer sleep for a while +# - as these tests do - we can have a quite high number of +# stale/recover conversions, proportional to +# sleep duration/ MPTCP-level RTX interval. +chk_stale_nr() +{ + local ns=$1 + local stale_min=$2 + local stale_max=$3 + local stale_delta=$4 + local dump_stats + local stale_nr + local recover_nr + + printf "%-39s %-18s" " " "stale" + stale_nr=`ip netns exec $ns nstat -as | grep MPTcpExtSubflowStale | awk '{print $2}'` + [ -z "$stale_nr" ] && stale_nr=0 + recover_nr=`ip netns exec $ns nstat -as | grep MPTcpExtSubflowRecover | awk '{print $2}'` + [ -z "$recover_nr" ] && recover_nr=0 + + if [ $stale_nr -lt $stale_min ] || + [ $stale_max -gt 0 -a $stale_nr -gt $stale_max ] || + [ $((stale_nr - $recover_nr)) -ne $stale_delta ]; then + echo "[fail] got $stale_nr stale[s] $recover_nr recover[s], " \ + " expected stale in range [$stale_min..$stale_max]," \ + " stale-recover delta $stale_delta " + ret=1 + dump_stats=1 + else + echo "[ ok ]" + fi + + if [ "${dump_stats}" = 1 ]; then + echo $ns stats + ip netns exec $ns ip -s link show + ip netns exec $ns nstat -as | grep MPTcp + fi } chk_add_nr() @@ -733,6 +916,27 @@ chk_prio_nr() fi } +chk_link_usage() +{ + local ns=$1 + local link=$2 + local out=$3 + local expected_rate=$4 + local tx_link=`ip netns exec $ns cat /sys/class/net/$link/statistics/tx_bytes` + local tx_total=`ls -l $out | awk '{print $5}'` + local tx_rate=$((tx_link * 100 / $tx_total)) + local tolerance=5 + + printf "%-39s %-18s" " " "link usage" + if [ $tx_rate -lt $((expected_rate - $tolerance)) -o \ + $tx_rate -gt $((expected_rate + $tolerance)) ]; then + echo "[fail] got $tx_rate% usage, expected $expected_rate%" + ret=1 + else + echo "[ ok ]" + fi +} + subflows_tests() { reset @@ -850,20 +1054,101 @@ signal_address_tests() run_tests $ns1 $ns2 10.0.1.1 chk_join_nr "signal invalid addresses" 1 1 1 chk_add_nr 3 3 + + # signal addresses race test + reset + ip netns exec $ns1 ./pm_nl_ctl limits 4 4 + ip netns exec $ns2 ./pm_nl_ctl limits 4 4 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl add 10.0.1.2 flags signal + ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags signal + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags signal + ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags signal + run_tests $ns1 $ns2 10.0.1.1 + chk_add_nr 4 4 } link_failure_tests() { # accept and use add_addr with additional subflows and link loss reset + + # without any b/w limit each veth could spool the packets and get + # them acked at xmit time, so that the corresponding subflow will + # have almost always no outstanding pkts, the scheduler will pick + # always the first subflow and we will have hard time testing + # active backup and link switch-over. + # Let's set some arbitrary (low) virtual link limits. + init_shapers ip netns exec $ns1 ./pm_nl_ctl limits 0 3 - ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal ip netns exec $ns2 ./pm_nl_ctl limits 1 3 - ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow - ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow + ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 dev ns2eth4 flags subflow run_tests $ns1 $ns2 10.0.1.1 1 chk_join_nr "multiple flows, signal, link failure" 3 3 3 chk_add_nr 1 1 + chk_stale_nr $ns2 1 5 1 + + # accept and use add_addr with additional subflows and link loss + # for bidirectional transfer + reset + init_shapers + ip netns exec $ns1 ./pm_nl_ctl limits 0 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 3 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow + ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 dev ns2eth4 flags subflow + run_tests $ns1 $ns2 10.0.1.1 2 + chk_join_nr "multi flows, signal, bidi, link fail" 3 3 3 + chk_add_nr 1 1 + chk_stale_nr $ns2 1 -1 1 + + # 2 subflows plus 1 backup subflow with a lossy link, backup + # will never be used + reset + init_shapers + ip netns exec $ns1 ./pm_nl_ctl limits 0 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 2 + export FAILING_LINKS="1" + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup + run_tests $ns1 $ns2 10.0.1.1 1 + chk_join_nr "backup subflow unused, link failure" 2 2 2 + chk_add_nr 1 1 + chk_link_usage $ns2 ns2eth3 $cinsent 0 + + # 2 lossy links after half transfer, backup will get half of + # the traffic + reset + init_shapers + ip netns exec $ns1 ./pm_nl_ctl limits 0 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 2 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup + export FAILING_LINKS="1 2" + run_tests $ns1 $ns2 10.0.1.1 1 + chk_join_nr "backup flow used, multi links fail" 2 2 2 + chk_add_nr 1 1 + chk_stale_nr $ns2 2 4 2 + chk_link_usage $ns2 ns2eth3 $cinsent 50 + + # use a backup subflow with the first subflow on a lossy link + # for bidirectional transfer + reset + init_shapers + ip netns exec $ns1 ./pm_nl_ctl limits 0 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 3 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup + run_tests $ns1 $ns2 10.0.1.1 2 + chk_join_nr "backup flow used, bidi, link failure" 2 2 2 + chk_add_nr 1 1 + chk_stale_nr $ns2 1 -1 2 + chk_link_usage $ns2 ns2eth3 $cinsent 50 } add_addr_timeout_tests() @@ -1341,7 +1626,7 @@ syncookies_tests() ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr "subflows limited by server w cookies" 2 2 1 + chk_join_nr "subflows limited by server w cookies" 2 1 1 # test signal address with cookies reset_with_cookies @@ -1374,6 +1659,143 @@ syncookies_tests() chk_add_nr 1 1 } +checksum_tests() +{ + # checksum test 0 0 + reset_with_checksum 0 0 + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + run_tests $ns1 $ns2 10.0.1.1 + chk_csum_nr "checksum test 0 0" + + # checksum test 1 1 + reset_with_checksum 1 1 + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + run_tests $ns1 $ns2 10.0.1.1 + chk_csum_nr "checksum test 1 1" + + # checksum test 0 1 + reset_with_checksum 0 1 + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + run_tests $ns1 $ns2 10.0.1.1 + chk_csum_nr "checksum test 0 1" + + # checksum test 1 0 + reset_with_checksum 1 0 + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + run_tests $ns1 $ns2 10.0.1.1 + chk_csum_nr "checksum test 1 0" +} + +deny_join_id0_tests() +{ + # subflow allow join id0 ns1 + reset_with_allow_join_id0 1 0 + ip netns exec $ns1 ./pm_nl_ctl limits 1 1 + ip netns exec $ns2 ./pm_nl_ctl limits 1 1 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "single subflow allow join id0 ns1" 1 1 1 + + # subflow allow join id0 ns2 + reset_with_allow_join_id0 0 1 + ip netns exec $ns1 ./pm_nl_ctl limits 1 1 + ip netns exec $ns2 ./pm_nl_ctl limits 1 1 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "single subflow allow join id0 ns2" 0 0 0 + + # signal address allow join id0 ns1 + # ADD_ADDRs are not affected by allow_join_id0 value. + reset_with_allow_join_id0 1 0 + ip netns exec $ns1 ./pm_nl_ctl limits 1 1 + ip netns exec $ns2 ./pm_nl_ctl limits 1 1 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "signal address allow join id0 ns1" 1 1 1 + chk_add_nr 1 1 + + # signal address allow join id0 ns2 + # ADD_ADDRs are not affected by allow_join_id0 value. + reset_with_allow_join_id0 0 1 + ip netns exec $ns1 ./pm_nl_ctl limits 1 1 + ip netns exec $ns2 ./pm_nl_ctl limits 1 1 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "signal address allow join id0 ns2" 1 1 1 + chk_add_nr 1 1 + + # subflow and address allow join id0 ns1 + reset_with_allow_join_id0 1 0 + ip netns exec $ns1 ./pm_nl_ctl limits 2 2 + ip netns exec $ns2 ./pm_nl_ctl limits 2 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "subflow and address allow join id0 1" 2 2 2 + + # subflow and address allow join id0 ns2 + reset_with_allow_join_id0 0 1 + ip netns exec $ns1 ./pm_nl_ctl limits 2 2 + ip netns exec $ns2 ./pm_nl_ctl limits 2 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "subflow and address allow join id0 2" 1 1 1 +} + +fullmesh_tests() +{ + # fullmesh 1 + # 2 fullmesh addrs in ns2, added before the connection, + # 1 non-fullmesh addr in ns1, added during the connection. + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 4 + ip netns exec $ns2 ./pm_nl_ctl limits 1 4 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow,fullmesh + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow,fullmesh + run_tests $ns1 $ns2 10.0.1.1 0 1 0 slow + chk_join_nr "fullmesh test 2x1" 4 4 4 + chk_add_nr 1 1 + + # fullmesh 2 + # 1 non-fullmesh addr in ns1, added before the connection, + # 1 fullmesh addr in ns2, added during the connection. + reset + ip netns exec $ns1 ./pm_nl_ctl limits 1 3 + ip netns exec $ns2 ./pm_nl_ctl limits 1 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + run_tests $ns1 $ns2 10.0.1.1 0 0 fullmesh_1 slow + chk_join_nr "fullmesh test 1x1" 3 3 3 + chk_add_nr 1 1 + + # fullmesh 3 + # 1 non-fullmesh addr in ns1, added before the connection, + # 2 fullmesh addrs in ns2, added during the connection. + reset + ip netns exec $ns1 ./pm_nl_ctl limits 2 5 + ip netns exec $ns2 ./pm_nl_ctl limits 1 5 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + run_tests $ns1 $ns2 10.0.1.1 0 0 fullmesh_2 slow + chk_join_nr "fullmesh test 1x2" 5 5 5 + chk_add_nr 1 1 + + # fullmesh 4 + # 1 non-fullmesh addr in ns1, added before the connection, + # 2 fullmesh addrs in ns2, added during the connection, + # limit max_subflows to 4. + reset + ip netns exec $ns1 ./pm_nl_ctl limits 2 4 + ip netns exec $ns2 ./pm_nl_ctl limits 1 4 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + run_tests $ns1 $ns2 10.0.1.1 0 0 fullmesh_2 slow + chk_join_nr "fullmesh test 1x2, limited" 4 4 4 + chk_add_nr 1 1 +} + all_tests() { subflows_tests @@ -1387,6 +1809,9 @@ all_tests() backup_tests add_addr_ports_tests syncookies_tests + checksum_tests + deny_join_id0_tests + fullmesh_tests } usage() @@ -1403,7 +1828,11 @@ usage() echo " -b backup_tests" echo " -p add_addr_ports_tests" echo " -k syncookies_tests" + echo " -S checksum_tests" + echo " -d deny_join_id0_tests" + echo " -m fullmesh_tests" echo " -c capture pcap files" + echo " -C enable data checksum" echo " -h help" } @@ -1418,13 +1847,16 @@ make_file "$sin" "server" 1 trap cleanup EXIT for arg in "$@"; do - # check for "capture" arg before launching tests + # check for "capture/checksum" args before launching tests if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then capture=1 fi + if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"C"[0-9a-zA-Z]*$ ]]; then + checksum=1 + fi - # exception for the capture option, the rest means: a part of the tests - if [ "${arg}" != "-c" ]; then + # exception for the capture/checksum options, the rest means: a part of the tests + if [ "${arg}" != "-c" ] && [ "${arg}" != "-C" ]; then do_all_tests=0 fi done @@ -1434,7 +1866,7 @@ if [ $do_all_tests -eq 1 ]; then exit $ret fi -while getopts 'fsltra64bpkch' opt; do +while getopts 'fsltra64bpkdmchCS' opt; do case $opt in f) subflows_tests @@ -1469,8 +1901,19 @@ while getopts 'fsltra64bpkch' opt; do k) syncookies_tests ;; + S) + checksum_tests + ;; + d) + deny_join_id0_tests + ;; + m) + fullmesh_tests + ;; c) ;; + C) + ;; h | *) usage ;; diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 2fa13946ac04..1579e471a5e7 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -178,7 +178,7 @@ do_transfer() timeout ${timeout_test} \ ip netns exec ${listener_ns} \ - $mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} \ + $mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} -c TIMESTAMPNS \ ${local_addr} < "$sin" > "$sout" & spid=$! @@ -186,7 +186,7 @@ do_transfer() timeout ${timeout_test} \ ip netns exec ${connector_ns} \ - $mptcp_connect -t ${timeout_poll} -M 2 -p $port -s ${cl_proto} \ + $mptcp_connect -t ${timeout_poll} -M 2 -p $port -s ${cl_proto} -c TIMESTAMPNS \ $connect_addr < "$cin" > "$cout" & cpid=$! diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 115decfdc1ef..354784512748 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -25,7 +25,7 @@ static void syntax(char *argv[]) { fprintf(stderr, "%s add|get|set|del|flush|dump|accept [<args>]\n", argv[0]); - fprintf(stderr, "\tadd [flags signal|subflow|backup] [id <nr>] [dev <name>] <ip>\n"); + fprintf(stderr, "\tadd [flags signal|subflow|backup|fullmesh] [id <nr>] [dev <name>] <ip>\n"); fprintf(stderr, "\tdel <id> [<ip>]\n"); fprintf(stderr, "\tget <id>\n"); fprintf(stderr, "\tset <ip> [flags backup|nobackup]\n"); @@ -236,11 +236,18 @@ int add_addr(int fd, int pm_family, int argc, char *argv[]) flags |= MPTCP_PM_ADDR_FLAG_SIGNAL; else if (!strcmp(tok, "backup")) flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else if (!strcmp(tok, "fullmesh")) + flags |= MPTCP_PM_ADDR_FLAG_FULLMESH; else error(1, errno, "unknown flag %s", argv[arg]); } + if (flags & MPTCP_PM_ADDR_FLAG_SIGNAL && + flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + error(1, errno, "error flag fullmesh"); + } + rta = (void *)(data + off); rta->rta_type = MPTCP_PM_ADDR_ATTR_FLAGS; rta->rta_len = RTA_LENGTH(4); @@ -422,6 +429,13 @@ static void print_addr(struct rtattr *attrs, int len) printf(","); } + if (flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + printf("fullmesh"); + flags &= ~MPTCP_PM_ADDR_FLAG_FULLMESH; + if (flags) + printf(","); + } + /* bump unknown flags, if any */ if (flags) printf("0x%x", flags); diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 3aeef3bcb101..910d8126af8f 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -22,8 +22,8 @@ usage() { cleanup() { - rm -f "$cin" "$cout" - rm -f "$sin" "$sout" + rm -f "$cout" "$sout" + rm -f "$large" "$small" rm -f "$capout" local netns @@ -60,6 +60,8 @@ setup() for i in "$ns1" "$ns2" "$ns3";do ip netns add $i || exit $ksft_skip ip -net $i link set lo up + ip netns exec $i sysctl -q net.ipv4.conf.all.rp_filter=0 + ip netns exec $i sysctl -q net.ipv4.conf.default.rp_filter=0 done ip link add ns1eth1 netns "$ns1" type veth peer name ns2eth1 netns "$ns2" @@ -80,7 +82,6 @@ setup() ip netns exec "$ns1" ./pm_nl_ctl limits 1 1 ip netns exec "$ns1" ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags subflow - ip netns exec "$ns1" sysctl -q net.ipv4.conf.all.rp_filter=0 ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1 ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad diff --git a/tools/testing/selftests/net/nettest.c b/tools/testing/selftests/net/nettest.c index 6365c7fd1262..b599003eb5ba 100644 --- a/tools/testing/selftests/net/nettest.c +++ b/tools/testing/selftests/net/nettest.c @@ -11,9 +11,11 @@ #include <sys/socket.h> #include <sys/wait.h> #include <linux/tcp.h> +#include <linux/udp.h> #include <arpa/inet.h> #include <net/if.h> #include <netinet/in.h> +#include <netinet/ip.h> #include <netdb.h> #include <fcntl.h> #include <libgen.h> @@ -26,6 +28,11 @@ #include <unistd.h> #include <time.h> #include <errno.h> +#include <getopt.h> + +#include <linux/xfrm.h> +#include <linux/ipsec.h> +#include <linux/pfkeyv2.h> #ifndef IPV6_UNICAST_IF #define IPV6_UNICAST_IF 76 @@ -95,6 +102,8 @@ struct sock_args { struct sockaddr_in6 v6; } md5_prefix; unsigned int prefix_len; + /* 0: default, -1: force off, +1: force on */ + int bind_key_ifindex; /* expected addresses and device index for connection */ const char *expected_dev; @@ -114,6 +123,9 @@ struct sock_args { struct in_addr in; struct in6_addr in6; } expected_raddr; + + /* ESP in UDP encap test */ + int use_xfrm; }; static int server_mode; @@ -262,11 +274,14 @@ static int tcp_md5sig(int sd, void *addr, socklen_t alen, struct sock_args *args } memcpy(&md5sig.tcpm_addr, addr, alen); - if (args->ifindex) { + if ((args->ifindex && args->bind_key_ifindex >= 0) || args->bind_key_ifindex >= 1) { opt = TCP_MD5SIG_EXT; md5sig.tcpm_flags |= TCP_MD5SIG_FLAG_IFINDEX; md5sig.tcpm_ifindex = args->ifindex; + log_msg("TCP_MD5SIG_FLAG_IFINDEX set tcpm_ifindex=%d\n", md5sig.tcpm_ifindex); + } else { + log_msg("TCP_MD5SIG_FLAG_IFINDEX off\n", md5sig.tcpm_ifindex); } rc = setsockopt(sd, IPPROTO_TCP, opt, &md5sig, sizeof(md5sig)); @@ -1346,6 +1361,41 @@ static int bind_socket(int sd, struct sock_args *args) return 0; } +static int config_xfrm_policy(int sd, struct sock_args *args) +{ + struct xfrm_userpolicy_info policy = {}; + int type = UDP_ENCAP_ESPINUDP; + int xfrm_af = IP_XFRM_POLICY; + int level = SOL_IP; + + if (args->type != SOCK_DGRAM) { + log_error("Invalid socket type. Only DGRAM could be used for XFRM\n"); + return 1; + } + + policy.action = XFRM_POLICY_ALLOW; + policy.sel.family = args->version; + if (args->version == AF_INET6) { + xfrm_af = IPV6_XFRM_POLICY; + level = SOL_IPV6; + } + + policy.dir = XFRM_POLICY_OUT; + if (setsockopt(sd, level, xfrm_af, &policy, sizeof(policy)) < 0) + return 1; + + policy.dir = XFRM_POLICY_IN; + if (setsockopt(sd, level, xfrm_af, &policy, sizeof(policy)) < 0) + return 1; + + if (setsockopt(sd, IPPROTO_UDP, UDP_ENCAP, &type, sizeof(type)) < 0) { + log_err_errno("Failed to set xfrm encap"); + return 1; + } + + return 0; +} + static int lsock_init(struct sock_args *args) { long flags; @@ -1389,6 +1439,11 @@ static int lsock_init(struct sock_args *args) if (fcntl(sd, F_SETFD, FD_CLOEXEC) < 0) log_err_errno("Failed to set close-on-exec flag"); + if (args->use_xfrm && config_xfrm_policy(sd, args)) { + log_err_errno("Failed to set xfrm policy"); + goto err; + } + out: return sd; @@ -1772,7 +1827,15 @@ static int ipc_parent(int cpid, int fd, struct sock_args *args) return client_status; } -#define GETOPT_STR "sr:l:c:p:t:g:P:DRn:M:X:m:d:I:BN:O:SCi6L:0:1:2:3:Fbq" +#define GETOPT_STR "sr:l:c:p:t:g:P:DRn:M:X:m:d:I:BN:O:SCi6xL:0:1:2:3:Fbq" +#define OPT_FORCE_BIND_KEY_IFINDEX 1001 +#define OPT_NO_BIND_KEY_IFINDEX 1002 + +static struct option long_opts[] = { + {"force-bind-key-ifindex", 0, 0, OPT_FORCE_BIND_KEY_IFINDEX}, + {"no-bind-key-ifindex", 0, 0, OPT_NO_BIND_KEY_IFINDEX}, + {0, 0, 0, 0} +}; static void print_usage(char *prog) { @@ -1795,6 +1858,7 @@ static void print_usage(char *prog) " -D|R datagram (D) / raw (R) socket (default stream)\n" " -l addr local address to bind to in server mode\n" " -c addr local address to bind to in client mode\n" + " -x configure XFRM policy on socket\n" "\n" " -d dev bind socket to given device name\n" " -I dev bind socket to given device name - server mode\n" @@ -1808,6 +1872,10 @@ static void print_usage(char *prog) " -M password use MD5 sum protection\n" " -X password MD5 password for client mode\n" " -m prefix/len prefix and length to use for MD5 key\n" + " --no-bind-key-ifindex: Force TCP_MD5SIG_FLAG_IFINDEX off\n" + " --force-bind-key-ifindex: Force TCP_MD5SIG_FLAG_IFINDEX on\n" + " (default: only if -I is passed)\n" + "\n" " -g grp multicast group (e.g., 239.1.1.1)\n" " -i interactive mode (default is echo and terminate)\n" "\n" @@ -1843,7 +1911,7 @@ int main(int argc, char *argv[]) * process input args */ - while ((rc = getopt(argc, argv, GETOPT_STR)) != -1) { + while ((rc = getopt_long(argc, argv, GETOPT_STR, long_opts, NULL)) != -1) { switch (rc) { case 'B': both_mode = 1; @@ -1916,6 +1984,12 @@ int main(int argc, char *argv[]) case 'M': args.password = optarg; break; + case OPT_FORCE_BIND_KEY_IFINDEX: + args.bind_key_ifindex = 1; + break; + case OPT_NO_BIND_KEY_IFINDEX: + args.bind_key_ifindex = -1; + break; case 'X': args.client_pw = optarg; break; @@ -1966,6 +2040,9 @@ int main(int argc, char *argv[]) case 'q': quiet = 1; break; + case 'x': + args.use_xfrm = 1; + break; default: print_usage(argv[0]); return 1; diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 64cd2e23c568..543ad7513a8e 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -118,6 +118,16 @@ # below for IPv6 doesn't apply here, because, on IPv4, administrative MTU # changes alone won't affect PMTU # +# - pmtu_vti4_udp_exception +# Same as pmtu_vti4_exception, but using ESP-in-UDP +# +# - pmtu_vti4_udp_routed_exception +# Set up vti tunnel on top of veth connected through routing namespace and +# add xfrm states and policies with ESP-in-UDP encapsulation. Check that +# route exception is not created if link layer MTU is not exceeded, then +# lower MTU on second part of routed environment and check that exception +# is created with the expected PMTU. +# # - pmtu_vti6_exception # Set up vti6 tunnel on top of veth, with xfrm states and policies, in two # namespaces with matching endpoints. Check that route exception is @@ -125,6 +135,13 @@ # decrease and increase MTU of tunnel, checking that route exception PMTU # changes accordingly # +# - pmtu_vti6_udp_exception +# Same as pmtu_vti6_exception, but using ESP-in-UDP +# +# - pmtu_vti6_udp_routed_exception +# Same as pmtu_vti6_udp_routed_exception but with routing between vti +# endpoints +# # - pmtu_vti4_default_mtu # Set up vti4 tunnel on top of veth, in two namespaces with matching # endpoints. Check that MTU assigned to vti interface is the MTU of the @@ -224,6 +241,10 @@ tests=" pmtu_ipv6_ipv6_exception IPv6 over IPv6: PMTU exceptions 1 pmtu_vti6_exception vti6: PMTU exceptions 0 pmtu_vti4_exception vti4: PMTU exceptions 0 + pmtu_vti6_udp_exception vti6: PMTU exceptions (ESP-in-UDP) 0 + pmtu_vti4_udp_exception vti4: PMTU exceptions (ESP-in-UDP) 0 + pmtu_vti6_udp_routed_exception vti6: PMTU exceptions, routed (ESP-in-UDP) 0 + pmtu_vti4_udp_routed_exception vti4: PMTU exceptions, routed (ESP-in-UDP) 0 pmtu_vti4_default_mtu vti4: default MTU assignment 0 pmtu_vti6_default_mtu vti6: default MTU assignment 0 pmtu_vti4_link_add_mtu vti4: MTU setting on link creation 0 @@ -246,7 +267,6 @@ ns_b="ip netns exec ${NS_B}" ns_c="ip netns exec ${NS_C}" ns_r1="ip netns exec ${NS_R1}" ns_r2="ip netns exec ${NS_R2}" - # Addressing and routing for tests with routers: four network segments, with # index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an # identifier ID, which is 1 for hosts (A and B), 2 for routers (R1 and R2). @@ -279,7 +299,6 @@ routes=" A ${prefix6}:${b_r2}::1 ${prefix6}:${a_r2}::2 B default ${prefix6}:${b_r1}::2 " - USE_NH="no" # ns family nh id destination gateway nexthops=" @@ -326,6 +345,7 @@ dummy6_mask="64" err_buf= tcpdump_pids= +nettest_pids= err() { err_buf="${err_buf}${1} @@ -548,6 +568,14 @@ setup_vti6() { setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${tunnel6_a_addr} ${tunnel6_b_addr} ${tunnel6_mask} } +setup_vti4routed() { + setup_vti 4 ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 ${tunnel4_a_addr} ${tunnel4_b_addr} ${tunnel4_mask} +} + +setup_vti6routed() { + setup_vti 6 ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 ${tunnel6_a_addr} ${tunnel6_b_addr} ${tunnel6_mask} +} + setup_vxlan_or_geneve() { type="${1}" a_addr="${2}" @@ -619,18 +647,36 @@ setup_xfrm() { proto=${1} veth_a_addr="${2}" veth_b_addr="${3}" + encap=${4} - run_cmd ${ns_a} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel || return 1 - run_cmd ${ns_a} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel + run_cmd ${ns_a} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel ${encap} || return 1 + run_cmd ${ns_a} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel ${encap} run_cmd ${ns_a} ip -${proto} xfrm policy add dir out mark 10 tmpl src ${veth_a_addr} dst ${veth_b_addr} proto esp mode tunnel run_cmd ${ns_a} ip -${proto} xfrm policy add dir in mark 10 tmpl src ${veth_b_addr} dst ${veth_a_addr} proto esp mode tunnel - run_cmd ${ns_b} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel - run_cmd ${ns_b} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel + run_cmd ${ns_b} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel ${encap} + run_cmd ${ns_b} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel ${encap} run_cmd ${ns_b} ip -${proto} xfrm policy add dir out mark 10 tmpl src ${veth_b_addr} dst ${veth_a_addr} proto esp mode tunnel run_cmd ${ns_b} ip -${proto} xfrm policy add dir in mark 10 tmpl src ${veth_a_addr} dst ${veth_b_addr} proto esp mode tunnel } +setup_nettest_xfrm() { + which nettest >/dev/null + if [ $? -ne 0 ]; then + echo "'nettest' command not found; skipping tests" + return 1 + fi + + [ ${1} -eq 6 ] && proto="-6" || proto="" + port=${2} + + run_cmd ${ns_a} nettest ${proto} -q -D -s -x -p ${port} -t 5 & + nettest_pids="${nettest_pids} $!" + + run_cmd ${ns_b} nettest ${proto} -q -D -s -x -p ${port} -t 5 & + nettest_pids="${nettest_pids} $!" +} + setup_xfrm4() { setup_xfrm 4 ${veth4_a_addr} ${veth4_b_addr} } @@ -639,6 +685,26 @@ setup_xfrm6() { setup_xfrm 6 ${veth6_a_addr} ${veth6_b_addr} } +setup_xfrm4udp() { + setup_xfrm 4 ${veth4_a_addr} ${veth4_b_addr} "encap espinudp 4500 4500 0.0.0.0" + setup_nettest_xfrm 4 4500 +} + +setup_xfrm6udp() { + setup_xfrm 6 ${veth6_a_addr} ${veth6_b_addr} "encap espinudp 4500 4500 0.0.0.0" + setup_nettest_xfrm 6 4500 +} + +setup_xfrm4udprouted() { + setup_xfrm 4 ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 "encap espinudp 4500 4500 0.0.0.0" + setup_nettest_xfrm 4 4500 +} + +setup_xfrm6udprouted() { + setup_xfrm 6 ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 "encap espinudp 4500 4500 0.0.0.0" + setup_nettest_xfrm 6 4500 +} + setup_routing_old() { for i in ${routes}; do [ "${ns}" = "" ] && ns="${i}" && continue @@ -823,6 +889,11 @@ cleanup() { done tcpdump_pids= + for pid in ${nettest_pids}; do + kill ${pid} + done + nettest_pids= + for n in ${NS_A} ${NS_B} ${NS_C} ${NS_R1} ${NS_R2}; do ip netns del ${n} 2> /dev/null done @@ -1432,6 +1503,135 @@ test_pmtu_vti6_exception() { return ${fail} } +test_pmtu_vti4_udp_exception() { + setup namespaces veth vti4 xfrm4udp || return $ksft_skip + trace "${ns_a}" veth_a "${ns_b}" veth_b \ + "${ns_a}" vti4_a "${ns_b}" vti4_b + + veth_mtu=1500 + vti_mtu=$((veth_mtu - 20)) + + # UDP SPI SN IV ICV pad length next header + esp_payload_rfc4106=$((vti_mtu - 8 - 4 - 4 - 8 - 16 - 1 - 1)) + ping_payload=$((esp_payload_rfc4106 - 28)) + + mtu "${ns_a}" veth_a ${veth_mtu} + mtu "${ns_b}" veth_b ${veth_mtu} + mtu "${ns_a}" vti4_a ${vti_mtu} + mtu "${ns_b}" vti4_b ${vti_mtu} + + # Send DF packet without exceeding link layer MTU, check that no + # exception is created + run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s ${ping_payload} ${tunnel4_b_addr} + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})" + check_pmtu_value "" "${pmtu}" "sending packet smaller than PMTU (IP payload length ${esp_payload_rfc4106})" || return 1 + + # Now exceed link layer MTU by one byte, check that exception is created + # with the right PMTU value + run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s $((ping_payload + 1)) ${tunnel4_b_addr} + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})" + check_pmtu_value "${esp_payload_rfc4106}" "${pmtu}" "exceeding PMTU (IP payload length $((esp_payload_rfc4106 + 1)))" +} + +test_pmtu_vti6_udp_exception() { + setup namespaces veth vti6 xfrm6udp || return $ksft_skip + trace "${ns_a}" veth_a "${ns_b}" veth_b \ + "${ns_a}" vti6_a "${ns_b}" vti6_b + fail=0 + + # Create route exception by exceeding link layer MTU + mtu "${ns_a}" veth_a 4000 + mtu "${ns_b}" veth_b 4000 + mtu "${ns_a}" vti6_a 5000 + mtu "${ns_b}" vti6_b 5000 + run_cmd ${ns_a} ${ping6} -q -i 0.1 -w 1 -s 60000 ${tunnel6_b_addr} + + # Check that exception was created + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})" + check_pmtu_value any "${pmtu}" "creating tunnel exceeding link layer MTU" || return 1 + + # Decrease tunnel MTU, check for PMTU decrease in route exception + mtu "${ns_a}" vti6_a 3000 + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})" + check_pmtu_value "3000" "${pmtu}" "decreasing tunnel MTU" || fail=1 + + # Increase tunnel MTU, check for PMTU increase in route exception + mtu "${ns_a}" vti6_a 9000 + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})" + check_pmtu_value "9000" "${pmtu}" "increasing tunnel MTU" || fail=1 + + return ${fail} +} + +test_pmtu_vti4_udp_routed_exception() { + setup namespaces routing vti4routed xfrm4udprouted || return $ksft_skip + trace "${ns_a}" veth_A-R1 "${ns_b}" veth_B-R1 \ + "${ns_a}" vti4_a "${ns_b}" vti4_b + + veth_mtu=1500 + vti_mtu=$((veth_mtu - 20)) + + # UDP SPI SN IV ICV pad length next header + esp_payload_rfc4106=$((vti_mtu - 8 - 4 - 4 - 8 - 16 - 1 - 1)) + ping_payload=$((esp_payload_rfc4106 - 28)) + + mtu "${ns_a}" veth_A-R1 ${veth_mtu} + mtu "${ns_r1}" veth_R1-A ${veth_mtu} + mtu "${ns_b}" veth_B-R1 ${veth_mtu} + mtu "${ns_r1}" veth_R1-B ${veth_mtu} + + mtu "${ns_a}" vti4_a ${vti_mtu} + mtu "${ns_b}" vti4_b ${vti_mtu} + + # Send DF packet without exceeding link layer MTU, check that no + # exception is created + run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s ${ping_payload} ${tunnel4_b_addr} + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})" + check_pmtu_value "" "${pmtu}" "sending packet smaller than PMTU (IP payload length ${esp_payload_rfc4106})" || return 1 + + # Now decrease link layer MTU by 8 bytes on R1, check that exception is created + # with the right PMTU value + mtu "${ns_r1}" veth_R1-B $((veth_mtu - 8)) + run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s $((ping_payload)) ${tunnel4_b_addr} + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})" + check_pmtu_value "$((esp_payload_rfc4106 - 8))" "${pmtu}" "exceeding PMTU (IP payload length $((esp_payload_rfc4106)))" +} + +test_pmtu_vti6_udp_routed_exception() { + setup namespaces routing vti6routed xfrm6udprouted || return $ksft_skip + trace "${ns_a}" veth_A-R1 "${ns_b}" veth_B-R1 \ + "${ns_a}" vti6_a "${ns_b}" vti6_b + + veth_mtu=1500 + vti_mtu=$((veth_mtu - 40)) + + # UDP SPI SN IV ICV pad length next header + esp_payload_rfc4106=$((vti_mtu - 8 - 4 - 4 - 8 - 16 - 1 - 1)) + ping_payload=$((esp_payload_rfc4106 - 48)) + + mtu "${ns_a}" veth_A-R1 ${veth_mtu} + mtu "${ns_r1}" veth_R1-A ${veth_mtu} + mtu "${ns_b}" veth_B-R1 ${veth_mtu} + mtu "${ns_r1}" veth_R1-B ${veth_mtu} + + # mtu "${ns_a}" vti6_a ${vti_mtu} + # mtu "${ns_b}" vti6_b ${vti_mtu} + + run_cmd ${ns_a} ${ping6} -q -M want -i 0.1 -w 1 -s ${ping_payload} ${tunnel6_b_addr} + + # Check that exception was not created + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})" + check_pmtu_value "" "${pmtu}" "sending packet smaller than PMTU (IP payload length ${esp_payload_rfc4106})" || return 1 + + # Now decrease link layer MTU by 8 bytes on R1, check that exception is created + # with the right PMTU value + mtu "${ns_r1}" veth_R1-B $((veth_mtu - 8)) + run_cmd ${ns_a} ${ping6} -q -M want -i 0.1 -w 1 -s $((ping_payload)) ${tunnel6_b_addr} + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})" + check_pmtu_value "$((esp_payload_rfc4106 - 8))" "${pmtu}" "exceeding PMTU (IP payload length $((esp_payload_rfc4106)))" + +} + test_pmtu_vti4_default_mtu() { setup namespaces veth vti4 || return $ksft_skip diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c index db4521335722..3653d6468c67 100644 --- a/tools/testing/selftests/net/psock_fanout.c +++ b/tools/testing/selftests/net/psock_fanout.c @@ -111,8 +111,8 @@ static int sock_fanout_open(uint16_t typeflags, uint16_t group_id) static void sock_fanout_set_cbpf(int fd) { struct sock_filter bpf_filter[] = { - BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 80), /* ldb [80] */ - BPF_STMT(BPF_RET+BPF_A, 0), /* ret A */ + BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 80), /* ldb [80] */ + BPF_STMT(BPF_RET | BPF_A, 0), /* ret A */ }; struct sock_fprog bpf_prog; diff --git a/tools/testing/selftests/net/psock_snd.sh b/tools/testing/selftests/net/psock_snd.sh index 170be65e0816..1cbfeb5052ec 100755 --- a/tools/testing/selftests/net/psock_snd.sh +++ b/tools/testing/selftests/net/psock_snd.sh @@ -86,9 +86,6 @@ echo "raw truncate hlen - 1 (expected to fail: EINVAL)" echo "raw gso min size" ./in_netns.sh ./psock_snd -v -c -g -l "${mss_exceeds}" -echo "raw gso min size - 1 (expected to fail)" -(! ./in_netns.sh ./psock_snd -v -c -g -l "${mss}") - echo "raw gso max size" ./in_netns.sh ./psock_snd -v -c -g -l "${max_mss}" diff --git a/tools/testing/selftests/net/run_afpackettests b/tools/testing/selftests/net/run_afpackettests index 8b42e8b04e0f..a59cb6a3c4f5 100755 --- a/tools/testing/selftests/net/run_afpackettests +++ b/tools/testing/selftests/net/run_afpackettests @@ -1,9 +1,12 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + if [ $(id -u) != 0 ]; then echo $msg must be run as root >&2 - exit 0 + exit $ksft_skip fi ret=0 diff --git a/tools/testing/selftests/net/setup_loopback.sh b/tools/testing/selftests/net/setup_loopback.sh new file mode 100755 index 000000000000..e57bbfbc5208 --- /dev/null +++ b/tools/testing/selftests/net/setup_loopback.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +readonly FLUSH_PATH="/sys/class/net/${dev}/gro_flush_timeout" +readonly IRQ_PATH="/sys/class/net/${dev}/napi_defer_hard_irqs" +readonly FLUSH_TIMEOUT="$(< ${FLUSH_PATH})" +readonly HARD_IRQS="$(< ${IRQ_PATH})" + +netdev_check_for_carrier() { + local -r dev="$1" + + for i in {1..5}; do + carrier="$(cat /sys/class/net/${dev}/carrier)" + if [[ "${carrier}" -ne 1 ]] ; then + echo "carrier not ready yet..." >&2 + sleep 1 + else + echo "carrier ready" >&2 + break + fi + done + echo "${carrier}" +} + +# Assumes that there is no existing ipvlan device on the physical device +setup_loopback_environment() { + local dev="$1" + + # Fail hard if cannot turn on loopback mode for current NIC + ethtool -K "${dev}" loopback on || exit 1 + sleep 1 + + # Check for the carrier + carrier=$(netdev_check_for_carrier ${dev}) + if [[ "${carrier}" -ne 1 ]] ; then + echo "setup_loopback_environment failed" + exit 1 + fi +} + +setup_macvlan_ns(){ + local -r link_dev="$1" + local -r ns_name="$2" + local -r ns_dev="$3" + local -r ns_mac="$4" + local -r addr="$5" + + ip link add link "${link_dev}" dev "${ns_dev}" \ + address "${ns_mac}" type macvlan + exit_code=$? + if [[ "${exit_code}" -ne 0 ]]; then + echo "setup_macvlan_ns failed" + exit $exit_code + fi + + [[ -e /var/run/netns/"${ns_name}" ]] || ip netns add "${ns_name}" + ip link set dev "${ns_dev}" netns "${ns_name}" + ip -netns "${ns_name}" link set dev "${ns_dev}" up + if [[ -n "${addr}" ]]; then + ip -netns "${ns_name}" addr add dev "${ns_dev}" "${addr}" + fi + + sleep 1 +} + +cleanup_macvlan_ns(){ + while (( $# >= 2 )); do + ns_name="$1" + ns_dev="$2" + ip -netns "${ns_name}" link del dev "${ns_dev}" + ip netns del "${ns_name}" + shift 2 + done +} + +cleanup_loopback(){ + local -r dev="$1" + + ethtool -K "${dev}" loopback off + sleep 1 + + # Check for the carrier + carrier=$(netdev_check_for_carrier ${dev}) + if [[ "${carrier}" -ne 1 ]] ; then + echo "setup_loopback_environment failed" + exit 1 + fi +} + +setup_interrupt() { + # Use timer on host to trigger the network stack + # Also disable device interrupt to not depend on NIC interrupt + # Reduce test flakiness caused by unexpected interrupts + echo 100000 >"${FLUSH_PATH}" + echo 50 >"${IRQ_PATH}" +} + +setup_ns() { + # Set up server_ns namespace and client_ns namespace + setup_macvlan_ns "${dev}" server_ns server "${SERVER_MAC}" + setup_macvlan_ns "${dev}" client_ns client "${CLIENT_MAC}" +} + +cleanup_ns() { + cleanup_macvlan_ns server_ns server client_ns client +} + +setup() { + setup_loopback_environment "${dev}" + setup_interrupt +} + +cleanup() { + cleanup_loopback "${dev}" + + echo "${FLUSH_TIMEOUT}" >"${FLUSH_PATH}" + echo "${HARD_IRQS}" >"${IRQ_PATH}" +} diff --git a/tools/testing/selftests/net/setup_veth.sh b/tools/testing/selftests/net/setup_veth.sh new file mode 100644 index 000000000000..1003ddf7b3b2 --- /dev/null +++ b/tools/testing/selftests/net/setup_veth.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +setup_veth_ns() { + local -r link_dev="$1" + local -r ns_name="$2" + local -r ns_dev="$3" + local -r ns_mac="$4" + + [[ -e /var/run/netns/"${ns_name}" ]] || ip netns add "${ns_name}" + echo 100000 > "/sys/class/net/${ns_dev}/gro_flush_timeout" + ip link set dev "${ns_dev}" netns "${ns_name}" mtu 65535 + ip -netns "${ns_name}" link set dev "${ns_dev}" up + + ip netns exec "${ns_name}" ethtool -K "${ns_dev}" gro on tso off +} + +setup_ns() { + # Set up server_ns namespace and client_ns namespace + ip link add name server type veth peer name client + + setup_veth_ns "${dev}" server_ns server "${SERVER_MAC}" + setup_veth_ns "${dev}" client_ns client "${CLIENT_MAC}" +} + +cleanup_ns() { + local ns_name + + for ns_name in client_ns server_ns; do + [[ -e /var/run/netns/"${ns_name}" ]] && ip netns del "${ns_name}" + done +} + +setup() { + # no global init setup step needed + : +} + +cleanup() { + cleanup_ns +} diff --git a/tools/testing/selftests/net/so_netns_cookie.c b/tools/testing/selftests/net/so_netns_cookie.c new file mode 100644 index 000000000000..b39e87e967cd --- /dev/null +++ b/tools/testing/selftests/net/so_netns_cookie.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <sched.h> +#include <unistd.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <stdlib.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/socket.h> + +#ifndef SO_NETNS_COOKIE +#define SO_NETNS_COOKIE 71 +#endif + +#define pr_err(fmt, ...) \ + ({ \ + fprintf(stderr, "%s:%d:" fmt ": %m\n", \ + __func__, __LINE__, ##__VA_ARGS__); \ + 1; \ + }) + +int main(int argc, char *argvp[]) +{ + uint64_t cookie1, cookie2; + socklen_t vallen; + int sock1, sock2; + + sock1 = socket(AF_INET, SOCK_STREAM, 0); + if (sock1 < 0) + return pr_err("Unable to create TCP socket"); + + vallen = sizeof(cookie1); + if (getsockopt(sock1, SOL_SOCKET, SO_NETNS_COOKIE, &cookie1, &vallen) != 0) + return pr_err("getsockopt(SOL_SOCKET, SO_NETNS_COOKIE)"); + + if (!cookie1) + return pr_err("SO_NETNS_COOKIE returned zero cookie"); + + if (unshare(CLONE_NEWNET)) + return pr_err("unshare"); + + sock2 = socket(AF_INET, SOCK_STREAM, 0); + if (sock2 < 0) + return pr_err("Unable to create TCP socket"); + + vallen = sizeof(cookie2); + if (getsockopt(sock2, SOL_SOCKET, SO_NETNS_COOKIE, &cookie2, &vallen) != 0) + return pr_err("getsockopt(SOL_SOCKET, SO_NETNS_COOKIE)"); + + if (!cookie2) + return pr_err("SO_NETNS_COOKIE returned zero cookie"); + + if (cookie1 == cookie2) + return pr_err("SO_NETNS_COOKIE returned identical cookies for distinct ns"); + + close(sock1); + close(sock2); + return 0; +} diff --git a/tools/testing/selftests/net/srv6_end_dt46_l3vpn_test.sh b/tools/testing/selftests/net/srv6_end_dt46_l3vpn_test.sh new file mode 100755 index 000000000000..aebaab8ce44c --- /dev/null +++ b/tools/testing/selftests/net/srv6_end_dt46_l3vpn_test.sh @@ -0,0 +1,576 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# author: Andrea Mayer <andrea.mayer@uniroma2.it> +# author: Paolo Lungaroni <paolo.lungaroni@uniroma2.it> + +# This test is designed for evaluating the new SRv6 End.DT46 Behavior used for +# implementing IPv4/IPv6 L3 VPN use cases. +# +# The current SRv6 code in the Linux kernel only implements SRv6 End.DT4 and +# End.DT6 Behaviors which can be used respectively to support IPv4-in-IPv6 and +# IPv6-in-IPv6 VPNs. With End.DT4 and End.DT6 it is not possible to create a +# single SRv6 VPN tunnel to carry both IPv4 and IPv6 traffic. +# The SRv6 End.DT46 Behavior implementation is meant to support the +# decapsulation of IPv4 and IPv6 traffic coming from a single SRv6 tunnel. +# Therefore, the SRv6 End.DT46 Behavior in the Linux kernel greatly simplifies +# the setup and operations of SRv6 VPNs. +# +# Hereafter a network diagram is shown, where two different tenants (named 100 +# and 200) offer IPv4/IPv6 L3 VPN services allowing hosts to communicate with +# each other across an IPv6 network. +# +# Only hosts belonging to the same tenant (and to the same VPN) can communicate +# with each other. Instead, the communication among hosts of different tenants +# is forbidden. +# In other words, hosts hs-t100-1 and hs-t100-2 are connected through the +# IPv4/IPv6 L3 VPN of tenant 100 while hs-t200-3 and hs-t200-4 are connected +# using the IPv4/IPv6 L3 VPN of tenant 200. Cross connection between tenant 100 +# and tenant 200 is forbidden and thus, for example, hs-t100-1 cannot reach +# hs-t200-3 and vice versa. +# +# Routers rt-1 and rt-2 implement IPv4/IPv6 L3 VPN services leveraging the SRv6 +# architecture. The key components for such VPNs are: a) SRv6 Encap behavior, +# b) SRv6 End.DT46 Behavior and c) VRF. +# +# To explain how an IPv4/IPv6 L3 VPN based on SRv6 works, let us briefly +# consider an example where, within the same domain of tenant 100, the host +# hs-t100-1 pings the host hs-t100-2. +# +# First of all, L2 reachability of the host hs-t100-2 is taken into account by +# the router rt-1 which acts as a arp/ndp proxy. +# +# When the host hs-t100-1 sends an IPv6 or IPv4 packet destined to hs-t100-2, +# the router rt-1 receives the packet on the internal veth-t100 interface. Such +# interface is enslaved to the VRF vrf-100 whose associated table contains the +# SRv6 Encap route for encapsulating any IPv6 or IPv4 packet in a IPv6 plus the +# Segment Routing Header (SRH) packet. This packet is sent through the (IPv6) +# core network up to the router rt-2 that receives it on veth0 interface. +# +# The rt-2 router uses the 'localsid' routing table to process incoming +# IPv6+SRH packets which belong to the VPN of the tenant 100. For each of these +# packets, the SRv6 End.DT46 Behavior removes the outer IPv6+SRH headers and +# performs the lookup on the vrf-100 table using the destination address of +# the decapsulated IPv6 or IPv4 packet. Afterwards, the packet is sent to the +# host hs-t100-2 through the veth-t100 interface. +# +# The ping response follows the same processing but this time the roles of rt-1 +# and rt-2 are swapped. +# +# Of course, the IPv4/IPv6 L3 VPN for tenant 200 works exactly as the IPv4/IPv6 +# L3 VPN for tenant 100. In this case, only hosts hs-t200-3 and hs-t200-4 are +# able to connect with each other. +# +# +# +-------------------+ +-------------------+ +# | | | | +# | hs-t100-1 netns | | hs-t100-2 netns | +# | | | | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | cafe::1/64 | | | | cafe::2/64 | | +# | | 10.0.0.1/24 | | | | 10.0.0.2/24 | | +# | +-------------+ | | +-------------+ | +# | . | | . | +# +-------------------+ +-------------------+ +# . . +# . . +# . . +# +-----------------------------------+ +-----------------------------------+ +# | . | | . | +# | +---------------+ | | +---------------- | +# | | veth-t100 | | | | veth-t100 | | +# | | cafe::254/64 | | | | cafe::254/64 | | +# | | 10.0.0.254/24 | +----------+ | | +----------+ | 10.0.0.254/24 | | +# | +-------+-------+ | localsid | | | | localsid | +-------+-------- | +# | | | table | | | | table | | | +# | +----+----+ +----------+ | | +----------+ +----+----+ | +# | | vrf-100 | | | | vrf-100 | | +# | +---------+ +------------+ | | +------------+ +---------+ | +# | | veth0 | | | | veth0 | | +# | | fd00::1/64 |.|...|.| fd00::2/64 | | +# | +---------+ +------------+ | | +------------+ +---------+ | +# | | vrf-200 | | | | vrf-200 | | +# | +----+----+ | | +----+----+ | +# | | | | | | +# | +-------+-------+ | | +-------+-------- | +# | | veth-t200 | | | | veth-t200 | | +# | | cafe::254/64 | | | | cafe::254/64 | | +# | | 10.0.0.254/24 | | | | 10.0.0.254/24 | | +# | +---------------+ rt-1 netns | | rt-2 netns +---------------- | +# | . | | . | +# +-----------------------------------+ +-----------------------------------+ +# . . +# . . +# . . +# . . +# +-------------------+ +-------------------+ +# | . | | . | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | cafe::3/64 | | | | cafe::4/64 | | +# | | 10.0.0.3/24 | | | | 10.0.0.4/24 | | +# | +-------------+ | | +-------------+ | +# | | | | +# | hs-t200-3 netns | | hs-t200-4 netns | +# | | | | +# +-------------------+ +-------------------+ +# +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# | Network configuration | +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# rt-1: localsid table (table 90) +# +--------------------------------------------------+ +# |SID |Action | +# +--------------------------------------------------+ +# |fc00:21:100::6046|apply SRv6 End.DT46 vrftable 100| +# +--------------------------------------------------+ +# |fc00:21:200::6046|apply SRv6 End.DT46 vrftable 200| +# +--------------------------------------------------+ +# +# rt-1: VRF tenant 100 (table 100) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::2 |apply seg6 encap segs fc00:12:100::6046| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth-t100 | +# +---------------------------------------------------+ +# |10.0.0.2 |apply seg6 encap segs fc00:12:100::6046| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth-t100 | +# +---------------------------------------------------+ +# +# rt-1: VRF tenant 200 (table 200) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::4 |apply seg6 encap segs fc00:12:200::6046| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth-t200 | +# +---------------------------------------------------+ +# |10.0.0.4 |apply seg6 encap segs fc00:12:200::6046| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth-t200 | +# +---------------------------------------------------+ +# +# +# rt-2: localsid table (table 90) +# +--------------------------------------------------+ +# |SID |Action | +# +--------------------------------------------------+ +# |fc00:12:100::6046|apply SRv6 End.DT46 vrftable 100| +# +--------------------------------------------------+ +# |fc00:12:200::6046|apply SRv6 End.DT46 vrftable 200| +# +--------------------------------------------------+ +# +# rt-2: VRF tenant 100 (table 100) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::1 |apply seg6 encap segs fc00:21:100::6046| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth-t100 | +# +---------------------------------------------------+ +# |10.0.0.1 |apply seg6 encap segs fc00:21:100::6046| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth-t100 | +# +---------------------------------------------------+ +# +# rt-2: VRF tenant 200 (table 200) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::3 |apply seg6 encap segs fc00:21:200::6046| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth-t200 | +# +---------------------------------------------------+ +# |10.0.0.3 |apply seg6 encap segs fc00:21:200::6046| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth-t200 | +# +---------------------------------------------------+ +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +readonly LOCALSID_TABLE_ID=90 +readonly IPv6_RT_NETWORK=fd00 +readonly IPv6_HS_NETWORK=cafe +readonly IPv4_HS_NETWORK=10.0.0 +readonly VPN_LOCATOR_SERVICE=fc00 +PING_TIMEOUT_SEC=4 + +ret=0 + +PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + nsuccess=$((nsuccess+1)) + printf "\n TEST: %-60s [ OK ]\n" "${msg}" + else + ret=1 + nfail=$((nfail+1)) + printf "\n TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +print_log_test_results() +{ + if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} + fi +} + +log_section() +{ + echo + echo "################################################################################" + echo "TEST SECTION: $*" + echo "################################################################################" +} + +cleanup() +{ + ip link del veth-rt-1 2>/dev/null || true + ip link del veth-rt-2 2>/dev/null || true + + # destroy routers rt-* and hosts hs-* + for ns in $(ip netns show | grep -E 'rt-*|hs-*'); do + ip netns del ${ns} || true + done +} + +# Setup the basic networking for the routers +setup_rt_networking() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns add ${nsname} + ip link set veth-rt-${rt} netns ${nsname} + ip -netns ${nsname} link set veth-rt-${rt} name veth0 + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip -netns ${nsname} addr add ${IPv6_RT_NETWORK}::${rt}/64 dev veth0 nodad + ip -netns ${nsname} link set veth0 up + ip -netns ${nsname} link set lo up + + ip netns exec ${nsname} sysctl -wq net.ipv4.ip_forward=1 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.forwarding=1 +} + +setup_hs() +{ + local hs=$1 + local rt=$2 + local tid=$3 + local hsname=hs-t${tid}-${hs} + local rtname=rt-${rt} + local rtveth=veth-t${tid} + + # set the networking for the host + ip netns add ${hsname} + + ip netns exec ${hsname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${hsname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip -netns ${hsname} link add veth0 type veth peer name ${rtveth} + ip -netns ${hsname} link set ${rtveth} netns ${rtname} + ip -netns ${hsname} addr add ${IPv6_HS_NETWORK}::${hs}/64 dev veth0 nodad + ip -netns ${hsname} addr add ${IPv4_HS_NETWORK}.${hs}/24 dev veth0 + ip -netns ${hsname} link set veth0 up + ip -netns ${hsname} link set lo up + + # configure the VRF for the tenant X on the router which is directly + # connected to the source host. + ip -netns ${rtname} link add vrf-${tid} type vrf table ${tid} + ip -netns ${rtname} link set vrf-${tid} up + + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + # enslave the veth-tX interface to the vrf-X in the access router + ip -netns ${rtname} link set ${rtveth} master vrf-${tid} + ip -netns ${rtname} addr add ${IPv6_HS_NETWORK}::254/64 dev ${rtveth} nodad + ip -netns ${rtname} addr add ${IPv4_HS_NETWORK}.254/24 dev ${rtveth} + ip -netns ${rtname} link set ${rtveth} up + + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.${rtveth}.proxy_ndp=1 + ip netns exec ${rtname} sysctl -wq net.ipv4.conf.${rtveth}.proxy_arp=1 + + # disable the rp_filter otherwise the kernel gets confused about how + # to route decap ipv4 packets. + ip netns exec ${rtname} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${rtname} sysctl -wq net.ipv4.conf.${rtveth}.rp_filter=0 + + ip netns exec ${rtname} sh -c "echo 1 > /proc/sys/net/vrf/strict_mode" +} + +setup_vpn_config() +{ + local hssrc=$1 + local rtsrc=$2 + local hsdst=$3 + local rtdst=$4 + local tid=$5 + + local hssrc_name=hs-t${tid}-${hssrc} + local hsdst_name=hs-t${tid}-${hsdst} + local rtsrc_name=rt-${rtsrc} + local rtdst_name=rt-${rtdst} + local rtveth=veth-t${tid} + local vpn_sid=${VPN_LOCATOR_SERVICE}:${hssrc}${hsdst}:${tid}::6046 + + ip -netns ${rtsrc_name} -6 neigh add proxy ${IPv6_HS_NETWORK}::${hsdst} dev ${rtveth} + + # set the encap route for encapsulating packets which arrive from the + # host hssrc and destined to the access router rtsrc. + ip -netns ${rtsrc_name} -6 route add ${IPv6_HS_NETWORK}::${hsdst}/128 vrf vrf-${tid} \ + encap seg6 mode encap segs ${vpn_sid} dev veth0 + ip -netns ${rtsrc_name} -4 route add ${IPv4_HS_NETWORK}.${hsdst}/32 vrf vrf-${tid} \ + encap seg6 mode encap segs ${vpn_sid} dev veth0 + ip -netns ${rtsrc_name} -6 route add ${vpn_sid}/128 vrf vrf-${tid} \ + via fd00::${rtdst} dev veth0 + + # set the decap route for decapsulating packets which arrive from + # the rtdst router and destined to the hsdst host. + ip -netns ${rtdst_name} -6 route add ${vpn_sid}/128 table ${LOCALSID_TABLE_ID} \ + encap seg6local action End.DT46 vrftable ${tid} dev vrf-${tid} + + # all sids for VPNs start with a common locator which is fc00::/16. + # Routes for handling the SRv6 End.DT46 behavior instances are grouped + # together in the 'localsid' table. + # + # NOTE: added only once + if [ -z "$(ip -netns ${rtdst_name} -6 rule show | \ + grep "to ${VPN_LOCATOR_SERVICE}::/16 lookup ${LOCALSID_TABLE_ID}")" ]; then + ip -netns ${rtdst_name} -6 rule add \ + to ${VPN_LOCATOR_SERVICE}::/16 \ + lookup ${LOCALSID_TABLE_ID} prio 999 + fi + + # set default routes to unreachable for both ipv4 and ipv6 + ip -netns ${rtsrc_name} -6 route add unreachable default metric 4278198272 \ + vrf vrf-${tid} + + ip -netns ${rtsrc_name} -4 route add unreachable default metric 4278198272 \ + vrf vrf-${tid} +} + +setup() +{ + ip link add veth-rt-1 type veth peer name veth-rt-2 + # setup the networking for router rt-1 and router rt-2 + setup_rt_networking 1 + setup_rt_networking 2 + + # setup two hosts for the tenant 100. + # - host hs-1 is directly connected to the router rt-1; + # - host hs-2 is directly connected to the router rt-2. + setup_hs 1 1 100 #args: host router tenant + setup_hs 2 2 100 + + # setup two hosts for the tenant 200 + # - host hs-3 is directly connected to the router rt-1; + # - host hs-4 is directly connected to the router rt-2. + setup_hs 3 1 200 + setup_hs 4 2 200 + + # setup the IPv4/IPv6 L3 VPN which connects the host hs-t100-1 and host + # hs-t100-2 within the same tenant 100. + setup_vpn_config 1 1 2 2 100 #args: src_host src_router dst_host dst_router tenant + setup_vpn_config 2 2 1 1 100 + + # setup the IPv4/IPv6 L3 VPN which connects the host hs-t200-3 and host + # hs-t200-4 within the same tenant 200. + setup_vpn_config 3 1 4 2 200 + setup_vpn_config 4 2 3 1 200 +} + +check_rt_connectivity() +{ + local rtsrc=$1 + local rtdst=$2 + + ip netns exec rt-${rtsrc} ping -c 1 -W 1 ${IPv6_RT_NETWORK}::${rtdst} \ + >/dev/null 2>&1 +} + +check_and_log_rt_connectivity() +{ + local rtsrc=$1 + local rtdst=$2 + + check_rt_connectivity ${rtsrc} ${rtdst} + log_test $? 0 "Routers connectivity: rt-${rtsrc} -> rt-${rtdst}" +} + +check_hs_ipv6_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + ip netns exec hs-t${tid}-${hssrc} ping -c 1 -W ${PING_TIMEOUT_SEC} \ + ${IPv6_HS_NETWORK}::${hsdst} >/dev/null 2>&1 +} + +check_hs_ipv4_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + ip netns exec hs-t${tid}-${hssrc} ping -c 1 -W ${PING_TIMEOUT_SEC} \ + ${IPv4_HS_NETWORK}.${hsdst} >/dev/null 2>&1 +} + +check_and_log_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + check_hs_ipv6_connectivity ${hssrc} ${hsdst} ${tid} + log_test $? 0 "IPv6 Hosts connectivity: hs-t${tid}-${hssrc} -> hs-t${tid}-${hsdst} (tenant ${tid})" + + check_hs_ipv4_connectivity ${hssrc} ${hsdst} ${tid} + log_test $? 0 "IPv4 Hosts connectivity: hs-t${tid}-${hssrc} -> hs-t${tid}-${hsdst} (tenant ${tid})" + +} + +check_and_log_hs_isolation() +{ + local hssrc=$1 + local tidsrc=$2 + local hsdst=$3 + local tiddst=$4 + + check_hs_ipv6_connectivity ${hssrc} ${hsdst} ${tidsrc} + # NOTE: ping should fail + log_test $? 1 "IPv6 Hosts isolation: hs-t${tidsrc}-${hssrc} -X-> hs-t${tiddst}-${hsdst}" + + check_hs_ipv4_connectivity ${hssrc} ${hsdst} ${tidsrc} + # NOTE: ping should fail + log_test $? 1 "IPv4 Hosts isolation: hs-t${tidsrc}-${hssrc} -X-> hs-t${tiddst}-${hsdst}" + +} + + +check_and_log_hs2gw_connectivity() +{ + local hssrc=$1 + local tid=$2 + + check_hs_ipv6_connectivity ${hssrc} 254 ${tid} + log_test $? 0 "IPv6 Hosts connectivity: hs-t${tid}-${hssrc} -> gw (tenant ${tid})" + + check_hs_ipv4_connectivity ${hssrc} 254 ${tid} + log_test $? 0 "IPv4 Hosts connectivity: hs-t${tid}-${hssrc} -> gw (tenant ${tid})" + +} + +router_tests() +{ + log_section "IPv6 routers connectivity test" + + check_and_log_rt_connectivity 1 2 + check_and_log_rt_connectivity 2 1 +} + +host2gateway_tests() +{ + log_section "IPv4/IPv6 connectivity test among hosts and gateway" + + check_and_log_hs2gw_connectivity 1 100 + check_and_log_hs2gw_connectivity 2 100 + + check_and_log_hs2gw_connectivity 3 200 + check_and_log_hs2gw_connectivity 4 200 +} + +host_vpn_tests() +{ + log_section "SRv6 VPN connectivity test among hosts in the same tenant" + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 + + check_and_log_hs_connectivity 3 4 200 + check_and_log_hs_connectivity 4 3 200 +} + +host_vpn_isolation_tests() +{ + local i + local j + local k + local tmp + local l1="1 2" + local l2="3 4" + local t1=100 + local t2=200 + + log_section "SRv6 VPN isolation test among hosts in different tentants" + + for k in 0 1; do + for i in ${l1}; do + for j in ${l2}; do + check_and_log_hs_isolation ${i} ${t1} ${j} ${t2} + done + done + + # let us test the reverse path + tmp="${l1}"; l1="${l2}"; l2="${tmp}" + tmp=${t1}; t1=${t2}; t2=${tmp} + done +} + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +modprobe vrf &>/dev/null +if [ ! -e /proc/sys/net/vrf/strict_mode ]; then + echo "SKIP: vrf sysctl does not exist" + exit $ksft_skip +fi + +cleanup &>/dev/null + +setup + +router_tests +host2gateway_tests +host_vpn_tests +host_vpn_isolation_tests + +print_log_test_results + +cleanup &>/dev/null + +exit ${ret} diff --git a/tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh b/tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh index ad7a9fc59934..1003119773e5 100755 --- a/tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh +++ b/tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh @@ -163,6 +163,9 @@ # +---------------------------------------------------+ # +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + readonly LOCALSID_TABLE_ID=90 readonly IPv6_RT_NETWORK=fd00 readonly IPv4_HS_NETWORK=10.0.0 @@ -464,18 +467,18 @@ host_vpn_isolation_tests() if [ "$(id -u)" -ne 0 ];then echo "SKIP: Need root privileges" - exit 0 + exit $ksft_skip fi if [ ! -x "$(command -v ip)" ]; then echo "SKIP: Could not run test without ip tool" - exit 0 + exit $ksft_skip fi modprobe vrf &>/dev/null if [ ! -e /proc/sys/net/vrf/strict_mode ]; then echo "SKIP: vrf sysctl does not exist" - exit 0 + exit $ksft_skip fi cleanup &>/dev/null diff --git a/tools/testing/selftests/net/srv6_end_dt6_l3vpn_test.sh b/tools/testing/selftests/net/srv6_end_dt6_l3vpn_test.sh index 68708f5e26a0..b9b06ef80d88 100755 --- a/tools/testing/selftests/net/srv6_end_dt6_l3vpn_test.sh +++ b/tools/testing/selftests/net/srv6_end_dt6_l3vpn_test.sh @@ -164,6 +164,9 @@ # +---------------------------------------------------+ # +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + readonly LOCALSID_TABLE_ID=90 readonly IPv6_RT_NETWORK=fd00 readonly IPv6_HS_NETWORK=cafe @@ -472,18 +475,18 @@ host_vpn_isolation_tests() if [ "$(id -u)" -ne 0 ];then echo "SKIP: Need root privileges" - exit 0 + exit $ksft_skip fi if [ ! -x "$(command -v ip)" ]; then echo "SKIP: Could not run test without ip tool" - exit 0 + exit $ksft_skip fi modprobe vrf &>/dev/null if [ ! -e /proc/sys/net/vrf/strict_mode ]; then echo "SKIP: vrf sysctl does not exist" - exit 0 + exit $ksft_skip fi cleanup &>/dev/null diff --git a/tools/testing/selftests/net/timestamping.c b/tools/testing/selftests/net/timestamping.c index 21091be70688..aee631c5284e 100644 --- a/tools/testing/selftests/net/timestamping.c +++ b/tools/testing/selftests/net/timestamping.c @@ -47,7 +47,7 @@ static void usage(const char *error) { if (error) printf("invalid option: %s\n", error); - printf("timestamping interface option*\n\n" + printf("timestamping <interface> [bind_phc_index] [option]*\n\n" "Options:\n" " IP_MULTICAST_LOOP - looping outgoing multicasts\n" " SO_TIMESTAMP - normal software time stamping, ms resolution\n" @@ -58,6 +58,7 @@ static void usage(const char *error) " SOF_TIMESTAMPING_RX_SOFTWARE - software fallback for incoming packets\n" " SOF_TIMESTAMPING_SOFTWARE - request reporting of software time stamps\n" " SOF_TIMESTAMPING_RAW_HARDWARE - request reporting of raw HW time stamps\n" + " SOF_TIMESTAMPING_BIND_PHC - request to bind a PHC of PTP vclock\n" " SIOCGSTAMP - check last socket time stamp\n" " SIOCGSTAMPNS - more accurate socket time stamp\n" " PTPV2 - use PTPv2 messages\n"); @@ -311,7 +312,6 @@ static void recvpacket(int sock, int recvmsg_flags, int main(int argc, char **argv) { - int so_timestamping_flags = 0; int so_timestamp = 0; int so_timestampns = 0; int siocgstamp = 0; @@ -325,6 +325,8 @@ int main(int argc, char **argv) struct ifreq device; struct ifreq hwtstamp; struct hwtstamp_config hwconfig, hwconfig_requested; + struct so_timestamping so_timestamping_get = { 0, -1 }; + struct so_timestamping so_timestamping = { 0, -1 }; struct sockaddr_in addr; struct ip_mreq imr; struct in_addr iaddr; @@ -342,7 +344,12 @@ int main(int argc, char **argv) exit(1); } - for (i = 2; i < argc; i++) { + if (argc >= 3 && sscanf(argv[2], "%d", &so_timestamping.bind_phc) == 1) + val = 3; + else + val = 2; + + for (i = val; i < argc; i++) { if (!strcasecmp(argv[i], "SO_TIMESTAMP")) so_timestamp = 1; else if (!strcasecmp(argv[i], "SO_TIMESTAMPNS")) @@ -356,17 +363,19 @@ int main(int argc, char **argv) else if (!strcasecmp(argv[i], "PTPV2")) ptpv2 = 1; else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_HARDWARE")) - so_timestamping_flags |= SOF_TIMESTAMPING_TX_HARDWARE; + so_timestamping.flags |= SOF_TIMESTAMPING_TX_HARDWARE; else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_SOFTWARE")) - so_timestamping_flags |= SOF_TIMESTAMPING_TX_SOFTWARE; + so_timestamping.flags |= SOF_TIMESTAMPING_TX_SOFTWARE; else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_HARDWARE")) - so_timestamping_flags |= SOF_TIMESTAMPING_RX_HARDWARE; + so_timestamping.flags |= SOF_TIMESTAMPING_RX_HARDWARE; else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_SOFTWARE")) - so_timestamping_flags |= SOF_TIMESTAMPING_RX_SOFTWARE; + so_timestamping.flags |= SOF_TIMESTAMPING_RX_SOFTWARE; else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_SOFTWARE")) - so_timestamping_flags |= SOF_TIMESTAMPING_SOFTWARE; + so_timestamping.flags |= SOF_TIMESTAMPING_SOFTWARE; else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RAW_HARDWARE")) - so_timestamping_flags |= SOF_TIMESTAMPING_RAW_HARDWARE; + so_timestamping.flags |= SOF_TIMESTAMPING_RAW_HARDWARE; + else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_BIND_PHC")) + so_timestamping.flags |= SOF_TIMESTAMPING_BIND_PHC; else usage(argv[i]); } @@ -385,10 +394,10 @@ int main(int argc, char **argv) hwtstamp.ifr_data = (void *)&hwconfig; memset(&hwconfig, 0, sizeof(hwconfig)); hwconfig.tx_type = - (so_timestamping_flags & SOF_TIMESTAMPING_TX_HARDWARE) ? + (so_timestamping.flags & SOF_TIMESTAMPING_TX_HARDWARE) ? HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF; hwconfig.rx_filter = - (so_timestamping_flags & SOF_TIMESTAMPING_RX_HARDWARE) ? + (so_timestamping.flags & SOF_TIMESTAMPING_RX_HARDWARE) ? ptpv2 ? HWTSTAMP_FILTER_PTP_V2_L4_SYNC : HWTSTAMP_FILTER_PTP_V1_L4_SYNC : HWTSTAMP_FILTER_NONE; hwconfig_requested = hwconfig; @@ -413,6 +422,9 @@ int main(int argc, char **argv) sizeof(struct sockaddr_in)) < 0) bail("bind"); + if (setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, interface, if_len)) + bail("bind device"); + /* set multicast group for outgoing packets */ inet_aton("224.0.1.130", &iaddr); /* alternate PTP domain 1 */ addr.sin_addr = iaddr; @@ -444,10 +456,9 @@ int main(int argc, char **argv) &enabled, sizeof(enabled)) < 0) bail("setsockopt SO_TIMESTAMPNS"); - if (so_timestamping_flags && - setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, - &so_timestamping_flags, - sizeof(so_timestamping_flags)) < 0) + if (so_timestamping.flags && + setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &so_timestamping, + sizeof(so_timestamping)) < 0) bail("setsockopt SO_TIMESTAMPING"); /* request IP_PKTINFO for debugging purposes */ @@ -468,14 +479,18 @@ int main(int argc, char **argv) else printf("SO_TIMESTAMPNS %d\n", val); - if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &val, &len) < 0) { + len = sizeof(so_timestamping_get); + if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &so_timestamping_get, + &len) < 0) { printf("%s: %s\n", "getsockopt SO_TIMESTAMPING", strerror(errno)); } else { - printf("SO_TIMESTAMPING %d\n", val); - if (val != so_timestamping_flags) - printf(" not the expected value %d\n", - so_timestamping_flags); + printf("SO_TIMESTAMPING flags %d, bind phc %d\n", + so_timestamping_get.flags, so_timestamping_get.bind_phc); + if (so_timestamping_get.flags != so_timestamping.flags || + so_timestamping_get.bind_phc != so_timestamping.bind_phc) + printf(" not expected, flags %d, bind phc %d\n", + so_timestamping.flags, so_timestamping.bind_phc); } /* send packets forever every five seconds */ diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 426d07875a48..97fceb9be9ed 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -25,6 +25,47 @@ #define TLS_PAYLOAD_MAX_LEN 16384 #define SOL_TLS 282 +struct tls_crypto_info_keys { + union { + struct tls12_crypto_info_aes_gcm_128 aes128; + struct tls12_crypto_info_chacha20_poly1305 chacha20; + }; + size_t len; +}; + +static void tls_crypto_info_init(uint16_t tls_version, uint16_t cipher_type, + struct tls_crypto_info_keys *tls12) +{ + memset(tls12, 0, sizeof(*tls12)); + + switch (cipher_type) { + case TLS_CIPHER_CHACHA20_POLY1305: + tls12->len = sizeof(struct tls12_crypto_info_chacha20_poly1305); + tls12->chacha20.info.version = tls_version; + tls12->chacha20.info.cipher_type = cipher_type; + break; + case TLS_CIPHER_AES_GCM_128: + tls12->len = sizeof(struct tls12_crypto_info_aes_gcm_128); + tls12->aes128.info.version = tls_version; + tls12->aes128.info.cipher_type = cipher_type; + break; + default: + break; + } +} + +static void memrnd(void *s, size_t n) +{ + int *dword = s; + char *byte; + + for (; n >= 4; n -= 4) + *dword++ = rand(); + byte = (void *)dword; + while (n--) + *byte++ = rand(); +} + FIXTURE(tls_basic) { int fd, cfd; @@ -133,33 +174,16 @@ FIXTURE_VARIANT_ADD(tls, 13_chacha) FIXTURE_SETUP(tls) { - union { - struct tls12_crypto_info_aes_gcm_128 aes128; - struct tls12_crypto_info_chacha20_poly1305 chacha20; - } tls12; + struct tls_crypto_info_keys tls12; struct sockaddr_in addr; socklen_t len; int sfd, ret; - size_t tls12_sz; self->notls = false; len = sizeof(addr); - memset(&tls12, 0, sizeof(tls12)); - switch (variant->cipher_type) { - case TLS_CIPHER_CHACHA20_POLY1305: - tls12_sz = sizeof(struct tls12_crypto_info_chacha20_poly1305); - tls12.chacha20.info.version = variant->tls_version; - tls12.chacha20.info.cipher_type = variant->cipher_type; - break; - case TLS_CIPHER_AES_GCM_128: - tls12_sz = sizeof(struct tls12_crypto_info_aes_gcm_128); - tls12.aes128.info.version = variant->tls_version; - tls12.aes128.info.cipher_type = variant->cipher_type; - break; - default: - tls12_sz = 0; - } + tls_crypto_info_init(variant->tls_version, variant->cipher_type, + &tls12); addr.sin_family = AF_INET; addr.sin_addr.s_addr = htonl(INADDR_ANY); @@ -187,7 +211,7 @@ FIXTURE_SETUP(tls) if (!self->notls) { ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, - tls12_sz); + tls12.len); ASSERT_EQ(ret, 0); } @@ -200,7 +224,7 @@ FIXTURE_SETUP(tls) ASSERT_EQ(ret, 0); ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, - tls12_sz); + tls12.len); ASSERT_EQ(ret, 0); } @@ -308,6 +332,8 @@ TEST_F(tls, recv_max) char recv_mem[TLS_PAYLOAD_MAX_LEN]; char buf[TLS_PAYLOAD_MAX_LEN]; + memrnd(buf, sizeof(buf)); + EXPECT_GE(send(self->fd, buf, send_len, 0), 0); EXPECT_NE(recv(self->cfd, recv_mem, send_len, 0), -1); EXPECT_EQ(memcmp(buf, recv_mem, send_len), 0); @@ -418,8 +444,9 @@ TEST_F(tls, sendmsg_large) EXPECT_EQ(sendmsg(self->cfd, &msg, 0), send_len); } - while (recvs++ < sends) + while (recvs++ < sends) { EXPECT_NE(recv(self->fd, mem, send_len, 0), -1); + } free(mem); } @@ -588,6 +615,8 @@ TEST_F(tls, recvmsg_single_max) struct iovec vec; struct msghdr hdr; + memrnd(send_mem, sizeof(send_mem)); + EXPECT_EQ(send(self->fd, send_mem, send_len, 0), send_len); vec.iov_base = (char *)recv_mem; vec.iov_len = TLS_PAYLOAD_MAX_LEN; @@ -610,6 +639,8 @@ TEST_F(tls, recvmsg_multiple) struct msghdr hdr; int i; + memrnd(buf, sizeof(buf)); + EXPECT_EQ(send(self->fd, buf, send_len, 0), send_len); for (i = 0; i < msg_iovlen; i++) { iov_base[i] = (char *)malloc(iov_len); @@ -634,6 +665,8 @@ TEST_F(tls, single_send_multiple_recv) char send_mem[TLS_PAYLOAD_MAX_LEN * 2]; char recv_mem[TLS_PAYLOAD_MAX_LEN * 2]; + memrnd(send_mem, sizeof(send_mem)); + EXPECT_GE(send(self->fd, send_mem, total_len, 0), 0); memset(recv_mem, 0, total_len); @@ -834,18 +867,17 @@ TEST_F(tls, bidir) int ret; if (!self->notls) { - struct tls12_crypto_info_aes_gcm_128 tls12; + struct tls_crypto_info_keys tls12; - memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = variant->tls_version; - tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; + tls_crypto_info_init(variant->tls_version, variant->cipher_type, + &tls12); ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12, - sizeof(tls12)); + tls12.len); ASSERT_EQ(ret, 0); ret = setsockopt(self->cfd, SOL_TLS, TLS_TX, &tls12, - sizeof(tls12)); + tls12.len); ASSERT_EQ(ret, 0); } diff --git a/tools/testing/selftests/net/toeplitz.c b/tools/testing/selftests/net/toeplitz.c new file mode 100644 index 000000000000..710ac956bdb3 --- /dev/null +++ b/tools/testing/selftests/net/toeplitz.c @@ -0,0 +1,585 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Toeplitz test + * + * 1. Read packets and their rx_hash using PF_PACKET/TPACKET_V3 + * 2. Compute the rx_hash in software based on the packet contents + * 3. Compare the two + * + * Optionally, either '-C $rx_irq_cpu_list' or '-r $rps_bitmap' may be given. + * + * If '-C $rx_irq_cpu_list' is given, also + * + * 4. Identify the cpu on which the packet arrived with PACKET_FANOUT_CPU + * 5. Compute the rxqueue that RSS would select based on this rx_hash + * 6. Using the $rx_irq_cpu_list map, identify the arriving cpu based on rxq irq + * 7. Compare the cpus from 4 and 6 + * + * Else if '-r $rps_bitmap' is given, also + * + * 4. Identify the cpu on which the packet arrived with PACKET_FANOUT_CPU + * 5. Compute the cpu that RPS should select based on rx_hash and $rps_bitmap + * 6. Compare the cpus from 4 and 5 + */ + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <errno.h> +#include <error.h> +#include <fcntl.h> +#include <getopt.h> +#include <linux/filter.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <net/if.h> +#include <netdb.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <poll.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/sysinfo.h> +#include <sys/time.h> +#include <sys/types.h> +#include <unistd.h> + +#define TOEPLITZ_KEY_MIN_LEN 40 +#define TOEPLITZ_KEY_MAX_LEN 60 + +#define TOEPLITZ_STR_LEN(K) (((K) * 3) - 1) /* hex encoded: AA:BB:CC:...:ZZ */ +#define TOEPLITZ_STR_MIN_LEN TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MIN_LEN) +#define TOEPLITZ_STR_MAX_LEN TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MAX_LEN) + +#define FOUR_TUPLE_MAX_LEN ((sizeof(struct in6_addr) * 2) + (sizeof(uint16_t) * 2)) + +#define RSS_MAX_CPUS (1 << 16) /* real constraint is PACKET_FANOUT_MAX */ + +#define RPS_MAX_CPUS 16UL /* must be a power of 2 */ + +/* configuration options (cmdline arguments) */ +static uint16_t cfg_dport = 8000; +static int cfg_family = AF_INET6; +static char *cfg_ifname = "eth0"; +static int cfg_num_queues; +static int cfg_num_rps_cpus; +static bool cfg_sink; +static int cfg_type = SOCK_STREAM; +static int cfg_timeout_msec = 1000; +static bool cfg_verbose; + +/* global vars */ +static int num_cpus; +static int ring_block_nr; +static int ring_block_sz; + +/* stats */ +static int frames_received; +static int frames_nohash; +static int frames_error; + +#define log_verbose(args...) do { if (cfg_verbose) fprintf(stderr, args); } while (0) + +/* tpacket ring */ +struct ring_state { + int fd; + char *mmap; + int idx; + int cpu; +}; + +static unsigned int rx_irq_cpus[RSS_MAX_CPUS]; /* map from rxq to cpu */ +static int rps_silo_to_cpu[RPS_MAX_CPUS]; +static unsigned char toeplitz_key[TOEPLITZ_KEY_MAX_LEN]; +static struct ring_state rings[RSS_MAX_CPUS]; + +static inline uint32_t toeplitz(const unsigned char *four_tuple, + const unsigned char *key) +{ + int i, bit, ret = 0; + uint32_t key32; + + key32 = ntohl(*((uint32_t *)key)); + key += 4; + + for (i = 0; i < FOUR_TUPLE_MAX_LEN; i++) { + for (bit = 7; bit >= 0; bit--) { + if (four_tuple[i] & (1 << bit)) + ret ^= key32; + + key32 <<= 1; + key32 |= !!(key[0] & (1 << bit)); + } + key++; + } + + return ret; +} + +/* Compare computed cpu with arrival cpu from packet_fanout_cpu */ +static void verify_rss(uint32_t rx_hash, int cpu) +{ + int queue = rx_hash % cfg_num_queues; + + log_verbose(" rxq %d (cpu %d)", queue, rx_irq_cpus[queue]); + if (rx_irq_cpus[queue] != cpu) { + log_verbose(". error: rss cpu mismatch (%d)", cpu); + frames_error++; + } +} + +static void verify_rps(uint64_t rx_hash, int cpu) +{ + int silo = (rx_hash * cfg_num_rps_cpus) >> 32; + + log_verbose(" silo %d (cpu %d)", silo, rps_silo_to_cpu[silo]); + if (rps_silo_to_cpu[silo] != cpu) { + log_verbose(". error: rps cpu mismatch (%d)", cpu); + frames_error++; + } +} + +static void log_rxhash(int cpu, uint32_t rx_hash, + const char *addrs, int addr_len) +{ + char saddr[INET6_ADDRSTRLEN], daddr[INET6_ADDRSTRLEN]; + uint16_t *ports; + + if (!inet_ntop(cfg_family, addrs, saddr, sizeof(saddr)) || + !inet_ntop(cfg_family, addrs + addr_len, daddr, sizeof(daddr))) + error(1, 0, "address parse error"); + + ports = (void *)addrs + (addr_len * 2); + log_verbose("cpu %d: rx_hash 0x%08x [saddr %s daddr %s sport %02hu dport %02hu]", + cpu, rx_hash, saddr, daddr, + ntohs(ports[0]), ntohs(ports[1])); +} + +/* Compare computed rxhash with rxhash received from tpacket_v3 */ +static void verify_rxhash(const char *pkt, uint32_t rx_hash, int cpu) +{ + unsigned char four_tuple[FOUR_TUPLE_MAX_LEN] = {0}; + uint32_t rx_hash_sw; + const char *addrs; + int addr_len; + + if (cfg_family == AF_INET) { + addr_len = sizeof(struct in_addr); + addrs = pkt + offsetof(struct iphdr, saddr); + } else { + addr_len = sizeof(struct in6_addr); + addrs = pkt + offsetof(struct ip6_hdr, ip6_src); + } + + memcpy(four_tuple, addrs, (addr_len * 2) + (sizeof(uint16_t) * 2)); + rx_hash_sw = toeplitz(four_tuple, toeplitz_key); + + if (cfg_verbose) + log_rxhash(cpu, rx_hash, addrs, addr_len); + + if (rx_hash != rx_hash_sw) { + log_verbose(" != expected 0x%x\n", rx_hash_sw); + frames_error++; + return; + } + + log_verbose(" OK"); + if (cfg_num_queues) + verify_rss(rx_hash, cpu); + else if (cfg_num_rps_cpus) + verify_rps(rx_hash, cpu); + log_verbose("\n"); +} + +static char *recv_frame(const struct ring_state *ring, char *frame) +{ + struct tpacket3_hdr *hdr = (void *)frame; + + if (hdr->hv1.tp_rxhash) + verify_rxhash(frame + hdr->tp_net, hdr->hv1.tp_rxhash, + ring->cpu); + else + frames_nohash++; + + return frame + hdr->tp_next_offset; +} + +/* A single TPACKET_V3 block can hold multiple frames */ +static void recv_block(struct ring_state *ring) +{ + struct tpacket_block_desc *block; + char *frame; + int i; + + block = (void *)(ring->mmap + ring->idx * ring_block_sz); + if (!(block->hdr.bh1.block_status & TP_STATUS_USER)) + return; + + frame = (char *)block; + frame += block->hdr.bh1.offset_to_first_pkt; + + for (i = 0; i < block->hdr.bh1.num_pkts; i++) { + frame = recv_frame(ring, frame); + frames_received++; + } + + block->hdr.bh1.block_status = TP_STATUS_KERNEL; + ring->idx = (ring->idx + 1) % ring_block_nr; +} + +/* simple test: sleep once unconditionally and then process all rings */ +static void process_rings(void) +{ + int i; + + usleep(1000 * cfg_timeout_msec); + + for (i = 0; i < num_cpus; i++) + recv_block(&rings[i]); + + fprintf(stderr, "count: pass=%u nohash=%u fail=%u\n", + frames_received - frames_nohash - frames_error, + frames_nohash, frames_error); +} + +static char *setup_ring(int fd) +{ + struct tpacket_req3 req3 = {0}; + void *ring; + + req3.tp_retire_blk_tov = cfg_timeout_msec; + req3.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH; + + req3.tp_frame_size = 2048; + req3.tp_frame_nr = 1 << 10; + req3.tp_block_nr = 2; + + req3.tp_block_size = req3.tp_frame_size * req3.tp_frame_nr; + req3.tp_block_size /= req3.tp_block_nr; + + if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3))) + error(1, errno, "setsockopt PACKET_RX_RING"); + + ring_block_sz = req3.tp_block_size; + ring_block_nr = req3.tp_block_nr; + + ring = mmap(0, req3.tp_block_size * req3.tp_block_nr, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED | MAP_POPULATE, fd, 0); + if (ring == MAP_FAILED) + error(1, 0, "mmap failed"); + + return ring; +} + +static void __set_filter(int fd, int off_proto, uint8_t proto, int off_dport) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4), + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, off_proto), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, proto, 0, 2), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, off_dport), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_dport, 1, 0), + BPF_STMT(BPF_RET + BPF_K, 0), + BPF_STMT(BPF_RET + BPF_K, 0xFFFF), + }; + struct sock_fprog prog = {}; + + prog.filter = filter; + prog.len = sizeof(filter) / sizeof(struct sock_filter); + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) + error(1, errno, "setsockopt filter"); +} + +/* filter on transport protocol and destination port */ +static void set_filter(int fd) +{ + const int off_dport = offsetof(struct tcphdr, dest); /* same for udp */ + uint8_t proto; + + proto = cfg_type == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP; + if (cfg_family == AF_INET) + __set_filter(fd, offsetof(struct iphdr, protocol), proto, + sizeof(struct iphdr) + off_dport); + else + __set_filter(fd, offsetof(struct ip6_hdr, ip6_nxt), proto, + sizeof(struct ip6_hdr) + off_dport); +} + +/* drop everything: used temporarily during setup */ +static void set_filter_null(int fd) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET + BPF_K, 0), + }; + struct sock_fprog prog = {}; + + prog.filter = filter; + prog.len = sizeof(filter) / sizeof(struct sock_filter); + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) + error(1, errno, "setsockopt filter"); +} + +static int create_ring(char **ring) +{ + struct fanout_args args = { + .id = 1, + .type_flags = PACKET_FANOUT_CPU, + .max_num_members = RSS_MAX_CPUS + }; + struct sockaddr_ll ll = { 0 }; + int fd, val; + + fd = socket(PF_PACKET, SOCK_DGRAM, 0); + if (fd == -1) + error(1, errno, "socket creation failed"); + + val = TPACKET_V3; + if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt PACKET_VERSION"); + *ring = setup_ring(fd); + + /* block packets until all rings are added to the fanout group: + * else packets can arrive during setup and get misclassified + */ + set_filter_null(fd); + + ll.sll_family = AF_PACKET; + ll.sll_ifindex = if_nametoindex(cfg_ifname); + ll.sll_protocol = cfg_family == AF_INET ? htons(ETH_P_IP) : + htons(ETH_P_IPV6); + if (bind(fd, (void *)&ll, sizeof(ll))) + error(1, errno, "bind"); + + /* must come after bind: verifies all programs in group match */ + if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT, &args, sizeof(args))) { + /* on failure, retry using old API if that is sufficient: + * it has a hard limit of 256 sockets, so only try if + * (a) only testing rxhash, not RSS or (b) <= 256 cpus. + * in this API, the third argument is left implicit. + */ + if (cfg_num_queues || num_cpus > 256 || + setsockopt(fd, SOL_PACKET, PACKET_FANOUT, + &args, sizeof(uint32_t))) + error(1, errno, "setsockopt PACKET_FANOUT cpu"); + } + + return fd; +} + +/* setup inet(6) socket to blackhole the test traffic, if arg '-s' */ +static int setup_sink(void) +{ + int fd, val; + + fd = socket(cfg_family, cfg_type, 0); + if (fd == -1) + error(1, errno, "socket %d.%d", cfg_family, cfg_type); + + val = 1 << 20; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &val, sizeof(val))) + error(1, errno, "setsockopt rcvbuf"); + + return fd; +} + +static void setup_rings(void) +{ + int i; + + for (i = 0; i < num_cpus; i++) { + rings[i].cpu = i; + rings[i].fd = create_ring(&rings[i].mmap); + } + + /* accept packets once all rings in the fanout group are up */ + for (i = 0; i < num_cpus; i++) + set_filter(rings[i].fd); +} + +static void cleanup_rings(void) +{ + int i; + + for (i = 0; i < num_cpus; i++) { + if (munmap(rings[i].mmap, ring_block_nr * ring_block_sz)) + error(1, errno, "munmap"); + if (close(rings[i].fd)) + error(1, errno, "close"); + } +} + +static void parse_cpulist(const char *arg) +{ + do { + rx_irq_cpus[cfg_num_queues++] = strtol(arg, NULL, 10); + + arg = strchr(arg, ','); + if (!arg) + break; + arg++; // skip ',' + } while (1); +} + +static void show_cpulist(void) +{ + int i; + + for (i = 0; i < cfg_num_queues; i++) + fprintf(stderr, "rxq %d: cpu %d\n", i, rx_irq_cpus[i]); +} + +static void show_silos(void) +{ + int i; + + for (i = 0; i < cfg_num_rps_cpus; i++) + fprintf(stderr, "silo %d: cpu %d\n", i, rps_silo_to_cpu[i]); +} + +static void parse_toeplitz_key(const char *str, int slen, unsigned char *key) +{ + int i, ret, off; + + if (slen < TOEPLITZ_STR_MIN_LEN || + slen > TOEPLITZ_STR_MAX_LEN + 1) + error(1, 0, "invalid toeplitz key"); + + for (i = 0, off = 0; off < slen; i++, off += 3) { + ret = sscanf(str + off, "%hhx", &key[i]); + if (ret != 1) + error(1, 0, "key parse error at %d off %d len %d", + i, off, slen); + } +} + +static void parse_rps_bitmap(const char *arg) +{ + unsigned long bitmap; + int i; + + bitmap = strtoul(arg, NULL, 0); + + if (bitmap & ~(RPS_MAX_CPUS - 1)) + error(1, 0, "rps bitmap 0x%lx out of bounds 0..%lu", + bitmap, RPS_MAX_CPUS - 1); + + for (i = 0; i < RPS_MAX_CPUS; i++) + if (bitmap & 1UL << i) + rps_silo_to_cpu[cfg_num_rps_cpus++] = i; +} + +static void parse_opts(int argc, char **argv) +{ + static struct option long_options[] = { + {"dport", required_argument, 0, 'd'}, + {"cpus", required_argument, 0, 'C'}, + {"key", required_argument, 0, 'k'}, + {"iface", required_argument, 0, 'i'}, + {"ipv4", no_argument, 0, '4'}, + {"ipv6", no_argument, 0, '6'}, + {"sink", no_argument, 0, 's'}, + {"tcp", no_argument, 0, 't'}, + {"timeout", required_argument, 0, 'T'}, + {"udp", no_argument, 0, 'u'}, + {"verbose", no_argument, 0, 'v'}, + {"rps", required_argument, 0, 'r'}, + {0, 0, 0, 0} + }; + bool have_toeplitz = false; + int index, c; + + while ((c = getopt_long(argc, argv, "46C:d:i:k:r:stT:u:v", long_options, &index)) != -1) { + switch (c) { + case '4': + cfg_family = AF_INET; + break; + case '6': + cfg_family = AF_INET6; + break; + case 'C': + parse_cpulist(optarg); + break; + case 'd': + cfg_dport = strtol(optarg, NULL, 0); + break; + case 'i': + cfg_ifname = optarg; + break; + case 'k': + parse_toeplitz_key(optarg, strlen(optarg), + toeplitz_key); + have_toeplitz = true; + break; + case 'r': + parse_rps_bitmap(optarg); + break; + case 's': + cfg_sink = true; + break; + case 't': + cfg_type = SOCK_STREAM; + break; + case 'T': + cfg_timeout_msec = strtol(optarg, NULL, 0); + break; + case 'u': + cfg_type = SOCK_DGRAM; + break; + case 'v': + cfg_verbose = true; + break; + + default: + error(1, 0, "unknown option %c", optopt); + break; + } + } + + if (!have_toeplitz) + error(1, 0, "Must supply rss key ('-k')"); + + num_cpus = get_nprocs(); + if (num_cpus > RSS_MAX_CPUS) + error(1, 0, "increase RSS_MAX_CPUS"); + + if (cfg_num_queues && cfg_num_rps_cpus) + error(1, 0, + "Can't supply both RSS cpus ('-C') and RPS map ('-r')"); + if (cfg_verbose) { + show_cpulist(); + show_silos(); + } +} + +int main(int argc, char **argv) +{ + const int min_tests = 10; + int fd_sink = -1; + + parse_opts(argc, argv); + + if (cfg_sink) + fd_sink = setup_sink(); + + setup_rings(); + process_rings(); + cleanup_rings(); + + if (cfg_sink && close(fd_sink)) + error(1, errno, "close sink"); + + if (frames_received - frames_nohash < min_tests) + error(1, 0, "too few frames for verification"); + + return frames_error; +} diff --git a/tools/testing/selftests/net/toeplitz.sh b/tools/testing/selftests/net/toeplitz.sh new file mode 100755 index 000000000000..0a49907cd4fe --- /dev/null +++ b/tools/testing/selftests/net/toeplitz.sh @@ -0,0 +1,199 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# extended toeplitz test: test rxhash plus, optionally, either (1) rss mapping +# from rxhash to rx queue ('-rss') or (2) rps mapping from rxhash to cpu +# ('-rps <rps_map>') +# +# irq-pattern-prefix can be derived from /sys/kernel/irq/*/action, +# which is a driver-specific encoding. +# +# invoke as ./toeplitz.sh (-i <iface>) -u|-t -4|-6 \ +# [(-rss -irq_prefix <irq-pattern-prefix>)|(-rps <rps_map>)] + +source setup_loopback.sh +readonly SERVER_IP4="192.168.1.200/24" +readonly SERVER_IP6="fda8::1/64" +readonly SERVER_MAC="aa:00:00:00:00:02" + +readonly CLIENT_IP4="192.168.1.100/24" +readonly CLIENT_IP6="fda8::2/64" +readonly CLIENT_MAC="aa:00:00:00:00:01" + +PORT=8000 +KEY="$(</proc/sys/net/core/netdev_rss_key)" +TEST_RSS=false +RPS_MAP="" +PROTO_FLAG="" +IP_FLAG="" +DEV="eth0" + +# Return the number of rxqs among which RSS is configured to spread packets. +# This is determined by reading the RSS indirection table using ethtool. +get_rss_cfg_num_rxqs() { + echo $(ethtool -x "${DEV}" | + egrep [[:space:]]+[0-9]+:[[:space:]]+ | + cut -d: -f2- | + awk '{$1=$1};1' | + tr ' ' '\n' | + sort -u | + wc -l) +} + +# Return a list of the receive irq handler cpus. +# The list is ordered by the irqs, so first rxq-0 cpu, then rxq-1 cpu, etc. +# Reads /sys/kernel/irq/ in order, so algorithm depends on +# irq_{rxq-0} < irq_{rxq-1}, etc. +get_rx_irq_cpus() { + CPUS="" + # sort so that irq 2 is read before irq 10 + SORTED_IRQS=$(for i in /sys/kernel/irq/*; do echo $i; done | sort -V) + # Consider only as many queues as RSS actually uses. We assume that + # if RSS_CFG_NUM_RXQS=N, then RSS uses rxqs 0-(N-1). + RSS_CFG_NUM_RXQS=$(get_rss_cfg_num_rxqs) + RXQ_COUNT=0 + + for i in ${SORTED_IRQS} + do + [[ "${RXQ_COUNT}" -lt "${RSS_CFG_NUM_RXQS}" ]] || break + # lookup relevant IRQs by action name + [[ -e "$i/actions" ]] || continue + cat "$i/actions" | grep -q "${IRQ_PATTERN}" || continue + irqname=$(<"$i/actions") + + # does the IRQ get called + irqcount=$(cat "$i/per_cpu_count" | tr -d '0,') + [[ -n "${irqcount}" ]] || continue + + # lookup CPU + irq=$(basename "$i") + cpu=$(cat "/proc/irq/$irq/smp_affinity_list") + + if [[ -z "${CPUS}" ]]; then + CPUS="${cpu}" + else + CPUS="${CPUS},${cpu}" + fi + RXQ_COUNT=$((RXQ_COUNT+1)) + done + + echo "${CPUS}" +} + +get_disable_rfs_cmd() { + echo "echo 0 > /proc/sys/net/core/rps_sock_flow_entries;" +} + +get_set_rps_bitmaps_cmd() { + CMD="" + for i in /sys/class/net/${DEV}/queues/rx-*/rps_cpus + do + CMD="${CMD} echo $1 > ${i};" + done + + echo "${CMD}" +} + +get_disable_rps_cmd() { + echo "$(get_set_rps_bitmaps_cmd 0)" +} + +die() { + echo "$1" + exit 1 +} + +check_nic_rxhash_enabled() { + local -r pattern="receive-hashing:\ on" + + ethtool -k "${DEV}" | grep -q "${pattern}" || die "rxhash must be enabled" +} + +parse_opts() { + local prog=$0 + shift 1 + + while [[ "$1" =~ "-" ]]; do + if [[ "$1" = "-irq_prefix" ]]; then + shift + IRQ_PATTERN="^$1-[0-9]*$" + elif [[ "$1" = "-u" || "$1" = "-t" ]]; then + PROTO_FLAG="$1" + elif [[ "$1" = "-4" ]]; then + IP_FLAG="$1" + SERVER_IP="${SERVER_IP4}" + CLIENT_IP="${CLIENT_IP4}" + elif [[ "$1" = "-6" ]]; then + IP_FLAG="$1" + SERVER_IP="${SERVER_IP6}" + CLIENT_IP="${CLIENT_IP6}" + elif [[ "$1" = "-rss" ]]; then + TEST_RSS=true + elif [[ "$1" = "-rps" ]]; then + shift + RPS_MAP="$1" + elif [[ "$1" = "-i" ]]; then + shift + DEV="$1" + else + die "Usage: ${prog} (-i <iface>) -u|-t -4|-6 \ + [(-rss -irq_prefix <irq-pattern-prefix>)|(-rps <rps_map>)]" + fi + shift + done +} + +setup() { + setup_loopback_environment "${DEV}" + + # Set up server_ns namespace and client_ns namespace + setup_macvlan_ns "${DEV}" server_ns server \ + "${SERVER_MAC}" "${SERVER_IP}" + setup_macvlan_ns "${DEV}" client_ns client \ + "${CLIENT_MAC}" "${CLIENT_IP}" +} + +cleanup() { + cleanup_macvlan_ns server_ns server client_ns client + cleanup_loopback "${DEV}" +} + +parse_opts $0 $@ + +setup +trap cleanup EXIT + +check_nic_rxhash_enabled + +# Actual test starts here +if [[ "${TEST_RSS}" = true ]]; then + # RPS/RFS must be disabled because they move packets between cpus, + # which breaks the PACKET_FANOUT_CPU identification of RSS decisions. + eval "$(get_disable_rfs_cmd) $(get_disable_rps_cmd)" \ + ip netns exec server_ns ./toeplitz "${IP_FLAG}" "${PROTO_FLAG}" \ + -d "${PORT}" -i "${DEV}" -k "${KEY}" -T 1000 \ + -C "$(get_rx_irq_cpus)" -s -v & +elif [[ ! -z "${RPS_MAP}" ]]; then + eval "$(get_disable_rfs_cmd) $(get_set_rps_bitmaps_cmd ${RPS_MAP})" \ + ip netns exec server_ns ./toeplitz "${IP_FLAG}" "${PROTO_FLAG}" \ + -d "${PORT}" -i "${DEV}" -k "${KEY}" -T 1000 \ + -r "0x${RPS_MAP}" -s -v & +else + ip netns exec server_ns ./toeplitz "${IP_FLAG}" "${PROTO_FLAG}" \ + -d "${PORT}" -i "${DEV}" -k "${KEY}" -T 1000 -s -v & +fi + +server_pid=$! + +ip netns exec client_ns ./toeplitz_client.sh "${PROTO_FLAG}" \ + "${IP_FLAG}" "${SERVER_IP%%/*}" "${PORT}" & + +client_pid=$! + +wait "${server_pid}" +exit_code=$? +kill -9 "${client_pid}" +if [[ "${exit_code}" -eq 0 ]]; then + echo "Test Succeeded!" +fi +exit "${exit_code}" diff --git a/tools/testing/selftests/net/toeplitz_client.sh b/tools/testing/selftests/net/toeplitz_client.sh new file mode 100755 index 000000000000..2fef34f4aba1 --- /dev/null +++ b/tools/testing/selftests/net/toeplitz_client.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# A simple program for generating traffic for the toeplitz test. +# +# This program sends packets periodically for, conservatively, 20 seconds. The +# intent is for the calling program to kill this program once it is no longer +# needed, rather than waiting for the 20 second expiration. + +send_traffic() { + expiration=$((SECONDS+20)) + while [[ "${SECONDS}" -lt "${expiration}" ]] + do + if [[ "${PROTO}" == "-u" ]]; then + echo "msg $i" | nc "${IPVER}" -u -w 0 "${ADDR}" "${PORT}" + else + echo "msg $i" | nc "${IPVER}" -w 0 "${ADDR}" "${PORT}" + fi + sleep 0.001 + done +} + +PROTO=$1 +IPVER=$2 +ADDR=$3 +PORT=$4 + +send_traffic diff --git a/tools/testing/selftests/net/unicast_extensions.sh b/tools/testing/selftests/net/unicast_extensions.sh index dbf0421986df..2d10ccac898a 100755 --- a/tools/testing/selftests/net/unicast_extensions.sh +++ b/tools/testing/selftests/net/unicast_extensions.sh @@ -28,12 +28,15 @@ # These tests provide an easy way to flip the expected result of any # of these behaviors for testing kernel patches that change them. +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + # nettest can be run from PATH or from same directory as this selftest if ! which nettest >/dev/null; then PATH=$PWD:$PATH if ! which nettest >/dev/null; then echo "'nettest' command not found; skipping tests" - exit 0 + exit $ksft_skip fi fi @@ -189,6 +192,15 @@ segmenttest 255.255.255.1 255.255.255.254 24 "assign and ping inside 255.255.255 route_test 240.5.6.7 240.5.6.1 255.1.2.1 255.1.2.3 24 "route between 240.5.6/24 and 255.1.2/24 (is allowed)" route_test 0.200.6.7 0.200.38.1 245.99.101.1 245.99.200.111 16 "route between 0.200/16 and 245.99/16 (is allowed)" # +# Test support for lowest address ending in .0 +segmenttest 5.10.15.20 5.10.15.0 24 "assign and ping lowest address (/24)" +# +# Test support for lowest address not ending in .0 +segmenttest 192.168.101.192 192.168.101.193 26 "assign and ping lowest address (/26)" +# +# Routing using lowest address as a gateway/endpoint +route_test 192.168.42.1 192.168.42.0 9.8.7.6 9.8.7.0 24 "routing using lowest address" +# # ============================================== # ==== TESTS THAT CURRENTLY EXPECT FAILURE ===== # ============================================== @@ -202,14 +214,6 @@ segmenttest 255.255.255.1 255.255.255.255 16 "assigning 255.255.255.255 (is forb # Currently Linux does not allow this, so this should fail too segmenttest 127.99.4.5 127.99.4.6 16 "assign and ping inside 127/8 (is forbidden)" # -# Test support for lowest address -# Currently Linux does not allow this, so this should fail too -segmenttest 5.10.15.20 5.10.15.0 24 "assign and ping lowest address (is forbidden)" -# -# Routing using lowest address as a gateway/endpoint -# Currently Linux does not allow this, so this should fail too -route_test 192.168.42.1 192.168.42.0 9.8.7.6 9.8.7.0 24 "routing using lowest address (is forbidden)" -# # Test support for unicast use of class D # Currently Linux does not allow this, so this should fail too segmenttest 225.1.2.3 225.1.2.200 24 "assign and ping class D address (is forbidden)" diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh index 11d7cdb898c0..19eac3e44c06 100755 --- a/tools/testing/selftests/net/veth.sh +++ b/tools/testing/selftests/net/veth.sh @@ -13,7 +13,7 @@ readonly NS_DST=$BASE$DST readonly BM_NET_V4=192.168.1. readonly BM_NET_V6=2001:db8:: -readonly NPROCS=`nproc` +readonly CPUS=`nproc` ret=0 cleanup() { @@ -75,6 +75,29 @@ chk_tso_flag() { __chk_flag "$1" $2 $3 tcp-segmentation-offload } +chk_channels() { + local msg="$1" + local target=$2 + local rx=$3 + local tx=$4 + + local dev=veth$target + + local cur_rx=`ip netns exec $BASE$target ethtool -l $dev |\ + grep RX: | tail -n 1 | awk '{print $2}' ` + local cur_tx=`ip netns exec $BASE$target ethtool -l $dev |\ + grep TX: | tail -n 1 | awk '{print $2}'` + local cur_combined=`ip netns exec $BASE$target ethtool -l $dev |\ + grep Combined: | tail -n 1 | awk '{print $2}'` + + printf "%-60s" "$msg" + if [ "$cur_rx" = "$rx" -a "$cur_tx" = "$tx" -a "$cur_combined" = "n/a" ]; then + echo " ok " + else + echo " fail rx:$rx:$cur_rx tx:$tx:$cur_tx combined:n/a:$cur_combined" + fi +} + chk_gro() { local msg="$1" local expected=$2 @@ -107,11 +130,100 @@ chk_gro() { fi } +__change_channels() +{ + local cur_cpu + local end=$1 + local cur + local i + + while true; do + printf -v cur '%(%s)T' + [ $cur -le $end ] || break + + for i in `seq 1 $CPUS`; do + ip netns exec $NS_SRC ethtool -L veth$SRC rx $i tx $i + ip netns exec $NS_DST ethtool -L veth$DST rx $i tx $i + done + + for i in `seq 1 $((CPUS - 1))`; do + cur_cpu=$((CPUS - $i)) + ip netns exec $NS_SRC ethtool -L veth$SRC rx $cur_cpu tx $cur_cpu + ip netns exec $NS_DST ethtool -L veth$DST rx $cur_cpu tx $cur_cpu + done + done +} + +__send_data() { + local end=$1 + + while true; do + printf -v cur '%(%s)T' + [ $cur -le $end ] || break + + ip netns exec $NS_SRC ./udpgso_bench_tx -4 -s 1000 -M 300 -D $BM_NET_V4$DST + done +} + +do_stress() { + local end + printf -v end '%(%s)T' + end=$((end + $STRESS)) + + ip netns exec $NS_SRC ethtool -L veth$SRC rx 3 tx 3 + ip netns exec $NS_DST ethtool -L veth$DST rx 3 tx 3 + + ip netns exec $NS_DST ./udpgso_bench_rx & + local rx_pid=$! + + echo "Running stress test for $STRESS seconds..." + __change_channels $end & + local ch_pid=$! + __send_data $end & + local data_pid_1=$! + __send_data $end & + local data_pid_2=$! + __send_data $end & + local data_pid_3=$! + __send_data $end & + local data_pid_4=$! + + wait $ch_pid $data_pid_1 $data_pid_2 $data_pid_3 $data_pid_4 + kill -9 $rx_pid + echo "done" + + # restore previous setting + ip netns exec $NS_SRC ethtool -L veth$SRC rx 2 tx 2 + ip netns exec $NS_DST ethtool -L veth$DST rx 2 tx 1 +} + +usage() { + echo "Usage: $0 [-h] [-s <seconds>]" + echo -e "\t-h: show this help" + echo -e "\t-s: run optional stress tests for the given amount of seconds" +} + +STRESS=0 +while getopts "hs:" option; do + case "$option" in + "h") + usage $0 + exit 0 + ;; + "s") + STRESS=$OPTARG + ;; + esac +done + if [ ! -f ../bpf/xdp_dummy.o ]; then echo "Missing xdp_dummy helper. Build bpf selftest first" exit 1 fi +[ $CPUS -lt 2 ] && echo "Only one CPU available, some tests will be skipped" +[ $STRESS -gt 0 -a $CPUS -lt 3 ] && echo " stress test will be skipped, too" + create_ns chk_gro_flag "default - gro flag" $SRC off chk_gro_flag " - peer gro flag" $DST off @@ -134,6 +246,8 @@ chk_gro " - aggregation with TSO off" 1 cleanup create_ns +chk_channels "default channels" $DST 1 1 + ip -n $NS_DST link set dev veth$DST down ip netns exec $NS_DST ethtool -K veth$DST gro on chk_gro_flag "with gro enabled on link down - gro flag" $DST on @@ -147,6 +261,56 @@ chk_gro " - aggregation with TSO off" 1 cleanup create_ns + +CUR_TX=1 +CUR_RX=1 +if [ $CPUS -gt 1 ]; then + ip netns exec $NS_DST ethtool -L veth$DST tx 2 + chk_channels "setting tx channels" $DST 1 2 + CUR_TX=2 +fi + +if [ $CPUS -gt 2 ]; then + ip netns exec $NS_DST ethtool -L veth$DST rx 3 tx 3 + chk_channels "setting both rx and tx channels" $DST 3 3 + CUR_RX=3 + CUR_TX=3 +fi + +ip netns exec $NS_DST ethtool -L veth$DST combined 2 2>/dev/null +chk_channels "bad setting: combined channels" $DST $CUR_RX $CUR_TX + +ip netns exec $NS_DST ethtool -L veth$DST tx $((CPUS + 1)) 2>/dev/null +chk_channels "setting invalid channels nr" $DST $CUR_RX $CUR_TX + +if [ $CPUS -gt 1 ]; then + # this also tests queues nr reduction + ip netns exec $NS_DST ethtool -L veth$DST rx 1 tx 2 2>/dev/null + ip netns exec $NS_SRC ethtool -L veth$SRC rx 1 tx 2 2>/dev/null + printf "%-60s" "bad setting: XDP with RX nr less than TX" + ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \ + section xdp_dummy 2>/dev/null &&\ + echo "fail - set operation successful ?!?" || echo " ok " + + # the following tests will run with multiple channels active + ip netns exec $NS_SRC ethtool -L veth$SRC rx 2 + ip netns exec $NS_DST ethtool -L veth$DST rx 2 + ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \ + section xdp_dummy 2>/dev/null + printf "%-60s" "bad setting: reducing RX nr below peer TX with XDP set" + ip netns exec $NS_DST ethtool -L veth$DST rx 1 2>/dev/null &&\ + echo "fail - set operation successful ?!?" || echo " ok " + CUR_RX=2 + CUR_TX=2 +fi + +if [ $CPUS -gt 2 ]; then + printf "%-60s" "bad setting: increasing peer TX nr above RX with XDP set" + ip netns exec $NS_SRC ethtool -L veth$SRC tx 3 2>/dev/null &&\ + echo "fail - set operation successful ?!?" || echo " ok " + chk_channels "setting invalid channels nr" $DST 2 2 +fi + ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null chk_gro_flag "with xdp attached - gro flag" $DST on chk_gro_flag " - peer gro flag" $SRC off @@ -167,10 +331,27 @@ chk_gro_flag " - after gro on xdp off, gro flag" $DST on chk_gro_flag " - peer gro flag" $SRC off chk_tso_flag " - tso flag" $SRC on chk_tso_flag " - peer tso flag" $DST on + +if [ $CPUS -gt 1 ]; then + ip netns exec $NS_DST ethtool -L veth$DST tx 1 + chk_channels "decreasing tx channels with device down" $DST 2 1 +fi + ip -n $NS_DST link set dev veth$DST up ip -n $NS_SRC link set dev veth$SRC up chk_gro " - aggregation" 1 +if [ $CPUS -gt 1 ]; then + [ $STRESS -gt 0 -a $CPUS -gt 2 ] && do_stress + + ip -n $NS_DST link set dev veth$DST down + ip -n $NS_SRC link set dev veth$SRC down + ip netns exec $NS_DST ethtool -L veth$DST tx 2 + chk_channels "increasing tx channels with device down" $DST 2 2 + ip -n $NS_DST link set dev veth$DST up + ip -n $NS_SRC link set dev veth$SRC up +fi + ip netns exec $NS_DST ethtool -K veth$DST gro off ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off chk_gro "aggregation again with default and TSO off" 10 diff --git a/tools/testing/selftests/net/vrf_strict_mode_test.sh b/tools/testing/selftests/net/vrf_strict_mode_test.sh index 18b982d611de..865d53c1781c 100755 --- a/tools/testing/selftests/net/vrf_strict_mode_test.sh +++ b/tools/testing/selftests/net/vrf_strict_mode_test.sh @@ -3,6 +3,9 @@ # This test is designed for testing the new VRF strict_mode functionality. +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + ret=0 # identifies the "init" network namespace which is often called root network @@ -371,18 +374,18 @@ vrf_strict_mode_check_support() if [ "$(id -u)" -ne 0 ];then echo "SKIP: Need root privileges" - exit 0 + exit $ksft_skip fi if [ ! -x "$(command -v ip)" ]; then echo "SKIP: Could not run test without ip tool" - exit 0 + exit $ksft_skip fi modprobe vrf &>/dev/null if [ ! -e /proc/sys/net/vrf/strict_mode ]; then echo "SKIP: vrf sysctl does not exist" - exit 0 + exit $ksft_skip fi cleanup &> /dev/null diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index cd6430b39982..8748199ac109 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -5,7 +5,7 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ nft_concat_range.sh nft_conntrack_helper.sh \ nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ - ipip-conntrack-mtu.sh + ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh LDLIBS = -lmnl TEST_GEN_FILES = nf-queue diff --git a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh new file mode 100755 index 000000000000..e7d7bf13cff5 --- /dev/null +++ b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Check that UNREPLIED tcp conntrack will eventually timeout. +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +waittime=20 +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +cleanup() { + ip netns pids $ns1 | xargs kill 2>/dev/null + ip netns pids $ns2 | xargs kill 2>/dev/null + + ip netns del $ns1 + ip netns del $ns2 +} + +ipv4() { + echo -n 192.168.$1.2 +} + +check_counter() +{ + ns=$1 + name=$2 + expect=$3 + local lret=0 + + cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect") + if [ $? -ne 0 ]; then + echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2 + ip netns exec $ns2 nft list counter inet filter "$name" 1>&2 + lret=1 + fi + + return $lret +} + +# Create test namespaces +ip netns add $ns1 || exit 1 + +trap cleanup EXIT + +ip netns add $ns2 || exit 1 + +# Connect the namespace to the host using a veth pair +ip -net $ns1 link add name veth1 type veth peer name veth2 +ip -net $ns1 link set netns $ns2 dev veth2 + +ip -net $ns1 link set up dev lo +ip -net $ns2 link set up dev lo +ip -net $ns1 link set up dev veth1 +ip -net $ns2 link set up dev veth2 + +ip -net $ns2 addr add 10.11.11.2/24 dev veth2 +ip -net $ns2 route add default via 10.11.11.1 + +ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1 + +# add a rule inside NS so we enable conntrack +ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT + +ip -net $ns1 addr add 10.11.11.1/24 dev veth1 +ip -net $ns1 route add 10.99.99.99 via 10.11.11.2 + +# Check connectivity works +ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1 + +ip netns exec $ns2 nc -l -p 8080 < /dev/null & + +# however, conntrack entries are there + +ip netns exec $ns2 nft -f - <<EOF +table inet filter { + counter connreq { } + counter redir { } + chain input { + type filter hook input priority 0; policy accept; + ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept + ct state new ct status dnat tcp dport 8080 counter name "redir" accept + } +} +EOF +if [ $? -ne 0 ]; then + echo "ERROR: Could not load nft rules" + exit 1 +fi + +ip netns exec $ns2 sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10 + +echo "INFO: connect $ns1 -> $ns2 to the virtual ip" +ip netns exec $ns1 bash -c 'while true ; do + nc -p 60000 10.99.99.99 80 + sleep 1 + done' & + +sleep 1 + +ip netns exec $ns2 nft -f - <<EOF +table inet nat { + chain prerouting { + type nat hook prerouting priority 0; policy accept; + ip daddr 10.99.99.99 tcp dport 80 redirect to :8080 + } +} +EOF +if [ $? -ne 0 ]; then + echo "ERROR: Could not load nat redirect" + exit 1 +fi + +count=$(ip netns exec $ns2 conntrack -L -p tcp --dport 80 2>/dev/null | wc -l) +if [ $count -eq 0 ]; then + echo "ERROR: $ns2 did not pick up tcp connection from peer" + exit 1 +fi + +echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect" +for i in $(seq 1 $waittime); do + echo -n "." + + sleep 1 + + count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l) + if [ $count -gt 0 ]; then + echo + echo "PASS: redirection took effect after $i seconds" + break + fi + + m=$((i%20)) + if [ $m -eq 0 ]; then + echo " waited for $i seconds" + fi +done + +expect="packets 1 bytes 60" +check_counter "$ns2" "redir" "$expect" +if [ $? -ne 0 ]; then + ret=1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: redirection counter has expected values" +else + echo "ERROR: no tcp connection was redirected" +fi + +exit $ret diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 427d94816f2d..d4ffebb989f8 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -199,7 +199,6 @@ fi # test basic connectivity if ! ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then echo "ERROR: ns1 cannot reach ns2" 1>&2 - bash exit 1 fi diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh index d7e07f4c3d7f..da1c1e4b6c86 100755 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ b/tools/testing/selftests/netfilter/nft_nat.sh @@ -741,6 +741,149 @@ EOF return $lret } +# test port shadowing. +# create two listening services, one on router (ns0), one +# on client (ns2), which is masqueraded from ns1 point of view. +# ns2 sends udp packet coming from service port to ns1, on a highport. +# Later, if n1 uses same highport to connect to ns0:service, packet +# might be port-forwarded to ns2 instead. + +# second argument tells if we expect the 'fake-entry' to take effect +# (CLIENT) or not (ROUTER). +test_port_shadow() +{ + local test=$1 + local expect=$2 + local daddrc="10.0.1.99" + local daddrs="10.0.1.1" + local result="" + local logmsg="" + + echo ROUTER | ip netns exec "$ns0" nc -w 5 -u -l -p 1405 >/dev/null 2>&1 & + nc_r=$! + + echo CLIENT | ip netns exec "$ns2" nc -w 5 -u -l -p 1405 >/dev/null 2>&1 & + nc_c=$! + + # make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405. + echo "fake-entry" | ip netns exec "$ns2" nc -w 1 -p 1405 -u "$daddrc" 41404 > /dev/null + + # ns1 tries to connect to ns0:1405. With default settings this should connect + # to client, it matches the conntrack entry created above. + + result=$(echo "" | ip netns exec "$ns1" nc -w 1 -p 41404 -u "$daddrs" 1405) + + if [ "$result" = "$expect" ] ;then + echo "PASS: portshadow test $test: got reply from ${expect}${logmsg}" + else + echo "ERROR: portshadow test $test: got reply from \"$result\", not $expect as intended" + ret=1 + fi + + kill $nc_r $nc_c 2>/dev/null + + # flush udp entries for next test round, if any + ip netns exec "$ns0" conntrack -F >/dev/null 2>&1 +} + +# This prevents port shadow of router service via packet filter, +# packets claiming to originate from service port from internal +# network are dropped. +test_port_shadow_filter() +{ + local family=$1 + +ip netns exec "$ns0" nft -f /dev/stdin <<EOF +table $family filter { + chain forward { + type filter hook forward priority 0; policy accept; + meta iif veth1 udp sport 1405 drop + } +} +EOF + test_port_shadow "port-filter" "ROUTER" + + ip netns exec "$ns0" nft delete table $family filter +} + +# This prevents port shadow of router service via notrack. +test_port_shadow_notrack() +{ + local family=$1 + +ip netns exec "$ns0" nft -f /dev/stdin <<EOF +table $family raw { + chain prerouting { + type filter hook prerouting priority -300; policy accept; + meta iif veth0 udp dport 1405 notrack + udp dport 1405 notrack + } + chain output { + type filter hook output priority -300; policy accept; + udp sport 1405 notrack + } +} +EOF + test_port_shadow "port-notrack" "ROUTER" + + ip netns exec "$ns0" nft delete table $family raw +} + +# This prevents port shadow of router service via sport remap. +test_port_shadow_pat() +{ + local family=$1 + +ip netns exec "$ns0" nft -f /dev/stdin <<EOF +table $family pat { + chain postrouting { + type nat hook postrouting priority -1; policy accept; + meta iif veth1 udp sport <= 1405 masquerade to : 1406-65535 random + } +} +EOF + test_port_shadow "pat" "ROUTER" + + ip netns exec "$ns0" nft delete table $family pat +} + +test_port_shadowing() +{ + local family="ip" + + ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + ip netns exec "$ns0" nft -f /dev/stdin <<EOF +table $family nat { + chain postrouting { + type nat hook postrouting priority 0; policy accept; + meta oif veth0 masquerade + } +} +EOF + if [ $? -ne 0 ]; then + echo "SKIP: Could not add add $family masquerade hook" + return $ksft_skip + fi + + # test default behaviour. Packet from ns1 to ns0 is redirected to ns2. + test_port_shadow "default" "CLIENT" + + # test packet filter based mitigation: prevent forwarding of + # packets claiming to come from the service port. + test_port_shadow_filter "$family" + + # test conntrack based mitigation: connections going or coming + # from router:service bypass connection tracking. + test_port_shadow_notrack "$family" + + # test nat based mitigation: fowarded packets coming from service port + # are masqueraded with random highport. + test_port_shadow_pat "$family" + + ip netns exec "$ns0" nft delete table $family nat +} # ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 for i in 0 1 2; do @@ -861,6 +1004,8 @@ reset_counters $test_inet_nat && test_redirect inet $test_inet_nat && test_redirect6 inet +test_port_shadowing + if [ $ret -ne 0 ];then echo -n "FAIL: " nft --version diff --git a/tools/testing/selftests/netfilter/nft_nat_zones.sh b/tools/testing/selftests/netfilter/nft_nat_zones.sh new file mode 100755 index 000000000000..b9ab37380f33 --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_nat_zones.sh @@ -0,0 +1,309 @@ +#!/bin/bash +# +# Test connection tracking zone and NAT source port reallocation support. +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# Don't increase too much, 2000 clients should work +# just fine but script can then take several minutes with +# KASAN/debug builds. +maxclients=100 + +have_iperf=1 +ret=0 + +# client1---. +# veth1-. +# | +# NAT Gateway --veth0--> Server +# | | +# veth2-' | +# client2---' | +# .... | +# clientX----vethX---' + +# All clients share identical IP address. +# NAT Gateway uses policy routing and conntrack zones to isolate client +# namespaces. Each client connects to Server, each with colliding tuples: +# clientsaddr:10000 -> serveraddr:dport +# NAT Gateway is supposed to do port reallocation for each of the +# connections. + +sfx=$(mktemp -u "XXXXXXXX") +gw="ns-gw-$sfx" +cl1="ns-cl1-$sfx" +cl2="ns-cl2-$sfx" +srv="ns-srv-$sfx" + +v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null) +v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null) +v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null) +v6gc1=$(sysctl -n net.ipv6.neigh.default.gc_thresh1 2>/dev/null) +v6gc2=$(sysctl -n net.ipv6.neigh.default.gc_thresh2 2>/dev/null) +v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null) + +cleanup() +{ + ip netns del $gw + ip netns del $srv + for i in $(seq 1 $maxclients); do + ip netns del ns-cl$i-$sfx 2>/dev/null + done + + sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +conntrack -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without conntrack tool" + exit $ksft_skip +fi + +iperf3 -v >/dev/null 2>&1 +if [ $? -ne 0 ];then + have_iperf=0 +fi + +ip netns add "$gw" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $gw" + exit $ksft_skip +fi +ip -net "$gw" link set lo up + +trap cleanup EXIT + +ip netns add "$srv" +if [ $? -ne 0 ];then + echo "SKIP: Could not create server netns $srv" + exit $ksft_skip +fi + +ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv" +ip -net "$gw" link set veth0 up +ip -net "$srv" link set lo up +ip -net "$srv" link set eth0 up + +sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null +sysctl -q net.ipv6.neigh.default.gc_thresh2=1024 2>/dev/null +sysctl -q net.ipv6.neigh.default.gc_thresh3=4096 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null + +for i in $(seq 1 $maxclients);do + cl="ns-cl$i-$sfx" + + ip netns add "$cl" + if [ $? -ne 0 ];then + echo "SKIP: Could not create client netns $cl" + exit $ksft_skip + fi + ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip + fi +done + +for i in $(seq 1 $maxclients);do + cl="ns-cl$i-$sfx" + echo netns exec "$cl" ip link set lo up + echo netns exec "$cl" ip link set eth0 up + echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2 + echo netns exec "$gw" ip link set veth$i up + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2 + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0 + + # clients have same IP addresses. + echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0 + echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 + echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0 + echo netns exec "$cl" ip route add default via dead:1::2 dev eth0 + + # NB: same addresses on client-facing interfaces. + echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i + echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i + + # gw: policy routing + echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i)) + echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i)) + echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i)) + echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i)) + echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i)) +done | ip -batch /dev/stdin + +ip -net "$gw" addr add 10.3.0.1/24 dev veth0 +ip -net "$gw" addr add dead:3::1/64 dev veth0 + +ip -net "$srv" addr add 10.3.0.99/24 dev eth0 +ip -net "$srv" addr add dead:3::99/64 dev eth0 + +ip netns exec $gw nft -f /dev/stdin<<EOF +table inet raw { + map iiftomark { + type ifname : mark + } + + map iiftozone { + typeof iifname : ct zone + } + + set inicmp { + flags dynamic + type ipv4_addr . ifname . ipv4_addr + } + set inflows { + flags dynamic + type ipv4_addr . inet_service . ifname . ipv4_addr . inet_service + } + + set inflows6 { + flags dynamic + type ipv6_addr . inet_service . ifname . ipv6_addr . inet_service + } + + chain prerouting { + type filter hook prerouting priority -64000; policy accept; + ct original zone set meta iifname map @iiftozone + meta mark set meta iifname map @iiftomark + + tcp flags & (syn|ack) == ack add @inflows { ip saddr . tcp sport . meta iifname . ip daddr . tcp dport counter } + add @inflows6 { ip6 saddr . tcp sport . meta iifname . ip6 daddr . tcp dport counter } + ip protocol icmp add @inicmp { ip saddr . meta iifname . ip daddr counter } + } + + chain nat_postrouting { + type nat hook postrouting priority 0; policy accept; + ct mark set meta mark meta oifname veth0 masquerade + } + + chain mangle_prerouting { + type filter hook prerouting priority -100; policy accept; + ct direction reply meta mark set ct mark + } +} +EOF + +( echo add element inet raw iiftomark \{ + for i in $(seq 1 $((maxclients-1))); do + echo \"veth$i\" : $i, + done + echo \"veth$maxclients\" : $maxclients \} + echo add element inet raw iiftozone \{ + for i in $(seq 1 $((maxclients-1))); do + echo \"veth$i\" : $i, + done + echo \"veth$maxclients\" : $maxclients \} +) | ip netns exec $gw nft -f /dev/stdin + +ip netns exec "$gw" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null +ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null + +# useful for debugging: allows to use 'ping' from clients to gateway. +ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null +ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null + +for i in $(seq 1 $maxclients); do + cl="ns-cl$i-$sfx" + ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & + if [ $? -ne 0 ]; then + echo FAIL: Ping failure from $cl 1>&2 + ret=1 + break + fi +done + +wait + +for i in $(seq 1 $maxclients); do + ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }" + if [ $? -ne 0 ];then + ret=1 + echo "FAIL: counter icmp mismatch for veth$i" 1>&2 + ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 + break + fi +done + +ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" +if [ $? -ne 0 ];then + ret=1 + echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" + ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 +fi + +if [ $ret -eq 0 ]; then + echo "PASS: ping test from all $maxclients namespaces" +fi + +if [ $have_iperf -eq 0 ];then + echo "SKIP: iperf3 not installed" + if [ $ret -ne 0 ];then + exit $ret + fi + exit $ksft_skip +fi + +ip netns exec $srv iperf3 -s > /dev/null 2>&1 & +iperfpid=$! +sleep 1 + +for i in $(seq 1 $maxclients); do + if [ $ret -ne 0 ]; then + break + fi + cl="ns-cl$i-$sfx" + ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null + if [ $? -ne 0 ]; then + echo FAIL: Failure to connect for $cl 1>&2 + ip netns exec $gw conntrack -S 1>&2 + ret=1 + fi +done +if [ $ret -eq 0 ];then + echo "PASS: iperf3 connections for all $maxclients net namespaces" +fi + +kill $iperfpid +wait + +for i in $(seq 1 $maxclients); do + ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null + if [ $? -ne 0 ];then + ret=1 + echo "FAIL: can't find expected tcp entry for veth$i" 1>&2 + break + fi +done +if [ $ret -eq 0 ];then + echo "PASS: Found client connection for all $maxclients net namespaces" +fi + +ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null +if [ $? -ne 0 ];then + ret=1 + echo "FAIL: cannot find return entry on veth0" 1>&2 +fi + +exit $ret diff --git a/tools/testing/selftests/netfilter/nft_zones_many.sh b/tools/testing/selftests/netfilter/nft_zones_many.sh new file mode 100755 index 000000000000..ac646376eb01 --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_zones_many.sh @@ -0,0 +1,156 @@ +#!/bin/bash + +# Test insertion speed for packets with identical addresses/ports +# that are all placed in distinct conntrack zones. + +sfx=$(mktemp -u "XXXXXXXX") +ns="ns-$sfx" + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +zones=20000 +have_ct_tool=0 +ret=0 + +cleanup() +{ + ip netns del $ns +} + +ip netns add $ns +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $gw" + exit $ksft_skip +fi + +trap cleanup EXIT + +conntrack -V > /dev/null 2>&1 +if [ $? -eq 0 ];then + have_ct_tool=1 +fi + +ip -net "$ns" link set lo up + +test_zones() { + local max_zones=$1 + +ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 +ip netns exec $ns nft -f /dev/stdin<<EOF +flush ruleset +table inet raw { + map rndzone { + typeof numgen inc mod $max_zones : ct zone + } + + chain output { + type filter hook output priority -64000; policy accept; + udp dport 12345 ct zone set numgen inc mod 65536 map @rndzone + } +} +EOF + ( + echo "add element inet raw rndzone {" + for i in $(seq 1 $max_zones);do + echo -n "$i : $i" + if [ $i -lt $max_zones ]; then + echo "," + else + echo "}" + fi + done + ) | ip netns exec $ns nft -f /dev/stdin + + local i=0 + local j=0 + local outerstart=$(date +%s%3N) + local stop=$outerstart + + while [ $i -lt $max_zones ]; do + local start=$(date +%s%3N) + i=$((i + 10000)) + j=$((j + 1)) + dd if=/dev/zero of=/dev/stdout bs=8k count=10000 2>/dev/null | ip netns exec "$ns" nc -w 1 -q 1 -u -p 12345 127.0.0.1 12345 > /dev/null + if [ $? -ne 0 ] ;then + ret=1 + break + fi + + stop=$(date +%s%3N) + local duration=$((stop-start)) + echo "PASS: added 10000 entries in $duration ms (now $i total, loop $j)" + done + + if [ $have_ct_tool -eq 1 ]; then + local count=$(ip netns exec "$ns" conntrack -C) + local duration=$((stop-outerstart)) + + if [ $count -eq $max_zones ]; then + echo "PASS: inserted $count entries from packet path in $duration ms total" + else + ip netns exec $ns conntrack -S 1>&2 + echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries" + ret=1 + fi + fi + + if [ $ret -ne 0 ];then + echo "FAIL: insert $max_zones entries from packet path" 1>&2 + fi +} + +test_conntrack_tool() { + local max_zones=$1 + + ip netns exec $ns conntrack -F >/dev/null 2>/dev/null + + local outerstart=$(date +%s%3N) + local start=$(date +%s%3N) + local stop=$start + local i=0 + while [ $i -lt $max_zones ]; do + i=$((i + 1)) + ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1 + if [ $? -ne 0 ];then + ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null + echo "FAIL: conntrack -I returned an error" + ret=1 + break + fi + + if [ $((i%10000)) -eq 0 ];then + stop=$(date +%s%3N) + + local duration=$((stop-start)) + echo "PASS: added 10000 entries in $duration ms (now $i total)" + start=$stop + fi + done + + local count=$(ip netns exec "$ns" conntrack -C) + local duration=$((stop-outerstart)) + + if [ $count -eq $max_zones ]; then + echo "PASS: inserted $count entries via ctnetlink in $duration ms" + else + ip netns exec $ns conntrack -S 1>&2 + echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)" + ret=1 + fi +} + +test_zones $zones + +if [ $have_ct_tool -eq 1 ];then + test_conntrack_tool $zones +else + echo "SKIP: Could not run ctnetlink insertion test without conntrack tool" + if [ $ret -eq 0 ];then + exit $ksft_skip + fi +fi + +exit $ret diff --git a/tools/testing/selftests/openat2/openat2_test.c b/tools/testing/selftests/openat2/openat2_test.c index 381d874cce99..1bddbe934204 100644 --- a/tools/testing/selftests/openat2/openat2_test.c +++ b/tools/testing/selftests/openat2/openat2_test.c @@ -22,7 +22,11 @@ * XXX: This is wrong on {mips, parisc, powerpc, sparc}. */ #undef O_LARGEFILE +#ifdef __aarch64__ +#define O_LARGEFILE 0x20000 +#else #define O_LARGEFILE 0x8000 +#endif struct open_how_ext { struct open_how inner; @@ -155,7 +159,7 @@ struct flag_test { int err; }; -#define NUM_OPENAT2_FLAG_TESTS 24 +#define NUM_OPENAT2_FLAG_TESTS 25 void test_openat2_flags(void) { @@ -229,6 +233,11 @@ void test_openat2_flags(void) { .name = "invalid how.resolve and O_PATH", .how.flags = O_PATH, .how.resolve = 0x1337, .err = -EINVAL }, + + /* currently unknown upper 32 bit rejected. */ + { .name = "currently unknown bit (1 << 63)", + .how.flags = O_RDONLY | (1ULL << 63), + .how.resolve = 0, .err = -EINVAL }, }; BUILD_BUG_ON(ARRAY_LEN(tests) != NUM_OPENAT2_FLAG_TESTS); diff --git a/tools/testing/selftests/powerpc/benchmarks/null_syscall.c b/tools/testing/selftests/powerpc/benchmarks/null_syscall.c index 579f0215c6e7..9836838a529f 100644 --- a/tools/testing/selftests/powerpc/benchmarks/null_syscall.c +++ b/tools/testing/selftests/powerpc/benchmarks/null_syscall.c @@ -14,6 +14,7 @@ #include <time.h> #include <sys/types.h> #include <sys/time.h> +#include <sys/syscall.h> #include <signal.h> static volatile int soak_done; @@ -121,7 +122,7 @@ static void do_null_syscall(unsigned long nr) unsigned long i; for (i = 0; i < nr; i++) - getppid(); + syscall(__NR_gettid); } #define TIME(A, STR) \ diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile index 640fad6cc2c7..0785c2e99d40 100644 --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile @@ -1,8 +1,8 @@ -CFLAGS = -O3 -m64 -I./include +CFLAGS = -O3 -m64 -I./include -I../include TEST_GEN_FILES := gzfht_test gunz_test TEST_PROGS := nx-gzip-test.sh include ../../lib.mk -$(TEST_GEN_FILES): gzip_vas.c +$(TEST_GEN_FILES): gzip_vas.c ../utils.c diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c index b099753b50e4..095195a25687 100644 --- a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c +++ b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c @@ -60,6 +60,7 @@ #include <assert.h> #include <errno.h> #include <signal.h> +#include "utils.h" #include "nxu.h" #include "nx.h" @@ -70,6 +71,8 @@ FILE *nx_gzip_log; #define FNAME_MAX 1024 #define FEXT ".nx.gz" +#define SYSFS_MAX_REQ_BUF_PATH "devices/vio/ibm,compression-v1/nx_gzip_caps/req_max_processed_len" + /* * LZ counts returned in the user supplied nx_gzip_crb_cpb_t structure. */ @@ -244,6 +247,7 @@ int compress_file(int argc, char **argv, void *handle) struct nx_gzip_crb_cpb_t *cmdp; uint32_t pagelen = 65536; int fault_tries = NX_MAX_FAULTS; + char buf[32]; cmdp = (void *)(uintptr_t) aligned_alloc(sizeof(struct nx_gzip_crb_cpb_t), @@ -263,8 +267,17 @@ int compress_file(int argc, char **argv, void *handle) assert(NULL != (outbuf = (char *)malloc(outlen))); nxu_touch_pages(outbuf, outlen, pagelen, 1); - /* Compress piecemeal in smallish chunks */ - chunk = 1<<22; + /* + * On PowerVM, the hypervisor defines the maximum request buffer + * size is defined and this value is available via sysfs. + */ + if (!read_sysfs_file(SYSFS_MAX_REQ_BUF_PATH, buf, sizeof(buf))) { + chunk = atoi(buf); + } else { + /* sysfs entry is not available on PowerNV */ + /* Compress piecemeal in smallish chunks */ + chunk = 1<<22; + } /* Write the gzip header to the stream */ num_hdr_bytes = gzip_header_blank(outbuf); diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index c5ecb4634094..010160690227 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -24,7 +24,7 @@ TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ fork_cleanup_test ebb_on_child_test \ ebb_on_willing_child_test back_to_back_ebbs_test \ lost_exception_test no_handler_test \ - cycles_with_mmcr2_test + cycles_with_mmcr2_test regs_access_pmccext_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb.h b/tools/testing/selftests/powerpc/pmu/ebb/ebb.h index b5bc2b616075..2c803b5b48d6 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/ebb.h +++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb.h @@ -55,8 +55,6 @@ void ebb_global_disable(void); bool ebb_is_supported(void); void ebb_freeze_pmcs(void); void ebb_unfreeze_pmcs(void); -void event_ebb_init(struct event *e); -void event_leader_ebb_init(struct event *e); int count_pmc(int pmc, uint32_t sample_period); void dump_ebb_state(void); void dump_summary_ebb_state(void); diff --git a/tools/testing/selftests/powerpc/pmu/ebb/no_handler_test.c b/tools/testing/selftests/powerpc/pmu/ebb/no_handler_test.c index fc5bf4870d8e..01e827c31169 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/no_handler_test.c +++ b/tools/testing/selftests/powerpc/pmu/ebb/no_handler_test.c @@ -50,8 +50,6 @@ static int no_handler_test(void) event_close(&event); - dump_ebb_state(); - /* The real test is that we never took an EBB at 0x0 */ return 0; diff --git a/tools/testing/selftests/powerpc/pmu/ebb/regs_access_pmccext_test.c b/tools/testing/selftests/powerpc/pmu/ebb/regs_access_pmccext_test.c new file mode 100644 index 000000000000..1eda8e9932e8 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/ebb/regs_access_pmccext_test.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2021, Athira Rajeev, IBM Corp. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <setjmp.h> +#include <signal.h> + +#include "ebb.h" + + +/* + * Test that closing the EBB event clears MMCR0_PMCC and + * sets MMCR0_PMCCEXT preventing further read access to the + * group B PMU registers. + */ + +static int regs_access_pmccext(void) +{ + struct event event; + + SKIP_IF(!ebb_is_supported()); + + event_init_named(&event, 0x1001e, "cycles"); + event_leader_ebb_init(&event); + + FAIL_IF(event_open(&event)); + + ebb_enable_pmc_counting(1); + setup_ebb_handler(standard_ebb_callee); + ebb_global_enable(); + FAIL_IF(ebb_event_enable(&event)); + + mtspr(SPRN_PMC1, pmc_sample_period(sample_period)); + + while (ebb_state.stats.ebb_count < 1) + FAIL_IF(core_busy_loop()); + + ebb_global_disable(); + event_close(&event); + + FAIL_IF(ebb_state.stats.ebb_count == 0); + + /* + * For ISA v3.1, verify the test takes a SIGILL when reading + * PMU regs after the event is closed. With the control bit + * in MMCR0 (PMCCEXT) restricting access to group B PMU regs, + * sigill is expected. + */ + if (have_hwcap2(PPC_FEATURE2_ARCH_3_1)) + FAIL_IF(catch_sigill(dump_ebb_state)); + else + dump_ebb_state(); + + return 0; +} + +int main(void) +{ + return test_harness(regs_access_pmccext, "regs_access_pmccext"); +} diff --git a/tools/testing/selftests/powerpc/primitives/asm/extable.h b/tools/testing/selftests/powerpc/primitives/asm/extable.h new file mode 120000 index 000000000000..6385f059a951 --- /dev/null +++ b/tools/testing/selftests/powerpc/primitives/asm/extable.h @@ -0,0 +1 @@ +../../../../../../arch/powerpc/include/asm/extable.h
\ No newline at end of file diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c index 82f7bdc2e5e6..67ca297c5cca 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c @@ -57,7 +57,7 @@ trans: : [gpr_1]"i"(GPR_1), [gpr_2]"i"(GPR_2), [sprn_texasr] "i" (SPRN_TEXASR), [flt_1] "b" (&a), [flt_2] "b" (&b), [cptr1] "b" (&cptr[1]) - : "memory", "r7", "r8", "r9", "r10", + : "memory", "r0", "r7", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", @@ -113,6 +113,7 @@ int ptrace_tm_gpr(void) int ret, status; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT); pid = fork(); if (pid < 0) { diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c index ad65be6e8e85..6f2bce1b6c5d 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c @@ -65,7 +65,7 @@ trans: : [gpr_1]"i"(GPR_1), [gpr_2]"i"(GPR_2), [gpr_4]"i"(GPR_4), [sprn_texasr] "i" (SPRN_TEXASR), [flt_1] "b" (&a), [flt_4] "b" (&d) - : "memory", "r5", "r6", "r7", + : "memory", "r0", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" @@ -119,6 +119,7 @@ int ptrace_tm_spd_gpr(void) int ret, status; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT); pid = fork(); if (pid < 0) { diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c index 2ecfa1158e2b..e112a34fbe59 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c @@ -129,6 +129,7 @@ int ptrace_tm_spd_tar(void) int ret, status; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT); pid = fork(); if (pid == 0) diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c index 6f7fb51f0809..40133d49fe39 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c @@ -129,6 +129,7 @@ int ptrace_tm_spd_vsx(void) int ret, status, i; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT); for (i = 0; i < 128; i++) { diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c index 068bfed2e606..880ba6a29a48 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c @@ -114,6 +114,7 @@ int ptrace_tm_spr(void) int ret, status; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(struct shared), 0777|IPC_CREAT); shm_id1 = shmget(IPC_PRIVATE, sizeof(int), 0777|IPC_CREAT); pid = fork(); diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c index 46ef378a15ec..d0db6df0f0ea 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c @@ -117,6 +117,7 @@ int ptrace_tm_tar(void) int ret, status; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT); pid = fork(); if (pid == 0) diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c index 70ca01234f79..4f05ce4fd282 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c @@ -113,6 +113,7 @@ int ptrace_tm_vsx(void) int ret, status, i; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT); for (i = 0; i < 128; i++) { diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile index 844d18cd5f93..7488315fd847 100644 --- a/tools/testing/selftests/powerpc/security/Makefile +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0+ TEST_GEN_PROGS := rfi_flush entry_flush uaccess_flush spectre_v2 +TEST_PROGS := mitigation-patching.sh + top_srcdir = ../../../../.. CFLAGS += -I../../../../../usr/include diff --git a/tools/testing/selftests/powerpc/security/mitigation-patching.sh b/tools/testing/selftests/powerpc/security/mitigation-patching.sh new file mode 100755 index 000000000000..00197acb7ff1 --- /dev/null +++ b/tools/testing/selftests/powerpc/security/mitigation-patching.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +set -euo pipefail + +TIMEOUT=10 + +function do_one +{ + local mitigation="$1" + local orig + local start + local now + + orig=$(cat "$mitigation") + + start=$EPOCHSECONDS + now=$start + + while [[ $((now-start)) -lt "$TIMEOUT" ]] + do + echo 0 > "$mitigation" + echo 1 > "$mitigation" + + now=$EPOCHSECONDS + done + + echo "$orig" > "$mitigation" +} + +rc=0 +cd /sys/kernel/debug/powerpc || rc=1 +if [[ "$rc" -ne 0 ]]; then + echo "Error: couldn't cd to /sys/kernel/debug/powerpc" >&2 + exit 1 +fi + +tainted=$(cat /proc/sys/kernel/tainted) +if [[ "$tainted" -ne 0 ]]; then + echo "Error: kernel already tainted!" >&2 + exit 1 +fi + +mitigations="barrier_nospec stf_barrier count_cache_flush rfi_flush entry_flush uaccess_flush" + +for m in $mitigations +do + do_one "$m" & +done + +echo "Spawned threads enabling/disabling mitigations ..." + +if stress-ng > /dev/null 2>&1; then + stress="stress-ng" +elif stress > /dev/null 2>&1; then + stress="stress" +else + stress="" +fi + +if [[ -n "$stress" ]]; then + "$stress" -m "$(nproc)" -t "$TIMEOUT" & + echo "Spawned VM stressors ..." +fi + +echo "Waiting for timeout ..." +wait + +tainted=$(cat /proc/sys/kernel/tainted) +if [[ "$tainted" -ne 0 ]]; then + echo "Error: kernel became tainted!" >&2 + exit 1 +fi + +echo "OK" +exit 0 diff --git a/tools/testing/selftests/powerpc/signal/signal_tm.c b/tools/testing/selftests/powerpc/signal/signal_tm.c index 5bf2224ef7f2..c9cf66a3daa2 100644 --- a/tools/testing/selftests/powerpc/signal/signal_tm.c +++ b/tools/testing/selftests/powerpc/signal/signal_tm.c @@ -56,6 +56,7 @@ static int test_signal_tm() } SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); for (i = 0; i < MAX_ATTEMPT; i++) { /* diff --git a/tools/testing/selftests/powerpc/tm/tm-exec.c b/tools/testing/selftests/powerpc/tm/tm-exec.c index 260cfdb97d23..c59919d6710d 100644 --- a/tools/testing/selftests/powerpc/tm/tm-exec.c +++ b/tools/testing/selftests/powerpc/tm/tm-exec.c @@ -27,6 +27,7 @@ static char *path; static int test_exec(void) { SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); asm __volatile__( "tbegin.;" diff --git a/tools/testing/selftests/powerpc/tm/tm-fork.c b/tools/testing/selftests/powerpc/tm/tm-fork.c index 6efa5a685a77..c27b935f0e9f 100644 --- a/tools/testing/selftests/powerpc/tm/tm-fork.c +++ b/tools/testing/selftests/powerpc/tm/tm-fork.c @@ -21,6 +21,7 @@ int test_fork(void) { SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); asm __volatile__( "tbegin.;" diff --git a/tools/testing/selftests/powerpc/tm/tm-poison.c b/tools/testing/selftests/powerpc/tm/tm-poison.c index 29e5f26af7b9..a7bbf034b5bb 100644 --- a/tools/testing/selftests/powerpc/tm/tm-poison.c +++ b/tools/testing/selftests/powerpc/tm/tm-poison.c @@ -20,7 +20,6 @@ #include <sched.h> #include <sys/types.h> #include <signal.h> -#include <inttypes.h> #include "tm.h" @@ -34,6 +33,7 @@ int tm_poison_test(void) bool fail_vr = false; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); cpu = pick_online_cpu(); FAIL_IF(cpu < 0); diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c index 4cdb83964bb3..85c940ae6ff8 100644 --- a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c +++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c @@ -40,6 +40,7 @@ int test_body(void) uint64_t rv, dscr1 = 1, dscr2, texasr; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); printf("Check DSCR TM context switch: "); fflush(stdout); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c index 254f912ad611..657d755b2905 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c @@ -79,6 +79,7 @@ static int tm_signal_context_chk_fpu() pid_t pid = getpid(); SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); act.sa_sigaction = signal_usr1; sigemptyset(&act.sa_mask); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c index 0cc680f61828..400fa70ca71e 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c @@ -81,6 +81,7 @@ static int tm_signal_context_chk_gpr() pid_t pid = getpid(); SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); act.sa_sigaction = signal_usr1; sigemptyset(&act.sa_mask); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c index b6d52730a0d8..d628fd302b28 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c @@ -104,6 +104,7 @@ static int tm_signal_context_chk() pid_t pid = getpid(); SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); act.sa_sigaction = signal_usr1; sigemptyset(&act.sa_mask); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c index 8e25e2072ecd..9bd869245bad 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c @@ -153,6 +153,7 @@ static int tm_signal_context_chk() pid_t pid = getpid(); SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); act.sa_sigaction = signal_usr1; sigemptyset(&act.sa_mask); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c b/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c index 5908bc6abe60..0b84c9208d62 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c @@ -226,6 +226,7 @@ int tm_signal_pagefault(void) stack_t ss; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); SKIP_IF(!have_userfaultfd()); setup_uf_mem(); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c index 07c388147b75..06b801906f27 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c @@ -32,6 +32,7 @@ int tm_signal_sigreturn_nt(void) struct sigaction trap_sa; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); trap_sa.sa_flags = SA_SIGINFO; trap_sa.sa_sigaction = trap_signal_handler; diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c index cdcf8c5bbbc7..68807aac8dd3 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c @@ -35,6 +35,7 @@ int tm_signal_stack() int pid; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); pid = fork(); if (pid < 0) diff --git a/tools/testing/selftests/powerpc/tm/tm-sigreturn.c b/tools/testing/selftests/powerpc/tm/tm-sigreturn.c index 9a6017a1d769..ffe4e5515f33 100644 --- a/tools/testing/selftests/powerpc/tm/tm-sigreturn.c +++ b/tools/testing/selftests/powerpc/tm/tm-sigreturn.c @@ -55,6 +55,7 @@ int tm_sigreturn(void) uint64_t ret = 0; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); SKIP_IF(!is_ppc64le()); memset(&sa, 0, sizeof(sa)); diff --git a/tools/testing/selftests/powerpc/tm/tm-syscall-asm.S b/tools/testing/selftests/powerpc/tm/tm-syscall-asm.S index bd1ca25febe4..aed632d29fff 100644 --- a/tools/testing/selftests/powerpc/tm/tm-syscall-asm.S +++ b/tools/testing/selftests/powerpc/tm/tm-syscall-asm.S @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#include <ppc-asm.h> +#include <basic_asm.h> #include <asm/unistd.h> .text @@ -26,3 +26,38 @@ FUNC_START(getppid_tm_suspended) 1: li r3, -1 blr + + +.macro scv level + .long (0x44000001 | (\level) << 5) +.endm + +FUNC_START(getppid_scv_tm_active) + PUSH_BASIC_STACK(0) + tbegin. + beq 1f + li r0, __NR_getppid + scv 0 + tend. + POP_BASIC_STACK(0) + blr +1: + li r3, -1 + POP_BASIC_STACK(0) + blr + +FUNC_START(getppid_scv_tm_suspended) + PUSH_BASIC_STACK(0) + tbegin. + beq 1f + li r0, __NR_getppid + tsuspend. + scv 0 + tresume. + tend. + POP_BASIC_STACK(0) + blr +1: + li r3, -1 + POP_BASIC_STACK(0) + blr diff --git a/tools/testing/selftests/powerpc/tm/tm-syscall.c b/tools/testing/selftests/powerpc/tm/tm-syscall.c index becb8207b432..b763354c2eb4 100644 --- a/tools/testing/selftests/powerpc/tm/tm-syscall.c +++ b/tools/testing/selftests/powerpc/tm/tm-syscall.c @@ -19,24 +19,36 @@ #include "utils.h" #include "tm.h" +#ifndef PPC_FEATURE2_SCV +#define PPC_FEATURE2_SCV 0x00100000 /* scv syscall */ +#endif + extern int getppid_tm_active(void); extern int getppid_tm_suspended(void); +extern int getppid_scv_tm_active(void); +extern int getppid_scv_tm_suspended(void); unsigned retries = 0; #define TEST_DURATION 10 /* seconds */ -#define TM_RETRIES 100 -pid_t getppid_tm(bool suspend) +pid_t getppid_tm(bool scv, bool suspend) { int i; pid_t pid; for (i = 0; i < TM_RETRIES; i++) { - if (suspend) - pid = getppid_tm_suspended(); - else - pid = getppid_tm_active(); + if (suspend) { + if (scv) + pid = getppid_scv_tm_suspended(); + else + pid = getppid_tm_suspended(); + } else { + if (scv) + pid = getppid_scv_tm_active(); + else + pid = getppid_tm_active(); + } if (pid >= 0) return pid; @@ -67,6 +79,7 @@ int tm_syscall(void) struct timeval end, now; SKIP_IF(!have_htm_nosc()); + SKIP_IF(htm_is_synthetic()); setbuf(stdout, NULL); @@ -82,15 +95,24 @@ int tm_syscall(void) * Test a syscall within a suspended transaction and verify * that it succeeds. */ - FAIL_IF(getppid_tm(true) == -1); /* Should succeed. */ + FAIL_IF(getppid_tm(false, true) == -1); /* Should succeed. */ /* * Test a syscall within an active transaction and verify that * it fails with the correct failure code. */ - FAIL_IF(getppid_tm(false) != -1); /* Should fail... */ + FAIL_IF(getppid_tm(false, false) != -1); /* Should fail... */ FAIL_IF(!failure_is_persistent()); /* ...persistently... */ FAIL_IF(!failure_is_syscall()); /* ...with code syscall. */ + + /* Now do it all again with scv if it is available. */ + if (have_hwcap2(PPC_FEATURE2_SCV)) { + FAIL_IF(getppid_tm(true, true) == -1); /* Should succeed. */ + FAIL_IF(getppid_tm(true, false) != -1); /* Should fail... */ + FAIL_IF(!failure_is_persistent()); /* ...persistently... */ + FAIL_IF(!failure_is_syscall()); /* ...with code syscall. */ + } + gettimeofday(&now, 0); } diff --git a/tools/testing/selftests/powerpc/tm/tm-tar.c b/tools/testing/selftests/powerpc/tm/tm-tar.c index 03be8c47292b..f2a9137f3c1e 100644 --- a/tools/testing/selftests/powerpc/tm/tm-tar.c +++ b/tools/testing/selftests/powerpc/tm/tm-tar.c @@ -26,6 +26,7 @@ int test_tar(void) int i; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); SKIP_IF(!is_ppc64le()); for (i = 0; i < num_loops; i++) diff --git a/tools/testing/selftests/powerpc/tm/tm-tmspr.c b/tools/testing/selftests/powerpc/tm/tm-tmspr.c index 794d574db784..dd5ddffa28b7 100644 --- a/tools/testing/selftests/powerpc/tm/tm-tmspr.c +++ b/tools/testing/selftests/powerpc/tm/tm-tmspr.c @@ -96,6 +96,7 @@ int test_tmspr() unsigned long i; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); /* To cause some context switching */ thread_num = 10 * sysconf(_SC_NPROCESSORS_ONLN); diff --git a/tools/testing/selftests/powerpc/tm/tm-trap.c b/tools/testing/selftests/powerpc/tm/tm-trap.c index 11521077f915..97cb74768e30 100644 --- a/tools/testing/selftests/powerpc/tm/tm-trap.c +++ b/tools/testing/selftests/powerpc/tm/tm-trap.c @@ -255,6 +255,7 @@ int tm_trap_test(void) struct sigaction trap_sa; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); trap_sa.sa_flags = SA_SIGINFO; trap_sa.sa_sigaction = trap_signal_handler; diff --git a/tools/testing/selftests/powerpc/tm/tm-unavailable.c b/tools/testing/selftests/powerpc/tm/tm-unavailable.c index a1348a5f721a..6bf1b65b020d 100644 --- a/tools/testing/selftests/powerpc/tm/tm-unavailable.c +++ b/tools/testing/selftests/powerpc/tm/tm-unavailable.c @@ -344,6 +344,7 @@ int tm_unavailable_test(void) cpu_set_t cpuset; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); cpu = pick_online_cpu(); FAIL_IF(cpu < 0); diff --git a/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c b/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c index e2a0c07e8362..34364ed2b6b7 100644 --- a/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c +++ b/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c @@ -17,7 +17,6 @@ #include <pthread.h> #include <sys/mman.h> #include <unistd.h> -#include <pthread.h> #include "tm.h" #include "utils.h" @@ -92,6 +91,7 @@ int tm_vmx_unavail_test() pthread_t *thread; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); passed = 1; diff --git a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c index c1e788a6df47..1640e7ead69b 100644 --- a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c +++ b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c @@ -46,6 +46,7 @@ int test_vmxcopy() uint64_t aborted = 0; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); SKIP_IF(!is_ppc64le()); fd = mkstemp(tmpfile); diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h index c5a1e5c163fc..c03c6e778876 100644 --- a/tools/testing/selftests/powerpc/tm/tm.h +++ b/tools/testing/selftests/powerpc/tm/tm.h @@ -10,6 +10,9 @@ #include <asm/tm.h> #include "utils.h" +#include "reg.h" + +#define TM_RETRIES 100 static inline bool have_htm(void) { @@ -31,6 +34,39 @@ static inline bool have_htm_nosc(void) #endif } +/* + * Transactional Memory was removed in ISA 3.1. A synthetic TM implementation + * is provided on P10 for threads running in P8/P9 compatibility mode. The + * synthetic implementation immediately fails after tbegin. This failure sets + * Bit 7 (Failure Persistent) and Bit 15 (Implementation-specific). + */ +static inline bool htm_is_synthetic(void) +{ + int i; + + /* + * Per the ISA, the Failure Persistent bit may be incorrect. Try a few + * times in case we got an Implementation-specific failure on a non ISA + * v3.1 system. On these systems the Implementation-specific failure + * should not be persistent. + */ + for (i = 0; i < TM_RETRIES; i++) { + asm volatile( + "tbegin.;" + "beq 1f;" + "tend.;" + "1:" + : + : + : "memory"); + + if ((__builtin_get_texasr() & (TEXASR_FP | TEXASR_IC)) != + (TEXASR_FP | TEXASR_IC)) + break; + } + return i == TM_RETRIES; +} + static inline long failure_code(void) { return __builtin_get_texasru() >> 24; diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index 15d937ba96ca..fd1ffaa5a135 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -68,16 +68,12 @@ do cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN { srand(n + me + systime()); ncpus = split(cpus, ca); - curcpu = ca[int(rand() * ncpus + 1)]; - z = ""; - for (i = 1; 4 * i <= curcpu; i++) - z = z "0"; - print "0x" 2 ^ (curcpu % 4) z; + print ca[int(rand() * ncpus + 1)]; }' < /dev/null` n=$(($n+1)) - if ! taskset -p $cpumask $$ > /dev/null 2>&1 + if ! taskset -c -p $cpumask $$ > /dev/null 2>&1 then - echo taskset failure: '"taskset -p ' $cpumask $$ '"' + echo taskset failure: '"taskset -c -p ' $cpumask $$ '"' exit 1 fi diff --git a/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh index e5cc6b2f195e..1af5d6b86b39 100755 --- a/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh +++ b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh @@ -14,7 +14,7 @@ if test -z "$TORTURE_KCONFIG_KCSAN_ARG" then exit 0 fi -cat $1/*/console.log | +find $1 -name console.log -exec cat {} \; | grep "BUG: KCSAN: " | sed -e 's/^\[[^]]*] //' | sort | diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index 46e47a00a7db..5a0023d183da 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -29,7 +29,7 @@ then echo "Usage: $scriptname /path/to/old/run [ options ]" exit 1 fi -if ! cp "$oldrun/batches" $T/batches.oldrun +if ! cp "$oldrun/scenarios" $T/scenarios.oldrun then # Later on, can reconstitute this from console.log files. echo Prior run batches file does not exist: $oldrun/batches @@ -142,7 +142,9 @@ then echo "Cannot copy from $oldrun to $rundir." usage fi -rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log +rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log +touch "$rundir/log" +echo $scriptname $args | tee -a "$rundir/log" echo $oldrun > "$rundir/re-run" if ! test -d "$rundir/../../bin" then @@ -165,35 +167,18 @@ done grep '^#' $i | sed -e 's/^# //' > $T/qemu-cmd-settings . $T/qemu-cmd-settings -grep -v '^#' $T/batches.oldrun | awk ' -BEGIN { - oldbatch = 1; -} - +grep -v '^#' $T/scenarios.oldrun | awk ' { - if (oldbatch != $1) { - print "kvm-test-1-run-batch.sh" curbatch; - curbatch = ""; - oldbatch = $1; - } - curbatch = curbatch " " $2; -} - -END { - print "kvm-test-1-run-batch.sh" curbatch + curbatch = ""; + for (i = 2; i <= NF; i++) + curbatch = curbatch " " $i; + print "kvm-test-1-run-batch.sh" curbatch; }' > $T/runbatches.sh if test -n "$dryrun" then echo ---- Dryrun complete, directory: $rundir | tee -a "$rundir/log" else - ( cd "$rundir"; sh $T/runbatches.sh ) - kcsan-collapse.sh "$rundir" | tee -a "$rundir/log" - echo | tee -a "$rundir/log" - echo ---- Results directory: $rundir | tee -a "$rundir/log" - kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1 - ret=$? - cat $T/kvm-recheck.sh.out | tee -a "$rundir/log" - echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log" - exit $ret + ( cd "$rundir"; sh $T/runbatches.sh ) | tee -a "$rundir/log" + kvm-end-run-stats.sh "$rundir" "$starttime" fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm-assign-cpus.sh b/tools/testing/selftests/rcutorture/bin/kvm-assign-cpus.sh new file mode 100755 index 000000000000..f99b2c146f83 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-assign-cpus.sh @@ -0,0 +1,106 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Produce awk statements roughly depicting the system's CPU and cache +# layout. If the required information is not available, produce +# error messages as awk comments. Successful exit regardless. +# +# Usage: kvm-assign-cpus.sh /path/to/sysfs + +T=/tmp/kvm-assign-cpus.sh.$$ +trap 'rm -rf $T' 0 2 +mkdir $T + +sysfsdir=${1-/sys/devices/system/node} +if ! cd "$sysfsdir" > $T/msg 2>&1 +then + sed -e 's/^/# /' < $T/msg + exit 0 +fi +nodelist="`ls -d node*`" +for i in node* +do + if ! test -d $i/ + then + echo "# Not a directory: $sysfsdir/node*" + exit 0 + fi + for j in $i/cpu*/cache/index* + do + if ! test -d $j/ + then + echo "# Not a directory: $sysfsdir/$j" + exit 0 + else + break + fi + done + indexlist="`ls -d $i/cpu* | grep 'cpu[0-9][0-9]*' | head -1 | sed -e 's,^.*$,ls -d &/cache/index*,' | sh | sed -e 's,^.*/,,'`" + break +done +for i in node*/cpu*/cache/index*/shared_cpu_list +do + if ! test -f $i + then + echo "# Not a file: $sysfsdir/$i" + exit 0 + else + break + fi +done +firstshared= +for i in $indexlist +do + rm -f $T/cpulist + for n in node* + do + f="$n/cpu*/cache/$i/shared_cpu_list" + if ! cat $f > $T/msg 2>&1 + then + sed -e 's/^/# /' < $T/msg + exit 0 + fi + cat $f >> $T/cpulist + done + if grep -q '[-,]' $T/cpulist + then + if test -z "$firstshared" + then + firstshared="$i" + fi + fi +done +if test -z "$firstshared" +then + splitindex="`echo $indexlist | sed -e 's/ .*$//'`" +else + splitindex="$firstshared" +fi +nodenum=0 +for n in node* +do + cat $n/cpu*/cache/$splitindex/shared_cpu_list | sort -u -k1n | + awk -v nodenum="$nodenum" ' + BEGIN { + idx = 0; + } + + { + nlists = split($0, cpulists, ","); + for (i = 1; i <= nlists; i++) { + listsize = split(cpulists[i], cpus, "-"); + if (listsize == 1) + cpus[2] = cpus[1]; + for (j = cpus[1]; j <= cpus[2]; j++) { + print "cpu[" nodenum "][" idx "] = " j ";"; + idx++; + } + } + } + + END { + print "nodecpus[" nodenum "] = " idx ";"; + }' + nodenum=`expr $nodenum + 1` +done +echo "numnodes = $nodenum;" diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index 115e1822b26f..5ad973dca820 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -40,8 +40,10 @@ if test $retval -gt 1 then exit 2 fi -ncpus=`cpus2use.sh` -make -j$ncpus $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1 + +# Tell "make" to use double the number of real CPUs on the build system. +ncpus="`getconf _NPROCESSORS_ONLN`" +make -j$((2 * ncpus)) $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1 retval=$? if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | egrep -q "Stop|Error|error:|warning:" || egrep -q "Stop|Error|error:" < $resdir/Make.out then diff --git a/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh b/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh new file mode 100755 index 000000000000..e4a00779b8c6 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Check the status of the specified run. +# +# Usage: kvm-end-run-stats.sh /path/to/run starttime +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +# scriptname=$0 +# args="$*" +rundir="$1" +if ! test -d "$rundir" +then + echo kvm-end-run-stats.sh: Specified run directory does not exist: $rundir + exit 1 +fi + +T=${TMPDIR-/tmp}/kvm-end-run-stats.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM +PATH=${KVM}/bin:$PATH; export PATH +. functions.sh +default_starttime="`get_starttime`" +starttime="${2-default_starttime}" + +echo | tee -a "$rundir/log" +echo | tee -a "$rundir/log" +echo " --- `date` Test summary:" | tee -a "$rundir/log" +echo Results directory: $rundir | tee -a "$rundir/log" +kcsan-collapse.sh "$rundir" | tee -a "$rundir/log" +kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1 +ret=$? +cat $T/kvm-recheck.sh.out | tee -a "$rundir/log" +echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log" +exit $ret diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh index 0670841122d8..daf64b507038 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh @@ -43,7 +43,7 @@ then else echo No build errors. fi -if grep -q -e "--buildonly" < ${rundir}/log +if grep -q -e "--build-\?only" < ${rundir}/log && ! test -f "${rundir}/remote-log" then echo Build-only run, no console logs to check. exit $editorret diff --git a/tools/testing/selftests/rcutorture/bin/kvm-get-cpus-script.sh b/tools/testing/selftests/rcutorture/bin/kvm-get-cpus-script.sh new file mode 100755 index 000000000000..20c7c53c5795 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-get-cpus-script.sh @@ -0,0 +1,88 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Create an awk script that takes as input numbers of CPUs and outputs +# lists of CPUs, one per line in both cases. +# +# Usage: kvm-get-cpus-script.sh /path/to/cpu/arrays /path/to/put/script [ /path/to/state ] +# +# The CPU arrays are output by kvm-assign-cpus.sh, and are valid awk +# statements initializing the variables describing the system's topology. +# +# The optional state is input by this script (if the file exists and is +# non-empty), and can also be output by this script. + +cpuarrays="${1-/sys/devices/system/node}" +scriptfile="${2}" +statefile="${3}" + +if ! test -f "$cpuarrays" +then + echo "File not found: $cpuarrays" 1>&2 + exit 1 +fi +scriptdir="`dirname "$scriptfile"`" +if ! test -d "$scriptdir" || ! test -x "$scriptdir" || ! test -w "$scriptdir" +then + echo "Directory not usable for script output: $scriptdir" + exit 1 +fi + +cat << '___EOF___' > "$scriptfile" +BEGIN { +___EOF___ +cat "$cpuarrays" >> "$scriptfile" +if test -r "$statefile" +then + cat "$statefile" >> "$scriptfile" +fi +cat << '___EOF___' >> "$scriptfile" +} + +# Do we have the system architecture to guide CPU affinity? +function gotcpus() +{ + return numnodes != ""; +} + +# Return a comma-separated list of the next n CPUs. +function nextcpus(n, i, s) +{ + for (i = 0; i < n; i++) { + if (nodecpus[curnode] == "") + curnode = 0; + if (cpu[curnode][curcpu[curnode]] == "") + curcpu[curnode] = 0; + if (s != "") + s = s ","; + s = s cpu[curnode][curcpu[curnode]]; + curcpu[curnode]++; + curnode++ + } + return s; +} + +# Dump out the current node/CPU state so that a later invocation of this +# script can continue where this one left off. Of course, this only works +# when a state file was specified and where there was valid sysfs state. +# Returns 1 if the state was dumped, 0 otherwise. +# +# Dumping the state for one system configuration and loading it into +# another isn't likely to do what you want, whatever that might be. +function dumpcpustate( i, fn) +{ +___EOF___ +echo ' fn = "'"$statefile"'";' >> $scriptfile +cat << '___EOF___' >> "$scriptfile" + if (fn != "" && gotcpus()) { + print "curnode = " curnode ";" > fn; + for (i = 0; i < numnodes; i++) + if (curcpu[i] != "") + print "curcpu[" i "] = " curcpu[i] ";" >> fn; + return 1; + } + if (fn != "") + print "# No CPU state to dump." > fn; + return 0; +} +___EOF___ diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh index f3a7a5e2b89d..db2c0e2c8e1d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh @@ -25,7 +25,7 @@ then echo "$configfile -------" else title="$configfile ------- $ncs acquisitions/releases" - dur=`sed -e 's/^.* locktorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null` + dur=`grep -v '^#' $i/qemu-cmd | sed -e 's/^.* locktorture.shutdown_secs=//' -e 's/ .*$//' 2> /dev/null` if test -z "$dur" then : diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh index 1706cd4466b4..fbdf162b6acd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh @@ -31,7 +31,7 @@ then echo "$configfile ------- " $stopstate else title="$configfile ------- $ngps GPs" - dur=`sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null` + dur=`grep -v '^#' $i/qemu-cmd | sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//'` if test -z "$dur" then : diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh index 671bfee4fcef..3afa5c6eda4f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh @@ -25,7 +25,7 @@ if test -z "$nscfs" then echo "$configfile ------- " else - dur="`sed -e 's/^.* scftorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null`" + dur="`grep -v '^#' $i/qemu-cmd | sed -e 's/^.* scftorture.shutdown_secs=//' -e 's/ .*$//' 2> /dev/null`" if test -z "$dur" then rate="" diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index e01b31b87044..0a5419982ab3 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -74,7 +74,10 @@ do done if test -f "$rd/kcsan.sum" then - if grep -q CONFIG_KCSAN=y $T + if ! test -f $T + then + : + elif grep -q CONFIG_KCSAN=y $T then echo "Compiler or architecture does not support KCSAN!" echo Did you forget to switch your compiler with '--kmake-arg CC=<cc-that-supports-kcsan>'? diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote-noreap.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote-noreap.sh new file mode 100755 index 000000000000..014ce68260d7 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote-noreap.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Periodically scan a directory tree to prevent files from being reaped +# by systemd and friends on long runs. +# +# Usage: kvm-remote-noreap.sh pathname +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +pathname="$1" +if test "$pathname" = "" +then + echo Usage: kvm-remote-noreap.sh pathname + exit 1 +fi +if ! test -d "$pathname" +then + echo Usage: kvm-remote-noreap.sh pathname + echo " pathname must be a directory." + exit 2 +fi + +while test -d "$pathname" +do + find "$pathname" -type f -exec touch -c {} \; > /dev/null 2>&1 + sleep 30 +done diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh new file mode 100755 index 000000000000..03126eb6ec5a --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -0,0 +1,261 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Run a series of tests on remote systems under KVM. +# +# Usage: kvm-remote.sh "systems" [ <kvm.sh args> ] +# kvm-remote.sh "systems" /path/to/old/run [ <kvm-again.sh args> ] +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +scriptname=$0 +args="$*" + +if ! test -d tools/testing/selftests/rcutorture/bin +then + echo $scriptname must be run from top-level directory of kernel source tree. + exit 1 +fi + +KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM +PATH=${KVM}/bin:$PATH; export PATH +. functions.sh + +starttime="`get_starttime`" + +systems="$1" +if test -z "$systems" +then + echo $scriptname: Empty list of systems will go nowhere good, giving up. + exit 1 +fi +shift + +# Pathnames: +# T: /tmp/kvm-remote.sh.$$ +# resdir: /tmp/kvm-remote.sh.$$/res +# rundir: /tmp/kvm-remote.sh.$$/res/$ds ("-remote" suffix) +# oldrun: `pwd`/tools/testing/.../res/$otherds +# +# Pathname segments: +# TD: kvm-remote.sh.$$ +# ds: yyyy.mm.dd-hh.mm.ss-remote + +TD=kvm-remote.sh.$$ +T=${TMPDIR-/tmp}/$TD +trap 'rm -rf $T' 0 +mkdir $T + +resdir="$T/res" +ds=`date +%Y.%m.%d-%H.%M.%S`-remote +rundir=$resdir/$ds +echo Results directory: $rundir +echo $scriptname $args +if echo $1 | grep -q '^--' +then + # Fresh build. Create a datestamp unless the caller supplied one. + datestamp="`echo "$@" | awk -v ds="$ds" '{ + for (i = 1; i < NF; i++) { + if ($i == "--datestamp") { + ds = ""; + break; + } + } + if (ds != "") + print "--datestamp " ds; + }'`" + kvm.sh --remote "$@" $datestamp --buildonly > $T/kvm.sh.out 2>&1 + ret=$? + if test "$ret" -ne 0 + then + echo $scriptname: kvm.sh failed exit code $? + cat $T/kvm.sh.out + exit 2 + fi + oldrun="`grep -m 1 "^Results directory: " $T/kvm.sh.out | awk '{ print $3 }'`" + touch "$oldrun/remote-log" + echo $scriptname $args >> "$oldrun/remote-log" + echo | tee -a "$oldrun/remote-log" + echo " ----" kvm.sh output: "(`date`)" | tee -a "$oldrun/remote-log" + cat $T/kvm.sh.out | tee -a "$oldrun/remote-log" + # We are going to run this, so remove the buildonly files. + rm -f "$oldrun"/*/buildonly + kvm-again.sh $oldrun --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1 + ret=$? + if test "$ret" -ne 0 + then + echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log" + cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log" + exit 2 + fi +else + # Re-use old run. + oldrun="$1" + if ! echo $oldrun | grep -q '^/' + then + oldrun="`pwd`/$oldrun" + fi + shift + touch "$oldrun/remote-log" + echo $scriptname $args >> "$oldrun/remote-log" + kvm-again.sh "$oldrun" "$@" --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1 + ret=$? + if test "$ret" -ne 0 + then + echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log" + cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log" + exit 2 + fi + cp -a "$rundir" "$KVM/res/" + oldrun="$KVM/res/$ds" +fi +echo | tee -a "$oldrun/remote-log" +echo " ----" kvm-again.sh output: "(`date`)" | tee -a "$oldrun/remote-log" +cat $T/kvm-again.sh.out +echo | tee -a "$oldrun/remote-log" +echo Remote run directory: $rundir | tee -a "$oldrun/remote-log" +echo Local build-side run directory: $oldrun | tee -a "$oldrun/remote-log" + +# Create the kvm-remote-N.sh scripts in the bin directory. +awk < "$rundir"/scenarios -v dest="$T/bin" -v rundir="$rundir" ' +{ + n = $1; + sub(/\./, "", n); + fn = dest "/kvm-remote-" n ".sh" + print "kvm-remote-noreap.sh " rundir " &" > fn; + scenarios = ""; + for (i = 2; i <= NF; i++) + scenarios = scenarios " " $i; + print "kvm-test-1-run-batch.sh" scenarios >> fn; + print "sync" >> fn; + print "rm " rundir "/remote.run" >> fn; +}' +chmod +x $T/bin/kvm-remote-*.sh +( cd "`dirname $T`"; tar -chzf $T/binres.tgz "$TD/bin" "$TD/res" ) + +# Check first to avoid the need for cleanup for system-name typos +for i in $systems +do + ncpus="`ssh $i getconf _NPROCESSORS_ONLN 2> /dev/null`" + echo $i: $ncpus CPUs " " `date` | tee -a "$oldrun/remote-log" + ret=$? + if test "$ret" -ne 0 + then + echo System $i unreachable, giving up. | tee -a "$oldrun/remote-log" + exit 4 | tee -a "$oldrun/remote-log" + fi +done + +# Download and expand the tarball on all systems. +for i in $systems +do + echo Downloading tarball to $i `date` | tee -a "$oldrun/remote-log" + cat $T/binres.tgz | ssh $i "cd /tmp; tar -xzf -" + ret=$? + if test "$ret" -ne 0 + then + echo Unable to download $T/binres.tgz to system $i, giving up. | tee -a "$oldrun/remote-log" + exit 10 | tee -a "$oldrun/remote-log" + fi +done + +# Function to check for presence of a file on the specified system. +# Complain if the system cannot be reached, and retry after a wait. +# Currently just waits forever if a machine disappears. +# +# Usage: checkremotefile system pathname +checkremotefile () { + local ret + local sleeptime=60 + + while : + do + ssh $1 "test -f \"$2\"" + ret=$? + if test "$ret" -eq 255 + then + echo " ---" ssh failure to $1 checking for file $2, retry after $sleeptime seconds. `date` + elif test "$ret" -eq 0 + then + return 0 + elif test "$ret" -eq 1 + then + echo " ---" File \"$2\" not found: ssh $1 test -f \"$2\" + return 1 + else + echo " ---" Exit code $ret: ssh $1 test -f \"$2\", retry after $sleeptime seconds. `date` + return $ret + fi + sleep $sleeptime + done +} + +# Function to start batches on idle remote $systems +# +# Usage: startbatches curbatch nbatches +# +# Batches are numbered starting at 1. Returns the next batch to start. +# Be careful to redirect all debug output to FD 2 (stderr). +startbatches () { + local curbatch="$1" + local nbatches="$2" + local ret + + # Each pass through the following loop examines one system. + for i in $systems + do + if test "$curbatch" -gt "$nbatches" + then + echo $((nbatches + 1)) + return 0 + fi + if checkremotefile "$i" "$resdir/$ds/remote.run" 1>&2 + then + continue # System still running last test, skip. + fi + ssh "$i" "cd \"$resdir/$ds\"; touch remote.run; PATH=\"$T/bin:$PATH\" nohup kvm-remote-$curbatch.sh > kvm-remote-$curbatch.sh.out 2>&1 &" 1>&2 + ret=$? + if test "$ret" -ne 0 + then + echo ssh $i failed: exitcode $ret 1>&2 + exit 11 + fi + echo " ----" System $i Batch `head -n $curbatch < "$rundir"/scenarios | tail -1` `date` 1>&2 + curbatch=$((curbatch + 1)) + done + echo $curbatch +} + +# Launch all the scenarios. +nbatches="`wc -l "$rundir"/scenarios | awk '{ print $1 }'`" +curbatch=1 +while test "$curbatch" -le "$nbatches" +do + startbatches $curbatch $nbatches > $T/curbatch 2> $T/startbatches.stderr + curbatch="`cat $T/curbatch`" + if test -s "$T/startbatches.stderr" + then + cat "$T/startbatches.stderr" | tee -a "$oldrun/remote-log" + fi + if test "$curbatch" -le "$nbatches" + then + sleep 30 + fi +done +echo All batches started. `date` + +# Wait for all remaining scenarios to complete and collect results. +for i in $systems +do + while checkremotefile "$i" "$resdir/$ds/remote.run" + do + sleep 30 + done + echo " ---" Collecting results from $i `date` + ( cd "$oldrun"; ssh $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - ) +done + +( kvm-end-run-stats.sh "$oldrun" "$starttime"; echo $? > $T/exitcode ) | tee -a "$oldrun/remote-log" +exit "`cat $T/exitcode`" diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh index 7ea0809e229e..1e29d656501b 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh @@ -50,10 +50,34 @@ grep '^#' $1/qemu-cmd | sed -e 's/^# //' > $T/qemu-cmd-settings echo ---- System running test: `uname -a` echo ---- Starting kernels. `date` | tee -a log $TORTURE_JITTER_START +kvm-assign-cpus.sh /sys/devices/system/node > $T/cpuarray.awk for i in "$@" do echo ---- System running test: `uname -a` > $i/kvm-test-1-run-qemu.sh.out echo > $i/kvm-test-1-run-qemu.sh.out + export TORTURE_AFFINITY= + kvm-get-cpus-script.sh $T/cpuarray.awk $T/cpubatches.awk $T/cpustate + cat << ' ___EOF___' >> $T/cpubatches.awk + END { + affinitylist = ""; + if (!gotcpus()) { + print "echo No CPU-affinity information, so no taskset command."; + } else if (cpu_count !~ /^[0-9][0-9]*$/) { + print "echo " scenario ": Bogus number of CPUs (old qemu-cmd?), so no taskset command."; + } else { + affinitylist = nextcpus(cpu_count); + if (!(affinitylist ~ /^[0-9,-][0-9,-]*$/)) + print "echo " scenario ": Bogus CPU-affinity information, so no taskset command."; + else if (!dumpcpustate()) + print "echo " scenario ": Could not dump state, so no taskset command."; + else + print "export TORTURE_AFFINITY=" affinitylist; + } + } + ___EOF___ + cpu_count="`grep '# TORTURE_CPU_COUNT=' $i/qemu-cmd | sed -e 's/^.*=//'`" + affinity_export="`awk -f $T/cpubatches.awk -v cpu_count="$cpu_count" -v scenario=$i < /dev/null`" + $affinity_export kvm-test-1-run-qemu.sh $i >> $i/kvm-test-1-run-qemu.sh.out 2>&1 & done for i in $runfiles diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh index 5b1aa2a4f3f6..44280582c594 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh @@ -39,27 +39,34 @@ echo ' ---' `date`: Starting kernel, PID $$ grep '^#' $resdir/qemu-cmd | sed -e 's/^# //' > $T/qemu-cmd-settings . $T/qemu-cmd-settings -# Decorate qemu-cmd with redirection, backgrounding, and PID capture -sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd -echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd +# Decorate qemu-cmd with affinity, redirection, backgrounding, and PID capture +taskset_command= +if test -n "$TORTURE_AFFINITY" +then + taskset_command="taskset -c $TORTURE_AFFINITY " +fi +sed -e 's/^[^#].*$/'"$taskset_command"'& 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd +echo 'qemu_pid=$!' >> $T/qemu-cmd +echo 'echo $qemu_pid > $resdir/qemu-pid' >> $T/qemu-cmd +echo 'taskset -c -p $qemu_pid > $resdir/qemu-affinity' >> $T/qemu-cmd # In case qemu refuses to run... echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log # Attempt to run qemu kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` -( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & +( . $T/qemu-cmd; wait `cat $resdir/qemu-pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 if test -z "$TORTURE_KCONFIG_GDB_ARG" then sleep 10 # Give qemu's pid a chance to reach the file - if test -s "$resdir/qemu_pid" + if test -s "$resdir/qemu-pid" then - qemu_pid=`cat "$resdir/qemu_pid"` - echo Monitoring qemu job at pid $qemu_pid + qemu_pid=`cat "$resdir/qemu-pid"` + echo Monitoring qemu job at pid $qemu_pid `date` else qemu_pid="" - echo Monitoring qemu job at yet-as-unknown pid + echo Monitoring qemu job at yet-as-unknown pid `date` fi fi if test -n "$TORTURE_KCONFIG_GDB_ARG" @@ -82,9 +89,9 @@ then fi while : do - if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" + if test -z "$qemu_pid" && test -s "$resdir/qemu-pid" then - qemu_pid=`cat "$resdir/qemu_pid"` + qemu_pid=`cat "$resdir/qemu-pid"` fi kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 @@ -115,22 +122,22 @@ do break fi done -if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" +if test -z "$qemu_pid" && test -s "$resdir/qemu-pid" then - qemu_pid=`cat "$resdir/qemu_pid"` + qemu_pid=`cat "$resdir/qemu-pid"` fi -if test $commandcompleted -eq 0 -a -n "$qemu_pid" +if test $commandcompleted -eq 0 && test -n "$qemu_pid" then if ! test -f "$resdir/../STOP.1" then - echo Grace period for qemu job at pid $qemu_pid + echo Grace period for qemu job at pid $qemu_pid `date` fi oldline="`tail $resdir/console.log`" while : do if test -f "$resdir/../STOP.1" then - echo "PID $qemu_pid killed due to run STOP.1 request" >> $resdir/Warnings 2>&1 + echo "PID $qemu_pid killed due to run STOP.1 request `date`" >> $resdir/Warnings 2>&1 kill -KILL $qemu_pid break fi @@ -152,13 +159,17 @@ then then last_ts=0 fi - if test "$newline" != "$oldline" -a "$last_ts" -lt $((seconds + $TORTURE_SHUTDOWN_GRACE)) + if test "$newline" != "$oldline" && test "$last_ts" -lt $((seconds + $TORTURE_SHUTDOWN_GRACE)) && test "$last_ts" -gt "$TORTURE_SHUTDOWN_GRACE" then must_continue=yes + if test $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) + then + echo Continuing at console.log time $last_ts \"`tail -n 1 $resdir/console.log`\" `date` + fi fi - if test $must_continue = no -a $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) + if test $must_continue = no && test $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) then - echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 + echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds `date`" >> $resdir/Warnings 2>&1 kill -KILL $qemu_pid break fi @@ -172,5 +183,3 @@ fi # Tell the script that this run is done. rm -f $resdir/build.run - -parse-console.sh $resdir/console.log $title diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 420ed5ce9d32..f4c8055dbf7a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -205,6 +205,7 @@ echo "# TORTURE_KCONFIG_GDB_ARG=\"$TORTURE_KCONFIG_GDB_ARG\"" >> $resdir/qemu-cm echo "# TORTURE_JITTER_START=\"$TORTURE_JITTER_START\"" >> $resdir/qemu-cmd echo "# TORTURE_JITTER_STOP=\"$TORTURE_JITTER_STOP\"" >> $resdir/qemu-cmd echo "# TORTURE_TRUST_MAKE=\"$TORTURE_TRUST_MAKE\"; export TORTURE_TRUST_MAKE" >> $resdir/qemu-cmd +echo "# TORTURE_CPU_COUNT=$cpu_count" >> $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then @@ -214,3 +215,4 @@ then fi kvm-test-1-run-qemu.sh $resdir +parse-console.sh $resdir/console.log $title diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 6bf00a003d3d..f442d84fb2a3 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -20,6 +20,9 @@ mkdir $T cd `dirname $scriptname`/../../../../../ +# This script knows only English. +LANG=en_US.UTF-8; export LANG + dur=$((30*60)) dryrun="" KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM @@ -41,6 +44,7 @@ TORTURE_KCONFIG_KASAN_ARG="" TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" TORTURE_QEMU_MEM=512 +TORTURE_REMOTE= TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu TORTURE_MOD=rcutorture @@ -64,7 +68,7 @@ usage () { echo " --cpus N" echo " --datestamp string" echo " --defconfig string" - echo " --dryrun batches|sched|script" + echo " --dryrun batches|scenarios|sched|script" echo " --duration minutes | <seconds>s | <hours>h | <days>d" echo " --gdb" echo " --help" @@ -77,6 +81,7 @@ usage () { echo " --no-initrd" echo " --qemu-args qemu-arguments" echo " --qemu-cmd qemu-system-..." + echo " --remote" echo " --results absolute-pathname" echo " --torture lock|rcu|rcuscale|refscale|scf" echo " --trust-make" @@ -112,10 +117,13 @@ do checkarg --cpus "(number)" "$#" "$2" '^[0-9]*$' '^--' cpus=$2 TORTURE_ALLOTED_CPUS="$2" - max_cpus="`identify_qemu_vcpus`" - if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus" + if test -z "$TORTURE_REMOTE" then - TORTURE_ALLOTED_CPUS=$max_cpus + max_cpus="`identify_qemu_vcpus`" + if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus" + then + TORTURE_ALLOTED_CPUS=$max_cpus + fi fi shift ;; @@ -130,7 +138,7 @@ do shift ;; --dryrun) - checkarg --dryrun "batches|sched|script" $# "$2" 'batches\|sched\|script' '^--' + checkarg --dryrun "batches|sched|script" $# "$2" 'batches\|scenarios\|sched\|script' '^--' dryrun=$2 shift ;; @@ -206,6 +214,9 @@ do TORTURE_QEMU_CMD="$2" shift ;; + --remote) + TORTURE_REMOTE=1 + ;; --results) checkarg --results "(absolute pathname)" "$#" "$2" '^/' '^error' resdir=$2 @@ -419,17 +430,10 @@ then git diff HEAD >> $resdir/$ds/testid.txt fi ___EOF___ -awk < $T/cfgcpu.pack \ - -v TORTURE_BUILDONLY="$TORTURE_BUILDONLY" \ - -v CONFIGDIR="$CONFIGFRAG/" \ - -v KVM="$KVM" \ - -v ncpus=$cpus \ - -v jitter="$jitter" \ - -v rd=$resdir/$ds/ \ - -v dur=$dur \ - -v TORTURE_QEMU_ARG="$TORTURE_QEMU_ARG" \ - -v TORTURE_BOOTARGS="$TORTURE_BOOTARGS" \ -'BEGIN { +kvm-assign-cpus.sh /sys/devices/system/node > $T/cpuarray.awk +kvm-get-cpus-script.sh $T/cpuarray.awk $T/dumpbatches.awk +cat << '___EOF___' >> $T/dumpbatches.awk +BEGIN { i = 0; } @@ -440,7 +444,7 @@ awk < $T/cfgcpu.pack \ } # Dump out the scripting required to run one test batch. -function dump(first, pastlast, batchnum) +function dump(first, pastlast, batchnum, affinitylist) { print "echo ----Start batch " batchnum ": `date` | tee -a " rd "log"; print "needqemurun=" @@ -472,6 +476,14 @@ function dump(first, pastlast, batchnum) print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date` | tee -a " rd "log"; print "mkdir " rd cfr[jn] " || :"; print "touch " builddir ".wait"; + affinitylist = ""; + if (gotcpus()) { + affinitylist = nextcpus(cpusr[jn]); + } + if (affinitylist ~ /^[0-9,-][0-9,-]*$/) + print "export TORTURE_AFFINITY=" affinitylist; + else + print "export TORTURE_AFFINITY="; print "kvm-test-1-run.sh " CONFIGDIR cf[j], rd cfr[jn], dur " \"" TORTURE_QEMU_ARG "\" \"" TORTURE_BOOTARGS "\" > " rd cfr[jn] "/kvm-test-1-run.sh.out 2>&1 &" print "echo ", cfr[jn], cpusr[jn] ovf ": Waiting for build to complete. `date` | tee -a " rd "log"; print "while test -f " builddir ".wait" @@ -549,21 +561,20 @@ END { # Dump the last batch. if (ncpus != 0) dump(first, i, batchnum); -}' >> $T/script - -cat << '___EOF___' >> $T/script -echo | tee -a $TORTURE_RESDIR/log -echo | tee -a $TORTURE_RESDIR/log -echo " --- `date` Test summary:" | tee -a $TORTURE_RESDIR/log -___EOF___ -cat << ___EOF___ >> $T/script -echo Results directory: $resdir/$ds | tee -a $resdir/$ds/log -kcsan-collapse.sh $resdir/$ds | tee -a $resdir/$ds/log -kvm-recheck.sh $resdir/$ds > $T/kvm-recheck.sh.out 2>&1 +} ___EOF___ -echo 'ret=$?' >> $T/script -echo "cat $T/kvm-recheck.sh.out | tee -a $resdir/$ds/log" >> $T/script -echo 'exit $ret' >> $T/script +awk < $T/cfgcpu.pack \ + -v TORTURE_BUILDONLY="$TORTURE_BUILDONLY" \ + -v CONFIGDIR="$CONFIGFRAG/" \ + -v KVM="$KVM" \ + -v ncpus=$cpus \ + -v jitter="$jitter" \ + -v rd=$resdir/$ds/ \ + -v dur=$dur \ + -v TORTURE_QEMU_ARG="$TORTURE_QEMU_ARG" \ + -v TORTURE_BOOTARGS="$TORTURE_BOOTARGS" \ + -f $T/dumpbatches.awk >> $T/script +echo kvm-end-run-stats.sh "$resdir/$ds" "$starttime" >> $T/script # Extract the tests and their batches from the script. egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | @@ -577,6 +588,25 @@ egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | print batchno, $1, $2 }' > $T/batches +# As above, but one line per batch. +grep -v '^#' $T/batches | awk ' +BEGIN { + oldbatch = 1; +} + +{ + if (oldbatch != $1) { + print ++n ". " curbatch; + curbatch = ""; + oldbatch = $1; + } + curbatch = curbatch " " $2; +} + +END { + print ++n ". " curbatch; +}' > $T/scenarios + if test "$dryrun" = script then cat $T/script @@ -597,13 +627,17 @@ elif test "$dryrun" = batches then cat $T/batches exit 0 +elif test "$dryrun" = scenarios +then + cat $T/scenarios + exit 0 else # Not a dryrun. Record the batches and the number of CPUs, then run the script. bash $T/script ret=$? cp $T/batches $resdir/$ds/batches + cp $T/scenarios $resdir/$ds/scenarios echo '#' cpus=$cpus >> $resdir/$ds/batches - echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a $resdir/$ds/log exit $ret fi diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 56e2e1a42569..363f56081eff 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -53,6 +53,7 @@ do_refscale=yes do_kvfree=yes do_kasan=yes do_kcsan=no +do_clocksourcewd=yes # doyesno - Helper function for yes/no arguments function doyesno () { @@ -72,6 +73,7 @@ usage () { echo " --configs-scftorture \"config-file list w/ repeat factor (2*CFLIST)\"" echo " --doall" echo " --doallmodconfig / --do-no-allmodconfig" + echo " --do-clocksourcewd / --do-no-clocksourcewd" echo " --do-kasan / --do-no-kasan" echo " --do-kcsan / --do-no-kcsan" echo " --do-kvfree / --do-no-kvfree" @@ -109,7 +111,7 @@ do configs_scftorture="$configs_scftorture $2" shift ;; - --doall) + --do-all|--doall) do_allmodconfig=yes do_rcutorture=yes do_locktorture=yes @@ -119,10 +121,14 @@ do do_kvfree=yes do_kasan=yes do_kcsan=yes + do_clocksourcewd=yes ;; --do-allmodconfig|--do-no-allmodconfig) do_allmodconfig=`doyesno "$1" --do-allmodconfig` ;; + --do-clocksourcewd|--do-no-clocksourcewd) + do_clocksourcewd=`doyesno "$1" --do-clocksourcewd` + ;; --do-kasan|--do-no-kasan) do_kasan=`doyesno "$1" --do-kasan` ;; @@ -135,7 +141,7 @@ do --do-locktorture|--do-no-locktorture) do_locktorture=`doyesno "$1" --do-locktorture` ;; - --do-none) + --do-none|--donone) do_allmodconfig=no do_rcutorture=no do_locktorture=no @@ -145,6 +151,7 @@ do do_kvfree=no do_kasan=no do_kcsan=no + do_clocksourcewd=no ;; --do-rcuscale|--do-no-rcuscale) do_rcuscale=`doyesno "$1" --do-rcuscale` @@ -279,9 +286,9 @@ function torture_one { # torture_bootargs="[ kernel boot arguments ]" # torture_set flavor [ kvm.sh arguments ] # -# Note that "flavor" is an arbitrary string. Supply --torture if needed. -# Note that quoting is problematic. So on the command line, pass multiple -# values with multiple kvm.sh argument instances. +# Note that "flavor" is an arbitrary string that does not affect kvm.sh +# in any way. So also supply --torture if you need something other than +# the default. function torture_set { local cur_kcsan_kmake_args= local kcsan_kmake_tag= @@ -302,7 +309,7 @@ function torture_set { kcsan_kmake_tag="--kmake-args" cur_kcsan_kmake_args="$kcsan_kmake_args" fi - torture_one $* --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan + torture_one "$@" --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan fi } @@ -377,6 +384,22 @@ then torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make fi +if test "$do_clocksourcewd" = "yes" +then + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000" + torture_set "clocksourcewd-1" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make + + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 clocksource.max_cswd_read_retries=1" + torture_set "clocksourcewd-2" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make + + # In case our work is already done... + if test "$do_rcutorture" != "yes" + then + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000" + torture_set "clocksourcewd-3" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --trust-make + fi +fi + echo " --- " $scriptname $args echo " --- " Done `date` | tee -a $T/log ret=0 @@ -395,6 +418,10 @@ then nfailures="`wc -l "$T/failures" | awk '{ print $1 }'`" ret=2 fi +if test "$do_kcsan" = "yes" +then + TORTURE_KCONFIG_KCSAN_ARG=1 tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh tools/testing/selftests/rcutorture/res/$ds > tools/testing/selftests/rcutorture/res/$ds/kcsan.sum +fi echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log echo Summary: Successes: $nsuccesses Failures: $nfailures. | tee -a $T/log tdir="`cat $T/successes $T/failures | head -1 | awk '{ print $NF }' | sed -e 's,/[^/]\+/*$,,'`" diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST new file mode 100644 index 000000000000..22d598f9cabe --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST @@ -0,0 +1,17 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=16 +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=y +CONFIG_NO_HZ_IDLE=n +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_TRACE=y +CONFIG_HOTPLUG_CPU=y +CONFIG_RCU_FANOUT=2 +CONFIG_RCU_FANOUT_LEAF=2 +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot new file mode 100644 index 000000000000..f57720c52c0f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot @@ -0,0 +1,8 @@ +rcutorture.test_boost=2 +rcutorture.stutter=0 +rcutree.gp_preinit_delay=12 +rcutree.gp_init_delay=3 +rcutree.gp_cleanup_delay=3 +rcutree.kthread_prio=2 +threadirqs +tree.use_softirq=0 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 index bafe94cbd739..3ca112444ce7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 @@ -1,5 +1,5 @@ CONFIG_SMP=y -CONFIG_NR_CPUS=2 +CONFIG_NR_CPUS=4 CONFIG_HOTPLUG_CPU=y CONFIG_PREEMPT_NONE=n CONFIG_PREEMPT_VOLUNTARY=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 index bafe94cbd739..3ca112444ce7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 @@ -1,5 +1,5 @@ CONFIG_SMP=y -CONFIG_NR_CPUS=2 +CONFIG_NR_CPUS=4 CONFIG_HOTPLUG_CPU=y CONFIG_PREEMPT_NONE=n CONFIG_PREEMPT_VOLUNTARY=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 index ea4399020c6c..dc02083803ce 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 @@ -1,5 +1,5 @@ CONFIG_SMP=y -CONFIG_NR_CPUS=2 +CONFIG_NR_CPUS=4 CONFIG_PREEMPT_NONE=n CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE index 721cfda76ab2..4cc1cc581321 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE @@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n -CONFIG_HOTPLUG_CPU=n +CONFIG_HOTPLUG_CPU=y CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_NOCB_CPU=n diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 index 7629f5dd73b2..f5952061fde7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 @@ -8,7 +8,7 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n -CONFIG_HOTPLUG_CPU=n +CONFIG_HOTPLUG_CPU=y CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=3 diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT index 1cd25b7314e3..ad505a887bec 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT +++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT @@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n -CONFIG_HOTPLUG_CPU=n +CONFIG_HOTPLUG_CPU=y CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_NOCB_CPU=n diff --git a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT index d10bc694f42c..4f08e641bb6b 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT +++ b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT @@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n -CONFIG_HOTPLUG_CPU=n +CONFIG_HOTPLUG_CPU=y CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_NOCB_CPU=n diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h index cf6938d679d7..1e24827f96f1 100644 --- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h +++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h @@ -174,7 +174,7 @@ static inline bool spin_trylock(spinlock_t *lock) } struct completion { - /* Hopefuly this won't overflow. */ + /* Hopefully this won't overflow. */ unsigned int count; }; diff --git a/tools/testing/selftests/resctrl/README b/tools/testing/selftests/resctrl/README index 4b36b25b6ac0..3d2bbd4fa3aa 100644 --- a/tools/testing/selftests/resctrl/README +++ b/tools/testing/selftests/resctrl/README @@ -47,7 +47,7 @@ Parameter '-h' shows usage information. usage: resctrl_tests [-h] [-b "benchmark_cmd [options]"] [-t test list] [-n no_of_bits] -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT default benchmark is builtin fill_buf - -t test list: run tests specified in the test list, e.g. -t mbm, mba, cmt, cat + -t test list: run tests specified in the test list, e.g. -t mbm,mba,cmt,cat -n no_of_bits: run cache tests using specified no of bits in cache bit mask -p cpu_no: specify CPU number to run the test. 1 is default -h: help diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index f51b5fc066a3..973f09a66e1e 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -40,7 +40,7 @@ static void cmd_help(void) printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT\n"); printf("\t default benchmark is builtin fill_buf\n"); printf("\t-t test list: run tests specified in the test list, "); - printf("e.g. -t mbm, mba, cmt, cat\n"); + printf("e.g. -t mbm,mba,cmt,cat\n"); printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n"); printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n"); printf("\t-h: help\n"); @@ -173,7 +173,7 @@ int main(int argc, char **argv) return -1; } - token = strtok(NULL, ":\t"); + token = strtok(NULL, ","); } break; case 'p': diff --git a/tools/testing/selftests/rlimits/.gitignore b/tools/testing/selftests/rlimits/.gitignore new file mode 100644 index 000000000000..091021f255b3 --- /dev/null +++ b/tools/testing/selftests/rlimits/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +rlimits-per-userns diff --git a/tools/testing/selftests/rlimits/Makefile b/tools/testing/selftests/rlimits/Makefile new file mode 100644 index 000000000000..03aadb406212 --- /dev/null +++ b/tools/testing/selftests/rlimits/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +CFLAGS += -Wall -O2 -g +TEST_GEN_PROGS := rlimits-per-userns + +include ../lib.mk diff --git a/tools/testing/selftests/rlimits/config b/tools/testing/selftests/rlimits/config new file mode 100644 index 000000000000..416bd53ce982 --- /dev/null +++ b/tools/testing/selftests/rlimits/config @@ -0,0 +1 @@ +CONFIG_USER_NS=y diff --git a/tools/testing/selftests/rlimits/rlimits-per-userns.c b/tools/testing/selftests/rlimits/rlimits-per-userns.c new file mode 100644 index 000000000000..26dc949e93ea --- /dev/null +++ b/tools/testing/selftests/rlimits/rlimits-per-userns.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Alexey Gladkov <gladkov.alexey@gmail.com> + */ +#define _GNU_SOURCE +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <sys/prctl.h> +#include <sys/stat.h> + +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <sched.h> +#include <signal.h> +#include <limits.h> +#include <fcntl.h> +#include <errno.h> +#include <err.h> + +#define NR_CHILDS 2 + +static char *service_prog; +static uid_t user = 60000; +static uid_t group = 60000; + +static void setrlimit_nproc(rlim_t n) +{ + pid_t pid = getpid(); + struct rlimit limit = { + .rlim_cur = n, + .rlim_max = n + }; + + warnx("(pid=%d): Setting RLIMIT_NPROC=%ld", pid, n); + + if (setrlimit(RLIMIT_NPROC, &limit) < 0) + err(EXIT_FAILURE, "(pid=%d): setrlimit(RLIMIT_NPROC)", pid); +} + +static pid_t fork_child(void) +{ + pid_t pid = fork(); + + if (pid < 0) + err(EXIT_FAILURE, "fork"); + + if (pid > 0) + return pid; + + pid = getpid(); + + warnx("(pid=%d): New process starting ...", pid); + + if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) + err(EXIT_FAILURE, "(pid=%d): prctl(PR_SET_PDEATHSIG)", pid); + + signal(SIGUSR1, SIG_DFL); + + warnx("(pid=%d): Changing to uid=%d, gid=%d", pid, user, group); + + if (setgid(group) < 0) + err(EXIT_FAILURE, "(pid=%d): setgid(%d)", pid, group); + if (setuid(user) < 0) + err(EXIT_FAILURE, "(pid=%d): setuid(%d)", pid, user); + + warnx("(pid=%d): Service running ...", pid); + + warnx("(pid=%d): Unshare user namespace", pid); + if (unshare(CLONE_NEWUSER) < 0) + err(EXIT_FAILURE, "unshare(CLONE_NEWUSER)"); + + char *const argv[] = { "service", NULL }; + char *const envp[] = { "I_AM_SERVICE=1", NULL }; + + warnx("(pid=%d): Executing real service ...", pid); + + execve(service_prog, argv, envp); + err(EXIT_FAILURE, "(pid=%d): execve", pid); +} + +int main(int argc, char **argv) +{ + size_t i; + pid_t child[NR_CHILDS]; + int wstatus[NR_CHILDS]; + int childs = NR_CHILDS; + pid_t pid; + + if (getenv("I_AM_SERVICE")) { + pause(); + exit(EXIT_SUCCESS); + } + + service_prog = argv[0]; + pid = getpid(); + + warnx("(pid=%d) Starting testcase", pid); + + /* + * This rlimit is not a problem for root because it can be exceeded. + */ + setrlimit_nproc(1); + + for (i = 0; i < NR_CHILDS; i++) { + child[i] = fork_child(); + wstatus[i] = 0; + usleep(250000); + } + + while (1) { + for (i = 0; i < NR_CHILDS; i++) { + if (child[i] <= 0) + continue; + + errno = 0; + pid_t ret = waitpid(child[i], &wstatus[i], WNOHANG); + + if (!ret || (!WIFEXITED(wstatus[i]) && !WIFSIGNALED(wstatus[i]))) + continue; + + if (ret < 0 && errno != ECHILD) + warn("(pid=%d): waitpid(%d)", pid, child[i]); + + child[i] *= -1; + childs -= 1; + } + + if (!childs) + break; + + usleep(250000); + + for (i = 0; i < NR_CHILDS; i++) { + if (child[i] <= 0) + continue; + kill(child[i], SIGUSR1); + } + } + + for (i = 0; i < NR_CHILDS; i++) { + if (WIFEXITED(wstatus[i])) + warnx("(pid=%d): pid %d exited, status=%d", + pid, -child[i], WEXITSTATUS(wstatus[i])); + else if (WIFSIGNALED(wstatus[i])) + warnx("(pid=%d): pid %d killed by signal %d", + pid, -child[i], WTERMSIG(wstatus[i])); + + if (WIFSIGNALED(wstatus[i]) && WTERMSIG(wstatus[i]) == SIGUSR1) + continue; + + warnx("(pid=%d): Test failed", pid); + exit(EXIT_FAILURE); + } + + warnx("(pid=%d): Test passed", pid); + exit(EXIT_SUCCESS); +} diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c index 0c4d50644c13..4b809c93ba36 100644 --- a/tools/testing/selftests/safesetid/safesetid-test.c +++ b/tools/testing/selftests/safesetid/safesetid-test.c @@ -152,7 +152,7 @@ static void write_policies(void) fd = open(add_whitelist_policy_file, O_WRONLY); if (fd < 0) - die("cant open add_whitelist_policy file\n"); + die("can't open add_whitelist_policy file\n"); written = write(fd, policy_str, strlen(policy_str)); if (written != strlen(policy_str)) { if (written >= 0) { diff --git a/tools/testing/selftests/sched/.gitignore b/tools/testing/selftests/sched/.gitignore new file mode 100644 index 000000000000..6996d4654d92 --- /dev/null +++ b/tools/testing/selftests/sched/.gitignore @@ -0,0 +1 @@ +cs_prctl_test diff --git a/tools/testing/selftests/sched/Makefile b/tools/testing/selftests/sched/Makefile new file mode 100644 index 000000000000..10c72f14fea9 --- /dev/null +++ b/tools/testing/selftests/sched/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0+ + +ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),) +CLANG_FLAGS += -no-integrated-as +endif + +CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -Wl,-rpath=./ \ + $(CLANG_FLAGS) +LDLIBS += -lpthread + +TEST_GEN_FILES := cs_prctl_test +TEST_PROGS := cs_prctl_test + +include ../lib.mk diff --git a/tools/testing/selftests/sched/config b/tools/testing/selftests/sched/config new file mode 100644 index 000000000000..e8b09aa7c0c4 --- /dev/null +++ b/tools/testing/selftests/sched/config @@ -0,0 +1 @@ +CONFIG_SCHED_DEBUG=y diff --git a/tools/testing/selftests/sched/cs_prctl_test.c b/tools/testing/selftests/sched/cs_prctl_test.c new file mode 100644 index 000000000000..7db9cf822dc7 --- /dev/null +++ b/tools/testing/selftests/sched/cs_prctl_test.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Use the core scheduling prctl() to test core scheduling cookies control. + * + * Copyright (c) 2021 Oracle and/or its affiliates. + * Author: Chris Hyser <chris.hyser@oracle.com> + * + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License as + * published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, see <http://www.gnu.org/licenses>. + */ + +#define _GNU_SOURCE +#include <sys/eventfd.h> +#include <sys/wait.h> +#include <sys/types.h> +#include <sched.h> +#include <sys/prctl.h> +#include <unistd.h> +#include <time.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#if __GLIBC_PREREQ(2, 30) == 0 +#include <sys/syscall.h> +static pid_t gettid(void) +{ + return syscall(SYS_gettid); +} +#endif + +#ifndef PR_SCHED_CORE +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 +#endif + +#define MAX_PROCESSES 128 +#define MAX_THREADS 128 + +static const char USAGE[] = "cs_prctl_test [options]\n" +" options:\n" +" -P : number of processes to create.\n" +" -T : number of threads per process to create.\n" +" -d : delay time to keep tasks alive.\n" +" -k : keep tasks alive until keypress.\n"; + +enum pid_type {PIDTYPE_PID = 0, PIDTYPE_TGID, PIDTYPE_PGID}; + +const int THREAD_CLONE_FLAGS = CLONE_THREAD | CLONE_SIGHAND | CLONE_FS | CLONE_VM | CLONE_FILES; + +static int _prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5) +{ + int res; + + res = prctl(option, arg2, arg3, arg4, arg5); + printf("%d = prctl(%d, %ld, %ld, %ld, %lx)\n", res, option, (long)arg2, (long)arg3, + (long)arg4, arg5); + return res; +} + +#define STACK_SIZE (1024 * 1024) + +#define handle_error(msg) __handle_error(__FILE__, __LINE__, msg) +static void __handle_error(char *fn, int ln, char *msg) +{ + printf("(%s:%d) - ", fn, ln); + perror(msg); + exit(EXIT_FAILURE); +} + +static void handle_usage(int rc, char *msg) +{ + puts(USAGE); + puts(msg); + putchar('\n'); + exit(rc); +} + +static unsigned long get_cs_cookie(int pid) +{ + unsigned long long cookie; + int ret; + + ret = prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, pid, PIDTYPE_PID, + (unsigned long)&cookie); + if (ret) { + printf("Not a core sched system\n"); + return -1UL; + } + + return cookie; +} + +struct child_args { + int num_threads; + int pfd[2]; + int cpid; + int thr_tids[MAX_THREADS]; +}; + +static int child_func_thread(void __attribute__((unused))*arg) +{ + while (1) + usleep(20000); + return 0; +} + +static void create_threads(int num_threads, int thr_tids[]) +{ + void *child_stack; + pid_t tid; + int i; + + for (i = 0; i < num_threads; ++i) { + child_stack = malloc(STACK_SIZE); + if (!child_stack) + handle_error("child stack allocate"); + + tid = clone(child_func_thread, child_stack + STACK_SIZE, THREAD_CLONE_FLAGS, NULL); + if (tid == -1) + handle_error("clone thread"); + thr_tids[i] = tid; + } +} + +static int child_func_process(void *arg) +{ + struct child_args *ca = (struct child_args *)arg; + + close(ca->pfd[0]); + + create_threads(ca->num_threads, ca->thr_tids); + + write(ca->pfd[1], &ca->thr_tids, sizeof(int) * ca->num_threads); + close(ca->pfd[1]); + + while (1) + usleep(20000); + return 0; +} + +static unsigned char child_func_process_stack[STACK_SIZE]; + +void create_processes(int num_processes, int num_threads, struct child_args proc[]) +{ + pid_t cpid; + int i; + + for (i = 0; i < num_processes; ++i) { + proc[i].num_threads = num_threads; + + if (pipe(proc[i].pfd) == -1) + handle_error("pipe() failed"); + + cpid = clone(child_func_process, child_func_process_stack + STACK_SIZE, + SIGCHLD, &proc[i]); + proc[i].cpid = cpid; + close(proc[i].pfd[1]); + } + + for (i = 0; i < num_processes; ++i) { + read(proc[i].pfd[0], &proc[i].thr_tids, sizeof(int) * proc[i].num_threads); + close(proc[i].pfd[0]); + } +} + +void disp_processes(int num_processes, struct child_args proc[]) +{ + int i, j; + + printf("tid=%d, / tgid=%d / pgid=%d: %lx\n", gettid(), getpid(), getpgid(0), + get_cs_cookie(getpid())); + + for (i = 0; i < num_processes; ++i) { + printf(" tid=%d, / tgid=%d / pgid=%d: %lx\n", proc[i].cpid, proc[i].cpid, + getpgid(proc[i].cpid), get_cs_cookie(proc[i].cpid)); + for (j = 0; j < proc[i].num_threads; ++j) { + printf(" tid=%d, / tgid=%d / pgid=%d: %lx\n", proc[i].thr_tids[j], + proc[i].cpid, getpgid(0), get_cs_cookie(proc[i].thr_tids[j])); + } + } + puts("\n"); +} + +static int errors; + +#define validate(v) _validate(__LINE__, v, #v) +void _validate(int line, int val, char *msg) +{ + if (!val) { + ++errors; + printf("(%d) FAILED: %s\n", line, msg); + } else { + printf("(%d) PASSED: %s\n", line, msg); + } +} + +int main(int argc, char *argv[]) +{ + struct child_args procs[MAX_PROCESSES]; + + int keypress = 0; + int num_processes = 2; + int num_threads = 3; + int delay = 0; + int res = 0; + int pidx; + int pid; + int opt; + + while ((opt = getopt(argc, argv, ":hkT:P:d:")) != -1) { + switch (opt) { + case 'P': + num_processes = (int)strtol(optarg, NULL, 10); + break; + case 'T': + num_threads = (int)strtoul(optarg, NULL, 10); + break; + case 'd': + delay = (int)strtol(optarg, NULL, 10); + break; + case 'k': + keypress = 1; + break; + case 'h': + printf(USAGE); + exit(EXIT_SUCCESS); + default: + handle_usage(20, "unknown option"); + } + } + + if (num_processes < 1 || num_processes > MAX_PROCESSES) + handle_usage(1, "Bad processes value"); + + if (num_threads < 1 || num_threads > MAX_THREADS) + handle_usage(2, "Bad thread value"); + + if (keypress) + delay = -1; + + srand(time(NULL)); + + /* put into separate process group */ + if (setpgid(0, 0) != 0) + handle_error("process group"); + + printf("\n## Create a thread/process/process group hiearchy\n"); + create_processes(num_processes, num_threads, procs); + disp_processes(num_processes, procs); + validate(get_cs_cookie(0) == 0); + + printf("\n## Set a cookie on entire process group\n"); + if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, 0, PIDTYPE_PGID, 0) < 0) + handle_error("core_sched create failed -- PGID"); + disp_processes(num_processes, procs); + + validate(get_cs_cookie(0) != 0); + + /* get a random process pid */ + pidx = rand() % num_processes; + pid = procs[pidx].cpid; + + validate(get_cs_cookie(0) == get_cs_cookie(pid)); + validate(get_cs_cookie(0) == get_cs_cookie(procs[pidx].thr_tids[0])); + + printf("\n## Set a new cookie on entire process/TGID [%d]\n", pid); + if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, pid, PIDTYPE_TGID, 0) < 0) + handle_error("core_sched create failed -- TGID"); + disp_processes(num_processes, procs); + + validate(get_cs_cookie(0) != get_cs_cookie(pid)); + validate(get_cs_cookie(pid) != 0); + validate(get_cs_cookie(pid) == get_cs_cookie(procs[pidx].thr_tids[0])); + + printf("\n## Copy the cookie of current/PGID[%d], to pid [%d] as PIDTYPE_PID\n", + getpid(), pid); + if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, pid, PIDTYPE_PID, 0) < 0) + handle_error("core_sched share to itself failed -- PID"); + disp_processes(num_processes, procs); + + validate(get_cs_cookie(0) == get_cs_cookie(pid)); + validate(get_cs_cookie(pid) != 0); + validate(get_cs_cookie(pid) != get_cs_cookie(procs[pidx].thr_tids[0])); + + printf("\n## Copy cookie from a thread [%d] to current/PGID [%d] as PIDTYPE_PID\n", + procs[pidx].thr_tids[0], getpid()); + if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, procs[pidx].thr_tids[0], + PIDTYPE_PID, 0) < 0) + handle_error("core_sched share from thread failed -- PID"); + disp_processes(num_processes, procs); + + validate(get_cs_cookie(0) == get_cs_cookie(procs[pidx].thr_tids[0])); + validate(get_cs_cookie(pid) != get_cs_cookie(procs[pidx].thr_tids[0])); + + printf("\n## Copy cookie from current [%d] to current as pidtype PGID\n", getpid()); + if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, 0, PIDTYPE_PGID, 0) < 0) + handle_error("core_sched share to self failed -- PGID"); + disp_processes(num_processes, procs); + + validate(get_cs_cookie(0) == get_cs_cookie(pid)); + validate(get_cs_cookie(pid) != 0); + validate(get_cs_cookie(pid) == get_cs_cookie(procs[pidx].thr_tids[0])); + + if (errors) { + printf("TESTS FAILED. errors: %d\n", errors); + res = 10; + } else { + printf("SUCCESS !!!\n"); + } + + if (keypress) + getchar(); + else + sleep(delay); + + for (pidx = 0; pidx < num_processes; ++pidx) + kill(procs[pidx].cpid, 15); + + return res; +} diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index fcc806585266..6e5102a7d7c9 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -143,9 +143,15 @@ int main(int argc, char *argv[]) unsigned long long native, filter1, filter2, bitmap1, bitmap2; unsigned long long entry, per_filter1, per_filter2; + setbuf(stdout, NULL); + + printf("Running on:\n"); + system("uname -a"); + printf("Current BPF sysctl settings:\n"); - system("sysctl net.core.bpf_jit_enable"); - system("sysctl net.core.bpf_jit_harden"); + /* Avoid using "sysctl" which may not be installed. */ + system("grep -H . /proc/sys/net/core/bpf_jit_enable"); + system("grep -H . /proc/sys/net/core/bpf_jit_harden"); if (argc > 1) samples = strtoull(argv[1], NULL, 0); diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index e3d5c77a8612..1d64891e6492 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -235,6 +235,10 @@ struct seccomp_notif_addfd { }; #endif +#ifndef SECCOMP_ADDFD_FLAG_SEND +#define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */ +#endif + struct seccomp_notif_addfd_small { __u64 id; char weird[4]; @@ -3959,7 +3963,7 @@ TEST(user_notification_addfd) { pid_t pid; long ret; - int status, listener, memfd, fd; + int status, listener, memfd, fd, nextfd; struct seccomp_notif_addfd addfd = {}; struct seccomp_notif_addfd_small small = {}; struct seccomp_notif_addfd_big big = {}; @@ -3968,25 +3972,34 @@ TEST(user_notification_addfd) /* 100 ms */ struct timespec delay = { .tv_nsec = 100000000 }; + /* There may be arbitrary already-open fds at test start. */ memfd = memfd_create("test", 0); ASSERT_GE(memfd, 0); + nextfd = memfd + 1; ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret) { TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); } + /* fd: 4 */ /* Check that the basic notification machinery works */ listener = user_notif_syscall(__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER); - ASSERT_GE(listener, 0); + ASSERT_EQ(listener, nextfd++); pid = fork(); ASSERT_GE(pid, 0); if (pid == 0) { + /* fds will be added and this value is expected */ if (syscall(__NR_getppid) != USER_NOTIF_MAGIC) exit(1); + + /* Atomic addfd+send is received here. Check it is a valid fd */ + if (fcntl(syscall(__NR_getppid), F_GETFD) == -1) + exit(1); + exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC); } @@ -4028,14 +4041,14 @@ TEST(user_notification_addfd) /* Verify we can set an arbitrary remote fd */ fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); - EXPECT_GE(fd, 0); + EXPECT_EQ(fd, nextfd++); EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); /* Verify we can set an arbitrary remote fd with large size */ memset(&big, 0x0, sizeof(big)); big.addfd = addfd; fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big); - EXPECT_GE(fd, 0); + EXPECT_EQ(fd, nextfd++); /* Verify we can set a specific remote fd */ addfd.newfd = 42; @@ -4065,6 +4078,32 @@ TEST(user_notification_addfd) ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); ASSERT_EQ(addfd.id, req.id); + /* Verify we can do an atomic addfd and send */ + addfd.newfd = 0; + addfd.flags = SECCOMP_ADDFD_FLAG_SEND; + fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); + /* + * Child has earlier "low" fds and now 42, so we expect the next + * lowest available fd to be assigned here. + */ + EXPECT_EQ(fd, nextfd++); + EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); + + /* + * This sets the ID of the ADD FD to the last request plus 1. The + * notification ID increments 1 per notification. + */ + addfd.id = req.id + 1; + + /* This spins until the underlying notification is generated */ + while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 && + errno != -EINPROGRESS) + nanosleep(&delay, NULL); + + memset(&req, 0, sizeof(req)); + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + ASSERT_EQ(addfd.id, req.id); + resp.id = req.id; resp.error = 0; resp.val = USER_NOTIF_MAGIC; @@ -4125,6 +4164,10 @@ TEST(user_notification_addfd_rlimit) EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); EXPECT_EQ(errno, EMFILE); + addfd.flags = SECCOMP_ADDFD_FLAG_SEND; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); + EXPECT_EQ(errno, EMFILE); + addfd.newfd = 100; addfd.flags = SECCOMP_ADDFD_FLAG_SETFD; EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); diff --git a/tools/testing/selftests/sgx/call.S b/tools/testing/selftests/sgx/call.S index 4ecadc7490f4..b09a25890f3b 100644 --- a/tools/testing/selftests/sgx/call.S +++ b/tools/testing/selftests/sgx/call.S @@ -5,8 +5,8 @@ .text - .global sgx_call_vdso -sgx_call_vdso: + .global sgx_enter_enclave +sgx_enter_enclave: .cfi_startproc push %r15 .cfi_adjust_cfa_offset 8 @@ -27,7 +27,7 @@ sgx_call_vdso: .cfi_adjust_cfa_offset 8 push 0x38(%rsp) .cfi_adjust_cfa_offset 8 - call *eenter(%rip) + call *vdso_sgx_enter_enclave(%rip) add $0x10, %rsp .cfi_adjust_cfa_offset -0x10 pop %rbx diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index 0bd73428d2f3..f88562afcaa0 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -18,4 +18,14 @@ #include "../../../../arch/x86/include/asm/enclu.h" #include "../../../../arch/x86/include/uapi/asm/sgx.h" +enum encl_op_type { + ENCL_OP_PUT, + ENCL_OP_GET, +}; + +struct encl_op { + uint64_t type; + uint64_t buffer; +}; + #endif /* DEFINES_H */ diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index f441ac34b4d4..3ebe5d1fe337 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -150,16 +150,6 @@ bool encl_load(const char *path, struct encl *encl) goto err; } - /* - * This just checks if the /dev file has these permission - * bits set. It does not check that the current user is - * the owner or in the owning group. - */ - if (!(sb.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { - fprintf(stderr, "no execute permissions on device file %s\n", device_path); - goto err; - } - ptr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0); if (ptr == (void *)-1) { perror("mmap for read"); @@ -169,13 +159,13 @@ bool encl_load(const char *path, struct encl *encl) #define ERR_MSG \ "mmap() succeeded for PROT_READ, but failed for PROT_EXEC.\n" \ -" Check that current user has execute permissions on %s and \n" \ -" that /dev does not have noexec set: mount | grep \"/dev .*noexec\"\n" \ +" Check that /dev does not have noexec set:\n" \ +" \tmount | grep \"/dev .*noexec\"\n" \ " If so, remount it executable: mount -o remount,exec /dev\n\n" ptr = mmap(NULL, PAGE_SIZE, PROT_EXEC, MAP_SHARED, fd, 0); if (ptr == (void *)-1) { - fprintf(stderr, ERR_MSG, device_path); + fprintf(stderr, ERR_MSG); goto err; } munmap(ptr, PAGE_SIZE); @@ -239,9 +229,6 @@ bool encl_load(const char *path, struct encl *encl) seg->offset = (phdr->p_offset & PAGE_MASK) - src_offset; seg->size = (phdr->p_filesz + PAGE_SIZE - 1) & PAGE_MASK; - printf("0x%016lx 0x%016lx 0x%02x\n", seg->offset, seg->size, - seg->prot); - j++; } diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index d304a4044eb9..e252015e0c15 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -17,11 +17,11 @@ #include <sys/types.h> #include <sys/auxv.h> #include "defines.h" +#include "../kselftest_harness.h" #include "main.h" -#include "../kselftest.h" static const uint64_t MAGIC = 0x1122334455667788ULL; -vdso_sgx_enter_enclave_t eenter; +vdso_sgx_enter_enclave_t vdso_sgx_enter_enclave; struct vdso_symtab { Elf64_Sym *elf_symtab; @@ -107,85 +107,51 @@ static Elf64_Sym *vdso_symtab_get(struct vdso_symtab *symtab, const char *name) return NULL; } -bool report_results(struct sgx_enclave_run *run, int ret, uint64_t result, - const char *test) -{ - bool valid = true; - - if (ret) { - printf("FAIL: %s() returned: %d\n", test, ret); - valid = false; - } - - if (run->function != EEXIT) { - printf("FAIL: %s() function, expected: %u, got: %u\n", test, EEXIT, - run->function); - valid = false; - } - - if (result != MAGIC) { - printf("FAIL: %s(), expected: 0x%lx, got: 0x%lx\n", test, MAGIC, - result); - valid = false; - } - - if (run->user_data) { - printf("FAIL: %s() user data, expected: 0x0, got: 0x%llx\n", - test, run->user_data); - valid = false; - } - - return valid; -} - -static int user_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r9, - struct sgx_enclave_run *run) -{ - run->user_data = 0; - return 0; -} +FIXTURE(enclave) { + struct encl encl; + struct sgx_enclave_run run; +}; -int main(int argc, char *argv[]) +FIXTURE_SETUP(enclave) { - struct sgx_enclave_run run; + Elf64_Sym *sgx_enter_enclave_sym = NULL; struct vdso_symtab symtab; - Elf64_Sym *eenter_sym; - uint64_t result = 0; - struct encl encl; + struct encl_segment *seg; + char maps_line[256]; + FILE *maps_file; unsigned int i; void *addr; - int ret; - memset(&run, 0, sizeof(run)); - - if (!encl_load("test_encl.elf", &encl)) { - encl_delete(&encl); + if (!encl_load("test_encl.elf", &self->encl)) { + encl_delete(&self->encl); ksft_exit_skip("cannot load enclaves\n"); } - if (!encl_measure(&encl)) + for (i = 0; i < self->encl.nr_segments; i++) { + seg = &self->encl.segment_tbl[i]; + + TH_LOG("0x%016lx 0x%016lx 0x%02x", seg->offset, seg->size, seg->prot); + } + + if (!encl_measure(&self->encl)) goto err; - if (!encl_build(&encl)) + if (!encl_build(&self->encl)) goto err; /* * An enclave consumer only must do this. */ - for (i = 0; i < encl.nr_segments; i++) { - struct encl_segment *seg = &encl.segment_tbl[i]; - - addr = mmap((void *)encl.encl_base + seg->offset, seg->size, - seg->prot, MAP_SHARED | MAP_FIXED, encl.fd, 0); - if (addr == MAP_FAILED) { - perror("mmap() segment failed"); - exit(KSFT_FAIL); - } + for (i = 0; i < self->encl.nr_segments; i++) { + struct encl_segment *seg = &self->encl.segment_tbl[i]; + + addr = mmap((void *)self->encl.encl_base + seg->offset, seg->size, + seg->prot, MAP_SHARED | MAP_FIXED, self->encl.fd, 0); + EXPECT_NE(addr, MAP_FAILED); + if (addr == MAP_FAILED) + goto err; } - memset(&run, 0, sizeof(run)); - run.tcs = encl.encl_base; - /* Get vDSO base address */ addr = (void *)getauxval(AT_SYSINFO_EHDR); if (!addr) @@ -194,37 +160,134 @@ int main(int argc, char *argv[]) if (!vdso_get_symtab(addr, &symtab)) goto err; - eenter_sym = vdso_symtab_get(&symtab, "__vdso_sgx_enter_enclave"); - if (!eenter_sym) + sgx_enter_enclave_sym = vdso_symtab_get(&symtab, "__vdso_sgx_enter_enclave"); + if (!sgx_enter_enclave_sym) goto err; - eenter = addr + eenter_sym->st_value; - - ret = sgx_call_vdso((void *)&MAGIC, &result, 0, EENTER, NULL, NULL, &run); - if (!report_results(&run, ret, result, "sgx_call_vdso")) - goto err; + vdso_sgx_enter_enclave = addr + sgx_enter_enclave_sym->st_value; + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; - /* Invoke the vDSO directly. */ - result = 0; - ret = eenter((unsigned long)&MAGIC, (unsigned long)&result, 0, EENTER, - 0, 0, &run); - if (!report_results(&run, ret, result, "eenter")) - goto err; + maps_file = fopen("/proc/self/maps", "r"); + if (maps_file != NULL) { + while (fgets(maps_line, sizeof(maps_line), maps_file) != NULL) { + maps_line[strlen(maps_line) - 1] = '\0'; - /* And with an exit handler. */ - run.user_handler = (__u64)user_handler; - run.user_data = 0xdeadbeef; - ret = eenter((unsigned long)&MAGIC, (unsigned long)&result, 0, EENTER, - 0, 0, &run); - if (!report_results(&run, ret, result, "user_handler")) - goto err; + if (strstr(maps_line, "/dev/sgx_enclave")) + TH_LOG("%s", maps_line); + } - printf("SUCCESS\n"); - encl_delete(&encl); - exit(KSFT_PASS); + fclose(maps_file); + } err: - encl_delete(&encl); - exit(KSFT_FAIL); + if (!sgx_enter_enclave_sym) + encl_delete(&self->encl); + + ASSERT_NE(sgx_enter_enclave_sym, NULL); +} + +FIXTURE_TEARDOWN(enclave) +{ + encl_delete(&self->encl); +} + +#define ENCL_CALL(op, run, clobbered) \ + ({ \ + int ret; \ + if ((clobbered)) \ + ret = vdso_sgx_enter_enclave((unsigned long)(op), 0, 0, \ + EENTER, 0, 0, (run)); \ + else \ + ret = sgx_enter_enclave((void *)(op), NULL, 0, EENTER, NULL, NULL, \ + (run)); \ + ret; \ + }) + +#define EXPECT_EEXIT(run) \ + do { \ + EXPECT_EQ((run)->function, EEXIT); \ + if ((run)->function != EEXIT) \ + TH_LOG("0x%02x 0x%02x 0x%016llx", (run)->exception_vector, \ + (run)->exception_error_code, (run)->exception_addr); \ + } while (0) + +TEST_F(enclave, unclobbered_vdso) +{ + struct encl_op op; + + op.type = ENCL_OP_PUT; + op.buffer = MAGIC; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + + op.type = ENCL_OP_GET; + op.buffer = 0; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0); + + EXPECT_EQ(op.buffer, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); +} + +TEST_F(enclave, clobbered_vdso) +{ + struct encl_op op; + + op.type = ENCL_OP_PUT; + op.buffer = MAGIC; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + + op.type = ENCL_OP_GET; + op.buffer = 0; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + + EXPECT_EQ(op.buffer, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); } + +static int test_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r9, + struct sgx_enclave_run *run) +{ + run->user_data = 0; + + return 0; +} + +TEST_F(enclave, clobbered_vdso_and_user_function) +{ + struct encl_op op; + + self->run.user_handler = (__u64)test_handler; + self->run.user_data = 0xdeadbeef; + + op.type = ENCL_OP_PUT; + op.buffer = MAGIC; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + + op.type = ENCL_OP_GET; + op.buffer = 0; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + + EXPECT_EQ(op.buffer, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h index 67211a708f04..68672fd86cf9 100644 --- a/tools/testing/selftests/sgx/main.h +++ b/tools/testing/selftests/sgx/main.h @@ -35,7 +35,7 @@ bool encl_load(const char *path, struct encl *encl); bool encl_measure(struct encl *encl); bool encl_build(struct encl *encl); -int sgx_call_vdso(void *rdi, void *rsi, long rdx, u32 function, void *r8, void *r9, - struct sgx_enclave_run *run); +int sgx_enter_enclave(void *rdi, void *rsi, long rdx, u32 function, void *r8, void *r9, + struct sgx_enclave_run *run); #endif /* MAIN_H */ diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c index dee7a3d6c5a5..92bbc5a15c39 100644 --- a/tools/testing/selftests/sgx/sigstruct.c +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -55,10 +55,27 @@ static bool alloc_q1q2_ctx(const uint8_t *s, const uint8_t *m, return true; } +static void reverse_bytes(void *data, int length) +{ + int i = 0; + int j = length - 1; + uint8_t temp; + uint8_t *ptr = data; + + while (i < j) { + temp = ptr[i]; + ptr[i] = ptr[j]; + ptr[j] = temp; + i++; + j--; + } +} + static bool calc_q1q2(const uint8_t *s, const uint8_t *m, uint8_t *q1, uint8_t *q2) { struct q1q2_ctx ctx; + int len; if (!alloc_q1q2_ctx(s, m, &ctx)) { fprintf(stderr, "Not enough memory for Q1Q2 calculation\n"); @@ -89,8 +106,10 @@ static bool calc_q1q2(const uint8_t *s, const uint8_t *m, uint8_t *q1, goto out; } - BN_bn2bin(ctx.q1, q1); - BN_bn2bin(ctx.q2, q2); + len = BN_bn2bin(ctx.q1, q1); + reverse_bytes(q1, len); + len = BN_bn2bin(ctx.q2, q2); + reverse_bytes(q2, len); free_q1q2_ctx(&ctx); return true; @@ -152,22 +171,6 @@ static RSA *gen_sign_key(void) return key; } -static void reverse_bytes(void *data, int length) -{ - int i = 0; - int j = length - 1; - uint8_t temp; - uint8_t *ptr = data; - - while (i < j) { - temp = ptr[i]; - ptr[i] = ptr[j]; - ptr[j] = temp; - i++; - j--; - } -} - enum mrtags { MRECREATE = 0x0045544145524345, MREADD = 0x0000000044444145, @@ -367,8 +370,6 @@ bool encl_measure(struct encl *encl) /* BE -> LE */ reverse_bytes(sigstruct->signature, SGX_MODULUS_SIZE); reverse_bytes(sigstruct->modulus, SGX_MODULUS_SIZE); - reverse_bytes(sigstruct->q1, SGX_MODULUS_SIZE); - reverse_bytes(sigstruct->q2, SGX_MODULUS_SIZE); EVP_MD_CTX_destroy(ctx); RSA_free(key); diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index cf25b5dc1e03..734ea52f9924 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -4,6 +4,8 @@ #include <stddef.h> #include "defines.h" +static uint8_t encl_buffer[8192] = { 1 }; + static void *memcpy(void *dest, const void *src, size_t n) { size_t i; @@ -14,7 +16,20 @@ static void *memcpy(void *dest, const void *src, size_t n) return dest; } -void encl_body(void *rdi, void *rsi) +void encl_body(void *rdi, void *rsi) { - memcpy(rsi, rdi, 8); + struct encl_op *op = (struct encl_op *)rdi; + + switch (op->type) { + case ENCL_OP_PUT: + memcpy(&encl_buffer[0], &op->buffer, 8); + break; + + case ENCL_OP_GET: + memcpy(&op->buffer, &encl_buffer[0], 8); + break; + + default: + break; + } } diff --git a/tools/testing/selftests/sgx/test_encl.lds b/tools/testing/selftests/sgx/test_encl.lds index 0fbbda7e665e..a1ec64f7d91f 100644 --- a/tools/testing/selftests/sgx/test_encl.lds +++ b/tools/testing/selftests/sgx/test_encl.lds @@ -18,9 +18,10 @@ SECTIONS .text : { *(.text*) *(.rodata*) + FILL(0xDEADBEEF); + . = ALIGN(4096); } : text - . = ALIGN(4096); .data : { *(.data*) } : data diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c index 8934a3766d20..c53b070755b6 100644 --- a/tools/testing/selftests/sigaltstack/sas.c +++ b/tools/testing/selftests/sigaltstack/sas.c @@ -17,6 +17,7 @@ #include <string.h> #include <assert.h> #include <errno.h> +#include <sys/auxv.h> #include "../kselftest.h" @@ -24,6 +25,11 @@ #define SS_AUTODISARM (1U << 31) #endif +#ifndef AT_MINSIGSTKSZ +#define AT_MINSIGSTKSZ 51 +#endif + +static unsigned int stack_size; static void *sstack, *ustack; static ucontext_t uc, sc; static const char *msg = "[OK]\tStack preserved"; @@ -47,7 +53,7 @@ void my_usr1(int sig, siginfo_t *si, void *u) #endif if (sp < (unsigned long)sstack || - sp >= (unsigned long)sstack + SIGSTKSZ) { + sp >= (unsigned long)sstack + stack_size) { ksft_exit_fail_msg("SP is not on sigaltstack\n"); } /* put some data on stack. other sighandler will try to overwrite it */ @@ -108,6 +114,10 @@ int main(void) stack_t stk; int err; + /* Make sure more than the required minimum. */ + stack_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ; + ksft_print_msg("[NOTE]\tthe stack size is %lu\n", stack_size); + ksft_print_header(); ksft_set_plan(3); @@ -117,7 +127,7 @@ int main(void) sigaction(SIGUSR1, &act, NULL); act.sa_sigaction = my_usr2; sigaction(SIGUSR2, &act, NULL); - sstack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE, + sstack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); if (sstack == MAP_FAILED) { ksft_exit_fail_msg("mmap() - %s\n", strerror(errno)); @@ -139,7 +149,7 @@ int main(void) } stk.ss_sp = sstack; - stk.ss_size = SIGSTKSZ; + stk.ss_size = stack_size; stk.ss_flags = SS_ONSTACK | SS_AUTODISARM; err = sigaltstack(&stk, NULL); if (err) { @@ -161,7 +171,7 @@ int main(void) } } - ustack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE, + ustack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); if (ustack == MAP_FAILED) { ksft_exit_fail_msg("mmap() - %s\n", strerror(errno)); @@ -170,7 +180,7 @@ int main(void) getcontext(&uc); uc.uc_link = NULL; uc.uc_stack.ss_sp = ustack; - uc.uc_stack.ss_size = SIGSTKSZ; + uc.uc_stack.ss_size = stack_size; makecontext(&uc, switch_fn, 0); raise(SIGUSR1); diff --git a/tools/testing/selftests/splice/short_splice_read.sh b/tools/testing/selftests/splice/short_splice_read.sh index 7810d3589d9a..22b6c8910b18 100755 --- a/tools/testing/selftests/splice/short_splice_read.sh +++ b/tools/testing/selftests/splice/short_splice_read.sh @@ -1,21 +1,87 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 +# +# Test for mishandling of splice() on pseudofilesystems, which should catch +# bugs like 11990a5bd7e5 ("module: Correctly truncate sysfs sections output") +# +# Since splice fallback was removed as part of the set_fs() rework, many of these +# tests expect to fail now. See https://lore.kernel.org/lkml/202009181443.C2179FB@keescook/ set -e +DIR=$(dirname "$0") + ret=0 +expect_success() +{ + title="$1" + shift + + echo "" >&2 + echo "$title ..." >&2 + + set +e + "$@" + rc=$? + set -e + + case "$rc" in + 0) + echo "ok: $title succeeded" >&2 + ;; + 1) + echo "FAIL: $title should work" >&2 + ret=$(( ret + 1 )) + ;; + *) + echo "FAIL: something else went wrong" >&2 + ret=$(( ret + 1 )) + ;; + esac +} + +expect_failure() +{ + title="$1" + shift + + echo "" >&2 + echo "$title ..." >&2 + + set +e + "$@" + rc=$? + set -e + + case "$rc" in + 0) + echo "FAIL: $title unexpectedly worked" >&2 + ret=$(( ret + 1 )) + ;; + 1) + echo "ok: $title correctly failed" >&2 + ;; + *) + echo "FAIL: something else went wrong" >&2 + ret=$(( ret + 1 )) + ;; + esac +} + do_splice() { filename="$1" bytes="$2" expected="$3" + report="$4" - out=$(./splice_read "$filename" "$bytes" | cat) + out=$("$DIR"/splice_read "$filename" "$bytes" | cat) if [ "$out" = "$expected" ] ; then - echo "ok: $filename $bytes" + echo " matched $report" >&2 + return 0 else - echo "FAIL: $filename $bytes" - ret=1 + echo " no match: '$out' vs $report" >&2 + return 1 fi } @@ -23,34 +89,45 @@ test_splice() { filename="$1" + echo " checking $filename ..." >&2 + full=$(cat "$filename") + rc=$? + if [ $rc -ne 0 ] ; then + return 2 + fi + two=$(echo "$full" | grep -m1 . | cut -c-2) # Make sure full splice has the same contents as a standard read. - do_splice "$filename" 4096 "$full" + echo " splicing 4096 bytes ..." >&2 + if ! do_splice "$filename" 4096 "$full" "full read" ; then + return 1 + fi # Make sure a partial splice see the first two characters. - do_splice "$filename" 2 "$two" + echo " splicing 2 bytes ..." >&2 + if ! do_splice "$filename" 2 "$two" "'$two'" ; then + return 1 + fi + + return 0 } -# proc_single_open(), seq_read() -test_splice /proc/$$/limits -# special open, seq_read() -test_splice /proc/$$/comm +### /proc/$pid/ has no splice interface; these should all fail. +expect_failure "proc_single_open(), seq_read() splice" test_splice /proc/$$/limits +expect_failure "special open(), seq_read() splice" test_splice /proc/$$/comm -# proc_handler, proc_dointvec_minmax -test_splice /proc/sys/fs/nr_open -# proc_handler, proc_dostring -test_splice /proc/sys/kernel/modprobe -# proc_handler, special read -test_splice /proc/sys/kernel/version +### /proc/sys/ has a splice interface; these should all succeed. +expect_success "proc_handler: proc_dointvec_minmax() splice" test_splice /proc/sys/fs/nr_open +expect_success "proc_handler: proc_dostring() splice" test_splice /proc/sys/kernel/modprobe +expect_success "proc_handler: special read splice" test_splice /proc/sys/kernel/version +### /sys/ has no splice interface; these should all fail. if ! [ -d /sys/module/test_module/sections ] ; then - modprobe test_module + expect_success "test_module kernel module load" modprobe test_module fi -# kernfs, attr -test_splice /sys/module/test_module/coresize -# kernfs, binattr -test_splice /sys/module/test_module/sections/.init.text +expect_failure "kernfs attr splice" test_splice /sys/module/test_module/coresize +expect_failure "kernfs binattr splice" test_splice /sys/module/test_module/sections/.init.text exit $ret diff --git a/tools/testing/selftests/sync/config b/tools/testing/selftests/sync/config index 1ab7e8130db2..47ff5afc3727 100644 --- a/tools/testing/selftests/sync/config +++ b/tools/testing/selftests/sync/config @@ -1,4 +1,3 @@ CONFIG_STAGING=y CONFIG_ANDROID=y -CONFIG_SYNC=y CONFIG_SW_SYNC=y diff --git a/tools/testing/selftests/tc-testing/plugin-lib/scapyPlugin.py b/tools/testing/selftests/tc-testing/plugin-lib/scapyPlugin.py index 229ee185b27e..254136e3da5a 100644 --- a/tools/testing/selftests/tc-testing/plugin-lib/scapyPlugin.py +++ b/tools/testing/selftests/tc-testing/plugin-lib/scapyPlugin.py @@ -29,22 +29,26 @@ class SubPlugin(TdcPlugin): return # Check for required fields - scapyinfo = self.args.caseinfo['scapy'] - scapy_keys = ['iface', 'count', 'packet'] - missing_keys = [] - keyfail = False - for k in scapy_keys: - if k not in scapyinfo: - keyfail = True - missing_keys.add(k) - if keyfail: - print('{}: Scapy block present in the test, but is missing info:' - .format(self.sub_class)) - print('{}'.format(missing_keys)) - - pkt = eval(scapyinfo['packet']) - if '$' in scapyinfo['iface']: - tpl = Template(scapyinfo['iface']) - scapyinfo['iface'] = tpl.safe_substitute(NAMES) - for count in range(scapyinfo['count']): - sendp(pkt, iface=scapyinfo['iface']) + lscapyinfo = self.args.caseinfo['scapy'] + if type(lscapyinfo) != list: + lscapyinfo = [ lscapyinfo, ] + + for scapyinfo in lscapyinfo: + scapy_keys = ['iface', 'count', 'packet'] + missing_keys = [] + keyfail = False + for k in scapy_keys: + if k not in scapyinfo: + keyfail = True + missing_keys.append(k) + if keyfail: + print('{}: Scapy block present in the test, but is missing info:' + .format(self.sub_class)) + print('{}'.format(missing_keys)) + + pkt = eval(scapyinfo['packet']) + if '$' in scapyinfo['iface']: + tpl = Template(scapyinfo['iface']) + scapyinfo['iface'] = tpl.safe_substitute(NAMES) + for count in range(scapyinfo['count']): + sendp(pkt, iface=scapyinfo['iface']) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json index 4202e95e27b9..bd843ab00a58 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json @@ -406,5 +406,50 @@ "teardown": [ "$TC actions flush action ct" ] + }, + { + "id": "3992", + "name": "Add ct action triggering DNAT tuple conflict", + "category": [ + "actions", + "ct", + "scapy" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + [ + "$TC qdisc del dev $DEV1 ingress", + 0, + 1, + 2, + 255 + ], + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 ingress protocol ip prio 1 flower ct_state -trk action ct commit nat dst addr 20.0.0.1 port 10 pipe action drop", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.10')/TCP(sport=5000,dport=10)" + }, + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)" + } + ], + "expExitCode": "0", + "verifyCmd": "cat /proc/net/nf_conntrack", + "matchPattern": "dst=10.0.0.20", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json b/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json index 6eb4c4f97060..742f2290973e 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json @@ -417,5 +417,29 @@ "teardown": [ "$TC actions flush action skbmod" ] + }, + { + "id": "fe09", + "name": "Add skbmod action to mark ECN bits", + "category": [ + "actions", + "skbmod" + ], + "setup": [ + [ + "$TC actions flush action skbmod", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action skbmod ecn", + "expExitCode": "0", + "verifyCmd": "$TC actions get action skbmod index 1", + "matchPattern": "action order [0-9]*: skbmod pipe ecn", + "matchCount": "1", + "teardown": [ + "$TC actions flush action skbmod" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json b/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json index 41d783254b08..2aad4caa8581 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json @@ -446,6 +446,30 @@ "teardown": [] }, { + "id": "ba5b", + "name": "Add vlan modify action for protocol 802.1Q setting priority 0", + "category": [ + "actions", + "vlan" + ], + "setup": [ + [ + "$TC actions flush action vlan", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action vlan modify protocol 802.1Q id 5 priority 0 index 100", + "expExitCode": "0", + "verifyCmd": "$TC actions get action vlan index 100", + "matchPattern": "action order [0-9]+: vlan.*modify id 100 priority 0 protocol 802.1Q pipe.*index 100 ref", + "matchCount": "0", + "teardown": [ + "$TC actions flush action vlan" + ] + }, + { "id": "6812", "name": "Add vlan modify action for protocol 802.1Q", "category": [ @@ -463,7 +487,7 @@ "cmdUnderTest": "$TC actions add action vlan modify protocol 802.1Q id 5 index 100", "expExitCode": "0", "verifyCmd": "$TC actions get action vlan index 100", - "matchPattern": "action order [0-9]+: vlan.*modify id 100 protocol 802.1Q priority 0 pipe.*index 100 ref", + "matchPattern": "action order [0-9]+: vlan.*modify id 100 protocol 802.1Q pipe.*index 100 ref", "matchCount": "0", "teardown": [ "$TC actions flush action vlan" @@ -487,7 +511,7 @@ "cmdUnderTest": "$TC actions add action vlan modify protocol 802.1ad id 500 reclassify index 12", "expExitCode": "0", "verifyCmd": "$TC actions get action vlan index 12", - "matchPattern": "action order [0-9]+: vlan.*modify id 500 protocol 802.1ad priority 0 reclassify.*index 12 ref", + "matchPattern": "action order [0-9]+: vlan.*modify id 500 protocol 802.1ad reclassify.*index 12 ref", "matchCount": "1", "teardown": [ "$TC actions flush action vlan" diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/mq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/mq.json new file mode 100644 index 000000000000..88a20c781e49 --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/mq.json @@ -0,0 +1,137 @@ +[ + { + "id": "ce7d", + "name": "Add mq Qdisc to multi-queue device (4 queues)", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "4", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "2f82", + "name": "Add mq Qdisc to multi-queue device (256 queues)", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 256\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-9,a-f][0-9,a-f]{0,2} bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "256", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "c525", + "name": "Add duplicate mq Qdisc", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH root handle 1: mq" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "4", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "128a", + "name": "Delete nonexistent mq Qdisc", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc del dev $ETH root handle 1: mq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "03a9", + "name": "Delete mq Qdisc twice", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH root handle 1: mq", + "$TC qdisc del dev $ETH root handle 1: mq" + ], + "cmdUnderTest": "$TC qdisc del dev $ETH root handle 1: mq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "be0f", + "name": "Add mq Qdisc to single-queue device", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + } +] diff --git a/tools/testing/selftests/tc-testing/tdc_config.py b/tools/testing/selftests/tc-testing/tdc_config.py index cd4a27ee1466..ea04f04c173e 100644 --- a/tools/testing/selftests/tc-testing/tdc_config.py +++ b/tools/testing/selftests/tc-testing/tdc_config.py @@ -17,6 +17,7 @@ NAMES = { 'DEV1': 'v0p1', 'DEV2': '', 'DUMMY': 'dummy1', + 'ETH': 'eth0', 'BATCH_FILE': './batch.txt', 'BATCH_DIR': 'tmp', # Length of time in seconds to wait before terminating a command diff --git a/tools/testing/selftests/timers/rtcpie.c b/tools/testing/selftests/timers/rtcpie.c index 47b5bad1b393..4ef2184f1558 100644 --- a/tools/testing/selftests/timers/rtcpie.c +++ b/tools/testing/selftests/timers/rtcpie.c @@ -18,6 +18,8 @@ #include <stdlib.h> #include <errno.h> +#include "../kselftest.h" + /* * This expects the new RTC class driver framework, working with * clocks that will often not be clones of what the PC-AT had. @@ -35,8 +37,14 @@ int main(int argc, char **argv) switch (argc) { case 2: rtc = argv[1]; - /* FALLTHROUGH */ + break; case 1: + fd = open(default_rtc, O_RDONLY); + if (fd == -1) { + printf("Default RTC %s does not exist. Test Skipped!\n", default_rtc); + exit(KSFT_SKIP); + } + close(fd); break; default: fprintf(stderr, "usage: rtctest [rtcdev] [d]\n"); diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 1f651e85ed60..b02eac613fdd 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -12,6 +12,9 @@ mremap_test on-fault-limit transhuge-stress protection_keys +protection_keys_32 +protection_keys_64 +madv_populate userfaultfd mlock-intersect-test mlock-random-test @@ -21,5 +24,7 @@ va_128TBswitch map_fixed_noreplace write_to_hugetlbfs hmm-tests +memfd_secret local_config.* split_huge_page_test +ksm_tests diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 73e1cc96d7c2..d9605bd10f2d 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -31,9 +31,11 @@ TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += khugepaged +TEST_GEN_FILES += madv_populate TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate +TEST_GEN_FILES += memfd_secret TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += mremap_dontunmap @@ -43,6 +45,7 @@ TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd TEST_GEN_FILES += split_huge_page_test +TEST_GEN_FILES += ksm_tests ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) @@ -100,7 +103,7 @@ $(1) $(1)_64: $(OUTPUT)/$(1)_64 endef ifeq ($(CAN_BUILD_I386),1) -$(BINARIES_32): CFLAGS += -m32 +$(BINARIES_32): CFLAGS += -m32 -mxsave $(BINARIES_32): LDLIBS += -lrt -ldl -lm $(BINARIES_32): $(OUTPUT)/%_32: %.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ @@ -108,7 +111,7 @@ $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t)))) endif ifeq ($(CAN_BUILD_X86_64),1) -$(BINARIES_64): CFLAGS += -m64 +$(BINARIES_64): CFLAGS += -m64 -mxsave $(BINARIES_64): LDLIBS += -lrt -ldl $(BINARIES_64): $(OUTPUT)/%_64: %.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ @@ -134,7 +137,7 @@ warn_32bit_failure: endif endif -$(OUTPUT)/mlock-random-test: LDLIBS += -lcap +$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap $(OUTPUT)/gup_test: ../../../../mm/gup_test.h @@ -143,6 +146,8 @@ $(OUTPUT)/hmm-tests: local_config.h # HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. $(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS) +$(OUTPUT)/ksm_tests: LDLIBS += -lnuma + local_config.mk local_config.h: check_config.sh /bin/sh ./check_config.sh $(CC) diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh index 18d33684faad..fe8fcfb334e0 100644 --- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh @@ -1,11 +1,14 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + set -e if [[ $(id -u) -ne 0 ]]; then echo "This test must be run as root. Skipping..." - exit 0 + exit $ksft_skip fi fault_limit_file=limit_in_bytes diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 1e662d59c502..fe043f67798b 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -6,6 +6,8 @@ #include <sys/mman.h> #include <sys/stat.h> #include <sys/types.h> +#include <pthread.h> +#include <assert.h> #include "../../../../mm/gup_test.h" #define MB (1UL << 20) @@ -15,6 +17,12 @@ #define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_TOUCH 0x02 /* mark page accessed */ +static unsigned long cmd = GUP_FAST_BENCHMARK; +static int gup_fd, repeats = 1; +static unsigned long size = 128 * MB; +/* Serialize prints */ +static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER; + static char *cmd_to_str(unsigned long cmd) { switch (cmd) { @@ -34,17 +42,55 @@ static char *cmd_to_str(unsigned long cmd) return "Unknown command"; } +void *gup_thread(void *data) +{ + struct gup_test gup = *(struct gup_test *)data; + int i; + + /* Only report timing information on the *_BENCHMARK commands: */ + if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || + (cmd == PIN_LONGTERM_BENCHMARK)) { + for (i = 0; i < repeats; i++) { + gup.size = size; + if (ioctl(gup_fd, cmd, &gup)) + perror("ioctl"), exit(1); + + pthread_mutex_lock(&print_mutex); + printf("%s: Time: get:%lld put:%lld us", + cmd_to_str(cmd), gup.get_delta_usec, + gup.put_delta_usec); + if (gup.size != size) + printf(", truncated (size: %lld)", gup.size); + printf("\n"); + pthread_mutex_unlock(&print_mutex); + } + } else { + gup.size = size; + if (ioctl(gup_fd, cmd, &gup)) { + perror("ioctl"); + exit(1); + } + + pthread_mutex_lock(&print_mutex); + printf("%s: done\n", cmd_to_str(cmd)); + if (gup.size != size) + printf("Truncated (size: %lld)\n", gup.size); + pthread_mutex_unlock(&print_mutex); + } + + return NULL; +} + int main(int argc, char **argv) { struct gup_test gup = { 0 }; - unsigned long size = 128 * MB; - int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 1; - unsigned long cmd = GUP_FAST_BENCHMARK; + int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; int flags = MAP_PRIVATE, touch = 0; char *file = "/dev/zero"; + pthread_t *tid; char *p; - while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHpz")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) { switch (opt) { case 'a': cmd = PIN_FAST_BENCHMARK; @@ -74,6 +120,9 @@ int main(int argc, char **argv) /* strtol, so you can pass flags in hex form */ gup.gup_flags = strtol(optarg, 0, 0); break; + case 'j': + nthreads = atoi(optarg); + break; case 'm': size = atoi(optarg) * MB; break; @@ -154,8 +203,8 @@ int main(int argc, char **argv) if (write) gup.gup_flags |= FOLL_WRITE; - fd = open("/sys/kernel/debug/gup_test", O_RDWR); - if (fd == -1) { + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (gup_fd == -1) { perror("open"); exit(1); } @@ -185,32 +234,17 @@ int main(int argc, char **argv) p[0] = 0; } - /* Only report timing information on the *_BENCHMARK commands: */ - if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || - (cmd == PIN_LONGTERM_BENCHMARK)) { - for (i = 0; i < repeats; i++) { - gup.size = size; - if (ioctl(fd, cmd, &gup)) - perror("ioctl"), exit(1); - - printf("%s: Time: get:%lld put:%lld us", - cmd_to_str(cmd), gup.get_delta_usec, - gup.put_delta_usec); - if (gup.size != size) - printf(", truncated (size: %lld)", gup.size); - printf("\n"); - } - } else { - gup.size = size; - if (ioctl(fd, cmd, &gup)) { - perror("ioctl"); - exit(1); - } - - printf("%s: done\n", cmd_to_str(cmd)); - if (gup.size != size) - printf("Truncated (size: %lld)\n", gup.size); + tid = malloc(sizeof(pthread_t) * nthreads); + assert(tid); + for (i = 0; i < nthreads; i++) { + ret = pthread_create(&tid[i], NULL, gup_thread, &gup); + assert(ret == 0); + } + for (i = 0; i < nthreads; i++) { + ret = pthread_join(tid[i], NULL); + assert(ret == 0); } + free(tid); return 0; } diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 5d1ac691b9f4..864f126ffd78 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1485,4 +1485,162 @@ TEST_F(hmm2, double_map) hmm_buffer_free(buffer); } +/* + * Basic check of exclusive faulting. + */ +TEST_F(hmm, exclusive) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i]++, i); + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i+1); + + /* Check atomic access revoked */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + + hmm_buffer_free(buffer); +} + +TEST_F(hmm, exclusive_mprotect) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, -EPERM); + + hmm_buffer_free(buffer); +} + +/* + * Check copy-on-write works. + */ +TEST_F(hmm, exclusive_cow) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + fork(); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i]++, i); + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i+1); + + hmm_buffer_free(buffer); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh index d11d1febccc3..4a9a3afe9fd4 100644 --- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh @@ -1,11 +1,14 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + set -e if [[ $(id -u) -ne 0 ]]; then echo "This test must be run as root. Skipping..." - exit 0 + exit $ksft_skip fi usage_file=usage_in_bytes diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 8b75821302a7..155120b67a16 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -86,7 +86,6 @@ struct settings { enum thp_enabled thp_enabled; enum thp_defrag thp_defrag; enum shmem_enabled shmem_enabled; - bool debug_cow; bool use_zero_page; struct khugepaged_settings khugepaged; }; @@ -95,7 +94,6 @@ static struct settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, .shmem_enabled = SHMEM_NEVER, - .debug_cow = 0, .use_zero_page = 0, .khugepaged = { .defrag = 1, @@ -268,7 +266,6 @@ static void write_settings(struct settings *settings) write_string("defrag", thp_defrag_strings[settings->thp_defrag]); write_string("shmem_enabled", shmem_enabled_strings[settings->shmem_enabled]); - write_num("debug_cow", settings->debug_cow); write_num("use_zero_page", settings->use_zero_page); write_num("khugepaged/defrag", khugepaged->defrag); @@ -304,7 +301,6 @@ static void save_settings(void) .thp_defrag = read_string("defrag", thp_defrag_strings), .shmem_enabled = read_string("shmem_enabled", shmem_enabled_strings), - .debug_cow = read_num("debug_cow"), .use_zero_page = read_num("use_zero_page"), }; saved_settings.khugepaged = (struct khugepaged_settings) { diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c new file mode 100644 index 000000000000..b61dcdb44c5b --- /dev/null +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -0,0 +1,662 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/mman.h> +#include <stdbool.h> +#include <time.h> +#include <string.h> +#include <numa.h> + +#include "../kselftest.h" +#include "../../../../include/vdso/time64.h" + +#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" +#define KSM_FP(s) (KSM_SYSFS_PATH s) +#define KSM_SCAN_LIMIT_SEC_DEFAULT 120 +#define KSM_PAGE_COUNT_DEFAULT 10l +#define KSM_PROT_STR_DEFAULT "rw" +#define KSM_USE_ZERO_PAGES_DEFAULT false +#define KSM_MERGE_ACROSS_NODES_DEFAULT true +#define MB (1ul << 20) + +struct ksm_sysfs { + unsigned long max_page_sharing; + unsigned long merge_across_nodes; + unsigned long pages_to_scan; + unsigned long run; + unsigned long sleep_millisecs; + unsigned long stable_node_chains_prune_millisecs; + unsigned long use_zero_pages; +}; + +enum ksm_test_name { + CHECK_KSM_MERGE, + CHECK_KSM_UNMERGE, + CHECK_KSM_ZERO_PAGE_MERGE, + CHECK_KSM_NUMA_MERGE, + KSM_MERGE_TIME, + KSM_COW_TIME +}; + +static int ksm_write_sysfs(const char *file_path, unsigned long val) +{ + FILE *f = fopen(file_path, "w"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fprintf(f, "%lu", val) < 0) { + perror("fprintf"); + return 1; + } + fclose(f); + + return 0; +} + +static int ksm_read_sysfs(const char *file_path, unsigned long *val) +{ + FILE *f = fopen(file_path, "r"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fscanf(f, "%lu", val) != 1) { + perror("fscanf"); + return 1; + } + fclose(f); + + return 0; +} + +static int str_to_prot(char *prot_str) +{ + int prot = 0; + + if ((strchr(prot_str, 'r')) != NULL) + prot |= PROT_READ; + if ((strchr(prot_str, 'w')) != NULL) + prot |= PROT_WRITE; + if ((strchr(prot_str, 'x')) != NULL) + prot |= PROT_EXEC; + + return prot; +} + +static void print_help(void) +{ + printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n" + "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n"); + + printf("Supported <test type>:\n" + " -M (page merging)\n" + " -Z (zero pages merging)\n" + " -N (merging of pages in different NUMA nodes)\n" + " -U (page unmerging)\n" + " -P evaluate merging time and speed.\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -C evaluate the time required to break COW of merged pages.\n\n"); + + printf(" -a: specify the access protections of pages.\n" + " <prot> must be of the form [rwx].\n" + " Default: %s\n", KSM_PROT_STR_DEFAULT); + printf(" -p: specify the number of pages to test.\n" + " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); + printf(" -l: limit the maximum running time (in seconds) for a test.\n" + " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); + printf(" -z: change use_zero_pages tunable\n" + " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); + printf(" -m: change merge_across_nodes tunable\n" + " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); + printf(" -s: the size of duplicated memory area (in MiB)\n"); + + exit(0); +} + +static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size) +{ + void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0); + + if (!map_ptr) { + perror("mmap"); + return NULL; + } + memset(map_ptr, data, map_size); + if (mprotect(map_ptr, map_size, prot)) { + perror("mprotect"); + munmap(map_ptr, map_size); + return NULL; + } + + return map_ptr; +} + +static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) +{ + struct timespec cur_time; + unsigned long cur_scan, init_scan; + + if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan)) + return 1; + cur_scan = init_scan; + + while (cur_scan < init_scan + scan_count) { + if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan)) + return 1; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) { + perror("clock_gettime"); + return 1; + } + if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { + printf("Scan time limit exceeded\n"); + return 1; + } + } + + return 0; +} + +static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_MERGEABLE)) { + perror("madvise"); + return 1; + } + if (ksm_write_sysfs(KSM_FP("run"), 1)) + return 1; + + /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */ + if (ksm_do_scan(2, start_time, timeout)) + return 1; + + return 0; +} + +static bool assert_ksm_pages_count(long dupl_page_count) +{ + unsigned long max_page_sharing, pages_sharing, pages_shared; + + if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || + ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || + ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing)) + return false; + + /* + * Since there must be at least 2 pages for merging and 1 page can be + * shared with the limited number of pages (max_page_sharing), sometimes + * there are 'leftover' pages that cannot be merged. For example, if there + * are 11 pages and max_page_sharing = 10, then only 10 pages will be + * merged and the 11th page won't be affected. As a result, when the number + * of duplicate pages is divided by max_page_sharing and the remainder is 1, + * pages_shared and pages_sharing values will be equal between dupl_page_count + * and dupl_page_count - 1. + */ + if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) { + if (pages_shared == dupl_page_count / max_page_sharing && + pages_sharing == pages_shared * (max_page_sharing - 1)) + return true; + } else { + if (pages_shared == (dupl_page_count / max_page_sharing + 1) && + pages_sharing == dupl_page_count - pages_shared) + return true; + } + + return false; +} + +static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) || + ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || + ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) || + ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) || + ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) || + ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + &ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int ksm_restore(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) || + ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) || + ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) || + ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* verify that the right number of pages are merged */ + if (assert_ksm_pages_count(page_count)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + int page_count = 2; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */ + memset(map_ptr, '-', 1); + memset(map_ptr + page_size, '+', 1); + + /* get at least 1 scan, so KSM can detect that the pages were modified */ + if (ksm_do_scan(1, start_time, timeout)) + goto err_out; + + /* check that unmerging was successful and 0 pages are currently merged */ + if (assert_ksm_pages_count(0)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout, + bool use_zero_pages, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages)) + return KSFT_FAIL; + + /* fill pages with zero and try to merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if use_zero_pages is set to 1, empty pages are merged + * with the kernel zero page instead of with each other; + * 2) if use_zero_pages is set to 0, empty pages are not treated specially + * and merged as usual. + */ + if (use_zero_pages && !assert_ksm_pages_count(0)) + goto err_out; + else if (!use_zero_pages && !assert_ksm_pages_count(page_count)) + goto err_out; + + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, + size_t page_size) +{ + void *numa1_map_ptr, *numa2_map_ptr; + struct timespec start_time; + int page_count = 2; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (numa_available() < 0) { + perror("NUMA support not enabled"); + return KSFT_SKIP; + } + if (numa_max_node() < 1) { + printf("At least 2 NUMA nodes must be available\n"); + return KSFT_SKIP; + } + if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes)) + return KSFT_FAIL; + + /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ + numa1_map_ptr = numa_alloc_onnode(page_size, 0); + numa2_map_ptr = numa_alloc_onnode(page_size, 1); + if (!numa1_map_ptr || !numa2_map_ptr) { + perror("numa_alloc_onnode"); + return KSFT_FAIL; + } + + memset(numa1_map_ptr, '*', page_size); + memset(numa2_map_ptr, '*', page_size); + + /* try to merge the pages */ + if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) || + ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged; + * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is + * only 1 unique page in each node and they can't be shared. + */ + if (merge_across_nodes && !assert_ksm_pages_count(page_count)) + goto err_out; + else if (!merge_across_nodes && !assert_ksm_pages_count(0)) + goto err_out; + + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("OK\n"); + return KSFT_PASS; + +err_out: + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("Not OK\n"); + return KSFT_FAIL; +} + +static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + +static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long cow_time_ns; + + /* page_count must be less than 2*page_size */ + size_t page_count = 4000; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB); + printf("Not merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + /* Create 2000 pairs of duplicate pages */ + for (size_t i = 0; i < page_count - 1; i = i + 2) { + memset(map_ptr + page_size * i, '+', i / 2 + 1); + memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1); + } + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +int main(int argc, char *argv[]) +{ + int ret, opt; + int prot = 0; + int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; + long page_count = KSM_PAGE_COUNT_DEFAULT; + size_t page_size = sysconf(_SC_PAGESIZE); + struct ksm_sysfs ksm_sysfs_old; + int test_name = CHECK_KSM_MERGE; + bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; + bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; + long size_MB = 0; + + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPC")) != -1) { + switch (opt) { + case 'a': + prot = str_to_prot(optarg); + break; + case 'p': + page_count = atol(optarg); + if (page_count <= 0) { + printf("The number of pages must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'l': + ksm_scan_limit_sec = atoi(optarg); + if (ksm_scan_limit_sec <= 0) { + printf("Timeout value must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'h': + print_help(); + break; + case 'z': + if (strcmp(optarg, "0") == 0) + use_zero_pages = 0; + else + use_zero_pages = 1; + break; + case 'm': + if (strcmp(optarg, "0") == 0) + merge_across_nodes = 0; + else + merge_across_nodes = 1; + break; + case 's': + size_MB = atoi(optarg); + if (size_MB <= 0) { + printf("Size must be greater than 0\n"); + return KSFT_FAIL; + } + case 'M': + break; + case 'U': + test_name = CHECK_KSM_UNMERGE; + break; + case 'Z': + test_name = CHECK_KSM_ZERO_PAGE_MERGE; + break; + case 'N': + test_name = CHECK_KSM_NUMA_MERGE; + break; + case 'P': + test_name = KSM_MERGE_TIME; + break; + case 'C': + test_name = KSM_COW_TIME; + break; + default: + return KSFT_FAIL; + } + } + + if (prot == 0) + prot = str_to_prot(KSM_PROT_STR_DEFAULT); + + if (access(KSM_SYSFS_PATH, F_OK)) { + printf("Config KSM not enabled\n"); + return KSFT_SKIP; + } + + if (ksm_save_def(&ksm_sysfs_old)) { + printf("Cannot save default tunables\n"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("run"), 2) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || + ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) + return KSFT_FAIL; + + switch (test_name) { + case CHECK_KSM_MERGE: + ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, page_size); + break; + case CHECK_KSM_UNMERGE: + ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; + case CHECK_KSM_ZERO_PAGE_MERGE: + ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, use_zero_pages, page_size); + break; + case CHECK_KSM_NUMA_MERGE: + ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + merge_across_nodes, page_size); + break; + case KSM_MERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + size_MB); + break; + case KSM_COW_TIME: + ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; + } + + if (ksm_restore(&ksm_sysfs_old)) { + printf("Cannot restore default tunables\n"); + return KSFT_FAIL; + } + + return ret; +} diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c new file mode 100644 index 000000000000..b959e4ebdad4 --- /dev/null +++ b/tools/testing/selftests/vm/madv_populate.c @@ -0,0 +1,342 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests + * + * Copyright 2021, Red Hat, Inc. + * + * Author(s): David Hildenbrand <david@redhat.com> + */ +#define _GNU_SOURCE +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <stdint.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/mman.h> + +#include "../kselftest.h" + +#if defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) + +/* + * For now, we're using 2 MiB of private anonymous memory for all tests. + */ +#define SIZE (2 * 1024 * 1024) + +static size_t pagesize; + +static uint64_t pagemap_get_entry(int fd, char *start) +{ + const unsigned long pfn = (unsigned long)start / pagesize; + uint64_t entry; + int ret; + + ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry)); + if (ret != sizeof(entry)) + ksft_exit_fail_msg("reading pagemap failed\n"); + return entry; +} + +static bool pagemap_is_populated(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + /* Present or swapped. */ + return entry & 0xc000000000000000ull; +} + +static bool pagemap_is_softdirty(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + return entry & 0x0080000000000000ull; +} + +static void sense_support(void) +{ + char *addr; + int ret; + + addr = mmap(0, pagesize, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (!addr) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, pagesize, MADV_POPULATE_READ); + if (ret) + ksft_exit_skip("MADV_POPULATE_READ is not available\n"); + + ret = madvise(addr, pagesize, MADV_POPULATE_WRITE); + if (ret) + ksft_exit_skip("MADV_POPULATE_WRITE is not available\n"); + + munmap(addr, pagesize); +} + +static void test_prot_read(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == EINVAL, + "MADV_POPULATE_WRITE with PROT_READ\n"); + + munmap(addr, SIZE); +} + +static void test_prot_write(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == EINVAL, + "MADV_POPULATE_READ with PROT_WRITE\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n"); + + munmap(addr, SIZE); +} + +static void test_holes(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ret = munmap(addr + pagesize, pagesize); + if (ret) + ksft_exit_fail_msg("munmap failed\n"); + + /* Hole in the middle */ + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes in the middle\n"); + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes in the middle\n"); + + /* Hole at end */ + ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes at the end\n"); + ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes at the end\n"); + + /* Hole at beginning */ + ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes at the beginning\n"); + ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes at the beginning\n"); + + munmap(addr, SIZE); +} + +static bool range_is_populated(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (!pagemap_is_populated(fd, start)) + ret = false; + close(fd); + return ret; +} + +static bool range_is_not_populated(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (pagemap_is_populated(fd, start)) + ret = false; + close(fd); + return ret; +} + +static void test_populate_read(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ksft_test_result(range_is_not_populated(addr, SIZE), + "range initially not populated\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ\n"); + ksft_test_result(range_is_populated(addr, SIZE), + "range is populated\n"); + + munmap(addr, SIZE); +} + +static void test_populate_write(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ksft_test_result(range_is_not_populated(addr, SIZE), + "range initially not populated\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); + ksft_test_result(range_is_populated(addr, SIZE), + "range is populated\n"); + + munmap(addr, SIZE); +} + +static bool range_is_softdirty(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (!pagemap_is_softdirty(fd, start)) + ret = false; + close(fd); + return ret; +} + +static bool range_is_not_softdirty(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (pagemap_is_softdirty(fd, start)) + ret = false; + close(fd); + return ret; +} + +static void clear_softdirty(void) +{ + int fd = open("/proc/self/clear_refs", O_WRONLY); + const char *ctrl = "4"; + int ret; + + if (fd < 0) + ksft_exit_fail_msg("opening clear_refs failed\n"); + ret = write(fd, ctrl, strlen(ctrl)); + if (ret != strlen(ctrl)) + ksft_exit_fail_msg("writing clear_refs failed\n"); + close(fd); +} + +static void test_softdirty(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* Clear any softdirty bits. */ + clear_softdirty(); + ksft_test_result(range_is_not_softdirty(addr, SIZE), + "range is not softdirty\n"); + + /* Populating READ should set softdirty. */ + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ\n"); + ksft_test_result(range_is_not_softdirty(addr, SIZE), + "range is not softdirty\n"); + + /* Populating WRITE should set softdirty. */ + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); + ksft_test_result(range_is_softdirty(addr, SIZE), + "range is softdirty\n"); + + munmap(addr, SIZE); +} + +int main(int argc, char **argv) +{ + int err; + + pagesize = getpagesize(); + + ksft_print_header(); + ksft_set_plan(21); + + sense_support(); + test_prot_read(); + test_prot_write(); + test_holes(); + test_populate_read(); + test_populate_write(); + test_softdirty(); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} + +#else /* defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) */ + +#warning "missing MADV_POPULATE_READ or MADV_POPULATE_WRITE definition" + +int main(int argc, char **argv) +{ + ksft_print_header(); + ksft_exit_skip("MADV_POPULATE_READ or MADV_POPULATE_WRITE not defined\n"); +} + +#endif /* defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) */ diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c new file mode 100644 index 000000000000..93e7e7ffed33 --- /dev/null +++ b/tools/testing/selftests/vm/memfd_secret.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2021 + * + * Author: Mike Rapoport <rppt@linux.ibm.com> + */ + +#define _GNU_SOURCE +#include <sys/uio.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <sys/types.h> +#include <sys/ptrace.h> +#include <sys/syscall.h> +#include <sys/resource.h> +#include <sys/capability.h> + +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <stdio.h> + +#include "../kselftest.h" + +#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__) +#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) +#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) + +#ifdef __NR_memfd_secret + +#define PATTERN 0x55 + +static const int prot = PROT_READ | PROT_WRITE; +static const int mode = MAP_SHARED; + +static unsigned long page_size; +static unsigned long mlock_limit_cur; +static unsigned long mlock_limit_max; + +static int memfd_secret(unsigned int flags) +{ + return syscall(__NR_memfd_secret, flags); +} + +static void test_file_apis(int fd) +{ + char buf[64]; + + if ((read(fd, buf, sizeof(buf)) >= 0) || + (write(fd, buf, sizeof(buf)) >= 0) || + (pread(fd, buf, sizeof(buf), 0) >= 0) || + (pwrite(fd, buf, sizeof(buf), 0) >= 0)) + fail("unexpected file IO\n"); + else + pass("file IO is blocked as expected\n"); +} + +static void test_mlock_limit(int fd) +{ + size_t len; + char *mem; + + len = mlock_limit_cur; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("unable to mmap secret memory\n"); + return; + } + munmap(mem, len); + + len = mlock_limit_max * 2; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem != MAP_FAILED) { + fail("unexpected mlock limit violation\n"); + munmap(mem, len); + return; + } + + pass("mlock limit is respected\n"); +} + +static void try_process_vm_read(int fd, int pipefd[2]) +{ + struct iovec liov, riov; + char buf[64]; + char *mem; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + exit(KSFT_FAIL); + } + + liov.iov_len = riov.iov_len = sizeof(buf); + liov.iov_base = buf; + riov.iov_base = mem; + + if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) { + if (errno == ENOSYS) + exit(KSFT_SKIP); + exit(KSFT_PASS); + } + + exit(KSFT_FAIL); +} + +static void try_ptrace(int fd, int pipefd[2]) +{ + pid_t ppid = getppid(); + int status; + char *mem; + long ret; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + perror("pipe write"); + exit(KSFT_FAIL); + } + + ret = ptrace(PTRACE_ATTACH, ppid, 0, 0); + if (ret) { + perror("ptrace_attach"); + exit(KSFT_FAIL); + } + + ret = waitpid(ppid, &status, WUNTRACED); + if ((ret != ppid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitppid result %ld stat %x\n", + ret, status); + exit(KSFT_FAIL); + } + + if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0)) + exit(KSFT_PASS); + + exit(KSFT_FAIL); +} + +static void check_child_status(pid_t pid, const char *name) +{ + int status; + + waitpid(pid, &status, 0); + + if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) { + skip("%s is not supported\n", name); + return; + } + + if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) || + WIFSIGNALED(status)) { + pass("%s is blocked as expected\n", name); + return; + } + + fail("%s: unexpected memory access\n", name); +} + +static void test_remote_access(int fd, const char *name, + void (*func)(int fd, int pipefd[2])) +{ + int pipefd[2]; + pid_t pid; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + pid = fork(); + if (pid < 0) { + fail("fork failed: %s\n", strerror(errno)); + return; + } + + if (pid == 0) { + func(fd, pipefd); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + return; + } + + ftruncate(fd, page_size); + memset(mem, PATTERN, page_size); + + if (write(pipefd[1], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + return; + } + + check_child_status(pid, name); +} + +static void test_process_vm_read(int fd) +{ + test_remote_access(fd, "process_vm_read", try_process_vm_read); +} + +static void test_ptrace(int fd) +{ + test_remote_access(fd, "ptrace", try_ptrace); +} + +static int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + perror("setrlimit() returns error"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + perror("cap_set_proc() returns error"); + return -2; + } + + return 0; +} + +static void prepare(void) +{ + struct rlimit rlim; + + page_size = sysconf(_SC_PAGE_SIZE); + if (!page_size) + ksft_exit_fail_msg("Failed to get page size %s\n", + strerror(errno)); + + if (getrlimit(RLIMIT_MEMLOCK, &rlim)) + ksft_exit_fail_msg("Unable to detect mlock limit: %s\n", + strerror(errno)); + + mlock_limit_cur = rlim.rlim_cur; + mlock_limit_max = rlim.rlim_max; + + printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n", + page_size, mlock_limit_cur, mlock_limit_max); + + if (page_size > mlock_limit_cur) + mlock_limit_cur = page_size; + if (page_size > mlock_limit_max) + mlock_limit_max = page_size; + + if (set_cap_limits(mlock_limit_max)) + ksft_exit_fail_msg("Unable to set mlock limit: %s\n", + strerror(errno)); +} + +#define NUM_TESTS 4 + +int main(int argc, char *argv[]) +{ + int fd; + + prepare(); + + ksft_print_header(); + ksft_set_plan(NUM_TESTS); + + fd = memfd_secret(0); + if (fd < 0) { + if (errno == ENOSYS) + ksft_exit_skip("memfd_secret is not supported\n"); + else + ksft_exit_fail_msg("memfd_secret failed: %s\n", + strerror(errno)); + } + + test_mlock_limit(fd); + test_file_apis(fd); + test_process_vm_read(fd); + test_ptrace(fd); + + close(fd); + + ksft_exit(!ksft_get_fail_cnt()); +} + +#else /* __NR_memfd_secret */ + +int main(int argc, char *argv[]) +{ + printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_memfd_secret */ diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c index ff4d72eb74b9..782ea94dee2f 100644 --- a/tools/testing/selftests/vm/mlock-random-test.c +++ b/tools/testing/selftests/vm/mlock-random-test.c @@ -70,7 +70,7 @@ int get_proc_locked_vm_size(void) } } - perror("cann't parse VmLck in /proc/self/status\n"); + perror("cannot parse VmLck in /proc/self/status\n"); fclose(f); return -1; } diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c index 9c391d016922..0624d1bd71b5 100644 --- a/tools/testing/selftests/vm/mremap_test.c +++ b/tools/testing/selftests/vm/mremap_test.c @@ -45,14 +45,15 @@ enum { _4MB = 4ULL << 20, _1GB = 1ULL << 30, _2GB = 2ULL << 30, - PTE = _4KB, PMD = _2MB, PUD = _1GB, }; +#define PTE page_size + #define MAKE_TEST(source_align, destination_align, size, \ overlaps, should_fail, test_name) \ -{ \ +(struct test){ \ .name = test_name, \ .config = { \ .src_alignment = source_align, \ @@ -74,9 +75,10 @@ static void *get_source_mapping(struct config c) retry: addr += c.src_alignment; src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_ANONYMOUS | MAP_SHARED, -1, 0); + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); if (src_addr == MAP_FAILED) { - if (errno == EPERM) + if (errno == EPERM || errno == EEXIST) goto retry; goto error; } @@ -252,12 +254,17 @@ static int parse_args(int argc, char **argv, unsigned int *threshold_mb, return 0; } +#define MAX_TEST 13 +#define MAX_PERF_TEST 3 int main(int argc, char **argv) { int failures = 0; int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; unsigned int pattern_seed; + struct test test_cases[MAX_TEST]; + struct test perf_test_cases[MAX_PERF_TEST]; + int page_size; time_t t; pattern_seed = (unsigned int) time(&t); @@ -268,56 +275,59 @@ int main(int argc, char **argv) ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", threshold_mb, pattern_seed); - struct test test_cases[] = { - /* Expected mremap failures */ - MAKE_TEST(_4KB, _4KB, _4KB, OVERLAPPING, EXPECT_FAILURE, - "mremap - Source and Destination Regions Overlapping"), - MAKE_TEST(_4KB, _1KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE, - "mremap - Destination Address Misaligned (1KB-aligned)"), - MAKE_TEST(_1KB, _4KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE, - "mremap - Source Address Misaligned (1KB-aligned)"), - - /* Src addr PTE aligned */ - MAKE_TEST(PTE, PTE, _8KB, NON_OVERLAPPING, EXPECT_SUCCESS, - "8KB mremap - Source PTE-aligned, Destination PTE-aligned"), - - /* Src addr 1MB aligned */ - MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"), - MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"), - - /* Src addr PMD aligned */ - MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination PTE-aligned"), - MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"), - MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination PMD-aligned"), - - /* Src addr PUD aligned */ - MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PTE-aligned"), - MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"), - MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PMD-aligned"), - MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PUD-aligned"), - }; - - struct test perf_test_cases[] = { - /* - * mremap 1GB region - Page table level aligned time - * comparison. - */ - MAKE_TEST(PTE, PTE, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PTE-aligned, Destination PTE-aligned"), - MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PMD-aligned, Destination PMD-aligned"), - MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PUD-aligned, Destination PUD-aligned"), - }; + page_size = sysconf(_SC_PAGESIZE); + + /* Expected mremap failures */ + test_cases[0] = MAKE_TEST(page_size, page_size, page_size, + OVERLAPPING, EXPECT_FAILURE, + "mremap - Source and Destination Regions Overlapping"); + + test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size, + NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Destination Address Misaligned (1KB-aligned)"); + test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size, + NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Source Address Misaligned (1KB-aligned)"); + + /* Src addr PTE aligned */ + test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2, + NON_OVERLAPPING, EXPECT_SUCCESS, + "8KB mremap - Source PTE-aligned, Destination PTE-aligned"); + + /* Src addr 1MB aligned */ + test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"); + test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"); + + /* Src addr PMD aligned */ + test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PTE-aligned"); + test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"); + test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PMD-aligned"); + + /* Src addr PUD aligned */ + test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PTE-aligned"); + test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"); + test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PMD-aligned"); + test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PUD-aligned"); + + perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PTE-aligned, Destination PTE-aligned"); + /* + * mremap 1GB region - Page table level aligned time + * comparison. + */ + perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PMD-aligned, Destination PMD-aligned"); + perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PUD-aligned, Destination PUD-aligned"); run_perf_tests = (threshold_mb == VALIDATION_NO_THRESHOLD) || (threshold_mb * _1MB >= _1GB); diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h index 3be20f5d5275..e4a4ce2b826d 100644 --- a/tools/testing/selftests/vm/pkey-x86.h +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -126,6 +126,7 @@ static inline u32 pkey_bit_position(int pkey) #define XSTATE_PKEY_BIT (9) #define XSTATE_PKEY 0x200 +#define XSTATE_BV_OFFSET 512 int pkey_reg_xstate_offset(void) { diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index fdbb602ecf32..2d0ae88665db 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -510,7 +510,7 @@ int alloc_pkey(void) " shadow: 0x%016llx\n", __func__, __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); - if (ret) { + if (ret > 0) { /* clear both the bits: */ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, ~PKEY_MASK); @@ -561,7 +561,6 @@ int alloc_random_pkey(void) int nr_alloced = 0; int random_index; memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); - srand((unsigned int)time(NULL)); /* allocate every possible key and make a note of which ones we got */ max_nr_pkey_allocs = NR_PKEYS; @@ -1278,6 +1277,78 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) } } +void arch_force_pkey_reg_init(void) +{ +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u64 *buf; + + /* + * All keys should be allocated and set to allow reads and + * writes, so the register should be all 0. If not, just + * skip the test. + */ + if (read_pkey_reg()) + return; + + /* + * Just allocate an absurd about of memory rather than + * doing the XSAVE size enumeration dance. + */ + buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + + /* These __builtins require compiling with -mxsave */ + + /* XSAVE to build a valid buffer: */ + __builtin_ia32_xsave(buf, XSTATE_PKEY); + /* Clear XSTATE_BV[PKRU]: */ + buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; + /* XRSTOR will likely get PKRU back to the init state: */ + __builtin_ia32_xrstor(buf, XSTATE_PKEY); + + munmap(buf, 1*MB); +#endif +} + + +/* + * This is mostly useless on ppc for now. But it will not + * hurt anything and should give some better coverage as + * a long-running test that continually checks the pkey + * register. + */ +void test_pkey_init_state(int *ptr, u16 pkey) +{ + int err; + int allocated_pkeys[NR_PKEYS] = {0}; + int nr_allocated_pkeys = 0; + int i; + + for (i = 0; i < NR_PKEYS; i++) { + int new_pkey = alloc_pkey(); + + if (new_pkey < 0) + continue; + allocated_pkeys[nr_allocated_pkeys++] = new_pkey; + } + + dprintf3("%s()::%d\n", __func__, __LINE__); + + arch_force_pkey_reg_init(); + + /* + * Loop for a bit, hoping to get exercise the kernel + * context switch code. + */ + for (i = 0; i < 1000000; i++) + read_pkey_reg(); + + for (i = 0; i < nr_allocated_pkeys; i++) { + err = sys_pkey_free(allocated_pkeys[i]); + pkey_assert(!err); + read_pkey_reg(); /* for shadow checking */ + } +} + /* * pkey 0 is special. It is allocated by default, so you do not * have to call pkey_alloc() to use it first. Make sure that it @@ -1449,6 +1520,13 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); pkey_assert(!ret); + /* + * Reset the shadow, assuming that the above mprotect() + * correctly changed PKRU, but to an unknown value since + * the actual alllocated pkey is unknown. + */ + shadow_pkey_reg = __read_pkey_reg(); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* Make sure this is an *instruction* fault */ @@ -1502,6 +1580,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_implicit_mprotect_exec_only_memory, test_mprotect_with_pkey_0, test_ptrace_of_child, + test_pkey_init_state, test_pkey_syscalls_on_non_allocated_pkey, test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, @@ -1552,6 +1631,8 @@ int main(void) int nr_iterations = 22; int pkeys_supported = is_pkeys_supported(); + srand((unsigned int)time(NULL)); + setup_handlers(); printf("has pkeys: %d\n", pkeys_supported); diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index e953f3cd9664..45e803af7c77 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -346,4 +346,133 @@ else exitcode=1 fi +echo "--------------------------------------------------------" +echo "running MADV_POPULATE_READ and MADV_POPULATE_WRITE tests" +echo "--------------------------------------------------------" +./madv_populate +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "running memfd_secret test" +echo "------------------------------------" +./memfd_secret +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "-------------------------------------------------------" +echo "running KSM MADV_MERGEABLE test with 10 identical pages" +echo "-------------------------------------------------------" +./ksm_tests -M -p 10 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "------------------------" +echo "running KSM unmerge test" +echo "------------------------" +./ksm_tests -U +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "----------------------------------------------------------" +echo "running KSM test with 10 zero pages and use_zero_pages = 0" +echo "----------------------------------------------------------" +./ksm_tests -Z -p 10 -z 0 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "----------------------------------------------------------" +echo "running KSM test with 10 zero pages and use_zero_pages = 1" +echo "----------------------------------------------------------" +./ksm_tests -Z -p 10 -z 1 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "-------------------------------------------------------------" +echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 1" +echo "-------------------------------------------------------------" +./ksm_tests -N -m 1 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "-------------------------------------------------------------" +echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 0" +echo "-------------------------------------------------------------" +./ksm_tests -N -m 0 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +exit $exitcode + exit $exitcode diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 1af16d2c2a0a..52497b7b9f1d 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -341,7 +341,7 @@ void split_file_backed_thp(void) } /* write something to the file, so a file-backed THP can be allocated */ - num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc)); + num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1); close(fd); if (num_written < 1) { diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index f5ab5e0312e7..60aa1a4fc69b 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -85,10 +85,12 @@ static bool test_uffdio_wp = false; static bool test_uffdio_minor = false; static bool map_shared; +static int shm_fd; static int huge_fd; static char *huge_fd_off0; static unsigned long long *count_verify; -static int uffd, uffd_flags, finished, *pipefd; +static int uffd = -1; +static int uffd_flags, finished, *pipefd; static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; static char *zeropage; pthread_attr_t attr; @@ -140,11 +142,18 @@ static void usage(void) exit(1); } -#define uffd_error(code, fmt, ...) \ - do { \ - fprintf(stderr, fmt, ##__VA_ARGS__); \ - fprintf(stderr, ": %" PRId64 "\n", (int64_t)(code)); \ - exit(1); \ +#define _err(fmt, ...) \ + do { \ + int ret = errno; \ + fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ + fprintf(stderr, " (errno=%d, line=%d)\n", \ + ret, __LINE__); \ + } while (0) + +#define err(fmt, ...) \ + do { \ + _err(fmt, ##__VA_ARGS__); \ + exit(1); \ } while (0) static void uffd_stats_reset(struct uffd_stats *uffd_stats, @@ -171,56 +180,52 @@ static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) minor_total += stats[i].minor_faults; } - printf("userfaults: %llu missing (", miss_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].missing_faults); - printf("\b), %llu wp (", wp_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].wp_faults); - printf("\b), %llu minor (", minor_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].minor_faults); - printf("\b)\n"); + printf("userfaults: "); + if (miss_total) { + printf("%llu missing (", miss_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].missing_faults); + printf("\b) "); + } + if (wp_total) { + printf("%llu wp (", wp_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].wp_faults); + printf("\b) "); + } + if (minor_total) { + printf("%llu minor (", minor_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].minor_faults); + printf("\b)"); + } + printf("\n"); } -static int anon_release_pages(char *rel_area) +static void anon_release_pages(char *rel_area) { - int ret = 0; - - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) { - perror("madvise"); - ret = 1; - } - - return ret; + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); } static void anon_allocate_area(void **alloc_area) { - if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) { - fprintf(stderr, "out of memory\n"); - *alloc_area = NULL; - } + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (*alloc_area == MAP_FAILED) + err("mmap of anonymous memory failed"); } static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) { } -/* HugeTLB memory */ -static int hugetlb_release_pages(char *rel_area) +static void hugetlb_release_pages(char *rel_area) { - int ret = 0; - if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - rel_area == huge_fd_off0 ? 0 : - nr_pages * page_size, - nr_pages * page_size)) { - perror("fallocate"); - ret = 1; - } - - return ret; + rel_area == huge_fd_off0 ? 0 : nr_pages * page_size, + nr_pages * page_size)) + err("fallocate() failed"); } static void hugetlb_allocate_area(void **alloc_area) @@ -233,20 +238,16 @@ static void hugetlb_allocate_area(void **alloc_area) MAP_HUGETLB, huge_fd, *alloc_area == area_src ? 0 : nr_pages * page_size); - if (*alloc_area == MAP_FAILED) { - perror("mmap of hugetlbfs file failed"); - goto fail; - } + if (*alloc_area == MAP_FAILED) + err("mmap of hugetlbfs file failed"); if (map_shared) { area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_HUGETLB, huge_fd, *alloc_area == area_src ? 0 : nr_pages * page_size); - if (area_alias == MAP_FAILED) { - perror("mmap of hugetlb file alias failed"); - goto fail_munmap; - } + if (area_alias == MAP_FAILED) + err("mmap of hugetlb file alias failed"); } if (*alloc_area == area_src) { @@ -257,16 +258,6 @@ static void hugetlb_allocate_area(void **alloc_area) } if (area_alias) *alloc_area_alias = area_alias; - - return; - -fail_munmap: - if (munmap(*alloc_area, nr_pages * page_size) < 0) { - perror("hugetlb munmap"); - exit(1); - } -fail: - *alloc_area = NULL; } static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) @@ -282,33 +273,43 @@ static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset *start = (unsigned long) area_dst_alias + offset; } -/* Shared memory */ -static int shmem_release_pages(char *rel_area) +static void shmem_release_pages(char *rel_area) { - int ret = 0; - - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) { - perror("madvise"); - ret = 1; - } - - return ret; + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); } static void shmem_allocate_area(void **alloc_area) { + void *area_alias = NULL; + bool is_src = alloc_area == (void **)&area_src; + unsigned long offset = is_src ? 0 : nr_pages * page_size; + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_SHARED, -1, 0); - if (*alloc_area == MAP_FAILED) { - fprintf(stderr, "shared memory mmap failed\n"); - *alloc_area = NULL; - } + MAP_SHARED, shm_fd, offset); + if (*alloc_area == MAP_FAILED) + err("mmap of memfd failed"); + + area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, shm_fd, offset); + if (area_alias == MAP_FAILED) + err("mmap of memfd alias failed"); + + if (is_src) + area_src_alias = area_alias; + else + area_dst_alias = area_alias; +} + +static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + *start = (unsigned long)area_dst_alias + offset; } struct uffd_test_ops { unsigned long expected_ioctls; void (*allocate_area)(void **alloc_area); - int (*release_pages)(char *rel_area); + void (*release_pages)(char *rel_area); void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); }; @@ -332,7 +333,7 @@ static struct uffd_test_ops shmem_uffd_test_ops = { .expected_ioctls = SHMEM_EXPECTED_IOCTLS, .allocate_area = shmem_allocate_area, .release_pages = shmem_release_pages, - .alias_mapping = noop_alias_mapping, + .alias_mapping = shmem_alias_mapping, }; static struct uffd_test_ops hugetlb_uffd_test_ops = { @@ -344,6 +345,128 @@ static struct uffd_test_ops hugetlb_uffd_test_ops = { static struct uffd_test_ops *uffd_test_ops; +static void userfaultfd_open(uint64_t *features) +{ + struct uffdio_api uffdio_api; + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); + if (uffd < 0) + err("userfaultfd syscall not available in this kernel"); + uffd_flags = fcntl(uffd, F_GETFD, NULL); + + uffdio_api.api = UFFD_API; + uffdio_api.features = *features; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) + err("UFFDIO_API failed.\nPlease make sure to " + "run with either root or ptrace capability."); + if (uffdio_api.api != UFFD_API) + err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); + + *features = uffdio_api.features; +} + +static inline void munmap_area(void **area) +{ + if (*area) + if (munmap(*area, nr_pages * page_size)) + err("munmap"); + + *area = NULL; +} + +static void uffd_test_ctx_clear(void) +{ + size_t i; + + if (pipefd) { + for (i = 0; i < nr_cpus * 2; ++i) { + if (close(pipefd[i])) + err("close pipefd"); + } + free(pipefd); + pipefd = NULL; + } + + if (count_verify) { + free(count_verify); + count_verify = NULL; + } + + if (uffd != -1) { + if (close(uffd)) + err("close uffd"); + uffd = -1; + } + + huge_fd_off0 = NULL; + munmap_area((void **)&area_src); + munmap_area((void **)&area_src_alias); + munmap_area((void **)&area_dst); + munmap_area((void **)&area_dst_alias); +} + +static void uffd_test_ctx_init_ext(uint64_t *features) +{ + unsigned long nr, cpu; + + uffd_test_ctx_clear(); + + uffd_test_ops->allocate_area((void **)&area_src); + uffd_test_ops->allocate_area((void **)&area_dst); + + userfaultfd_open(features); + + count_verify = malloc(nr_pages * sizeof(unsigned long long)); + if (!count_verify) + err("count_verify"); + + for (nr = 0; nr < nr_pages; nr++) { + *area_mutex(area_src, nr) = + (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; + count_verify[nr] = *area_count(area_src, nr) = 1; + /* + * In the transition between 255 to 256, powerpc will + * read out of order in my_bcmp and see both bytes as + * zero, so leave a placeholder below always non-zero + * after the count, to avoid my_bcmp to trigger false + * positives. + */ + *(area_count(area_src, nr) + 1) = 1; + } + + /* + * After initialization of area_src, we must explicitly release pages + * for area_dst to make sure it's fully empty. Otherwise we could have + * some area_dst pages be errornously initialized with zero pages, + * hence we could hit memory corruption later in the test. + * + * One example is when THP is globally enabled, above allocate_area() + * calls could have the two areas merged into a single VMA (as they + * will have the same VMA flags so they're mergeable). When we + * initialize the area_src above, it's possible that some part of + * area_dst could have been faulted in via one huge THP that will be + * shared between area_src and area_dst. It could cause some of the + * area_dst won't be trapped by missing userfaults. + * + * This release_pages() will guarantee even if that happened, we'll + * proactively split the thp and drop any accidentally initialized + * pages within area_dst. + */ + uffd_test_ops->release_pages(area_dst); + + pipefd = malloc(sizeof(int) * nr_cpus * 2); + if (!pipefd) + err("pipefd"); + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) + err("pipe"); +} + +static inline void uffd_test_ctx_init(uint64_t features) +{ + uffd_test_ctx_init_ext(&features); +} + static int my_bcmp(char *str1, char *str2, size_t n) { unsigned long i; @@ -363,27 +486,33 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp) /* Undo write-protect, do wakeup after that */ prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; - if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) { - fprintf(stderr, "clear WP failed for address 0x%" PRIx64 "\n", - (uint64_t)start); - exit(1); - } + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) + err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); } static void continue_range(int ufd, __u64 start, __u64 len) { struct uffdio_continue req; + int ret; req.range.start = start; req.range.len = len; req.mode = 0; - if (ioctl(ufd, UFFDIO_CONTINUE, &req)) { - fprintf(stderr, - "UFFDIO_CONTINUE failed for address 0x%" PRIx64 "\n", - (uint64_t)start); - exit(1); - } + if (ioctl(ufd, UFFDIO_CONTINUE, &req)) + err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, + (uint64_t)start); + + /* + * Error handling within the kernel for continue is subtly different + * from copy or zeropage, so it may be a source of bugs. Trigger an + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. + */ + req.mapped = 0; + ret = ioctl(ufd, UFFDIO_CONTINUE, &req); + if (ret >= 0 || req.mapped != -EEXIST) + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, + ret, (int64_t) req.mapped); } static void *locking_thread(void *arg) @@ -395,7 +524,6 @@ static void *locking_thread(void *arg) unsigned long long count; char randstate[64]; unsigned int seed; - time_t start; if (bounces & BOUNCE_RANDOM) { seed = (unsigned int) time(NULL) - bounces; @@ -403,10 +531,8 @@ static void *locking_thread(void *arg) seed += cpu; bzero(&rand, sizeof(rand)); bzero(&randstate, sizeof(randstate)); - if (initstate_r(seed, randstate, sizeof(randstate), &rand)) { - fprintf(stderr, "srandom_r error\n"); - exit(1); - } + if (initstate_r(seed, randstate, sizeof(randstate), &rand)) + err("initstate_r failed"); } else { page_nr = -bounces; if (!(bounces & BOUNCE_RACINGFAULTS)) @@ -415,92 +541,26 @@ static void *locking_thread(void *arg) while (!finished) { if (bounces & BOUNCE_RANDOM) { - if (random_r(&rand, &rand_nr)) { - fprintf(stderr, "random_r 1 error\n"); - exit(1); - } + if (random_r(&rand, &rand_nr)) + err("random_r failed"); page_nr = rand_nr; if (sizeof(page_nr) > sizeof(rand_nr)) { - if (random_r(&rand, &rand_nr)) { - fprintf(stderr, "random_r 2 error\n"); - exit(1); - } + if (random_r(&rand, &rand_nr)) + err("random_r failed"); page_nr |= (((unsigned long) rand_nr) << 16) << 16; } } else page_nr += 1; page_nr %= nr_pages; - - start = time(NULL); - if (bounces & BOUNCE_VERIFY) { - count = *area_count(area_dst, page_nr); - if (!count) { - fprintf(stderr, - "page_nr %lu wrong count %Lu %Lu\n", - page_nr, count, - count_verify[page_nr]); - exit(1); - } - - - /* - * We can't use bcmp (or memcmp) because that - * returns 0 erroneously if the memory is - * changing under it (even if the end of the - * page is never changing and always - * different). - */ -#if 1 - if (!my_bcmp(area_dst + page_nr * page_size, zeropage, - page_size)) { - fprintf(stderr, - "my_bcmp page_nr %lu wrong count %Lu %Lu\n", - page_nr, count, count_verify[page_nr]); - exit(1); - } -#else - unsigned long loops; - - loops = 0; - /* uncomment the below line to test with mutex */ - /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */ - while (!bcmp(area_dst + page_nr * page_size, zeropage, - page_size)) { - loops += 1; - if (loops > 10) - break; - } - /* uncomment below line to test with mutex */ - /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */ - if (loops) { - fprintf(stderr, - "page_nr %lu all zero thread %lu %p %lu\n", - page_nr, cpu, area_dst + page_nr * page_size, - loops); - if (loops > 10) - exit(1); - } -#endif - } - pthread_mutex_lock(area_mutex(area_dst, page_nr)); count = *area_count(area_dst, page_nr); - if (count != count_verify[page_nr]) { - fprintf(stderr, - "page_nr %lu memory corruption %Lu %Lu\n", - page_nr, count, - count_verify[page_nr]); exit(1); - } + if (count != count_verify[page_nr]) + err("page_nr %lu memory corruption %llu %llu", + page_nr, count, count_verify[page_nr]); count++; *area_count(area_dst, page_nr) = count_verify[page_nr] = count; pthread_mutex_unlock(area_mutex(area_dst, page_nr)); - - if (time(NULL) - start > 1) - fprintf(stderr, - "userfault too slow %ld " - "possible false positive with overcommit\n", - time(NULL) - start); } return NULL; @@ -514,22 +574,33 @@ static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, offset); if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { /* real retval in ufdio_copy.copy */ - if (uffdio_copy->copy != -EEXIST) { - uffd_error(uffdio_copy->copy, - "UFFDIO_COPY retry error"); - } - } else - uffd_error(uffdio_copy->copy, "UFFDIO_COPY retry unexpected"); + if (uffdio_copy->copy != -EEXIST) + err("UFFDIO_COPY retry error: %"PRId64, + (int64_t)uffdio_copy->copy); + } else { + err("UFFDIO_COPY retry unexpected: %"PRId64, + (int64_t)uffdio_copy->copy); + } +} + +static void wake_range(int ufd, unsigned long addr, unsigned long len) +{ + struct uffdio_range uffdio_wake; + + uffdio_wake.start = addr; + uffdio_wake.len = len; + + if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) + fprintf(stderr, "error waking %lu\n", + addr), exit(1); } static int __copy_page(int ufd, unsigned long offset, bool retry) { struct uffdio_copy uffdio_copy; - if (offset >= nr_pages * page_size) { - fprintf(stderr, "unexpected offset %lu\n", offset); - exit(1); - } + if (offset >= nr_pages * page_size) + err("unexpected offset %lu\n", offset); uffdio_copy.dst = (unsigned long) area_dst + offset; uffdio_copy.src = (unsigned long) area_src + offset; uffdio_copy.len = page_size; @@ -541,9 +612,11 @@ static int __copy_page(int ufd, unsigned long offset, bool retry) if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { /* real retval in ufdio_copy.copy */ if (uffdio_copy.copy != -EEXIST) - uffd_error(uffdio_copy.copy, "UFFDIO_COPY error"); + err("UFFDIO_COPY error: %"PRId64, + (int64_t)uffdio_copy.copy); + wake_range(ufd, uffdio_copy.dst, page_size); } else if (uffdio_copy.copy != page_size) { - uffd_error(uffdio_copy.copy, "UFFDIO_COPY unexpected copy"); + err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); } else { if (test_uffdio_copy_eexist && retry) { test_uffdio_copy_eexist = false; @@ -572,11 +645,10 @@ static int uffd_read_msg(int ufd, struct uffd_msg *msg) if (ret < 0) { if (errno == EAGAIN) return 1; - perror("blocking read error"); + err("blocking read error"); } else { - fprintf(stderr, "short read\n"); + err("short read"); } - exit(1); } return 0; @@ -587,10 +659,8 @@ static void uffd_handle_page_fault(struct uffd_msg *msg, { unsigned long offset; - if (msg->event != UFFD_EVENT_PAGEFAULT) { - fprintf(stderr, "unexpected msg event %u\n", msg->event); - exit(1); - } + if (msg->event != UFFD_EVENT_PAGEFAULT) + err("unexpected msg event %u", msg->event); if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { /* Write protect page faults */ @@ -621,11 +691,8 @@ static void uffd_handle_page_fault(struct uffd_msg *msg, stats->minor_faults++; } else { /* Missing page faults */ - if (bounces & BOUNCE_VERIFY && - msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) { - fprintf(stderr, "unexpected write fault\n"); - exit(1); - } + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + err("unexpected write fault"); offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; offset &= ~(page_size-1); @@ -652,32 +719,20 @@ static void *uffd_poll_thread(void *arg) for (;;) { ret = poll(pollfd, 2, -1); - if (!ret) { - fprintf(stderr, "poll error %d\n", ret); - exit(1); - } - if (ret < 0) { - perror("poll"); - exit(1); - } + if (ret <= 0) + err("poll error: %d", ret); if (pollfd[1].revents & POLLIN) { - if (read(pollfd[1].fd, &tmp_chr, 1) != 1) { - fprintf(stderr, "read pipefd error\n"); - exit(1); - } + if (read(pollfd[1].fd, &tmp_chr, 1) != 1) + err("read pipefd error"); break; } - if (!(pollfd[0].revents & POLLIN)) { - fprintf(stderr, "pollfd[0].revents %d\n", - pollfd[0].revents); - exit(1); - } + if (!(pollfd[0].revents & POLLIN)) + err("pollfd[0].revents %d", pollfd[0].revents); if (uffd_read_msg(uffd, &msg)) continue; switch (msg.event) { default: - fprintf(stderr, "unexpected msg event %u\n", - msg.event); exit(1); + err("unexpected msg event %u\n", msg.event); break; case UFFD_EVENT_PAGEFAULT: uffd_handle_page_fault(&msg, stats); @@ -691,10 +746,8 @@ static void *uffd_poll_thread(void *arg) uffd_reg.range.start = msg.arg.remove.start; uffd_reg.range.len = msg.arg.remove.end - msg.arg.remove.start; - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) { - fprintf(stderr, "remove failure\n"); - exit(1); - } + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) + err("remove failure"); break; case UFFD_EVENT_REMAP: area_dst = (char *)(unsigned long)msg.arg.remap.to; @@ -797,9 +850,7 @@ static int stress(struct uffd_stats *uffd_stats) * UFFDIO_COPY without writing zero pages into area_dst * because the background threads already completed). */ - if (uffd_test_ops->release_pages(area_src)) - return 1; - + uffd_test_ops->release_pages(area_src); finished = 1; for (cpu = 0; cpu < nr_cpus; cpu++) @@ -809,10 +860,8 @@ static int stress(struct uffd_stats *uffd_stats) for (cpu = 0; cpu < nr_cpus; cpu++) { char c; if (bounces & BOUNCE_POLL) { - if (write(pipefd[cpu*2+1], &c, 1) != 1) { - fprintf(stderr, "pipefd write error\n"); - return 1; - } + if (write(pipefd[cpu*2+1], &c, 1) != 1) + err("pipefd write error"); if (pthread_join(uffd_threads[cpu], (void *)&uffd_stats[cpu])) return 1; @@ -827,40 +876,6 @@ static int stress(struct uffd_stats *uffd_stats) return 0; } -static int userfaultfd_open_ext(uint64_t *features) -{ - struct uffdio_api uffdio_api; - - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd < 0) { - fprintf(stderr, - "userfaultfd syscall not available in this kernel\n"); - return 1; - } - uffd_flags = fcntl(uffd, F_GETFD, NULL); - - uffdio_api.api = UFFD_API; - uffdio_api.features = *features; - if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { - fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to " - "run with either root or ptrace capability.\n"); - return 1; - } - if (uffdio_api.api != UFFD_API) { - fprintf(stderr, "UFFDIO_API error: %" PRIu64 "\n", - (uint64_t)uffdio_api.api); - return 1; - } - - *features = uffdio_api.features; - return 0; -} - -static int userfaultfd_open(uint64_t features) -{ - return userfaultfd_open_ext(&features); -} - sigjmp_buf jbuf, *sigbuf; static void sighndl(int sig, siginfo_t *siginfo, void *ptr) @@ -912,10 +927,8 @@ static int faulting_process(int signal_test) memset(&act, 0, sizeof(act)); act.sa_sigaction = sighndl; act.sa_flags = SA_SIGINFO; - if (sigaction(SIGBUS, &act, 0)) { - perror("sigaction"); - return 1; - } + if (sigaction(SIGBUS, &act, 0)) + err("sigaction"); lastnr = (unsigned long)-1; } @@ -925,10 +938,8 @@ static int faulting_process(int signal_test) if (signal_test) { if (sigsetjmp(*sigbuf, 1) != 0) { - if (steps == 1 && nr == lastnr) { - fprintf(stderr, "Signal repeated\n"); - return 1; - } + if (steps == 1 && nr == lastnr) + err("Signal repeated"); lastnr = nr; if (signal_test == 1) { @@ -953,12 +964,9 @@ static int faulting_process(int signal_test) } count = *area_count(area_dst, nr); - if (count != count_verify[nr]) { - fprintf(stderr, - "nr %lu memory corruption %Lu %Lu\n", - nr, count, - count_verify[nr]); - } + if (count != count_verify[nr]) + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); /* * Trigger write protection if there is by writing * the same value back. @@ -974,18 +982,16 @@ static int faulting_process(int signal_test) area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, area_src); - if (area_dst == MAP_FAILED) { - perror("mremap"); - exit(1); - } + if (area_dst == MAP_FAILED) + err("mremap"); + /* Reset area_src since we just clobbered it */ + area_src = NULL; for (; nr < nr_pages; nr++) { count = *area_count(area_dst, nr); if (count != count_verify[nr]) { - fprintf(stderr, - "nr %lu memory corruption %Lu %Lu\n", - nr, count, - count_verify[nr]); exit(1); + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); } /* * Trigger write protection if there is by writing @@ -994,15 +1000,11 @@ static int faulting_process(int signal_test) *area_count(area_dst, nr) = count; } - if (uffd_test_ops->release_pages(area_dst)) - return 1; + uffd_test_ops->release_pages(area_dst); - for (nr = 0; nr < nr_pages; nr++) { - if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) { - fprintf(stderr, "nr %lu is not zero\n", nr); - exit(1); - } - } + for (nr = 0; nr < nr_pages; nr++) + if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) + err("nr %lu is not zero", nr); return 0; } @@ -1015,13 +1017,12 @@ static void retry_uffdio_zeropage(int ufd, uffdio_zeropage->range.len, offset); if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { - if (uffdio_zeropage->zeropage != -EEXIST) { - uffd_error(uffdio_zeropage->zeropage, - "UFFDIO_ZEROPAGE retry error"); - } + if (uffdio_zeropage->zeropage != -EEXIST) + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); } else { - uffd_error(uffdio_zeropage->zeropage, - "UFFDIO_ZEROPAGE retry unexpected"); + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); } } @@ -1034,10 +1035,8 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE); - if (offset >= nr_pages * page_size) { - fprintf(stderr, "unexpected offset %lu\n", offset); - exit(1); - } + if (offset >= nr_pages * page_size) + err("unexpected offset %lu", offset); uffdio_zeropage.range.start = (unsigned long) area_dst + offset; uffdio_zeropage.range.len = page_size; uffdio_zeropage.mode = 0; @@ -1045,14 +1044,13 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) res = uffdio_zeropage.zeropage; if (ret) { /* real retval in ufdio_zeropage.zeropage */ - if (has_zeropage) { - uffd_error(res, "UFFDIO_ZEROPAGE %s", - res == -EEXIST ? "-EEXIST" : "error"); - } else if (res != -EINVAL) - uffd_error(res, "UFFDIO_ZEROPAGE not -EINVAL"); + if (has_zeropage) + err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); + else if (res != -EINVAL) + err("UFFDIO_ZEROPAGE not -EINVAL"); } else if (has_zeropage) { if (res != page_size) { - uffd_error(res, "UFFDIO_ZEROPAGE unexpected"); + err("UFFDIO_ZEROPAGE unexpected size"); } else { if (test_uffdio_zeropage_eexist && retry) { test_uffdio_zeropage_eexist = false; @@ -1062,7 +1060,7 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) return 1; } } else - uffd_error(res, "UFFDIO_ZEROPAGE succeeded"); + err("UFFDIO_ZEROPAGE succeeded"); return 0; } @@ -1081,37 +1079,24 @@ static int userfaultfd_zeropage_test(void) printf("testing UFFDIO_ZEROPAGE: "); fflush(stdout); - if (uffd_test_ops->release_pages(area_dst)) - return 1; + uffd_test_ctx_init(0); - if (userfaultfd_open(0)) - return 1; uffdio_register.range.start = (unsigned long) area_dst; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; if (test_uffdio_wp) uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { - fprintf(stderr, "register failure\n"); - exit(1); - } + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != - expected_ioctls) { - fprintf(stderr, - "unexpected missing ioctl for anon memory\n"); - exit(1); - } + if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) + err("unexpected missing ioctl for anon memory"); - if (uffdio_zeropage(uffd, 0)) { - if (my_bcmp(area_dst, zeropage, page_size)) { - fprintf(stderr, "zeropage is not zero\n"); - exit(1); - } - } + if (uffdio_zeropage(uffd, 0)) + if (my_bcmp(area_dst, zeropage, page_size)) + err("zeropage is not zero"); - close(uffd); printf("done.\n"); return 0; } @@ -1129,13 +1114,10 @@ static int userfaultfd_events_test(void) printf("testing events (fork, remap, remove): "); fflush(stdout); - if (uffd_test_ops->release_pages(area_dst)) - return 1; - features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE; - if (userfaultfd_open(features)) - return 1; + uffd_test_ctx_init(features); + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); uffdio_register.range.start = (unsigned long) area_dst; @@ -1143,46 +1125,31 @@ static int userfaultfd_events_test(void) uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; if (test_uffdio_wp) uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { - fprintf(stderr, "register failure\n"); - exit(1); - } + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { - fprintf(stderr, "unexpected missing ioctl for anon memory\n"); - exit(1); - } + if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) + err("unexpected missing ioctl for anon memory"); - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) { - perror("uffd_poll_thread create"); - exit(1); - } + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); pid = fork(); - if (pid < 0) { - perror("fork"); - exit(1); - } + if (pid < 0) + err("fork"); if (!pid) exit(faulting_process(0)); waitpid(pid, &err, 0); - if (err) { - fprintf(stderr, "faulting process failed\n"); - exit(1); - } - - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) { - perror("pipe write"); - exit(1); - } + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); if (pthread_join(uffd_mon, NULL)) return 1; - close(uffd); - uffd_stats_report(&stats, 1); return stats.missing_faults != nr_pages; @@ -1202,12 +1169,9 @@ static int userfaultfd_sig_test(void) printf("testing signal delivery: "); fflush(stdout); - if (uffd_test_ops->release_pages(area_dst)) - return 1; - features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; - if (userfaultfd_open(features)) - return 1; + uffd_test_ctx_init(features); + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); uffdio_register.range.start = (unsigned long) area_dst; @@ -1215,57 +1179,40 @@ static int userfaultfd_sig_test(void) uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; if (test_uffdio_wp) uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { - fprintf(stderr, "register failure\n"); - exit(1); - } + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { - fprintf(stderr, "unexpected missing ioctl for anon memory\n"); - exit(1); - } + if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) + err("unexpected missing ioctl for anon memory"); - if (faulting_process(1)) { - fprintf(stderr, "faulting process failed\n"); - exit(1); - } + if (faulting_process(1)) + err("faulting process failed"); - if (uffd_test_ops->release_pages(area_dst)) - return 1; + uffd_test_ops->release_pages(area_dst); - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) { - perror("uffd_poll_thread create"); - exit(1); - } + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); pid = fork(); - if (pid < 0) { - perror("fork"); - exit(1); - } + if (pid < 0) + err("fork"); if (!pid) exit(faulting_process(2)); waitpid(pid, &err, 0); - if (err) { - fprintf(stderr, "faulting process failed\n"); - exit(1); - } - - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) { - perror("pipe write"); - exit(1); - } + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); if (pthread_join(uffd_mon, (void **)&userfaults)) return 1; printf("done.\n"); if (userfaults) - fprintf(stderr, "Signal test failed, userfaults: %ld\n", - userfaults); - close(uffd); + err("Signal test failed, userfaults: %ld", userfaults); + return userfaults != 0; } @@ -1279,7 +1226,7 @@ static int userfaultfd_minor_test(void) void *expected_page; char c; struct uffd_stats stats = { 0 }; - uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS; + uint64_t req_features, features_out; if (!test_uffdio_minor) return 0; @@ -1287,13 +1234,17 @@ static int userfaultfd_minor_test(void) printf("testing minor faults: "); fflush(stdout); - if (uffd_test_ops->release_pages(area_dst)) + if (test_type == TEST_HUGETLB) + req_features = UFFD_FEATURE_MINOR_HUGETLBFS; + else if (test_type == TEST_SHMEM) + req_features = UFFD_FEATURE_MINOR_SHMEM; + else return 1; - if (userfaultfd_open_ext(&features)) - return 1; - /* If kernel reports the feature isn't supported, skip the test. */ - if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) { + features_out = req_features; + uffd_test_ctx_init_ext(&features_out); + /* If kernel reports required features aren't supported, skip test. */ + if ((features_out & req_features) != req_features) { printf("skipping test due to lack of feature support\n"); fflush(stdout); return 0; @@ -1302,17 +1253,13 @@ static int userfaultfd_minor_test(void) uffdio_register.range.start = (unsigned long)area_dst_alias; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { - fprintf(stderr, "register failure\n"); - exit(1); - } + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); expected_ioctls = uffd_test_ops->expected_ioctls; expected_ioctls |= 1 << _UFFDIO_CONTINUE; - if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { - fprintf(stderr, "unexpected missing ioctl(s)\n"); - exit(1); - } + if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) + err("unexpected missing ioctl(s)"); /* * After registering with UFFD, populate the non-UFFD-registered side of @@ -1323,10 +1270,8 @@ static int userfaultfd_minor_test(void) page_size); } - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) { - perror("uffd_poll_thread create"); - exit(1); - } + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); /* * Read each of the pages back using the UFFD-registered mapping. We @@ -1335,92 +1280,173 @@ static int userfaultfd_minor_test(void) * page's contents, and then issuing a CONTINUE ioctl. */ - if (posix_memalign(&expected_page, page_size, page_size)) { - fprintf(stderr, "out of memory\n"); - return 1; - } + if (posix_memalign(&expected_page, page_size, page_size)) + err("out of memory"); for (p = 0; p < nr_pages; ++p) { expected_byte = ~((uint8_t)(p % ((uint8_t)-1))); memset(expected_page, expected_byte, page_size); if (my_bcmp(expected_page, area_dst_alias + (p * page_size), - page_size)) { - fprintf(stderr, - "unexpected page contents after minor fault\n"); - exit(1); - } + page_size)) + err("unexpected page contents after minor fault"); } - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) { - perror("pipe write"); - exit(1); - } + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); if (pthread_join(uffd_mon, NULL)) return 1; - close(uffd); - uffd_stats_report(&stats, 1); return stats.missing_faults != 0 || stats.minor_faults != nr_pages; } -static int userfaultfd_stress(void) +#define BIT_ULL(nr) (1ULL << (nr)) +#define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_UFFD_WP BIT_ULL(57) +#define PM_FILE BIT_ULL(61) +#define PM_SWAP BIT_ULL(62) +#define PM_PRESENT BIT_ULL(63) + +static int pagemap_open(void) { - void *area; - char *tmp_area; - unsigned long nr; - struct uffdio_register uffdio_register; - unsigned long cpu; - int err; - struct uffd_stats uffd_stats[nr_cpus]; + int fd = open("/proc/self/pagemap", O_RDONLY); - uffd_test_ops->allocate_area((void **)&area_src); - if (!area_src) - return 1; - uffd_test_ops->allocate_area((void **)&area_dst); - if (!area_dst) - return 1; + if (fd < 0) + err("open pagemap"); - if (userfaultfd_open(0)) - return 1; + return fd; +} - count_verify = malloc(nr_pages * sizeof(unsigned long long)); - if (!count_verify) { - perror("count_verify"); - return 1; - } +static uint64_t pagemap_read_vaddr(int fd, void *vaddr) +{ + uint64_t value; + int ret; - for (nr = 0; nr < nr_pages; nr++) { - *area_mutex(area_src, nr) = (pthread_mutex_t) - PTHREAD_MUTEX_INITIALIZER; - count_verify[nr] = *area_count(area_src, nr) = 1; + ret = pread(fd, &value, sizeof(uint64_t), + ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); + if (ret != sizeof(uint64_t)) + err("pread() on pagemap failed"); + + return value; +} + +/* This macro let __LINE__ works in err() */ +#define pagemap_check_wp(value, wp) do { \ + if (!!(value & PM_UFFD_WP) != wp) \ + err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ + } while (0) + +static int pagemap_test_fork(bool present) +{ + pid_t child = fork(); + uint64_t value; + int fd, result; + + if (!child) { + /* Open the pagemap fd of the child itself */ + fd = pagemap_open(); + value = pagemap_read_vaddr(fd, area_dst); /* - * In the transition between 255 to 256, powerpc will - * read out of order in my_bcmp and see both bytes as - * zero, so leave a placeholder below always non-zero - * after the count, to avoid my_bcmp to trigger false - * positives. + * After fork() uffd-wp bit should be gone as long as we're + * without UFFD_FEATURE_EVENT_FORK */ - *(area_count(area_src, nr) + 1) = 1; + pagemap_check_wp(value, false); + /* Succeed */ + exit(0); } + waitpid(child, &result, 0); + return result; +} - pipefd = malloc(sizeof(int) * nr_cpus * 2); - if (!pipefd) { - perror("pipefd"); - return 1; - } - for (cpu = 0; cpu < nr_cpus; cpu++) { - if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) { - perror("pipe"); - return 1; - } - } +static void userfaultfd_pagemap_test(unsigned int test_pgsize) +{ + struct uffdio_register uffdio_register; + int pagemap_fd; + uint64_t value; - if (posix_memalign(&area, page_size, page_size)) { - fprintf(stderr, "out of memory\n"); - return 1; + /* Pagemap tests uffd-wp only */ + if (!test_uffdio_wp) + return; + + /* Not enough memory to test this page size */ + if (test_pgsize > nr_pages * page_size) + return; + + printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); + /* Flush so it doesn't flush twice in parent/child later */ + fflush(stdout); + + uffd_test_ctx_init(0); + + if (test_pgsize > page_size) { + /* This is a thp test */ + if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) + err("madvise(MADV_HUGEPAGE) failed"); + } else if (test_pgsize == page_size) { + /* This is normal page test; force no thp */ + if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) + err("madvise(MADV_NOHUGEPAGE) failed"); } + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failed"); + + pagemap_fd = pagemap_open(); + + /* Touch the page */ + *area_dst = 1; + wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(true)) + err("Detected stall uffd-wp bit in child"); + + /* Exclusive required or PAGEOUT won't work */ + if (!(value & PM_MMAP_EXCLUSIVE)) + err("multiple mapping detected: 0x%"PRIx64, value); + + if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) + err("madvise(MADV_PAGEOUT) failed"); + + /* Uffd-wp should persist even swapped out */ + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(false)) + err("Detected stall uffd-wp bit in child"); + + /* Unprotect; this tests swap pte modifications */ + wp_range(uffd, (uint64_t)area_dst, page_size, false); + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Fault in the page from disk */ + *area_dst = 2; + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + close(pagemap_fd); + printf("done\n"); +} + +static int userfaultfd_stress(void) +{ + void *area; + char *tmp_area; + unsigned long nr; + struct uffdio_register uffdio_register; + struct uffd_stats uffd_stats[nr_cpus]; + + uffd_test_ctx_init(0); + + if (posix_memalign(&area, page_size, page_size)) + err("out of memory"); zeropage = area; bzero(zeropage, page_size); @@ -1429,7 +1455,6 @@ static int userfaultfd_stress(void) pthread_attr_init(&attr); pthread_attr_setstacksize(&attr, 16*1024*1024); - err = 0; while (bounces--) { unsigned long expected_ioctls; @@ -1458,25 +1483,18 @@ static int userfaultfd_stress(void) uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; if (test_uffdio_wp) uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { - fprintf(stderr, "register failure\n"); - return 1; - } + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); expected_ioctls = uffd_test_ops->expected_ioctls; if ((uffdio_register.ioctls & expected_ioctls) != - expected_ioctls) { - fprintf(stderr, - "unexpected missing ioctl for anon memory\n"); - return 1; - } + expected_ioctls) + err("unexpected missing ioctl for anon memory"); if (area_dst_alias) { uffdio_register.range.start = (unsigned long) area_dst_alias; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { - fprintf(stderr, "register failure alias\n"); - return 1; - } + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure alias"); } /* @@ -1503,8 +1521,7 @@ static int userfaultfd_stress(void) * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's * required to MADV_DONTNEED here. */ - if (uffd_test_ops->release_pages(area_dst)) - return 1; + uffd_test_ops->release_pages(area_dst); uffd_stats_reset(uffd_stats, nr_cpus); @@ -1518,33 +1535,22 @@ static int userfaultfd_stress(void) nr_pages * page_size, false); /* unregister */ - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) { - fprintf(stderr, "unregister failure\n"); - return 1; - } + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) + err("unregister failure"); if (area_dst_alias) { uffdio_register.range.start = (unsigned long) area_dst; if (ioctl(uffd, UFFDIO_UNREGISTER, - &uffdio_register.range)) { - fprintf(stderr, "unregister failure alias\n"); - return 1; - } + &uffdio_register.range)) + err("unregister failure alias"); } /* verification */ - if (bounces & BOUNCE_VERIFY) { - for (nr = 0; nr < nr_pages; nr++) { - if (*area_count(area_dst, nr) != count_verify[nr]) { - fprintf(stderr, - "error area_count %Lu %Lu %lu\n", - *area_count(area_src, nr), - count_verify[nr], - nr); - err = 1; - bounces = 0; - } - } - } + if (bounces & BOUNCE_VERIFY) + for (nr = 0; nr < nr_pages; nr++) + if (*area_count(area_dst, nr) != count_verify[nr]) + err("error area_count %llu %llu %lu\n", + *area_count(area_src, nr), + count_verify[nr], nr); /* prepare next bounce */ tmp_area = area_src; @@ -1558,10 +1564,21 @@ static int userfaultfd_stress(void) uffd_stats_report(uffd_stats, nr_cpus); } - if (err) - return err; + if (test_type == TEST_ANON) { + /* + * shmem/hugetlb won't be able to run since they have different + * behavior on fork() (file-backed memory normally drops ptes + * directly when fork), meanwhile the pagemap test will verify + * pgtable entry of fork()ed child. + */ + userfaultfd_pagemap_test(page_size); + /* + * Hard-code for x86_64 for now for 2M THP, as x86_64 is + * currently the only one that supports uffd-wp + */ + userfaultfd_pagemap_test(page_size * 512); + } - close(uffd); return userfaultfd_zeropage_test() || userfaultfd_sig_test() || userfaultfd_events_test() || userfaultfd_minor_test(); } @@ -1610,8 +1627,9 @@ static void set_test_type(const char *type) map_shared = true; test_type = TEST_SHMEM; uffd_test_ops = &shmem_uffd_test_ops; + test_uffdio_minor = true; } else { - fprintf(stderr, "Unknown test type: %s\n", type); exit(1); + err("Unknown test type: %s", type); } if (test_type == TEST_HUGETLB) @@ -1619,15 +1637,11 @@ static void set_test_type(const char *type) else page_size = sysconf(_SC_PAGE_SIZE); - if (!page_size) { - fprintf(stderr, "Unable to determine page size\n"); - exit(2); - } + if (!page_size) + err("Unable to determine page size"); if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 - > page_size) { - fprintf(stderr, "Impossible to run this test\n"); - exit(2); - } + > page_size) + err("Impossible to run this test"); } static void sigalrm(int sig) @@ -1644,10 +1658,8 @@ int main(int argc, char **argv) if (argc < 4) usage(); - if (signal(SIGALRM, sigalrm) == SIG_ERR) { - fprintf(stderr, "failed to arm SIGALRM"); - exit(1); - } + if (signal(SIGALRM, sigalrm) == SIG_ERR) + err("failed to arm SIGALRM"); alarm(ALARM_INTERVAL_SECS); set_test_type(argv[1]); @@ -1656,13 +1668,13 @@ int main(int argc, char **argv) nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size / nr_cpus; if (!nr_pages_per_cpu) { - fprintf(stderr, "invalid MiB\n"); + _err("invalid MiB"); usage(); } bounces = atoi(argv[3]); if (bounces <= 0) { - fprintf(stderr, "invalid bounces\n"); + _err("invalid bounces"); usage(); } nr_pages = nr_pages_per_cpu * nr_cpus; @@ -1671,16 +1683,20 @@ int main(int argc, char **argv) if (argc < 5) usage(); huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); - if (huge_fd < 0) { - fprintf(stderr, "Open of %s failed", argv[3]); - perror("open"); - exit(1); - } - if (ftruncate(huge_fd, 0)) { - fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]); - perror("ftruncate"); - exit(1); - } + if (huge_fd < 0) + err("Open of %s failed", argv[4]); + if (ftruncate(huge_fd, 0)) + err("ftruncate %s to size 0 failed", argv[4]); + } else if (test_type == TEST_SHMEM) { + shm_fd = memfd_create(argv[0], 0); + if (shm_fd < 0) + err("memfd_create"); + if (ftruncate(shm_fd, nr_pages * page_size * 2)) + err("ftruncate"); + if (fallocate(shm_fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, + nr_pages * page_size * 2)) + err("fallocate"); } printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", nr_pages, nr_pages_per_cpu); diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 333980375bc7..b4142cd1c5c2 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -13,11 +13,12 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl ioperm \ test_vsyscall mov_ss_trap \ - syscall_arg_fault fsgsbase_restore + syscall_arg_fault fsgsbase_restore sigaltstack TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer -TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering +TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \ + corrupt_xstate_header # Some selftests require 32bit support enabled also on 64bit systems TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall diff --git a/tools/testing/selftests/x86/corrupt_xstate_header.c b/tools/testing/selftests/x86/corrupt_xstate_header.c new file mode 100644 index 000000000000..ab8599c10ce5 --- /dev/null +++ b/tools/testing/selftests/x86/corrupt_xstate_header.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Corrupt the XSTATE header in a signal frame + * + * Based on analysis and a test case from Thomas Gleixner. + */ + +#define _GNU_SOURCE + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <sched.h> +#include <signal.h> +#include <err.h> +#include <unistd.h> +#include <stdint.h> +#include <sys/wait.h> + +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + asm volatile( + "cpuid;" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +static inline int xsave_enabled(void) +{ + unsigned int eax, ebx, ecx, edx; + + eax = 0x1; + ecx = 0x0; + __cpuid(&eax, &ebx, &ecx, &edx); + + /* Is CR4.OSXSAVE enabled ? */ + return ecx & (1U << 27); +} + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void sigusr1(int sig, siginfo_t *info, void *uc_void) +{ + ucontext_t *uc = uc_void; + uint8_t *fpstate = (uint8_t *)uc->uc_mcontext.fpregs; + uint64_t *xfeatures = (uint64_t *)(fpstate + 512); + + printf("\tWreck XSTATE header\n"); + /* Wreck the first reserved bytes in the header */ + *(xfeatures + 2) = 0xfffffff; +} + +static void sigsegv(int sig, siginfo_t *info, void *uc_void) +{ + printf("\tGot SIGSEGV\n"); +} + +int main(void) +{ + cpu_set_t set; + + sethandler(SIGUSR1, sigusr1, 0); + sethandler(SIGSEGV, sigsegv, 0); + + if (!xsave_enabled()) { + printf("[SKIP] CR4.OSXSAVE disabled.\n"); + return 0; + } + + CPU_ZERO(&set); + CPU_SET(0, &set); + + /* + * Enforce that the child runs on the same CPU + * which in turn forces a schedule. + */ + sched_setaffinity(getpid(), sizeof(set), &set); + + printf("[RUN]\tSend ourselves a signal\n"); + raise(SIGUSR1); + + printf("[OK]\tBack from the signal. Now schedule.\n"); + pid_t child = fork(); + if (child < 0) + err(1, "fork"); + if (child == 0) + return 0; + if (child) + waitpid(child, NULL, 0); + printf("[OK]\tBack in the main thread.\n"); + + /* + * We could try to confirm that extended state is still preserved + * when we schedule. For now, the only indication of failure is + * a warning in the kernel logs. + */ + + return 0; +} diff --git a/tools/testing/selftests/x86/mov_ss_trap.c b/tools/testing/selftests/x86/mov_ss_trap.c index 6da0ac3f0135..cc3de6ff9fba 100644 --- a/tools/testing/selftests/x86/mov_ss_trap.c +++ b/tools/testing/selftests/x86/mov_ss_trap.c @@ -47,7 +47,6 @@ unsigned short ss; extern unsigned char breakpoint_insn[]; sigjmp_buf jmpbuf; -static unsigned char altstack_data[SIGSTKSZ]; static void enable_watchpoint(void) { @@ -250,13 +249,14 @@ int main() if (sigsetjmp(jmpbuf, 1) == 0) { printf("[RUN]\tMOV SS; SYSENTER\n"); stack_t stack = { - .ss_sp = altstack_data, + .ss_sp = malloc(sizeof(char) * SIGSTKSZ), .ss_size = SIGSTKSZ, }; if (sigaltstack(&stack, NULL) != 0) err(1, "sigaltstack"); sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND | SA_ONSTACK); nr = SYS_getpid; + free(stack.ss_sp); /* Clear EBP first to make sure we segfault cleanly. */ asm volatile ("xorl %%ebp, %%ebp; mov %[ss], %%ss; SYSENTER" : "+a" (nr) : [ss] "m" (ss) : "flags", "rcx" diff --git a/tools/testing/selftests/x86/sigaltstack.c b/tools/testing/selftests/x86/sigaltstack.c new file mode 100644 index 000000000000..f689af75e979 --- /dev/null +++ b/tools/testing/selftests/x86/sigaltstack.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define _GNU_SOURCE +#include <signal.h> +#include <stdio.h> +#include <stdbool.h> +#include <string.h> +#include <err.h> +#include <errno.h> +#include <limits.h> +#include <sys/mman.h> +#include <sys/auxv.h> +#include <sys/prctl.h> +#include <sys/resource.h> +#include <setjmp.h> + +/* sigaltstack()-enforced minimum stack */ +#define ENFORCED_MINSIGSTKSZ 2048 + +#ifndef AT_MINSIGSTKSZ +# define AT_MINSIGSTKSZ 51 +#endif + +static int nerrs; + +static bool sigalrm_expected; + +static unsigned long at_minstack_size; + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void clearhandler(int sig) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static int setup_altstack(void *start, unsigned long size) +{ + stack_t ss; + + memset(&ss, 0, sizeof(ss)); + ss.ss_size = size; + ss.ss_sp = start; + + return sigaltstack(&ss, NULL); +} + +static jmp_buf jmpbuf; + +static void sigsegv(int sig, siginfo_t *info, void *ctx_void) +{ + if (sigalrm_expected) { + printf("[FAIL]\tWrong signal delivered: SIGSEGV (expected SIGALRM)."); + nerrs++; + } else { + printf("[OK]\tSIGSEGV signal delivered.\n"); + } + + siglongjmp(jmpbuf, 1); +} + +static void sigalrm(int sig, siginfo_t *info, void *ctx_void) +{ + if (!sigalrm_expected) { + printf("[FAIL]\tWrong signal delivered: SIGALRM (expected SIGSEGV)."); + nerrs++; + } else { + printf("[OK]\tSIGALRM signal delivered.\n"); + } +} + +static void test_sigaltstack(void *altstack, unsigned long size) +{ + if (setup_altstack(altstack, size)) + err(1, "sigaltstack()"); + + sigalrm_expected = (size > at_minstack_size) ? true : false; + + sethandler(SIGSEGV, sigsegv, 0); + sethandler(SIGALRM, sigalrm, SA_ONSTACK); + + if (!sigsetjmp(jmpbuf, 1)) { + printf("[RUN]\tTest an alternate signal stack of %ssufficient size.\n", + sigalrm_expected ? "" : "in"); + printf("\tRaise SIGALRM. %s is expected to be delivered.\n", + sigalrm_expected ? "It" : "SIGSEGV"); + raise(SIGALRM); + } + + clearhandler(SIGALRM); + clearhandler(SIGSEGV); +} + +int main(void) +{ + void *altstack; + + at_minstack_size = getauxval(AT_MINSIGSTKSZ); + + altstack = mmap(NULL, at_minstack_size + SIGSTKSZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + if (altstack == MAP_FAILED) + err(1, "mmap()"); + + if ((ENFORCED_MINSIGSTKSZ + 1) < at_minstack_size) + test_sigaltstack(altstack, ENFORCED_MINSIGSTKSZ + 1); + + test_sigaltstack(altstack, at_minstack_size + SIGSTKSZ); + + return nerrs == 0 ? 0 : 1; +} diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index 57c4f67f16ef..5d7961a5f7f6 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -138,9 +138,6 @@ static unsigned short LDT3(int idx) return (idx << 3) | 7; } -/* Our sigaltstack scratch space. */ -static char altstack_data[SIGSTKSZ]; - static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -771,7 +768,8 @@ int main() setup_ldt(); stack_t stack = { - .ss_sp = altstack_data, + /* Our sigaltstack scratch space. */ + .ss_sp = malloc(sizeof(char) * SIGSTKSZ), .ss_size = SIGSTKSZ, }; if (sigaltstack(&stack, NULL) != 0) @@ -872,5 +870,6 @@ int main() total_nerrs += test_nonstrict_ss(); #endif + free(stack.ss_sp); return total_nerrs ? 1 : 0; } diff --git a/tools/testing/selftests/x86/single_step_syscall.c b/tools/testing/selftests/x86/single_step_syscall.c index 120ac741fe44..9a30f443e928 100644 --- a/tools/testing/selftests/x86/single_step_syscall.c +++ b/tools/testing/selftests/x86/single_step_syscall.c @@ -57,7 +57,6 @@ static void clearhandler(int sig) static volatile sig_atomic_t sig_traps, sig_eflags; sigjmp_buf jmpbuf; -static unsigned char altstack_data[SIGSTKSZ]; #ifdef __x86_64__ # define REG_IP REG_RIP @@ -210,7 +209,7 @@ int main() unsigned long nr = SYS_getpid; printf("[RUN]\tSet TF and check SYSENTER\n"); stack_t stack = { - .ss_sp = altstack_data, + .ss_sp = malloc(sizeof(char) * SIGSTKSZ), .ss_size = SIGSTKSZ, }; if (sigaltstack(&stack, NULL) != 0) @@ -219,6 +218,7 @@ int main() SA_RESETHAND | SA_ONSTACK); sethandler(SIGILL, print_and_longjmp, SA_RESETHAND); set_eflags(get_eflags() | X86_EFLAGS_TF); + free(stack.ss_sp); /* Clear EBP first to make sure we segfault cleanly. */ asm volatile ("xorl %%ebp, %%ebp; SYSENTER" : "+a" (nr) :: "flags", "rcx" #ifdef __x86_64__ diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c index bff474b5efc6..461fa41a4d02 100644 --- a/tools/testing/selftests/x86/syscall_arg_fault.c +++ b/tools/testing/selftests/x86/syscall_arg_fault.c @@ -17,9 +17,6 @@ #include "helpers.h" -/* Our sigaltstack scratch space. */ -static unsigned char altstack_data[SIGSTKSZ]; - static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -104,7 +101,8 @@ static void sigill(int sig, siginfo_t *info, void *ctx_void) int main() { stack_t stack = { - .ss_sp = altstack_data, + /* Our sigaltstack scratch space. */ + .ss_sp = malloc(sizeof(char) * SIGSTKSZ), .ss_size = SIGSTKSZ, }; if (sigaltstack(&stack, NULL) != 0) @@ -233,5 +231,6 @@ int main() set_eflags(get_eflags() & ~X86_EFLAGS_TF); #endif + free(stack.ss_sp); return 0; } diff --git a/tools/testing/selftests/x86/syscall_numbering.c b/tools/testing/selftests/x86/syscall_numbering.c index d6b09cb1aa2c..991591718bb0 100644 --- a/tools/testing/selftests/x86/syscall_numbering.c +++ b/tools/testing/selftests/x86/syscall_numbering.c @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args + * syscall_numbering.c - test calling the x86-64 kernel with various + * valid and invalid system call numbers. + * * Copyright (c) 2018 Andrew Lutomirski */ @@ -11,79 +13,470 @@ #include <stdbool.h> #include <errno.h> #include <unistd.h> -#include <syscall.h> +#include <string.h> +#include <fcntl.h> +#include <limits.h> +#include <signal.h> +#include <sysexits.h> -static int nerrs; +#include <sys/ptrace.h> +#include <sys/user.h> +#include <sys/wait.h> +#include <sys/mman.h> -#define X32_BIT 0x40000000UL +#include <linux/ptrace.h> -static void check_enosys(unsigned long nr, bool *ok) +/* Common system call numbers */ +#define SYS_READ 0 +#define SYS_WRITE 1 +#define SYS_GETPID 39 +/* x64-only system call numbers */ +#define X64_IOCTL 16 +#define X64_READV 19 +#define X64_WRITEV 20 +/* x32-only system call numbers (without X32_BIT) */ +#define X32_IOCTL 514 +#define X32_READV 515 +#define X32_WRITEV 516 + +#define X32_BIT 0x40000000 + +static int nullfd = -1; /* File descriptor for /dev/null */ +static bool with_x32; /* x32 supported on this kernel? */ + +enum ptrace_pass { + PTP_NOTHING, + PTP_GETREGS, + PTP_WRITEBACK, + PTP_FUZZRET, + PTP_FUZZHIGH, + PTP_INTNUM, + PTP_DONE +}; + +static const char * const ptrace_pass_name[] = { - /* If this fails, a segfault is reasonably likely. */ - fflush(stdout); + [PTP_NOTHING] = "just stop, no data read", + [PTP_GETREGS] = "only getregs", + [PTP_WRITEBACK] = "getregs, unmodified setregs", + [PTP_FUZZRET] = "modifying the default return", + [PTP_FUZZHIGH] = "clobbering the top 32 bits", + [PTP_INTNUM] = "sign-extending the syscall number", +}; - long ret = syscall(nr, 0, 0, 0, 0, 0, 0); - if (ret == 0) { - printf("[FAIL]\tsyscall %lu succeeded, but it should have failed\n", nr); - *ok = false; - } else if (errno != ENOSYS) { - printf("[FAIL]\tsyscall %lu had error code %d, but it should have reported ENOSYS\n", nr, errno); - *ok = false; - } +/* + * Shared memory block between tracer and test + */ +struct shared { + unsigned int nerr; /* Total error count */ + unsigned int indent; /* Message indentation level */ + enum ptrace_pass ptrace_pass; + bool probing_syscall; /* In probe_syscall() */ +}; +static volatile struct shared *sh; + +static inline unsigned int offset(void) +{ + unsigned int level = sh ? sh->indent : 0; + + return 8 + level * 4; } -static void test_x32_without_x32_bit(void) +#define msg(lvl, fmt, ...) printf("%-*s" fmt, offset(), "[" #lvl "]", \ + ## __VA_ARGS__) + +#define run(fmt, ...) msg(RUN, fmt, ## __VA_ARGS__) +#define info(fmt, ...) msg(INFO, fmt, ## __VA_ARGS__) +#define ok(fmt, ...) msg(OK, fmt, ## __VA_ARGS__) + +#define fail(fmt, ...) \ + do { \ + msg(FAIL, fmt, ## __VA_ARGS__); \ + sh->nerr++; \ + } while (0) + +#define crit(fmt, ...) \ + do { \ + sh->indent = 0; \ + msg(FAIL, fmt, ## __VA_ARGS__); \ + msg(SKIP, "Unable to run test\n"); \ + exit(EX_OSERR); \ + } while (0) + +/* Sentinel for ptrace-modified return value */ +#define MODIFIED_BY_PTRACE -9999 + +/* + * Directly invokes the given syscall with nullfd as the first argument + * and the rest zero. Avoids involving glibc wrappers in case they ever + * end up intercepting some system calls for some reason, or modify + * the system call number itself. + */ +static long long probe_syscall(int msb, int lsb) { - bool ok = true; + register long long arg1 asm("rdi") = nullfd; + register long long arg2 asm("rsi") = 0; + register long long arg3 asm("rdx") = 0; + register long long arg4 asm("r10") = 0; + register long long arg5 asm("r8") = 0; + register long long arg6 asm("r9") = 0; + long long nr = ((long long)msb << 32) | (unsigned int)lsb; + long long ret; /* - * Syscalls 512-547 are "x32" syscalls. They are intended to be - * called with the x32 (0x40000000) bit set. Calling them without - * the x32 bit set is nonsense and should not work. + * We pass in an extra copy of the extended system call number + * in %rbx, so we can examine it from the ptrace handler without + * worrying about it being possibly modified. This is to test + * the validity of struct user regs.orig_rax a.k.a. + * struct pt_regs.orig_ax. */ - printf("[RUN]\tChecking syscalls 512-547\n"); - for (int i = 512; i <= 547; i++) - check_enosys(i, &ok); + sh->probing_syscall = true; + asm volatile("syscall" + : "=a" (ret) + : "a" (nr), "b" (nr), + "r" (arg1), "r" (arg2), "r" (arg3), + "r" (arg4), "r" (arg5), "r" (arg6) + : "rcx", "r11", "memory", "cc"); + sh->probing_syscall = false; + + return ret; +} + +static const char *syscall_str(int msb, int start, int end) +{ + static char buf[64]; + const char * const type = (start & X32_BIT) ? "x32" : "x64"; + int lsb = start; /* - * Check that a handful of 64-bit-only syscalls are rejected if the x32 - * bit is set. + * Improve readability by stripping the x32 bit, but round + * toward zero so we don't display -1 as -1073741825. */ - printf("[RUN]\tChecking some 64-bit syscalls in x32 range\n"); - check_enosys(16 | X32_BIT, &ok); /* ioctl */ - check_enosys(19 | X32_BIT, &ok); /* readv */ - check_enosys(20 | X32_BIT, &ok); /* writev */ + if (lsb < 0) + lsb |= X32_BIT; + else + lsb &= ~X32_BIT; + + if (start == end) + snprintf(buf, sizeof buf, "%s syscall %d:%d", + type, msb, lsb); + else + snprintf(buf, sizeof buf, "%s syscalls %d:%d..%d", + type, msb, lsb, lsb + (end-start)); + + return buf; +} + +static unsigned int _check_for(int msb, int start, int end, long long expect, + const char *expect_str) +{ + unsigned int err = 0; + + sh->indent++; + if (start != end) + sh->indent++; + + for (int nr = start; nr <= end; nr++) { + long long ret = probe_syscall(msb, nr); + + if (ret != expect) { + fail("%s returned %lld, but it should have returned %s\n", + syscall_str(msb, nr, nr), + ret, expect_str); + err++; + } + } + + if (start != end) + sh->indent--; + + if (err) { + if (start != end) + fail("%s had %u failure%s\n", + syscall_str(msb, start, end), + err, err == 1 ? "s" : ""); + } else { + ok("%s returned %s as expected\n", + syscall_str(msb, start, end), expect_str); + } + + sh->indent--; + + return err; +} + +#define check_for(msb,start,end,expect) \ + _check_for(msb,start,end,expect,#expect) + +static bool check_zero(int msb, int nr) +{ + return check_for(msb, nr, nr, 0); +} + +static bool check_enosys(int msb, int nr) +{ + return check_for(msb, nr, nr, -ENOSYS); +} + +/* + * Anyone diagnosing a failure will want to know whether the kernel + * supports x32. Tell them. This can also be used to conditionalize + * tests based on existence or nonexistence of x32. + */ +static bool test_x32(void) +{ + long long ret; + pid_t mypid = getpid(); + + run("Checking for x32 by calling x32 getpid()\n"); + ret = probe_syscall(0, SYS_GETPID | X32_BIT); + + sh->indent++; + if (ret == mypid) { + info("x32 is supported\n"); + with_x32 = true; + } else if (ret == -ENOSYS) { + info("x32 is not supported\n"); + with_x32 = false; + } else { + fail("x32 getpid() returned %lld, but it should have returned either %lld or -ENOSYS\n", ret, (long long)mypid); + with_x32 = false; + } + sh->indent--; + return with_x32; +} + +static void test_syscalls_common(int msb) +{ + enum ptrace_pass pass = sh->ptrace_pass; + + run("Checking some common syscalls as 64 bit\n"); + check_zero(msb, SYS_READ); + check_zero(msb, SYS_WRITE); + + run("Checking some 64-bit only syscalls as 64 bit\n"); + check_zero(msb, X64_READV); + check_zero(msb, X64_WRITEV); + + run("Checking out of range system calls\n"); + check_for(msb, -64, -2, -ENOSYS); + if (pass >= PTP_FUZZRET) + check_for(msb, -1, -1, MODIFIED_BY_PTRACE); + else + check_for(msb, -1, -1, -ENOSYS); + check_for(msb, X32_BIT-64, X32_BIT-1, -ENOSYS); + check_for(msb, -64-X32_BIT, -1-X32_BIT, -ENOSYS); + check_for(msb, INT_MAX-64, INT_MAX-1, -ENOSYS); +} +static void test_syscalls_with_x32(int msb) +{ /* - * Check some syscalls with high bits set. + * Syscalls 512-547 are "x32" syscalls. They are + * intended to be called with the x32 (0x40000000) bit + * set. Calling them without the x32 bit set is + * nonsense and should not work. */ - printf("[RUN]\tChecking numbers above 2^32-1\n"); - check_enosys((1UL << 32), &ok); - check_enosys(X32_BIT | (1UL << 32), &ok); + run("Checking x32 syscalls as 64 bit\n"); + check_for(msb, 512, 547, -ENOSYS); - if (!ok) - nerrs++; - else - printf("[OK]\tThey all returned -ENOSYS\n"); + run("Checking some common syscalls as x32\n"); + check_zero(msb, SYS_READ | X32_BIT); + check_zero(msb, SYS_WRITE | X32_BIT); + + run("Checking some x32 syscalls as x32\n"); + check_zero(msb, X32_READV | X32_BIT); + check_zero(msb, X32_WRITEV | X32_BIT); + + run("Checking some 64-bit syscalls as x32\n"); + check_enosys(msb, X64_IOCTL | X32_BIT); + check_enosys(msb, X64_READV | X32_BIT); + check_enosys(msb, X64_WRITEV | X32_BIT); } -int main() +static void test_syscalls_without_x32(int msb) { + run("Checking for absence of x32 system calls\n"); + check_for(msb, 0 | X32_BIT, 999 | X32_BIT, -ENOSYS); +} + +static void test_syscall_numbering(void) +{ + static const int msbs[] = { + 0, 1, -1, X32_BIT-1, X32_BIT, X32_BIT-1, -X32_BIT, INT_MAX, + INT_MIN, INT_MIN+1 + }; + + sh->indent++; + /* - * Anyone diagnosing a failure will want to know whether the kernel - * supports x32. Tell them. + * The MSB is supposed to be ignored, so we loop over a few + * to test that out. */ - printf("\tChecking for x32..."); - fflush(stdout); - if (syscall(39 | X32_BIT, 0, 0, 0, 0, 0, 0) >= 0) { - printf(" supported\n"); - } else if (errno == ENOSYS) { - printf(" not supported\n"); + for (size_t i = 0; i < sizeof(msbs)/sizeof(msbs[0]); i++) { + int msb = msbs[i]; + run("Checking system calls with msb = %d (0x%x)\n", + msb, msb); + + sh->indent++; + + test_syscalls_common(msb); + if (with_x32) + test_syscalls_with_x32(msb); + else + test_syscalls_without_x32(msb); + + sh->indent--; + } + + sh->indent--; +} + +static void syscall_numbering_tracee(void) +{ + enum ptrace_pass pass; + + if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { + crit("Failed to request tracing\n"); + return; + } + raise(SIGSTOP); + + for (sh->ptrace_pass = pass = PTP_NOTHING; pass < PTP_DONE; + sh->ptrace_pass = ++pass) { + run("Running tests under ptrace: %s\n", ptrace_pass_name[pass]); + test_syscall_numbering(); + } +} + +static void mess_with_syscall(pid_t testpid, enum ptrace_pass pass) +{ + struct user_regs_struct regs; + + sh->probing_syscall = false; /* Do this on entry only */ + + /* For these, don't even getregs */ + if (pass == PTP_NOTHING || pass == PTP_DONE) + return; + + ptrace(PTRACE_GETREGS, testpid, NULL, ®s); + + if (regs.orig_rax != regs.rbx) { + fail("orig_rax %#llx doesn't match syscall number %#llx\n", + (unsigned long long)regs.orig_rax, + (unsigned long long)regs.rbx); + } + + switch (pass) { + case PTP_GETREGS: + /* Just read, no writeback */ + return; + case PTP_WRITEBACK: + /* Write back the same register state verbatim */ + break; + case PTP_FUZZRET: + regs.rax = MODIFIED_BY_PTRACE; + break; + case PTP_FUZZHIGH: + regs.rax = MODIFIED_BY_PTRACE; + regs.orig_rax = regs.orig_rax | 0xffffffff00000000ULL; + break; + case PTP_INTNUM: + regs.rax = MODIFIED_BY_PTRACE; + regs.orig_rax = (int)regs.orig_rax; + break; + default: + crit("invalid ptrace_pass\n"); + break; + } + + ptrace(PTRACE_SETREGS, testpid, NULL, ®s); +} + +static void syscall_numbering_tracer(pid_t testpid) +{ + int wstatus; + + do { + pid_t wpid = waitpid(testpid, &wstatus, 0); + if (wpid < 0 && errno != EINTR) + break; + if (wpid != testpid) + continue; + if (!WIFSTOPPED(wstatus)) + break; /* Thread exited? */ + + if (sh->probing_syscall && WSTOPSIG(wstatus) == SIGTRAP) + mess_with_syscall(testpid, sh->ptrace_pass); + } while (sh->ptrace_pass != PTP_DONE && + !ptrace(PTRACE_SYSCALL, testpid, NULL, NULL)); + + ptrace(PTRACE_DETACH, testpid, NULL, NULL); + + /* Wait for the child process to terminate */ + while (waitpid(testpid, &wstatus, 0) != testpid || !WIFEXITED(wstatus)) + /* wait some more */; +} + +static void test_traced_syscall_numbering(void) +{ + pid_t testpid; + + /* Launch the test thread; this thread continues as the tracer thread */ + testpid = fork(); + + if (testpid < 0) { + crit("Unable to launch tracer process\n"); + } else if (testpid == 0) { + syscall_numbering_tracee(); + _exit(0); } else { - printf(" confused\n"); + syscall_numbering_tracer(testpid); } +} - test_x32_without_x32_bit(); +int main(void) +{ + unsigned int nerr; - return nerrs ? 1 : 0; + /* + * It is quite likely to get a segfault on a failure, so make + * sure the message gets out by setting stdout to nonbuffered. + */ + setvbuf(stdout, NULL, _IONBF, 0); + + /* + * Harmless file descriptor to work on... + */ + nullfd = open("/dev/null", O_RDWR); + if (nullfd < 0) { + crit("Unable to open /dev/null: %s\n", strerror(errno)); + } + + /* + * Set up a block of shared memory... + */ + sh = mmap(NULL, sysconf(_SC_PAGE_SIZE), PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_SHARED, 0, 0); + if (sh == MAP_FAILED) { + crit("Unable to allocated shared memory block: %s\n", + strerror(errno)); + } + + with_x32 = test_x32(); + + run("Running tests without ptrace...\n"); + test_syscall_numbering(); + + test_traced_syscall_numbering(); + + nerr = sh->nerr; + if (!nerr) { + ok("All system calls succeeded or failed as expected\n"); + return 0; + } else { + fail("A total of %u system call%s had incorrect behavior\n", + nerr, nerr != 1 ? "s" : ""); + return 1; + } } |