diff options
author | 2024-11-18 10:30:29 -0800 | |
---|---|---|
committer | 2024-11-18 10:30:29 -0800 | |
commit | 4c797b11a88297b9b0010b2c6645b191bac2350c (patch) | |
tree | 3eca58d86d27164682ee732cfb49e1404b315584 /tools | |
parent | Merge tag 'vfs-6.13.netfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs (diff) | |
parent | selftests: add file SLAB_TYPESAFE_BY_RCU recycling stressor (diff) | |
download | wireguard-linux-4c797b11a88297b9b0010b2c6645b191bac2350c.tar.xz wireguard-linux-4c797b11a88297b9b0010b2c6645b191bac2350c.zip |
Merge tag 'vfs-6.13.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs file updates from Christian Brauner:
"This contains changes the changes for files for this cycle:
- Introduce a new reference counting mechanism for files.
As atomic_inc_not_zero() is implemented with a try_cmpxchg() loop
it has O(N^2) behaviour under contention with N concurrent
operations and it is in a hot path in __fget_files_rcu().
The rcuref infrastructures remedies this problem by using an
unconditional increment relying on safe- and dead zones to make
this work and requiring rcu protection for the data structure in
question. This not just scales better it also introduces overflow
protection.
However, in contrast to generic rcuref, files require a memory
barrier and thus cannot rely on *_relaxed() atomic operations and
also require to be built on atomic_long_t as having massive amounts
of reference isn't unheard of even if it is just an attack.
This adds a file specific variant instead of making this a generic
library.
This has been tested by various people and it gives consistent
improvement up to 3-5% on workloads with loads of threads.
- Add a fastpath for find_next_zero_bit(). Skip 2-levels searching
via find_next_zero_bit() when there is a free slot in the word that
contains the next fd. This improves pts/blogbench-1.1.0 read by 8%
and write by 4% on Intel ICX 160.
- Conditionally clear full_fds_bits since it's very likely that a bit
in full_fds_bits has been cleared during __clear_open_fds(). This
improves pts/blogbench-1.1.0 read up to 13%, and write up to 5% on
Intel ICX 160.
- Get rid of all lookup_*_fdget_rcu() variants. They were used to
lookup files without taking a reference count. That became invalid
once files were switched to SLAB_TYPESAFE_BY_RCU and now we're
always taking a reference count. Switch to an already existing
helper and remove the legacy variants.
- Remove pointless includes of <linux/fdtable.h>.
- Avoid cmpxchg() in close_files() as nobody else has a reference to
the files_struct at that point.
- Move close_range() into fs/file.c and fold __close_range() into it.
- Cleanup calling conventions of alloc_fdtable() and expand_files().
- Merge __{set,clear}_close_on_exec() into one.
- Make __set_open_fd() set cloexec as well instead of doing it in two
separate steps"
* tag 'vfs-6.13.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
selftests: add file SLAB_TYPESAFE_BY_RCU recycling stressor
fs: port files to file_ref
fs: add file_ref
expand_files(): simplify calling conventions
make __set_open_fd() set cloexec state as well
fs: protect backing files with rcu
file.c: merge __{set,clear}_close_on_exec()
alloc_fdtable(): change calling conventions.
fs/file.c: add fast path in find_next_fd()
fs/file.c: conditionally clear full_fds
fs/file.c: remove sanity_check and add likely/unlikely in alloc_fd()
move close_range(2) into fs/file.c, fold __close_range() into it
close_files(): don't bother with xchg()
remove pointless includes of <linux/fdtable.h>
get rid of ...lookup...fdget_rcu() family
Diffstat (limited to 'tools')
-rw-r--r-- | tools/testing/selftests/filesystems/.gitignore | 1 | ||||
-rw-r--r-- | tools/testing/selftests/filesystems/Makefile | 2 | ||||
-rw-r--r-- | tools/testing/selftests/filesystems/file_stressor.c | 194 |
3 files changed, 196 insertions, 1 deletions
diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore index f0c0ff20d6cf..828b66a10c63 100644 --- a/tools/testing/selftests/filesystems/.gitignore +++ b/tools/testing/selftests/filesystems/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only dnotify_test devpts_pts +file_stressor diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile index c647fd6a0446..66305fc34c60 100644 --- a/tools/testing/selftests/filesystems/Makefile +++ b/tools/testing/selftests/filesystems/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += $(KHDR_INCLUDES) -TEST_GEN_PROGS := devpts_pts +TEST_GEN_PROGS := devpts_pts file_stressor TEST_GEN_PROGS_EXTENDED := dnotify_test include ../lib.mk diff --git a/tools/testing/selftests/filesystems/file_stressor.c b/tools/testing/selftests/filesystems/file_stressor.c new file mode 100644 index 000000000000..1136f93a9977 --- /dev/null +++ b/tools/testing/selftests/filesystems/file_stressor.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ + +#include <fcntl.h> +#include <limits.h> +#include <pthread.h> +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <unistd.h> + +#include "../kselftest_harness.h" + +#include <linux/types.h> +#include <linux/mount.h> +#include <sys/syscall.h> + +static inline int sys_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} + +static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, + const char *value, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, value, aux); +} + +static inline int sys_fsmount(int fd, unsigned int flags, + unsigned int attr_flags) +{ + return syscall(__NR_fsmount, fd, flags, attr_flags); +} + +#ifndef MOVE_MOUNT_F_EMPTY_PATH +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ +#endif + +static inline int sys_move_mount(int from_dfd, const char *from_pathname, + int to_dfd, const char *to_pathname, + unsigned int flags) +{ + return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, + to_pathname, flags); +} + +FIXTURE(file_stressor) { + int fd_tmpfs; + int nr_procs; + int max_fds; + pid_t *pids_openers; + pid_t *pids_getdents; + int *fd_proc_pid; +}; + +FIXTURE_SETUP(file_stressor) +{ + int fd_context; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + ASSERT_EQ(mkdir("/slab_typesafe_by_rcu", 0755), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + self->fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(self->fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + ASSERT_EQ(sys_move_mount(self->fd_tmpfs, "", -EBADF, "/slab_typesafe_by_rcu", MOVE_MOUNT_F_EMPTY_PATH), 0); + + self->nr_procs = sysconf(_SC_NPROCESSORS_ONLN); + self->pids_openers = malloc(sizeof(pid_t) * self->nr_procs); + ASSERT_NE(self->pids_openers, NULL); + self->pids_getdents = malloc(sizeof(pid_t) * self->nr_procs); + ASSERT_NE(self->pids_getdents, NULL); + self->fd_proc_pid = malloc(sizeof(int) * self->nr_procs); + ASSERT_NE(self->fd_proc_pid, NULL); + self->max_fds = 500; +} + +FIXTURE_TEARDOWN(file_stressor) +{ + for (int i = 0; i < self->nr_procs; i++) { + int wstatus; + pid_t pid; + + pid = waitpid(self->pids_openers[i], &wstatus, 0); + ASSERT_EQ(pid, self->pids_openers[i]); + ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus)); + + pid = waitpid(self->pids_getdents[i], &wstatus, 0); + ASSERT_EQ(pid, self->pids_getdents[i]); + ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus)); + } + free(self->pids_openers); + free(self->pids_getdents); + ASSERT_EQ(close(self->fd_tmpfs), 0); + + umount2("/slab_typesafe_by_rcu", 0); + ASSERT_EQ(rmdir("/slab_typesafe_by_rcu"), 0); +} + +TEST_F_TIMEOUT(file_stressor, slab_typesafe_by_rcu, 900 * 2) +{ + for (int i = 0; i < self->nr_procs; i++) { + pid_t pid_self; + + self->pids_openers[i] = fork(); + ASSERT_GE(self->pids_openers[i], 0); + + if (self->pids_openers[i] != 0) + continue; + + self->pids_openers[i] = getpid(); + for (;;) { + for (int i = 0; i < self->max_fds; i++) { + char path[PATH_MAX]; + int fd; + + sprintf(path, "/slab_typesafe_by_rcu/file-%d-%d", self->pids_openers[i], i); + fd = open(path, O_CREAT | O_RDONLY | O_CLOEXEC, 0644); + if (fd < 0) + continue; + } + + close_range(3, ~0U, 0); + } + + exit(0); + } + + for (int i = 0; i < self->nr_procs; i++) { + char path[PATH_MAX]; + + sprintf(path, "/proc/%d/fd/", self->pids_openers[i]); + self->fd_proc_pid[i] = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + ASSERT_GE(self->fd_proc_pid[i], 0); + } + + for (int i = 0; i < self->nr_procs; i++) { + self->pids_getdents[i] = fork(); + ASSERT_GE(self->pids_getdents[i], 0); + + if (self->pids_getdents[i] != 0) + continue; + + self->pids_getdents[i] = getpid(); + for (;;) { + char ents[1024]; + ssize_t nr_read; + + /* + * Concurrently read /proc/<pid>/fd/ which rougly does: + * + * f = fget_task_next(p, &fd); + * if (!f) + * break; + * data.mode = f->f_mode; + * fput(f); + * + * Which means that it'll try to get a reference to a + * file in another task's file descriptor table. + * + * Under heavy file load it is increasingly likely that + * the other task will manage to close @file and @file + * is being recycled due to SLAB_TYPEAFE_BY_RCU + * concurrently. This will trigger various warnings in + * the file reference counting code. + */ + do { + nr_read = syscall(SYS_getdents64, self->fd_proc_pid[i], ents, sizeof(ents)); + } while (nr_read >= 0); + + lseek(self->fd_proc_pid[i], 0, SEEK_SET); + } + + exit(0); + } + + ASSERT_EQ(clock_nanosleep(CLOCK_MONOTONIC, 0, &(struct timespec){ .tv_sec = 900 /* 15 min */ }, NULL), 0); + + for (int i = 0; i < self->nr_procs; i++) { + kill(self->pids_openers[i], SIGKILL); + kill(self->pids_getdents[i], SIGKILL); + } +} + +TEST_HARNESS_MAIN |