aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/tools
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-01-20 09:59:00 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-01-20 09:59:00 -0800
commit5f85bd6aeceaecd0ff3a5ee827bf75eb6141ad55 (patch)
tree10077e22dc6b696f798ede82e844cbed8edd26a4 /tools
parentMerge tag 'vfs-6.14-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs (diff)
parentMerge patch series "pidfs: support bind-mounts" (diff)
downloadwireguard-linux-5f85bd6aeceaecd0ff3a5ee827bf75eb6141ad55.tar.xz
wireguard-linux-5f85bd6aeceaecd0ff3a5ee827bf75eb6141ad55.zip
Merge tag 'vfs-6.14-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull pidfs updates from Christian Brauner: - Rework inode number allocation Recently we received a patchset that aims to enable file handle encoding and decoding via name_to_handle_at(2) and open_by_handle_at(2). A crucical step in the patch series is how to go from inode number to struct pid without leaking information into unprivileged contexts. The issue is that in order to find a struct pid the pid number in the initial pid namespace must be encoded into the file handle via name_to_handle_at(2). This can be used by containers using a separate pid namespace to learn what the pid number of a given process in the initial pid namespace is. While this is a weak information leak it could be used in various exploits and in general is an ugly wart in the design. To solve this problem a new way is needed to lookup a struct pid based on the inode number allocated for that struct pid. The other part is to remove the custom inode number allocation on 32bit systems that is also an ugly wart that should go away. Allocate unique identifiers for struct pid by simply incrementing a 64 bit counter and insert each struct pid into the rbtree so it can be looked up to decode file handles avoiding to leak actual pids across pid namespaces in file handles. On both 64 bit and 32 bit the same 64 bit identifier is used to lookup struct pid in the rbtree. On 64 bit the unique identifier for struct pid simply becomes the inode number. Comparing two pidfds continues to be as simple as comparing inode numbers. On 32 bit the 64 bit number assigned to struct pid is split into two 32 bit numbers. The lower 32 bits are used as the inode number and the upper 32 bits are used as the inode generation number. Whenever a wraparound happens on 32 bit the 64 bit number will be incremented by 2 so inode numbering starts at 2 again. When a wraparound happens on 32 bit multiple pidfds with the same inode number are likely to exist. This isn't a problem since before pidfs pidfds used the anonymous inode meaning all pidfds had the same inode number. On 32 bit sserspace can thus reconstruct the 64 bit identifier by retrieving both the inode number and the inode generation number to compare, or use file handles. This gives the same guarantees on both 32 bit and 64 bit. - Implement file handle support This is based on custom export operation methods which allows pidfs to implement permission checking and opening of pidfs file handles cleanly without hacking around in the core file handle code too much. - Support bind-mounts Allow bind-mounting pidfds. Similar to nsfs let's allow bind-mounts for pidfds. This allows pidfds to be safely recovered and checked for process recycling. Instead of checking d_ops for both nsfs and pidfs we could in a follow-up patch add a flag argument to struct dentry_operations that functions similar to file_operations->fop_flags. * tag 'vfs-6.14-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: selftests: add pidfd bind-mount tests pidfs: allow bind-mounts pidfs: lookup pid through rbtree selftests/pidfd: add pidfs file handle selftests pidfs: check for valid ioctl commands pidfs: implement file handle support exportfs: add permission method fhandle: pull CAP_DAC_READ_SEARCH check into may_decode_fh() exportfs: add open method fhandle: simplify error handling pseudofs: add support for export_ops pidfs: support FS_IOC_GETVERSION pidfs: remove 32bit inode number handling pidfs: rework inode number allocation
Diffstat (limited to 'tools')
-rw-r--r--tools/testing/selftests/pidfd/.gitignore2
-rw-r--r--tools/testing/selftests/pidfd/Makefile3
-rw-r--r--tools/testing/selftests/pidfd/pidfd.h39
-rw-r--r--tools/testing/selftests/pidfd/pidfd_bind_mount.c188
-rw-r--r--tools/testing/selftests/pidfd/pidfd_file_handle_test.c503
-rw-r--r--tools/testing/selftests/pidfd/pidfd_setns_test.c47
-rw-r--r--tools/testing/selftests/pidfd/pidfd_wait.c47
7 files changed, 756 insertions, 73 deletions
diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore
index 973198a3ec3d..bf92481f925c 100644
--- a/tools/testing/selftests/pidfd/.gitignore
+++ b/tools/testing/selftests/pidfd/.gitignore
@@ -6,3 +6,5 @@ pidfd_wait
pidfd_fdinfo_test
pidfd_getfd_test
pidfd_setns_test
+pidfd_file_handle_test
+pidfd_bind_mount
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index d731e3e76d5b..301343a11b62 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -2,7 +2,8 @@
CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
- pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test
+ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \
+ pidfd_file_handle_test pidfd_bind_mount
include ../lib.mk
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index 88d6830ee004..28a471c88c51 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -17,6 +17,7 @@
#include <sys/wait.h>
#include "../kselftest.h"
+#include "../clone3/clone3_selftests.h"
#ifndef P_PIDFD
#define P_PIDFD 3
@@ -68,6 +69,11 @@
#define PIDFD_SKIP 3
#define PIDFD_XFAIL 4
+static inline int sys_waitid(int which, pid_t pid, siginfo_t *info, int options)
+{
+ return syscall(__NR_waitid, which, pid, info, options, NULL);
+}
+
static inline int wait_for_pid(pid_t pid)
{
int status, ret;
@@ -114,4 +120,37 @@ static inline int sys_memfd_create(const char *name, unsigned int flags)
return syscall(__NR_memfd_create, name, flags);
}
+static inline pid_t create_child(int *pidfd, unsigned flags)
+{
+ struct __clone_args args = {
+ .flags = CLONE_PIDFD | flags,
+ .exit_signal = SIGCHLD,
+ .pidfd = ptr_to_u64(pidfd),
+ };
+
+ return sys_clone3(&args, sizeof(struct __clone_args));
+}
+
+static inline ssize_t read_nointr(int fd, void *buf, size_t count)
+{
+ ssize_t ret;
+
+ do {
+ ret = read(fd, buf, count);
+ } while (ret < 0 && errno == EINTR);
+
+ return ret;
+}
+
+static inline ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+ ssize_t ret;
+
+ do {
+ ret = write(fd, buf, count);
+ } while (ret < 0 && errno == EINTR);
+
+ return ret;
+}
+
#endif /* __PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_bind_mount.c b/tools/testing/selftests/pidfd/pidfd_bind_mount.c
new file mode 100644
index 000000000000..7822dd080258
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_bind_mount.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+#ifndef __NR_open_tree
+ #if defined __alpha__
+ #define __NR_open_tree 538
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_open_tree 4428
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_open_tree 6428
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_open_tree 5428
+ #endif
+ #elif defined __ia64__
+ #define __NR_open_tree (428 + 1024)
+ #else
+ #define __NR_open_tree 428
+ #endif
+#endif
+
+#ifndef __NR_move_mount
+ #if defined __alpha__
+ #define __NR_move_mount 539
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_move_mount 4429
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_move_mount 6429
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_move_mount 5429
+ #endif
+ #elif defined __ia64__
+ #define __NR_move_mount (428 + 1024)
+ #else
+ #define __NR_move_mount 429
+ #endif
+#endif
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#endif
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */
+#endif
+
+static inline int sys_move_mount(int from_dfd, const char *from_pathname,
+ int to_dfd, const char *to_pathname,
+ unsigned int flags)
+{
+ return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
+ to_pathname, flags);
+}
+
+#ifndef OPEN_TREE_CLONE
+#define OPEN_TREE_CLONE 1
+#endif
+
+#ifndef OPEN_TREE_CLOEXEC
+#define OPEN_TREE_CLOEXEC O_CLOEXEC
+#endif
+
+#ifndef AT_RECURSIVE
+#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
+#endif
+
+static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
+{
+ return syscall(__NR_open_tree, dfd, filename, flags);
+}
+
+FIXTURE(pidfd_bind_mount) {
+ char template[PATH_MAX];
+ int fd_tmp;
+ int pidfd;
+ struct stat st1;
+ struct stat st2;
+ __u32 gen1;
+ __u32 gen2;
+ bool must_unmount;
+};
+
+FIXTURE_SETUP(pidfd_bind_mount)
+{
+ self->fd_tmp = -EBADF;
+ self->must_unmount = false;
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_LE(snprintf(self->template, PATH_MAX, "%s", P_tmpdir "/pidfd_bind_mount_XXXXXX"), PATH_MAX);
+ self->fd_tmp = mkstemp(self->template);
+ ASSERT_GE(self->fd_tmp, 0);
+ self->pidfd = sys_pidfd_open(getpid(), 0);
+ ASSERT_GE(self->pidfd, 0);
+ ASSERT_GE(fstat(self->pidfd, &self->st1), 0);
+ ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen1), 0);
+}
+
+FIXTURE_TEARDOWN(pidfd_bind_mount)
+{
+ ASSERT_EQ(close(self->fd_tmp), 0);
+ if (self->must_unmount)
+ ASSERT_EQ(umount2(self->template, 0), 0);
+ ASSERT_EQ(unlink(self->template), 0);
+}
+
+/*
+ * Test that a detached mount can be created for a pidfd and then
+ * attached to the filesystem hierarchy.
+ */
+TEST_F(pidfd_bind_mount, bind_mount)
+{
+ int fd_tree;
+
+ fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
+ ASSERT_GE(fd_tree, 0);
+
+ ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0);
+ self->must_unmount = true;
+
+ ASSERT_EQ(close(fd_tree), 0);
+}
+
+/* Test that a pidfd can be reopened through procfs. */
+TEST_F(pidfd_bind_mount, reopen)
+{
+ int pidfd;
+ char proc_path[PATH_MAX];
+
+ sprintf(proc_path, "/proc/self/fd/%d", self->pidfd);
+ pidfd = open(proc_path, O_RDONLY | O_NOCTTY | O_CLOEXEC);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_GE(fstat(self->pidfd, &self->st2), 0);
+ ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen2), 0);
+
+ ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino);
+ ASSERT_TRUE(self->gen1 == self->gen2);
+
+ ASSERT_EQ(close(pidfd), 0);
+}
+
+/*
+ * Test that a detached mount can be created for a pidfd and then
+ * attached to the filesystem hierarchy and reopened.
+ */
+TEST_F(pidfd_bind_mount, bind_mount_reopen)
+{
+ int fd_tree, fd_pidfd_mnt;
+
+ fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
+ ASSERT_GE(fd_tree, 0);
+
+ ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0);
+ self->must_unmount = true;
+
+ fd_pidfd_mnt = openat(-EBADF, self->template, O_RDONLY | O_NOCTTY | O_CLOEXEC);
+ ASSERT_GE(fd_pidfd_mnt, 0);
+
+ ASSERT_GE(fstat(fd_tree, &self->st2), 0);
+ ASSERT_EQ(ioctl(fd_pidfd_mnt, FS_IOC_GETVERSION, &self->gen2), 0);
+
+ ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino);
+ ASSERT_TRUE(self->gen1 == self->gen2);
+
+ ASSERT_EQ(close(fd_tree), 0);
+ ASSERT_EQ(close(fd_pidfd_mnt), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/pidfd_file_handle_test.c b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c
new file mode 100644
index 000000000000..439b9c6c0457
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c
@@ -0,0 +1,503 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <poll.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <linux/kcmp.h>
+#include <sys/stat.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+FIXTURE(file_handle)
+{
+ pid_t pid;
+ int pidfd;
+
+ pid_t child_pid1;
+ int child_pidfd1;
+
+ pid_t child_pid2;
+ int child_pidfd2;
+
+ pid_t child_pid3;
+ int child_pidfd3;
+};
+
+FIXTURE_SETUP(file_handle)
+{
+ int ret;
+ int ipc_sockets[2];
+ char c;
+
+ self->pid = getpid();
+ self->pidfd = sys_pidfd_open(self->pid, 0);
+ ASSERT_GE(self->pidfd, 0);
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ EXPECT_EQ(ret, 0);
+
+ self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER);
+ EXPECT_GE(self->child_pid1, 0);
+
+ if (self->child_pid1 == 0) {
+ close(ipc_sockets[0]);
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+ _exit(EXIT_FAILURE);
+
+ close(ipc_sockets[1]);
+
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ close(ipc_sockets[1]);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ close(ipc_sockets[0]);
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ EXPECT_EQ(ret, 0);
+
+ self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID);
+ EXPECT_GE(self->child_pid2, 0);
+
+ if (self->child_pid2 == 0) {
+ close(ipc_sockets[0]);
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+ _exit(EXIT_FAILURE);
+
+ close(ipc_sockets[1]);
+
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ close(ipc_sockets[1]);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ close(ipc_sockets[0]);
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ EXPECT_EQ(ret, 0);
+
+ self->child_pid3 = create_child(&self->child_pidfd3, CLONE_NEWUSER | CLONE_NEWPID);
+ EXPECT_GE(self->child_pid3, 0);
+
+ if (self->child_pid3 == 0) {
+ close(ipc_sockets[0]);
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+ _exit(EXIT_FAILURE);
+
+ close(ipc_sockets[1]);
+
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ close(ipc_sockets[1]);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ close(ipc_sockets[0]);
+}
+
+FIXTURE_TEARDOWN(file_handle)
+{
+ EXPECT_EQ(close(self->pidfd), 0);
+
+ EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd1, SIGKILL, NULL, 0), 0);
+ if (self->child_pidfd1 >= 0)
+ EXPECT_EQ(0, close(self->child_pidfd1));
+
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0);
+
+ EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd2, SIGKILL, NULL, 0), 0);
+ if (self->child_pidfd2 >= 0)
+ EXPECT_EQ(0, close(self->child_pidfd2));
+
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0);
+
+ if (self->child_pidfd3 >= 0) {
+ EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd3, SIGKILL, NULL, 0), 0);
+ EXPECT_EQ(0, close(self->child_pidfd3));
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+ }
+}
+
+/*
+ * Test that we can decode a pidfs file handle in the same pid
+ * namespace.
+ */
+TEST_F(file_handle, file_handle_same_pidns)
+{
+ int mnt_id;
+ struct file_handle *fh;
+ int pidfd = -EBADF;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd1, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd1, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ free(fh);
+}
+
+/*
+ * Test that we can decode a pidfs file handle from a child pid
+ * namespace.
+ */
+TEST_F(file_handle, file_handle_child_pidns)
+{
+ int mnt_id;
+ struct file_handle *fh;
+ int pidfd = -EBADF;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ free(fh);
+}
+
+/*
+ * Test that we fail to decode a pidfs file handle from an ancestor
+ * child pid namespace.
+ */
+TEST_F(file_handle, file_handle_foreign_pidns)
+{
+ int mnt_id;
+ struct file_handle *fh;
+ pid_t pid;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->pidfd, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(setns(self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ if (pidfd >= 0) {
+ TH_LOG("Managed to open pidfd outside of the caller's pid namespace hierarchy");
+ _exit(1);
+ }
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+
+ free(fh);
+}
+
+/*
+ * Test that we can decode a pidfs file handle of a process that has
+ * exited but not been reaped.
+ */
+TEST_F(file_handle, pid_has_exited)
+{
+ int mnt_id, pidfd, child_pidfd3;
+ struct file_handle *fh;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ child_pidfd3 = self->child_pidfd3;
+ self->child_pidfd3 = -EBADF;
+ EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0);
+ EXPECT_EQ(close(child_pidfd3), 0);
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED | WNOWAIT), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+}
+
+/*
+ * Test that we fail to decode a pidfs file handle of a process that has
+ * already been reaped.
+ */
+TEST_F(file_handle, pid_has_been_reaped)
+{
+ int mnt_id, pidfd, child_pidfd3;
+ struct file_handle *fh;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ child_pidfd3 = self->child_pidfd3;
+ self->child_pidfd3 = -EBADF;
+ EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0);
+ EXPECT_EQ(close(child_pidfd3), 0);
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_LT(pidfd, 0);
+}
+
+/*
+ * Test valid flags to open a pidfd file handle. Note, that
+ * PIDFD_NONBLOCK is defined as O_NONBLOCK and O_NONBLOCK is an alias to
+ * O_NDELAY. Also note that PIDFD_THREAD is an alias for O_EXCL.
+ */
+TEST_F(file_handle, open_by_handle_at_valid_flags)
+{
+ int mnt_id;
+ struct file_handle *fh;
+ int pidfd = -EBADF;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh,
+ O_RDONLY |
+ O_WRONLY |
+ O_RDWR |
+ O_NONBLOCK |
+ O_NDELAY |
+ O_CLOEXEC |
+ O_EXCL);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+}
+
+/*
+ * Test that invalid flags passed to open a pidfd file handle are
+ * rejected.
+ */
+TEST_F(file_handle, open_by_handle_at_invalid_flags)
+{
+ int mnt_id;
+ struct file_handle *fh;
+ int pidfd = -EBADF;
+ static const struct invalid_pidfs_file_handle_flags {
+ int oflag;
+ const char *oflag_name;
+ } invalid_pidfs_file_handle_flags[] = {
+ { FASYNC, "FASYNC" },
+ { O_CREAT, "O_CREAT" },
+ { O_NOCTTY, "O_NOCTTY" },
+ { O_CREAT, "O_CREAT" },
+ { O_TRUNC, "O_TRUNC" },
+ { O_APPEND, "O_APPEND" },
+ { O_SYNC, "O_SYNC" },
+ { O_DSYNC, "O_DSYNC" },
+ { O_DIRECT, "O_DIRECT" },
+ { O_DIRECTORY, "O_DIRECTORY" },
+ { O_NOFOLLOW, "O_NOFOLLOW" },
+ { O_NOATIME, "O_NOATIME" },
+ { O_PATH, "O_PATH" },
+ { O_TMPFILE, "O_TMPFILE" },
+ /*
+ * O_LARGEFILE is added implicitly by
+ * open_by_handle_at() so pidfs simply masks it off.
+ */
+ };
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ for (int i = 0; i < ARRAY_SIZE(invalid_pidfs_file_handle_flags); i++) {
+ pidfd = open_by_handle_at(self->pidfd, fh, invalid_pidfs_file_handle_flags[i].oflag);
+ ASSERT_LT(pidfd, 0) {
+ TH_LOG("open_by_handle_at() succeeded with invalid flags: %s", invalid_pidfs_file_handle_flags[i].oflag_name);
+ }
+ }
+}
+
+/* Test that lookup fails. */
+TEST_F(file_handle, lookup_must_fail)
+{
+ int mnt_id;
+ struct file_handle *fh;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, AT_EMPTY_PATH), 0);
+ ASSERT_EQ(errno, ENOTDIR);
+ ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, 0), 0);
+ ASSERT_EQ(errno, ENOTDIR);
+}
+
+#ifndef AT_HANDLE_CONNECTABLE
+#define AT_HANDLE_CONNECTABLE 0x002
+#endif
+
+/*
+ * Test that AT_HANDLE_CONNECTABLE is rejected. Connectable file handles
+ * don't make sense for pidfs. Note that currently AT_HANDLE_CONNECTABLE
+ * is rejected because it is incompatible with AT_EMPTY_PATH which is
+ * required with pidfds as we don't support lookup.
+ */
+TEST_F(file_handle, invalid_name_to_handle_at_flags)
+{
+ int mnt_id;
+ struct file_handle *fh;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_NE(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_CONNECTABLE), 0);
+}
+
+#ifndef AT_HANDLE_FID
+#define AT_HANDLE_FID 0x200
+#endif
+
+/*
+ * Test that a request with AT_HANDLE_FID always leads to decodable file
+ * handle as pidfs always provides export operations.
+ */
+TEST_F(file_handle, valid_name_to_handle_at_flags)
+{
+ int mnt_id, pidfd;
+ struct file_handle *fh;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_FID), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c
index 7c2a4349170a..222f8131283b 100644
--- a/tools/testing/selftests/pidfd/pidfd_setns_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c
@@ -19,7 +19,6 @@
#include <linux/ioctl.h>
#include "pidfd.h"
-#include "../clone3/clone3_selftests.h"
#include "../kselftest_harness.h"
#ifndef PIDFS_IOCTL_MAGIC
@@ -118,22 +117,6 @@ FIXTURE(current_nsset)
int child_pidfd_derived_nsfds2[PIDFD_NS_MAX];
};
-static int sys_waitid(int which, pid_t pid, int options)
-{
- return syscall(__NR_waitid, which, pid, NULL, options, NULL);
-}
-
-pid_t create_child(int *pidfd, unsigned flags)
-{
- struct __clone_args args = {
- .flags = CLONE_PIDFD | flags,
- .exit_signal = SIGCHLD,
- .pidfd = ptr_to_u64(pidfd),
- };
-
- return sys_clone3(&args, sizeof(struct clone_args));
-}
-
static bool switch_timens(void)
{
int fd, ret;
@@ -150,28 +133,6 @@ static bool switch_timens(void)
return ret == 0;
}
-static ssize_t read_nointr(int fd, void *buf, size_t count)
-{
- ssize_t ret;
-
- do {
- ret = read(fd, buf, count);
- } while (ret < 0 && errno == EINTR);
-
- return ret;
-}
-
-static ssize_t write_nointr(int fd, const void *buf, size_t count)
-{
- ssize_t ret;
-
- do {
- ret = write(fd, buf, count);
- } while (ret < 0 && errno == EINTR);
-
- return ret;
-}
-
FIXTURE_SETUP(current_nsset)
{
int i, proc_fd, ret;
@@ -229,7 +190,7 @@ FIXTURE_SETUP(current_nsset)
_exit(EXIT_SUCCESS);
}
- ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED | WNOWAIT), 0);
self->pidfd = sys_pidfd_open(self->pid, 0);
EXPECT_GE(self->pidfd, 0) {
@@ -432,9 +393,9 @@ FIXTURE_TEARDOWN(current_nsset)
EXPECT_EQ(0, close(self->child_pidfd1));
if (self->child_pidfd2 >= 0)
EXPECT_EQ(0, close(self->child_pidfd2));
- ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
- ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0);
- ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0);
}
static int preserve_ns(const int pid, const char *ns)
diff --git a/tools/testing/selftests/pidfd/pidfd_wait.c b/tools/testing/selftests/pidfd/pidfd_wait.c
index 0dcb8365ddc3..1e2d49751cde 100644
--- a/tools/testing/selftests/pidfd/pidfd_wait.c
+++ b/tools/testing/selftests/pidfd/pidfd_wait.c
@@ -26,22 +26,11 @@
#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__)
#endif
-static pid_t sys_clone3(struct clone_args *args)
-{
- return syscall(__NR_clone3, args, sizeof(struct clone_args));
-}
-
-static int sys_waitid(int which, pid_t pid, siginfo_t *info, int options,
- struct rusage *ru)
-{
- return syscall(__NR_waitid, which, pid, info, options, ru);
-}
-
TEST(wait_simple)
{
int pidfd = -1;
pid_t parent_tid = -1;
- struct clone_args args = {
+ struct __clone_args args = {
.parent_tid = ptr_to_u64(&parent_tid),
.pidfd = ptr_to_u64(&pidfd),
.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
@@ -55,7 +44,7 @@ TEST(wait_simple)
pidfd = open("/proc/self", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
ASSERT_GE(pidfd, 0);
- pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_NE(pid, 0);
EXPECT_EQ(close(pidfd), 0);
pidfd = -1;
@@ -63,18 +52,18 @@ TEST(wait_simple)
pidfd = open("/dev/null", O_RDONLY | O_CLOEXEC);
ASSERT_GE(pidfd, 0);
- pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_NE(pid, 0);
EXPECT_EQ(close(pidfd), 0);
pidfd = -1;
- pid = sys_clone3(&args);
+ pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
if (pid == 0)
exit(EXIT_SUCCESS);
- pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_GE(pid, 0);
ASSERT_EQ(WIFEXITED(info.si_status), true);
ASSERT_EQ(WEXITSTATUS(info.si_status), 0);
@@ -89,7 +78,7 @@ TEST(wait_states)
{
int pidfd = -1;
pid_t parent_tid = -1;
- struct clone_args args = {
+ struct __clone_args args = {
.parent_tid = ptr_to_u64(&parent_tid),
.pidfd = ptr_to_u64(&pidfd),
.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
@@ -102,7 +91,7 @@ TEST(wait_states)
};
ASSERT_EQ(pipe(pfd), 0);
- pid = sys_clone3(&args);
+ pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
if (pid == 0) {
@@ -117,28 +106,28 @@ TEST(wait_states)
}
close(pfd[0]);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_STOPPED);
ASSERT_EQ(info.si_pid, parent_tid);
ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED), 0);
ASSERT_EQ(write(pfd[1], "C", 1), 1);
close(pfd[1]);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_CONTINUED);
ASSERT_EQ(info.si_pid, parent_tid);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_STOPPED);
ASSERT_EQ(info.si_pid, parent_tid);
ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_KILLED);
ASSERT_EQ(info.si_pid, parent_tid);
@@ -151,7 +140,7 @@ TEST(wait_nonblock)
int pidfd;
unsigned int flags = 0;
pid_t parent_tid = -1;
- struct clone_args args = {
+ struct __clone_args args = {
.parent_tid = ptr_to_u64(&parent_tid),
.flags = CLONE_PARENT_SETTID,
.exit_signal = SIGCHLD,
@@ -173,12 +162,12 @@ TEST(wait_nonblock)
SKIP(return, "Skipping PIDFD_NONBLOCK test");
}
- ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_LT(ret, 0);
ASSERT_EQ(errno, ECHILD);
EXPECT_EQ(close(pidfd), 0);
- pid = sys_clone3(&args);
+ pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
if (pid == 0) {
@@ -201,7 +190,7 @@ TEST(wait_nonblock)
* Callers need to see EAGAIN/EWOULDBLOCK with non-blocking pidfd when
* child processes exist but none have exited.
*/
- ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_LT(ret, 0);
ASSERT_EQ(errno, EAGAIN);
@@ -210,19 +199,19 @@ TEST(wait_nonblock)
* WNOHANG raised explicitly when child processes exist but none have
* exited.
*/
- ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG, NULL);
+ ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG);
ASSERT_EQ(ret, 0);
ASSERT_EQ(fcntl(pidfd, F_SETFL, (flags & ~O_NONBLOCK)), 0);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_STOPPED);
ASSERT_EQ(info.si_pid, parent_tid);
ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_EXITED);
ASSERT_EQ(info.si_pid, parent_tid);