aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/tools/testing/selftests/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'tools/testing/selftests/filesystems')
-rw-r--r--tools/testing/selftests/filesystems/.gitignore2
-rw-r--r--tools/testing/selftests/filesystems/Makefile4
-rw-r--r--tools/testing/selftests/filesystems/anon_inode_test.c69
-rw-r--r--tools/testing/selftests/filesystems/binderfs/Makefile4
-rw-r--r--tools/testing/selftests/filesystems/binderfs/binderfs_test.c31
-rw-r--r--tools/testing/selftests/filesystems/binderfs/config1
-rw-r--r--tools/testing/selftests/filesystems/epoll/Makefile2
-rw-r--r--tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c211
-rw-r--r--tools/testing/selftests/filesystems/eventfd/.gitignore2
-rw-r--r--tools/testing/selftests/filesystems/eventfd/Makefile7
-rw-r--r--tools/testing/selftests/filesystems/eventfd/eventfd_test.c311
-rw-r--r--tools/testing/selftests/filesystems/fat/.gitignore2
-rw-r--r--tools/testing/selftests/filesystems/fat/Makefile7
-rw-r--r--tools/testing/selftests/filesystems/fat/config2
-rw-r--r--tools/testing/selftests/filesystems/fat/rename_exchange.c37
-rwxr-xr-xtools/testing/selftests/filesystems/fat/run_fat_tests.sh82
-rw-r--r--tools/testing/selftests/filesystems/file_stressor.c194
-rw-r--r--tools/testing/selftests/filesystems/mount-notify/.gitignore3
-rw-r--r--tools/testing/selftests/filesystems/mount-notify/Makefile11
-rw-r--r--tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c529
-rw-r--r--tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c557
-rw-r--r--tools/testing/selftests/filesystems/nsfs/.gitignore4
-rw-r--r--tools/testing/selftests/filesystems/nsfs/Makefile6
-rw-r--r--tools/testing/selftests/filesystems/nsfs/config3
-rw-r--r--tools/testing/selftests/filesystems/nsfs/iterate_mntns.c163
-rw-r--r--tools/testing/selftests/filesystems/nsfs/owner.c92
-rw-r--r--tools/testing/selftests/filesystems/nsfs/pidns.c79
-rw-r--r--tools/testing/selftests/filesystems/overlayfs/.gitignore3
-rw-r--r--tools/testing/selftests/filesystems/overlayfs/Makefile14
-rw-r--r--tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c160
-rw-r--r--tools/testing/selftests/filesystems/overlayfs/log.h26
-rw-r--r--tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c720
-rw-r--r--tools/testing/selftests/filesystems/statmount/.gitignore3
-rw-r--r--tools/testing/selftests/filesystems/statmount/Makefile10
-rw-r--r--tools/testing/selftests/filesystems/statmount/listmount_test.c66
-rw-r--r--tools/testing/selftests/filesystems/statmount/statmount.h82
-rw-r--r--tools/testing/selftests/filesystems/statmount/statmount_test.c702
-rw-r--r--tools/testing/selftests/filesystems/statmount/statmount_test_ns.c291
-rw-r--r--tools/testing/selftests/filesystems/utils.c589
-rw-r--r--tools/testing/selftests/filesystems/utils.h48
-rw-r--r--tools/testing/selftests/filesystems/wrappers.h108
41 files changed, 5224 insertions, 13 deletions
diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore
index f0c0ff20d6cf..7afa58e2bb20 100644
--- a/tools/testing/selftests/filesystems/.gitignore
+++ b/tools/testing/selftests/filesystems/.gitignore
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
dnotify_test
devpts_pts
+file_stressor
+anon_inode_test
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index 129880fb42d3..b02326193fee 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-CFLAGS += -I../../../../usr/include/
-TEST_GEN_PROGS := devpts_pts
+CFLAGS += $(KHDR_INCLUDES)
+TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test
TEST_GEN_PROGS_EXTENDED := dnotify_test
include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c
new file mode 100644
index 000000000000..73e0a4d4fb2f
--- /dev/null
+++ b/tools/testing/selftests/filesystems/anon_inode_test.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/stat.h>
+
+#include "../kselftest_harness.h"
+#include "wrappers.h"
+
+TEST(anon_inode_no_chown)
+{
+ int fd_context;
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_LT(fchown(fd_context, 1234, 5678), 0);
+ ASSERT_EQ(errno, EOPNOTSUPP);
+
+ EXPECT_EQ(close(fd_context), 0);
+}
+
+TEST(anon_inode_no_chmod)
+{
+ int fd_context;
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_LT(fchmod(fd_context, 0777), 0);
+ ASSERT_EQ(errno, EOPNOTSUPP);
+
+ EXPECT_EQ(close(fd_context), 0);
+}
+
+TEST(anon_inode_no_exec)
+{
+ int fd_context;
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
+ ASSERT_EQ(errno, EACCES);
+
+ EXPECT_EQ(close(fd_context), 0);
+}
+
+TEST(anon_inode_no_open)
+{
+ int fd_context;
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_GE(dup2(fd_context, 500), 0);
+ ASSERT_EQ(close(fd_context), 0);
+ fd_context = 500;
+
+ ASSERT_LT(open("/proc/self/fd/500", 0), 0);
+ ASSERT_EQ(errno, ENXIO);
+
+ EXPECT_EQ(close(fd_context), 0);
+}
+
+TEST_HARNESS_MAIN
+
diff --git a/tools/testing/selftests/filesystems/binderfs/Makefile b/tools/testing/selftests/filesystems/binderfs/Makefile
index 8af25ae96049..eb4c3b411934 100644
--- a/tools/testing/selftests/filesystems/binderfs/Makefile
+++ b/tools/testing/selftests/filesystems/binderfs/Makefile
@@ -1,8 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-CFLAGS += -I../../../../../usr/include/ -pthread
+CFLAGS += $(KHDR_INCLUDES) -pthread
TEST_GEN_PROGS := binderfs_test
-binderfs_test: binderfs_test.c ../../kselftest.h ../../kselftest_harness.h
-
include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c
index 1d27f52c61e6..81db85a5cc16 100644
--- a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c
+++ b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c
@@ -57,11 +57,15 @@ static int __do_binderfs_test(struct __test_metadata *_metadata)
{
int fd, ret, saved_errno, result = 1;
size_t len;
- ssize_t wret;
struct binderfs_device device = { 0 };
struct binder_version version = { 0 };
char binderfs_mntpt[] = P_tmpdir "/binderfs_XXXXXX",
device_path[sizeof(P_tmpdir "/binderfs_XXXXXX/") + BINDERFS_MAX_NAME];
+ static const char * const binder_features[] = {
+ "oneway_spam_detection",
+ "extended_error",
+ "freeze_notification",
+ };
change_mountns(_metadata);
@@ -74,7 +78,7 @@ static int __do_binderfs_test(struct __test_metadata *_metadata)
ret = mount(NULL, binderfs_mntpt, "binder", 0, 0);
EXPECT_EQ(ret, 0) {
if (errno == ENODEV)
- XFAIL(goto out, "binderfs missing");
+ SKIP(goto out, "binderfs missing");
TH_LOG("%s - Failed to mount binderfs", strerror(errno));
goto rmdir;
}
@@ -150,6 +154,20 @@ static int __do_binderfs_test(struct __test_metadata *_metadata)
}
/* success: binder-control device removal failed as expected */
+
+ for (int i = 0; i < ARRAY_SIZE(binder_features); i++) {
+ snprintf(device_path, sizeof(device_path), "%s/features/%s",
+ binderfs_mntpt, binder_features[i]);
+ fd = open(device_path, O_CLOEXEC | O_RDONLY);
+ EXPECT_GE(fd, 0) {
+ TH_LOG("%s - Failed to open binder feature: %s",
+ strerror(errno), binder_features[i]);
+ goto umount;
+ }
+ close(fd);
+ }
+
+ /* success: binder feature files found */
result = 0;
umount:
@@ -395,7 +413,8 @@ TEST(binderfs_stress)
ret = mount(NULL, binderfs_mntpt, "binder", 0, 0);
ASSERT_EQ(ret, 0) {
- TH_LOG("%s - Failed to mount binderfs", strerror(errno));
+ TH_LOG("%s - Failed to mount binderfs, check if CONFIG_ANDROID_BINDERFS is enabled in the running kernel",
+ strerror(errno));
}
for (int i = 0; i < ARRAY_SIZE(fds); i++) {
@@ -475,10 +494,10 @@ TEST(binderfs_stress)
TEST(binderfs_test_privileged)
{
if (geteuid() != 0)
- XFAIL(return, "Tests are not run as root. Skipping privileged tests");
+ SKIP(return, "Tests are not run as root. Skipping privileged tests");
if (__do_binderfs_test(_metadata))
- XFAIL(return, "The Android binderfs filesystem is not available");
+ SKIP(return, "The Android binderfs filesystem is not available");
}
TEST(binderfs_test_unprivileged)
@@ -511,7 +530,7 @@ TEST(binderfs_test_unprivileged)
ret = wait_for_pid(pid);
if (ret) {
if (ret == 2)
- XFAIL(return, "The Android binderfs filesystem is not available");
+ SKIP(return, "The Android binderfs filesystem is not available");
ASSERT_EQ(ret, 0) {
TH_LOG("wait_for_pid() failed");
}
diff --git a/tools/testing/selftests/filesystems/binderfs/config b/tools/testing/selftests/filesystems/binderfs/config
index 02dd6cc9cf99..7b4fc6ee6205 100644
--- a/tools/testing/selftests/filesystems/binderfs/config
+++ b/tools/testing/selftests/filesystems/binderfs/config
@@ -1,3 +1,2 @@
-CONFIG_ANDROID=y
CONFIG_ANDROID_BINDERFS=y
CONFIG_ANDROID_BINDER_IPC=y
diff --git a/tools/testing/selftests/filesystems/epoll/Makefile b/tools/testing/selftests/filesystems/epoll/Makefile
index 78ae4aaf7141..0788a7dc8004 100644
--- a/tools/testing/selftests/filesystems/epoll/Makefile
+++ b/tools/testing/selftests/filesystems/epoll/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-CFLAGS += -I../../../../../usr/include/
+CFLAGS += $(KHDR_INCLUDES)
LDLIBS += -lpthread
TEST_GEN_PROGS := epoll_wakeup_test
diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
index d979ff14775a..65ede506305c 100644
--- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
+++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
+#include <asm/unistd.h>
+#include <linux/time_types.h>
#include <poll.h>
#include <unistd.h>
#include <assert.h>
@@ -21,6 +23,19 @@ struct epoll_mtcontext
pthread_t waiter;
};
+#ifndef __NR_epoll_pwait2
+#define __NR_epoll_pwait2 -1
+#endif
+
+static inline int sys_epoll_pwait2(int fd, struct epoll_event *events,
+ int maxevents,
+ const struct __kernel_timespec *timeout,
+ const sigset_t *sigset, size_t sigsetsize)
+{
+ return syscall(__NR_epoll_pwait2, fd, events, maxevents, timeout,
+ sigset, sigsetsize);
+}
+
static void signal_handler(int signum)
{
}
@@ -3282,4 +3297,200 @@ TEST(epoll60)
close(ctx.epfd);
}
+struct epoll61_ctx {
+ int epfd;
+ int evfd;
+};
+
+static void *epoll61_write_eventfd(void *ctx_)
+{
+ struct epoll61_ctx *ctx = ctx_;
+ int64_t l = 1;
+
+ usleep(10950);
+ write(ctx->evfd, &l, sizeof(l));
+ return NULL;
+}
+
+static void *epoll61_epoll_with_timeout(void *ctx_)
+{
+ struct epoll61_ctx *ctx = ctx_;
+ struct epoll_event events[1];
+ int n;
+
+ n = epoll_wait(ctx->epfd, events, 1, 11);
+ /*
+ * If epoll returned the eventfd, write on the eventfd to wake up the
+ * blocking poller.
+ */
+ if (n == 1) {
+ int64_t l = 1;
+
+ write(ctx->evfd, &l, sizeof(l));
+ }
+ return NULL;
+}
+
+static void *epoll61_blocking_epoll(void *ctx_)
+{
+ struct epoll61_ctx *ctx = ctx_;
+ struct epoll_event events[1];
+
+ epoll_wait(ctx->epfd, events, 1, -1);
+ return NULL;
+}
+
+TEST(epoll61)
+{
+ struct epoll61_ctx ctx;
+ struct epoll_event ev;
+ int i, r;
+
+ ctx.epfd = epoll_create1(0);
+ ASSERT_GE(ctx.epfd, 0);
+ ctx.evfd = eventfd(0, EFD_NONBLOCK);
+ ASSERT_GE(ctx.evfd, 0);
+
+ ev.events = EPOLLIN | EPOLLET | EPOLLERR | EPOLLHUP;
+ ev.data.ptr = NULL;
+ r = epoll_ctl(ctx.epfd, EPOLL_CTL_ADD, ctx.evfd, &ev);
+ ASSERT_EQ(r, 0);
+
+ /*
+ * We are testing a race. Repeat the test case 1000 times to make it
+ * more likely to fail in case of a bug.
+ */
+ for (i = 0; i < 1000; i++) {
+ pthread_t threads[3];
+ int n;
+
+ /*
+ * Start 3 threads:
+ * Thread 1 sleeps for 10.9ms and writes to the evenfd.
+ * Thread 2 calls epoll with a timeout of 11ms.
+ * Thread 3 calls epoll with a timeout of -1.
+ *
+ * The eventfd write by Thread 1 should either wakeup Thread 2
+ * or Thread 3. If it wakes up Thread 2, Thread 2 writes on the
+ * eventfd to wake up Thread 3.
+ *
+ * If no events are missed, all three threads should eventually
+ * be joinable.
+ */
+ ASSERT_EQ(pthread_create(&threads[0], NULL,
+ epoll61_write_eventfd, &ctx), 0);
+ ASSERT_EQ(pthread_create(&threads[1], NULL,
+ epoll61_epoll_with_timeout, &ctx), 0);
+ ASSERT_EQ(pthread_create(&threads[2], NULL,
+ epoll61_blocking_epoll, &ctx), 0);
+
+ for (n = 0; n < ARRAY_SIZE(threads); ++n)
+ ASSERT_EQ(pthread_join(threads[n], NULL), 0);
+ }
+
+ close(ctx.epfd);
+ close(ctx.evfd);
+}
+
+/* Equivalent to basic test epoll1, but exercising epoll_pwait2. */
+TEST(epoll62)
+{
+ int efd;
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
+ EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/* Epoll_pwait2 basic timeout test. */
+TEST(epoll63)
+{
+ const int cfg_delay_ms = 10;
+ unsigned long long tdiff;
+ struct __kernel_timespec ts;
+ int efd;
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ts.tv_sec = 0;
+ ts.tv_nsec = cfg_delay_ms * 1000 * 1000;
+
+ tdiff = msecs();
+ EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, &ts, NULL, 0), 0);
+ tdiff = msecs() - tdiff;
+
+ EXPECT_GE(tdiff, cfg_delay_ms);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (lt)
+ * s0
+ */
+TEST(epoll64)
+{
+ pthread_t waiter[2];
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ /*
+ * main will act as the emitter once both waiter threads are
+ * blocked and expects to both be awoken upon the ready event.
+ */
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&waiter[0], NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&waiter[1], NULL, waiter_entry1a, &ctx), 0);
+
+ usleep(100000);
+ ASSERT_EQ(write(ctx.sfd[1], "w", 1), 1);
+
+ ASSERT_EQ(pthread_join(waiter[0], NULL), 0);
+ ASSERT_EQ(pthread_join(waiter[1], NULL), 0);
+
+ EXPECT_EQ(ctx.count, 2);
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/eventfd/.gitignore b/tools/testing/selftests/filesystems/eventfd/.gitignore
new file mode 100644
index 000000000000..483faf59fe4a
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+eventfd_test
diff --git a/tools/testing/selftests/filesystems/eventfd/Makefile b/tools/testing/selftests/filesystems/eventfd/Makefile
new file mode 100644
index 000000000000..0a8e3910df15
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += $(KHDR_INCLUDES)
+LDLIBS += -lpthread
+TEST_GEN_PROGS := eventfd_test
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
new file mode 100644
index 000000000000..72d51ad0ee0e
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <asm/unistd.h>
+#include <linux/time_types.h>
+#include <unistd.h>
+#include <assert.h>
+#include <signal.h>
+#include <pthread.h>
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+#include "../../kselftest_harness.h"
+
+#define EVENTFD_TEST_ITERATIONS 100000UL
+
+struct error {
+ int code;
+ char msg[512];
+};
+
+static int error_set(struct error *err, int code, const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+ if (code == 0 || !err || err->code != 0)
+ return code;
+
+ err->code = code;
+ va_start(args, fmt);
+ r = vsnprintf(err->msg, sizeof(err->msg), fmt, args);
+ assert((size_t)r < sizeof(err->msg));
+ va_end(args);
+
+ return code;
+}
+
+static inline int sys_eventfd2(unsigned int count, int flags)
+{
+ return syscall(__NR_eventfd2, count, flags);
+}
+
+TEST(eventfd_check_flag_rdwr)
+{
+ int fd, flags;
+
+ fd = sys_eventfd2(0, 0);
+ ASSERT_GE(fd, 0);
+
+ flags = fcntl(fd, F_GETFL);
+ // The kernel automatically adds the O_RDWR flag.
+ EXPECT_EQ(flags, O_RDWR);
+
+ close(fd);
+}
+
+TEST(eventfd_check_flag_cloexec)
+{
+ int fd, flags;
+
+ fd = sys_eventfd2(0, EFD_CLOEXEC);
+ ASSERT_GE(fd, 0);
+
+ flags = fcntl(fd, F_GETFD);
+ ASSERT_GT(flags, -1);
+ EXPECT_EQ(flags, FD_CLOEXEC);
+
+ close(fd);
+}
+
+TEST(eventfd_check_flag_nonblock)
+{
+ int fd, flags;
+
+ fd = sys_eventfd2(0, EFD_NONBLOCK);
+ ASSERT_GE(fd, 0);
+
+ flags = fcntl(fd, F_GETFL);
+ ASSERT_GT(flags, -1);
+ EXPECT_EQ(flags & EFD_NONBLOCK, EFD_NONBLOCK);
+ EXPECT_EQ(flags & O_RDWR, O_RDWR);
+
+ close(fd);
+}
+
+TEST(eventfd_check_flag_cloexec_and_nonblock)
+{
+ int fd, flags;
+
+ fd = sys_eventfd2(0, EFD_CLOEXEC|EFD_NONBLOCK);
+ ASSERT_GE(fd, 0);
+
+ flags = fcntl(fd, F_GETFL);
+ ASSERT_GT(flags, -1);
+ EXPECT_EQ(flags & EFD_NONBLOCK, EFD_NONBLOCK);
+ EXPECT_EQ(flags & O_RDWR, O_RDWR);
+
+ flags = fcntl(fd, F_GETFD);
+ ASSERT_GT(flags, -1);
+ EXPECT_EQ(flags, FD_CLOEXEC);
+
+ close(fd);
+}
+
+static inline void trim_newline(char *str)
+{
+ char *pos = strrchr(str, '\n');
+
+ if (pos)
+ *pos = '\0';
+}
+
+static int verify_fdinfo(int fd, struct error *err, const char *prefix,
+ size_t prefix_len, const char *expect, ...)
+{
+ char buffer[512] = {0, };
+ char path[512] = {0, };
+ va_list args;
+ FILE *f;
+ char *line = NULL;
+ size_t n = 0;
+ int found = 0;
+ int r;
+
+ va_start(args, expect);
+ r = vsnprintf(buffer, sizeof(buffer), expect, args);
+ assert((size_t)r < sizeof(buffer));
+ va_end(args);
+
+ snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd);
+ f = fopen(path, "re");
+ if (!f)
+ return error_set(err, -1, "fdinfo open failed for %d", fd);
+
+ while (getline(&line, &n, f) != -1) {
+ char *val;
+
+ if (strncmp(line, prefix, prefix_len))
+ continue;
+
+ found = 1;
+
+ val = line + prefix_len;
+ r = strcmp(val, buffer);
+ if (r != 0) {
+ trim_newline(line);
+ trim_newline(buffer);
+ error_set(err, -1, "%s '%s' != '%s'",
+ prefix, val, buffer);
+ }
+ break;
+ }
+
+ free(line);
+ fclose(f);
+
+ if (found == 0)
+ return error_set(err, -1, "%s not found for fd %d",
+ prefix, fd);
+
+ return 0;
+}
+
+TEST(eventfd_check_flag_semaphore)
+{
+ struct error err = {0};
+ int fd, ret;
+
+ fd = sys_eventfd2(0, EFD_SEMAPHORE);
+ ASSERT_GE(fd, 0);
+
+ ret = fcntl(fd, F_GETFL);
+ ASSERT_GT(ret, -1);
+ EXPECT_EQ(ret & O_RDWR, O_RDWR);
+
+ // The semaphore could only be obtained from fdinfo.
+ ret = verify_fdinfo(fd, &err, "eventfd-semaphore: ", 19, "1\n");
+ if (ret != 0)
+ ksft_print_msg("eventfd semaphore flag check failed: %s\n", err.msg);
+ EXPECT_EQ(ret, 0);
+
+ close(fd);
+}
+
+/*
+ * A write(2) fails with the error EINVAL if the size of the supplied buffer
+ * is less than 8 bytes, or if an attempt is made to write the value
+ * 0xffffffffffffffff.
+ */
+TEST(eventfd_check_write)
+{
+ uint64_t value = 1;
+ ssize_t size;
+ int fd;
+
+ fd = sys_eventfd2(0, 0);
+ ASSERT_GE(fd, 0);
+
+ size = write(fd, &value, sizeof(int));
+ EXPECT_EQ(size, -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(value));
+
+ value = (uint64_t)-1;
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ close(fd);
+}
+
+/*
+ * A read(2) fails with the error EINVAL if the size of the supplied buffer is
+ * less than 8 bytes.
+ */
+TEST(eventfd_check_read)
+{
+ uint64_t value;
+ ssize_t size;
+ int fd;
+
+ fd = sys_eventfd2(1, 0);
+ ASSERT_GE(fd, 0);
+
+ size = read(fd, &value, sizeof(int));
+ EXPECT_EQ(size, -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ size = read(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(value));
+ EXPECT_EQ(value, 1);
+
+ close(fd);
+}
+
+
+/*
+ * If EFD_SEMAPHORE was not specified and the eventfd counter has a nonzero
+ * value, then a read(2) returns 8 bytes containing that value, and the
+ * counter's value is reset to zero.
+ * If the eventfd counter is zero at the time of the call to read(2), then the
+ * call fails with the error EAGAIN if the file descriptor has been made nonblocking.
+ */
+TEST(eventfd_check_read_with_nonsemaphore)
+{
+ uint64_t value;
+ ssize_t size;
+ int fd;
+ int i;
+
+ fd = sys_eventfd2(0, EFD_NONBLOCK);
+ ASSERT_GE(fd, 0);
+
+ value = 1;
+ for (i = 0; i < EVENTFD_TEST_ITERATIONS; i++) {
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(value));
+ }
+
+ size = read(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(uint64_t));
+ EXPECT_EQ(value, EVENTFD_TEST_ITERATIONS);
+
+ size = read(fd, &value, sizeof(value));
+ EXPECT_EQ(size, -1);
+ EXPECT_EQ(errno, EAGAIN);
+
+ close(fd);
+}
+
+/*
+ * If EFD_SEMAPHORE was specified and the eventfd counter has a nonzero value,
+ * then a read(2) returns 8 bytes containing the value 1, and the counter's
+ * value is decremented by 1.
+ * If the eventfd counter is zero at the time of the call to read(2), then the
+ * call fails with the error EAGAIN if the file descriptor has been made nonblocking.
+ */
+TEST(eventfd_check_read_with_semaphore)
+{
+ uint64_t value;
+ ssize_t size;
+ int fd;
+ int i;
+
+ fd = sys_eventfd2(0, EFD_SEMAPHORE|EFD_NONBLOCK);
+ ASSERT_GE(fd, 0);
+
+ value = 1;
+ for (i = 0; i < EVENTFD_TEST_ITERATIONS; i++) {
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(value));
+ }
+
+ for (i = 0; i < EVENTFD_TEST_ITERATIONS; i++) {
+ size = read(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(value));
+ EXPECT_EQ(value, 1);
+ }
+
+ size = read(fd, &value, sizeof(value));
+ EXPECT_EQ(size, -1);
+ EXPECT_EQ(errno, EAGAIN);
+
+ close(fd);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/fat/.gitignore b/tools/testing/selftests/filesystems/fat/.gitignore
new file mode 100644
index 000000000000..b89920ed841c
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fat/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+rename_exchange
diff --git a/tools/testing/selftests/filesystems/fat/Makefile b/tools/testing/selftests/filesystems/fat/Makefile
new file mode 100644
index 000000000000..902033f6ef09
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fat/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_PROGS := run_fat_tests.sh
+TEST_GEN_PROGS_EXTENDED := rename_exchange
+CFLAGS += -O2 -g -Wall $(KHDR_INCLUDES)
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/fat/config b/tools/testing/selftests/filesystems/fat/config
new file mode 100644
index 000000000000..6cf95e787a17
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fat/config
@@ -0,0 +1,2 @@
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_VFAT_FS=y
diff --git a/tools/testing/selftests/filesystems/fat/rename_exchange.c b/tools/testing/selftests/filesystems/fat/rename_exchange.c
new file mode 100644
index 000000000000..e488ad354fce
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fat/rename_exchange.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Program that atomically exchanges two paths using
+ * the renameat2() system call RENAME_EXCHANGE flag.
+ *
+ * Copyright 2022 Red Hat Inc.
+ * Author: Javier Martinez Canillas <javierm@redhat.com>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void print_usage(const char *program)
+{
+ printf("Usage: %s [oldpath] [newpath]\n", program);
+ printf("Atomically exchange oldpath and newpath\n");
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+
+ if (argc != 3) {
+ print_usage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ ret = renameat2(AT_FDCWD, argv[1], AT_FDCWD, argv[2], RENAME_EXCHANGE);
+ if (ret) {
+ perror("rename exchange failed");
+ exit(EXIT_FAILURE);
+ }
+
+ exit(EXIT_SUCCESS);
+}
diff --git a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh
new file mode 100755
index 000000000000..d61264d4795d
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run filesystem operations tests on an 1 MiB disk image that is formatted with
+# a vfat filesystem and mounted in a temporary directory using a loop device.
+#
+# Copyright 2022 Red Hat Inc.
+# Author: Javier Martinez Canillas <javierm@redhat.com>
+
+set -e
+set -u
+set -o pipefail
+
+BASE_DIR="$(dirname $0)"
+TMP_DIR="$(mktemp -d /tmp/fat_tests_tmp.XXXXXX)"
+IMG_PATH="${TMP_DIR}/fat.img"
+MNT_PATH="${TMP_DIR}/mnt"
+
+cleanup()
+{
+ mountpoint -q "${MNT_PATH}" && unmount_image
+ rm -rf "${TMP_DIR}"
+}
+trap cleanup SIGINT SIGTERM EXIT
+
+create_loopback()
+{
+ touch "${IMG_PATH}"
+ chattr +C "${IMG_PATH}" >/dev/null 2>&1 || true
+
+ truncate -s 1M "${IMG_PATH}"
+ mkfs.vfat "${IMG_PATH}" >/dev/null 2>&1
+}
+
+mount_image()
+{
+ mkdir -p "${MNT_PATH}"
+ sudo mount -o loop "${IMG_PATH}" "${MNT_PATH}"
+}
+
+rename_exchange_test()
+{
+ local rename_exchange="${BASE_DIR}/rename_exchange"
+ local old_path="${MNT_PATH}/old_file"
+ local new_path="${MNT_PATH}/new_file"
+
+ echo old | sudo tee "${old_path}" >/dev/null 2>&1
+ echo new | sudo tee "${new_path}" >/dev/null 2>&1
+ sudo "${rename_exchange}" "${old_path}" "${new_path}" >/dev/null 2>&1
+ sudo sync -f "${MNT_PATH}"
+ grep new "${old_path}" >/dev/null 2>&1
+ grep old "${new_path}" >/dev/null 2>&1
+}
+
+rename_exchange_subdir_test()
+{
+ local rename_exchange="${BASE_DIR}/rename_exchange"
+ local dir_path="${MNT_PATH}/subdir"
+ local old_path="${MNT_PATH}/old_file"
+ local new_path="${dir_path}/new_file"
+
+ sudo mkdir -p "${dir_path}"
+ echo old | sudo tee "${old_path}" >/dev/null 2>&1
+ echo new | sudo tee "${new_path}" >/dev/null 2>&1
+ sudo "${rename_exchange}" "${old_path}" "${new_path}" >/dev/null 2>&1
+ sudo sync -f "${MNT_PATH}"
+ grep new "${old_path}" >/dev/null 2>&1
+ grep old "${new_path}" >/dev/null 2>&1
+}
+
+unmount_image()
+{
+ sudo umount "${MNT_PATH}" &> /dev/null
+}
+
+create_loopback
+mount_image
+rename_exchange_test
+rename_exchange_subdir_test
+unmount_image
+
+exit 0
diff --git a/tools/testing/selftests/filesystems/file_stressor.c b/tools/testing/selftests/filesystems/file_stressor.c
new file mode 100644
index 000000000000..01dd89f8e52f
--- /dev/null
+++ b/tools/testing/selftests/filesystems/file_stressor.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+
+#include <fcntl.h>
+#include <limits.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <sys/syscall.h>
+
+static inline int sys_fsopen(const char *fsname, unsigned int flags)
+{
+ return syscall(__NR_fsopen, fsname, flags);
+}
+
+static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key,
+ const char *value, int aux)
+{
+ return syscall(__NR_fsconfig, fd, cmd, key, value, aux);
+}
+
+static inline int sys_fsmount(int fd, unsigned int flags,
+ unsigned int attr_flags)
+{
+ return syscall(__NR_fsmount, fd, flags, attr_flags);
+}
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#endif
+
+static inline int sys_move_mount(int from_dfd, const char *from_pathname,
+ int to_dfd, const char *to_pathname,
+ unsigned int flags)
+{
+ return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
+ to_pathname, flags);
+}
+
+FIXTURE(file_stressor) {
+ int fd_tmpfs;
+ int nr_procs;
+ int max_fds;
+ pid_t *pids_openers;
+ pid_t *pids_getdents;
+ int *fd_proc_pid;
+};
+
+FIXTURE_SETUP(file_stressor)
+{
+ int fd_context;
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_EQ(mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+ ASSERT_EQ(mkdir("/slab_typesafe_by_rcu", 0755), 0);
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+ self->fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(self->fd_tmpfs, 0);
+ ASSERT_EQ(close(fd_context), 0);
+
+ ASSERT_EQ(sys_move_mount(self->fd_tmpfs, "", -EBADF, "/slab_typesafe_by_rcu", MOVE_MOUNT_F_EMPTY_PATH), 0);
+
+ self->nr_procs = sysconf(_SC_NPROCESSORS_ONLN);
+ self->pids_openers = malloc(sizeof(pid_t) * self->nr_procs);
+ ASSERT_NE(self->pids_openers, NULL);
+ self->pids_getdents = malloc(sizeof(pid_t) * self->nr_procs);
+ ASSERT_NE(self->pids_getdents, NULL);
+ self->fd_proc_pid = malloc(sizeof(int) * self->nr_procs);
+ ASSERT_NE(self->fd_proc_pid, NULL);
+ self->max_fds = 500;
+}
+
+FIXTURE_TEARDOWN(file_stressor)
+{
+ for (int i = 0; i < self->nr_procs; i++) {
+ int wstatus;
+ pid_t pid;
+
+ pid = waitpid(self->pids_openers[i], &wstatus, 0);
+ ASSERT_EQ(pid, self->pids_openers[i]);
+ ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus));
+
+ pid = waitpid(self->pids_getdents[i], &wstatus, 0);
+ ASSERT_EQ(pid, self->pids_getdents[i]);
+ ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus));
+ }
+ free(self->pids_openers);
+ free(self->pids_getdents);
+ ASSERT_EQ(close(self->fd_tmpfs), 0);
+
+ umount2("/slab_typesafe_by_rcu", 0);
+ ASSERT_EQ(rmdir("/slab_typesafe_by_rcu"), 0);
+}
+
+TEST_F_TIMEOUT(file_stressor, slab_typesafe_by_rcu, 900 * 2)
+{
+ for (int i = 0; i < self->nr_procs; i++) {
+ pid_t pid_self;
+
+ self->pids_openers[i] = fork();
+ ASSERT_GE(self->pids_openers[i], 0);
+
+ if (self->pids_openers[i] != 0)
+ continue;
+
+ self->pids_openers[i] = getpid();
+ for (;;) {
+ for (int i = 0; i < self->max_fds; i++) {
+ char path[PATH_MAX];
+ int fd;
+
+ sprintf(path, "/slab_typesafe_by_rcu/file-%d-%d", self->pids_openers[i], i);
+ fd = open(path, O_CREAT | O_RDONLY | O_CLOEXEC, 0644);
+ if (fd < 0)
+ continue;
+ }
+
+ close_range(3, ~0U, 0);
+ }
+
+ exit(0);
+ }
+
+ for (int i = 0; i < self->nr_procs; i++) {
+ char path[PATH_MAX];
+
+ sprintf(path, "/proc/%d/fd/", self->pids_openers[i]);
+ self->fd_proc_pid[i] = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+ ASSERT_GE(self->fd_proc_pid[i], 0);
+ }
+
+ for (int i = 0; i < self->nr_procs; i++) {
+ self->pids_getdents[i] = fork();
+ ASSERT_GE(self->pids_getdents[i], 0);
+
+ if (self->pids_getdents[i] != 0)
+ continue;
+
+ self->pids_getdents[i] = getpid();
+ for (;;) {
+ char ents[1024];
+ ssize_t nr_read;
+
+ /*
+ * Concurrently read /proc/<pid>/fd/ which roughly does:
+ *
+ * f = fget_task_next(p, &fd);
+ * if (!f)
+ * break;
+ * data.mode = f->f_mode;
+ * fput(f);
+ *
+ * Which means that it'll try to get a reference to a
+ * file in another task's file descriptor table.
+ *
+ * Under heavy file load it is increasingly likely that
+ * the other task will manage to close @file and @file
+ * is being recycled due to SLAB_TYPEAFE_BY_RCU
+ * concurrently. This will trigger various warnings in
+ * the file reference counting code.
+ */
+ do {
+ nr_read = syscall(SYS_getdents64, self->fd_proc_pid[i], ents, sizeof(ents));
+ } while (nr_read >= 0);
+
+ lseek(self->fd_proc_pid[i], 0, SEEK_SET);
+ }
+
+ exit(0);
+ }
+
+ ASSERT_EQ(clock_nanosleep(CLOCK_MONOTONIC, 0, &(struct timespec){ .tv_sec = 900 /* 15 min */ }, NULL), 0);
+
+ for (int i = 0; i < self->nr_procs; i++) {
+ kill(self->pids_openers[i], SIGKILL);
+ kill(self->pids_getdents[i], SIGKILL);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/mount-notify/.gitignore b/tools/testing/selftests/filesystems/mount-notify/.gitignore
new file mode 100644
index 000000000000..124339ea7845
--- /dev/null
+++ b/tools/testing/selftests/filesystems/mount-notify/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/*_test
+/*_test_ns
diff --git a/tools/testing/selftests/filesystems/mount-notify/Makefile b/tools/testing/selftests/filesystems/mount-notify/Makefile
new file mode 100644
index 000000000000..836a4eb7be06
--- /dev/null
+++ b/tools/testing/selftests/filesystems/mount-notify/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
+
+TEST_GEN_PROGS := mount-notify_test mount-notify_test_ns
+
+include ../../lib.mk
+
+$(OUTPUT)/mount-notify_test: ../utils.c
+$(OUTPUT)/mount-notify_test_ns: ../utils.c
diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c
new file mode 100644
index 000000000000..63ce708d93ed
--- /dev/null
+++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c
@@ -0,0 +1,529 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2025 Miklos Szeredi <miklos@szeredi.hu>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include "../../kselftest_harness.h"
+#include "../statmount/statmount.h"
+#include "../utils.h"
+
+// Needed for linux/fanotify.h
+#ifndef __kernel_fsid_t
+typedef struct {
+ int val[2];
+} __kernel_fsid_t;
+#endif
+
+#include <sys/fanotify.h>
+
+static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX";
+
+static const int mark_cmds[] = {
+ FAN_MARK_ADD,
+ FAN_MARK_REMOVE,
+ FAN_MARK_FLUSH
+};
+
+#define NUM_FAN_FDS ARRAY_SIZE(mark_cmds)
+
+FIXTURE(fanotify) {
+ int fan_fd[NUM_FAN_FDS];
+ char buf[256];
+ unsigned int rem;
+ void *next;
+ char root_mntpoint[sizeof(root_mntpoint_templ)];
+ int orig_root;
+ int ns_fd;
+ uint64_t root_id;
+};
+
+FIXTURE_SETUP(fanotify)
+{
+ int i, ret;
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+
+ self->ns_fd = open("/proc/self/ns/mnt", O_RDONLY);
+ ASSERT_GE(self->ns_fd, 0);
+
+ ASSERT_EQ(mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL), 0);
+
+ strcpy(self->root_mntpoint, root_mntpoint_templ);
+ ASSERT_NE(mkdtemp(self->root_mntpoint), NULL);
+
+ self->orig_root = open("/", O_PATH | O_CLOEXEC);
+ ASSERT_GE(self->orig_root, 0);
+
+ ASSERT_EQ(mount("tmpfs", self->root_mntpoint, "tmpfs", 0, NULL), 0);
+
+ ASSERT_EQ(chroot(self->root_mntpoint), 0);
+
+ ASSERT_EQ(chdir("/"), 0);
+
+ ASSERT_EQ(mkdir("a", 0700), 0);
+
+ ASSERT_EQ(mkdir("b", 0700), 0);
+
+ self->root_id = get_unique_mnt_id("/");
+ ASSERT_NE(self->root_id, 0);
+
+ for (i = 0; i < NUM_FAN_FDS; i++) {
+ self->fan_fd[i] = fanotify_init(FAN_REPORT_MNT | FAN_NONBLOCK,
+ 0);
+ ASSERT_GE(self->fan_fd[i], 0);
+ ret = fanotify_mark(self->fan_fd[i], FAN_MARK_ADD |
+ FAN_MARK_MNTNS,
+ FAN_MNT_ATTACH | FAN_MNT_DETACH,
+ self->ns_fd, NULL);
+ ASSERT_EQ(ret, 0);
+ // On fd[0] we do an extra ADD that changes nothing.
+ // On fd[1]/fd[2] we REMOVE/FLUSH which removes the mark.
+ ret = fanotify_mark(self->fan_fd[i], mark_cmds[i] |
+ FAN_MARK_MNTNS,
+ FAN_MNT_ATTACH | FAN_MNT_DETACH,
+ self->ns_fd, NULL);
+ ASSERT_EQ(ret, 0);
+ }
+
+ self->rem = 0;
+}
+
+FIXTURE_TEARDOWN(fanotify)
+{
+ int i;
+
+ ASSERT_EQ(self->rem, 0);
+ for (i = 0; i < NUM_FAN_FDS; i++)
+ close(self->fan_fd[i]);
+
+ ASSERT_EQ(fchdir(self->orig_root), 0);
+
+ ASSERT_EQ(chroot("."), 0);
+
+ EXPECT_EQ(umount2(self->root_mntpoint, MNT_DETACH), 0);
+ EXPECT_EQ(chdir(self->root_mntpoint), 0);
+ EXPECT_EQ(chdir("/"), 0);
+ EXPECT_EQ(rmdir(self->root_mntpoint), 0);
+}
+
+static uint64_t expect_notify(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ uint64_t *mask)
+{
+ struct fanotify_event_metadata *meta;
+ struct fanotify_event_info_mnt *mnt;
+ unsigned int thislen;
+
+ if (!self->rem) {
+ ssize_t len;
+ int i;
+
+ for (i = NUM_FAN_FDS - 1; i >= 0; i--) {
+ len = read(self->fan_fd[i], self->buf,
+ sizeof(self->buf));
+ if (i > 0) {
+ // Groups 1,2 should get EAGAIN
+ ASSERT_EQ(len, -1);
+ ASSERT_EQ(errno, EAGAIN);
+ } else {
+ // Group 0 should get events
+ ASSERT_GT(len, 0);
+ }
+ }
+
+ self->rem = len;
+ self->next = (void *) self->buf;
+ }
+
+ meta = self->next;
+ ASSERT_TRUE(FAN_EVENT_OK(meta, self->rem));
+
+ thislen = meta->event_len;
+ self->rem -= thislen;
+ self->next += thislen;
+
+ *mask = meta->mask;
+ thislen -= sizeof(*meta);
+
+ mnt = ((void *) meta) + meta->event_len - thislen;
+
+ ASSERT_EQ(thislen, sizeof(*mnt));
+
+ return mnt->mnt_id;
+}
+
+static void expect_notify_n(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ unsigned int n, uint64_t mask[], uint64_t mnts[])
+{
+ unsigned int i;
+
+ for (i = 0; i < n; i++)
+ mnts[i] = expect_notify(_metadata, self, &mask[i]);
+}
+
+static uint64_t expect_notify_mask(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ uint64_t expect_mask)
+{
+ uint64_t mntid, mask;
+
+ mntid = expect_notify(_metadata, self, &mask);
+ ASSERT_EQ(expect_mask, mask);
+
+ return mntid;
+}
+
+
+static void expect_notify_mask_n(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ uint64_t mask, unsigned int n, uint64_t mnts[])
+{
+ unsigned int i;
+
+ for (i = 0; i < n; i++)
+ mnts[i] = expect_notify_mask(_metadata, self, mask);
+}
+
+static void verify_mount_ids(struct __test_metadata *const _metadata,
+ const uint64_t list1[], const uint64_t list2[],
+ size_t num)
+{
+ unsigned int i, j;
+
+ // Check that neither list has any duplicates
+ for (i = 0; i < num; i++) {
+ for (j = 0; j < num; j++) {
+ if (i != j) {
+ ASSERT_NE(list1[i], list1[j]);
+ ASSERT_NE(list2[i], list2[j]);
+ }
+ }
+ }
+ // Check that all list1 memebers can be found in list2. Together with
+ // the above it means that the list1 and list2 represent the same sets.
+ for (i = 0; i < num; i++) {
+ for (j = 0; j < num; j++) {
+ if (list1[i] == list2[j])
+ break;
+ }
+ ASSERT_NE(j, num);
+ }
+}
+
+static void check_mounted(struct __test_metadata *const _metadata,
+ const uint64_t mnts[], size_t num)
+{
+ ssize_t ret;
+ uint64_t *list;
+
+ list = malloc((num + 1) * sizeof(list[0]));
+ ASSERT_NE(list, NULL);
+
+ ret = listmount(LSMT_ROOT, 0, 0, list, num + 1, 0);
+ ASSERT_EQ(ret, num);
+
+ verify_mount_ids(_metadata, mnts, list, num);
+
+ free(list);
+}
+
+static void setup_mount_tree(struct __test_metadata *const _metadata,
+ int log2_num)
+{
+ int ret, i;
+
+ ret = mount("", "/", NULL, MS_SHARED, NULL);
+ ASSERT_EQ(ret, 0);
+
+ for (i = 0; i < log2_num; i++) {
+ ret = mount("/", "/", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+ }
+}
+
+TEST_F(fanotify, bind)
+{
+ int ret;
+ uint64_t mnts[2] = { self->root_id };
+
+ ret = mount("/", "/", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ ASSERT_NE(mnts[0], mnts[1]);
+
+ check_mounted(_metadata, mnts, 2);
+
+ // Cleanup
+ uint64_t detach_id;
+ ret = umount("/");
+ ASSERT_EQ(ret, 0);
+
+ detach_id = expect_notify_mask(_metadata, self, FAN_MNT_DETACH);
+ ASSERT_EQ(detach_id, mnts[1]);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, move)
+{
+ int ret;
+ uint64_t mnts[2] = { self->root_id };
+ uint64_t move_id;
+
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ ASSERT_NE(mnts[0], mnts[1]);
+
+ check_mounted(_metadata, mnts, 2);
+
+ ret = move_mount(AT_FDCWD, "/a", AT_FDCWD, "/b", 0);
+ ASSERT_EQ(ret, 0);
+
+ move_id = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH | FAN_MNT_DETACH);
+ ASSERT_EQ(move_id, mnts[1]);
+
+ // Cleanup
+ ret = umount("/b");
+ ASSERT_EQ(ret, 0);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, propagate)
+{
+ const unsigned int log2_num = 4;
+ const unsigned int num = (1 << log2_num);
+ uint64_t mnts[num];
+
+ setup_mount_tree(_metadata, log2_num);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, num - 1, mnts + 1);
+
+ mnts[0] = self->root_id;
+ check_mounted(_metadata, mnts, num);
+
+ // Cleanup
+ int ret;
+ uint64_t mnts2[num];
+ ret = umount2("/", MNT_DETACH);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("", "/", NULL, MS_PRIVATE, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts2[0] = self->root_id;
+ expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, num - 1, mnts2 + 1);
+ verify_mount_ids(_metadata, mnts, mnts2, num);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, fsmount)
+{
+ int ret, fs, mnt;
+ uint64_t mnts[2] = { self->root_id };
+
+ fs = fsopen("tmpfs", 0);
+ ASSERT_GE(fs, 0);
+
+ ret = fsconfig(fs, FSCONFIG_CMD_CREATE, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+
+ mnt = fsmount(fs, 0, 0);
+ ASSERT_GE(mnt, 0);
+
+ close(fs);
+
+ ret = move_mount(mnt, "", AT_FDCWD, "/a", MOVE_MOUNT_F_EMPTY_PATH);
+ ASSERT_EQ(ret, 0);
+
+ close(mnt);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ ASSERT_NE(mnts[0], mnts[1]);
+
+ check_mounted(_metadata, mnts, 2);
+
+ // Cleanup
+ uint64_t detach_id;
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ detach_id = expect_notify_mask(_metadata, self, FAN_MNT_DETACH);
+ ASSERT_EQ(detach_id, mnts[1]);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, reparent)
+{
+ uint64_t mnts[6] = { self->root_id };
+ uint64_t dmnts[3];
+ uint64_t masks[3];
+ unsigned int i;
+ int ret;
+
+ // Create setup with a[1] -> b[2] propagation
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("", "/a", NULL, MS_SHARED, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("/a", "/b", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("", "/b", NULL, MS_SLAVE, NULL);
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 1);
+
+ check_mounted(_metadata, mnts, 3);
+
+ // Mount on a[3], which is propagated to b[4]
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 3);
+
+ check_mounted(_metadata, mnts, 5);
+
+ // Mount on b[5], not propagated
+ ret = mount("/", "/b", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[5] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+
+ check_mounted(_metadata, mnts, 6);
+
+ // Umount a[3], which is propagated to b[4], but not b[5]
+ // This will result in b[5] "falling" on b[2]
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_n(_metadata, self, 3, masks, dmnts);
+ verify_mount_ids(_metadata, mnts + 3, dmnts, 3);
+
+ for (i = 0; i < 3; i++) {
+ if (dmnts[i] == mnts[5]) {
+ ASSERT_EQ(masks[i], FAN_MNT_ATTACH | FAN_MNT_DETACH);
+ } else {
+ ASSERT_EQ(masks[i], FAN_MNT_DETACH);
+ }
+ }
+
+ mnts[3] = mnts[5];
+ check_mounted(_metadata, mnts, 4);
+
+ // Cleanup
+ ret = umount("/b");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/b");
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, 3, dmnts);
+ verify_mount_ids(_metadata, mnts + 1, dmnts, 3);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, rmdir)
+{
+ uint64_t mnts[3] = { self->root_id };
+ int ret;
+
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("/", "/a/b", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 1);
+
+ check_mounted(_metadata, mnts, 3);
+
+ ret = chdir("/a");
+ ASSERT_EQ(ret, 0);
+
+ ret = fork();
+ ASSERT_GE(ret, 0);
+
+ if (ret == 0) {
+ chdir("/");
+ unshare(CLONE_NEWNS);
+ mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL);
+ umount2("/a", MNT_DETACH);
+ // This triggers a detach in the other namespace
+ rmdir("/a");
+ exit(0);
+ }
+ wait(NULL);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, 2, mnts + 1);
+ check_mounted(_metadata, mnts, 1);
+
+ // Cleanup
+ ret = chdir("/");
+ ASSERT_EQ(ret, 0);
+}
+
+TEST_F(fanotify, pivot_root)
+{
+ uint64_t mnts[3] = { self->root_id };
+ uint64_t mnts2[3];
+ int ret;
+
+ ret = mount("tmpfs", "/a", "tmpfs", 0, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[2] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+
+ ret = mkdir("/a/new", 0700);
+ ASSERT_EQ(ret, 0);
+
+ ret = mkdir("/a/old", 0700);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("/a", "/a/new", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ check_mounted(_metadata, mnts, 3);
+
+ ret = syscall(SYS_pivot_root, "/a/new", "/a/new/old");
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH | FAN_MNT_DETACH, 2, mnts2);
+ verify_mount_ids(_metadata, mnts, mnts2, 2);
+ check_mounted(_metadata, mnts, 3);
+
+ // Cleanup
+ ret = syscall(SYS_pivot_root, "/old", "/old/a/new");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/a/new");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c
new file mode 100644
index 000000000000..090a5ca65004
--- /dev/null
+++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c
@@ -0,0 +1,557 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2025 Miklos Szeredi <miklos@szeredi.hu>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include "../../kselftest_harness.h"
+#include "../../pidfd/pidfd.h"
+#include "../statmount/statmount.h"
+#include "../utils.h"
+
+// Needed for linux/fanotify.h
+#ifndef __kernel_fsid_t
+typedef struct {
+ int val[2];
+} __kernel_fsid_t;
+#endif
+
+#include <sys/fanotify.h>
+
+static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX";
+
+static const int mark_types[] = {
+ FAN_MARK_FILESYSTEM,
+ FAN_MARK_MOUNT,
+ FAN_MARK_INODE
+};
+
+static const int mark_cmds[] = {
+ FAN_MARK_ADD,
+ FAN_MARK_REMOVE,
+ FAN_MARK_FLUSH
+};
+
+#define NUM_FAN_FDS ARRAY_SIZE(mark_cmds)
+
+FIXTURE(fanotify) {
+ int fan_fd[NUM_FAN_FDS];
+ char buf[256];
+ unsigned int rem;
+ void *next;
+ char root_mntpoint[sizeof(root_mntpoint_templ)];
+ int orig_root;
+ int orig_ns_fd;
+ int ns_fd;
+ uint64_t root_id;
+};
+
+FIXTURE_SETUP(fanotify)
+{
+ int i, ret;
+
+ self->orig_ns_fd = open("/proc/self/ns/mnt", O_RDONLY);
+ ASSERT_GE(self->orig_ns_fd, 0);
+
+ ret = setup_userns();
+ ASSERT_EQ(ret, 0);
+
+ self->ns_fd = open("/proc/self/ns/mnt", O_RDONLY);
+ ASSERT_GE(self->ns_fd, 0);
+
+ strcpy(self->root_mntpoint, root_mntpoint_templ);
+ ASSERT_NE(mkdtemp(self->root_mntpoint), NULL);
+
+ self->orig_root = open("/", O_PATH | O_CLOEXEC);
+ ASSERT_GE(self->orig_root, 0);
+
+ ASSERT_EQ(mount("tmpfs", self->root_mntpoint, "tmpfs", 0, NULL), 0);
+
+ ASSERT_EQ(chroot(self->root_mntpoint), 0);
+
+ ASSERT_EQ(chdir("/"), 0);
+
+ ASSERT_EQ(mkdir("a", 0700), 0);
+
+ ASSERT_EQ(mkdir("b", 0700), 0);
+
+ self->root_id = get_unique_mnt_id("/");
+ ASSERT_NE(self->root_id, 0);
+
+ for (i = 0; i < NUM_FAN_FDS; i++) {
+ int fan_fd = fanotify_init(FAN_REPORT_FID, 0);
+ // Verify that watching tmpfs mounted inside userns is allowed
+ ret = fanotify_mark(fan_fd, FAN_MARK_ADD | mark_types[i],
+ FAN_OPEN, AT_FDCWD, "/");
+ ASSERT_EQ(ret, 0);
+ // ...but watching entire orig root filesystem is not allowed
+ ret = fanotify_mark(fan_fd, FAN_MARK_ADD | FAN_MARK_FILESYSTEM,
+ FAN_OPEN, self->orig_root, ".");
+ ASSERT_NE(ret, 0);
+ close(fan_fd);
+
+ self->fan_fd[i] = fanotify_init(FAN_REPORT_MNT | FAN_NONBLOCK,
+ 0);
+ ASSERT_GE(self->fan_fd[i], 0);
+ // Verify that watching mntns where group was created is allowed
+ ret = fanotify_mark(self->fan_fd[i], FAN_MARK_ADD |
+ FAN_MARK_MNTNS,
+ FAN_MNT_ATTACH | FAN_MNT_DETACH,
+ self->ns_fd, NULL);
+ ASSERT_EQ(ret, 0);
+ // ...but watching orig mntns is not allowed
+ ret = fanotify_mark(self->fan_fd[i], FAN_MARK_ADD |
+ FAN_MARK_MNTNS,
+ FAN_MNT_ATTACH | FAN_MNT_DETACH,
+ self->orig_ns_fd, NULL);
+ ASSERT_NE(ret, 0);
+ // On fd[0] we do an extra ADD that changes nothing.
+ // On fd[1]/fd[2] we REMOVE/FLUSH which removes the mark.
+ ret = fanotify_mark(self->fan_fd[i], mark_cmds[i] |
+ FAN_MARK_MNTNS,
+ FAN_MNT_ATTACH | FAN_MNT_DETACH,
+ self->ns_fd, NULL);
+ ASSERT_EQ(ret, 0);
+ }
+
+ self->rem = 0;
+}
+
+FIXTURE_TEARDOWN(fanotify)
+{
+ int i;
+
+ ASSERT_EQ(self->rem, 0);
+ for (i = 0; i < NUM_FAN_FDS; i++)
+ close(self->fan_fd[i]);
+
+ ASSERT_EQ(fchdir(self->orig_root), 0);
+
+ ASSERT_EQ(chroot("."), 0);
+
+ EXPECT_EQ(umount2(self->root_mntpoint, MNT_DETACH), 0);
+ EXPECT_EQ(chdir(self->root_mntpoint), 0);
+ EXPECT_EQ(chdir("/"), 0);
+ EXPECT_EQ(rmdir(self->root_mntpoint), 0);
+}
+
+static uint64_t expect_notify(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ uint64_t *mask)
+{
+ struct fanotify_event_metadata *meta;
+ struct fanotify_event_info_mnt *mnt;
+ unsigned int thislen;
+
+ if (!self->rem) {
+ ssize_t len;
+ int i;
+
+ for (i = NUM_FAN_FDS - 1; i >= 0; i--) {
+ len = read(self->fan_fd[i], self->buf,
+ sizeof(self->buf));
+ if (i > 0) {
+ // Groups 1,2 should get EAGAIN
+ ASSERT_EQ(len, -1);
+ ASSERT_EQ(errno, EAGAIN);
+ } else {
+ // Group 0 should get events
+ ASSERT_GT(len, 0);
+ }
+ }
+
+ self->rem = len;
+ self->next = (void *) self->buf;
+ }
+
+ meta = self->next;
+ ASSERT_TRUE(FAN_EVENT_OK(meta, self->rem));
+
+ thislen = meta->event_len;
+ self->rem -= thislen;
+ self->next += thislen;
+
+ *mask = meta->mask;
+ thislen -= sizeof(*meta);
+
+ mnt = ((void *) meta) + meta->event_len - thislen;
+
+ ASSERT_EQ(thislen, sizeof(*mnt));
+
+ return mnt->mnt_id;
+}
+
+static void expect_notify_n(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ unsigned int n, uint64_t mask[], uint64_t mnts[])
+{
+ unsigned int i;
+
+ for (i = 0; i < n; i++)
+ mnts[i] = expect_notify(_metadata, self, &mask[i]);
+}
+
+static uint64_t expect_notify_mask(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ uint64_t expect_mask)
+{
+ uint64_t mntid, mask;
+
+ mntid = expect_notify(_metadata, self, &mask);
+ ASSERT_EQ(expect_mask, mask);
+
+ return mntid;
+}
+
+
+static void expect_notify_mask_n(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ uint64_t mask, unsigned int n, uint64_t mnts[])
+{
+ unsigned int i;
+
+ for (i = 0; i < n; i++)
+ mnts[i] = expect_notify_mask(_metadata, self, mask);
+}
+
+static void verify_mount_ids(struct __test_metadata *const _metadata,
+ const uint64_t list1[], const uint64_t list2[],
+ size_t num)
+{
+ unsigned int i, j;
+
+ // Check that neither list has any duplicates
+ for (i = 0; i < num; i++) {
+ for (j = 0; j < num; j++) {
+ if (i != j) {
+ ASSERT_NE(list1[i], list1[j]);
+ ASSERT_NE(list2[i], list2[j]);
+ }
+ }
+ }
+ // Check that all list1 memebers can be found in list2. Together with
+ // the above it means that the list1 and list2 represent the same sets.
+ for (i = 0; i < num; i++) {
+ for (j = 0; j < num; j++) {
+ if (list1[i] == list2[j])
+ break;
+ }
+ ASSERT_NE(j, num);
+ }
+}
+
+static void check_mounted(struct __test_metadata *const _metadata,
+ const uint64_t mnts[], size_t num)
+{
+ ssize_t ret;
+ uint64_t *list;
+
+ list = malloc((num + 1) * sizeof(list[0]));
+ ASSERT_NE(list, NULL);
+
+ ret = listmount(LSMT_ROOT, 0, 0, list, num + 1, 0);
+ ASSERT_EQ(ret, num);
+
+ verify_mount_ids(_metadata, mnts, list, num);
+
+ free(list);
+}
+
+static void setup_mount_tree(struct __test_metadata *const _metadata,
+ int log2_num)
+{
+ int ret, i;
+
+ ret = mount("", "/", NULL, MS_SHARED, NULL);
+ ASSERT_EQ(ret, 0);
+
+ for (i = 0; i < log2_num; i++) {
+ ret = mount("/", "/", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+ }
+}
+
+TEST_F(fanotify, bind)
+{
+ int ret;
+ uint64_t mnts[2] = { self->root_id };
+
+ ret = mount("/", "/", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ ASSERT_NE(mnts[0], mnts[1]);
+
+ check_mounted(_metadata, mnts, 2);
+
+ // Cleanup
+ uint64_t detach_id;
+ ret = umount("/");
+ ASSERT_EQ(ret, 0);
+
+ detach_id = expect_notify_mask(_metadata, self, FAN_MNT_DETACH);
+ ASSERT_EQ(detach_id, mnts[1]);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, move)
+{
+ int ret;
+ uint64_t mnts[2] = { self->root_id };
+ uint64_t move_id;
+
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ ASSERT_NE(mnts[0], mnts[1]);
+
+ check_mounted(_metadata, mnts, 2);
+
+ ret = move_mount(AT_FDCWD, "/a", AT_FDCWD, "/b", 0);
+ ASSERT_EQ(ret, 0);
+
+ move_id = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH | FAN_MNT_DETACH);
+ ASSERT_EQ(move_id, mnts[1]);
+
+ // Cleanup
+ ret = umount("/b");
+ ASSERT_EQ(ret, 0);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, propagate)
+{
+ const unsigned int log2_num = 4;
+ const unsigned int num = (1 << log2_num);
+ uint64_t mnts[num];
+
+ setup_mount_tree(_metadata, log2_num);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, num - 1, mnts + 1);
+
+ mnts[0] = self->root_id;
+ check_mounted(_metadata, mnts, num);
+
+ // Cleanup
+ int ret;
+ uint64_t mnts2[num];
+ ret = umount2("/", MNT_DETACH);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("", "/", NULL, MS_PRIVATE, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts2[0] = self->root_id;
+ expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, num - 1, mnts2 + 1);
+ verify_mount_ids(_metadata, mnts, mnts2, num);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, fsmount)
+{
+ int ret, fs, mnt;
+ uint64_t mnts[2] = { self->root_id };
+
+ fs = fsopen("tmpfs", 0);
+ ASSERT_GE(fs, 0);
+
+ ret = fsconfig(fs, FSCONFIG_CMD_CREATE, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+
+ mnt = fsmount(fs, 0, 0);
+ ASSERT_GE(mnt, 0);
+
+ close(fs);
+
+ ret = move_mount(mnt, "", AT_FDCWD, "/a", MOVE_MOUNT_F_EMPTY_PATH);
+ ASSERT_EQ(ret, 0);
+
+ close(mnt);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ ASSERT_NE(mnts[0], mnts[1]);
+
+ check_mounted(_metadata, mnts, 2);
+
+ // Cleanup
+ uint64_t detach_id;
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ detach_id = expect_notify_mask(_metadata, self, FAN_MNT_DETACH);
+ ASSERT_EQ(detach_id, mnts[1]);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, reparent)
+{
+ uint64_t mnts[6] = { self->root_id };
+ uint64_t dmnts[3];
+ uint64_t masks[3];
+ unsigned int i;
+ int ret;
+
+ // Create setup with a[1] -> b[2] propagation
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("", "/a", NULL, MS_SHARED, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("/a", "/b", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("", "/b", NULL, MS_SLAVE, NULL);
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 1);
+
+ check_mounted(_metadata, mnts, 3);
+
+ // Mount on a[3], which is propagated to b[4]
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 3);
+
+ check_mounted(_metadata, mnts, 5);
+
+ // Mount on b[5], not propagated
+ ret = mount("/", "/b", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[5] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+
+ check_mounted(_metadata, mnts, 6);
+
+ // Umount a[3], which is propagated to b[4], but not b[5]
+ // This will result in b[5] "falling" on b[2]
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_n(_metadata, self, 3, masks, dmnts);
+ verify_mount_ids(_metadata, mnts + 3, dmnts, 3);
+
+ for (i = 0; i < 3; i++) {
+ if (dmnts[i] == mnts[5]) {
+ ASSERT_EQ(masks[i], FAN_MNT_ATTACH | FAN_MNT_DETACH);
+ } else {
+ ASSERT_EQ(masks[i], FAN_MNT_DETACH);
+ }
+ }
+
+ mnts[3] = mnts[5];
+ check_mounted(_metadata, mnts, 4);
+
+ // Cleanup
+ ret = umount("/b");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/b");
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, 3, dmnts);
+ verify_mount_ids(_metadata, mnts + 1, dmnts, 3);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, rmdir)
+{
+ uint64_t mnts[3] = { self->root_id };
+ int ret;
+
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("/", "/a/b", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 1);
+
+ check_mounted(_metadata, mnts, 3);
+
+ ret = chdir("/a");
+ ASSERT_EQ(ret, 0);
+
+ ret = fork();
+ ASSERT_GE(ret, 0);
+
+ if (ret == 0) {
+ chdir("/");
+ unshare(CLONE_NEWNS);
+ mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL);
+ umount2("/a", MNT_DETACH);
+ // This triggers a detach in the other namespace
+ rmdir("/a");
+ exit(0);
+ }
+ wait(NULL);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, 2, mnts + 1);
+ check_mounted(_metadata, mnts, 1);
+
+ // Cleanup
+ ret = chdir("/");
+ ASSERT_EQ(ret, 0);
+}
+
+TEST_F(fanotify, pivot_root)
+{
+ uint64_t mnts[3] = { self->root_id };
+ uint64_t mnts2[3];
+ int ret;
+
+ ret = mount("tmpfs", "/a", "tmpfs", 0, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[2] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+
+ ret = mkdir("/a/new", 0700);
+ ASSERT_EQ(ret, 0);
+
+ ret = mkdir("/a/old", 0700);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("/a", "/a/new", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ check_mounted(_metadata, mnts, 3);
+
+ ret = syscall(SYS_pivot_root, "/a/new", "/a/new/old");
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH | FAN_MNT_DETACH, 2, mnts2);
+ verify_mount_ids(_metadata, mnts, mnts2, 2);
+ check_mounted(_metadata, mnts, 3);
+
+ // Cleanup
+ ret = syscall(SYS_pivot_root, "/old", "/old/a/new");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/a/new");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/nsfs/.gitignore b/tools/testing/selftests/filesystems/nsfs/.gitignore
new file mode 100644
index 000000000000..92a8249006d1
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+owner
+pidns
+iterate_mntns
diff --git a/tools/testing/selftests/filesystems/nsfs/Makefile b/tools/testing/selftests/filesystems/nsfs/Makefile
new file mode 100644
index 000000000000..231aaa7dfd95
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+TEST_GEN_PROGS := owner pidns iterate_mntns
+
+CFLAGS := -Wall -Werror
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/nsfs/config b/tools/testing/selftests/filesystems/nsfs/config
new file mode 100644
index 000000000000..598d0a225fc9
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/config
@@ -0,0 +1,3 @@
+CONFIG_USER_NS=y
+CONFIG_UTS_NS=y
+CONFIG_PID_NS=y
diff --git a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c
new file mode 100644
index 000000000000..a3d8015897e9
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/auto_dev-ioctl.h>
+#include <linux/errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "../../kselftest_harness.h"
+
+#define MNT_NS_COUNT 11
+#define MNT_NS_LAST_INDEX 10
+
+struct mnt_ns_info {
+ __u32 size;
+ __u32 nr_mounts;
+ __u64 mnt_ns_id;
+};
+
+#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
+
+/* Get information about namespace. */
+#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info)
+/* Get next namespace. */
+#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info)
+/* Get previous namespace. */
+#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info)
+
+FIXTURE(iterate_mount_namespaces) {
+ int fd_mnt_ns[MNT_NS_COUNT];
+ __u64 mnt_ns_id[MNT_NS_COUNT];
+};
+
+FIXTURE_SETUP(iterate_mount_namespaces)
+{
+ for (int i = 0; i < MNT_NS_COUNT; i++)
+ self->fd_mnt_ns[i] = -EBADF;
+
+ /*
+ * Creating a new user namespace let's us guarantee that we only see
+ * mount namespaces that we did actually create.
+ */
+ ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
+
+ for (int i = 0; i < MNT_NS_COUNT; i++) {
+ struct mnt_ns_info info = {};
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ self->fd_mnt_ns[i] = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
+ ASSERT_GE(self->fd_mnt_ns[i], 0);
+ ASSERT_EQ(ioctl(self->fd_mnt_ns[i], NS_MNT_GET_INFO, &info), 0);
+ self->mnt_ns_id[i] = info.mnt_ns_id;
+ }
+}
+
+FIXTURE_TEARDOWN(iterate_mount_namespaces)
+{
+ for (int i = 0; i < MNT_NS_COUNT; i++) {
+ if (self->fd_mnt_ns[i] < 0)
+ continue;
+ ASSERT_EQ(close(self->fd_mnt_ns[i]), 0);
+ }
+}
+
+TEST_F(iterate_mount_namespaces, iterate_all_forward)
+{
+ int fd_mnt_ns_cur, count = 0;
+
+ fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[0], F_DUPFD_CLOEXEC);
+ ASSERT_GE(fd_mnt_ns_cur, 0);
+
+ for (;; count++) {
+ struct mnt_ns_info info = {};
+ int fd_mnt_ns_next;
+
+ fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info);
+ if (fd_mnt_ns_next < 0 && errno == ENOENT)
+ break;
+ ASSERT_GE(fd_mnt_ns_next, 0);
+ ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+ fd_mnt_ns_cur = fd_mnt_ns_next;
+ }
+ ASSERT_EQ(count, MNT_NS_LAST_INDEX);
+}
+
+TEST_F(iterate_mount_namespaces, iterate_all_backwards)
+{
+ int fd_mnt_ns_cur, count = 0;
+
+ fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[MNT_NS_LAST_INDEX], F_DUPFD_CLOEXEC);
+ ASSERT_GE(fd_mnt_ns_cur, 0);
+
+ for (;; count++) {
+ struct mnt_ns_info info = {};
+ int fd_mnt_ns_prev;
+
+ fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info);
+ if (fd_mnt_ns_prev < 0 && errno == ENOENT)
+ break;
+ ASSERT_GE(fd_mnt_ns_prev, 0);
+ ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+ fd_mnt_ns_cur = fd_mnt_ns_prev;
+ }
+ ASSERT_EQ(count, MNT_NS_LAST_INDEX);
+}
+
+TEST_F(iterate_mount_namespaces, iterate_forward)
+{
+ int fd_mnt_ns_cur;
+
+ ASSERT_EQ(setns(self->fd_mnt_ns[0], CLONE_NEWNS), 0);
+
+ fd_mnt_ns_cur = self->fd_mnt_ns[0];
+ for (int i = 1; i < MNT_NS_COUNT; i++) {
+ struct mnt_ns_info info = {};
+ int fd_mnt_ns_next;
+
+ fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info);
+ ASSERT_GE(fd_mnt_ns_next, 0);
+ ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+ fd_mnt_ns_cur = fd_mnt_ns_next;
+ ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]);
+ }
+}
+
+TEST_F(iterate_mount_namespaces, iterate_backward)
+{
+ int fd_mnt_ns_cur;
+
+ ASSERT_EQ(setns(self->fd_mnt_ns[MNT_NS_LAST_INDEX], CLONE_NEWNS), 0);
+
+ fd_mnt_ns_cur = self->fd_mnt_ns[MNT_NS_LAST_INDEX];
+ for (int i = MNT_NS_LAST_INDEX - 1; i >= 0; i--) {
+ struct mnt_ns_info info = {};
+ int fd_mnt_ns_prev;
+
+ fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info);
+ ASSERT_GE(fd_mnt_ns_prev, 0);
+ ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+ fd_mnt_ns_cur = fd_mnt_ns_prev;
+ ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]);
+ }
+}
+
+TEST_F(iterate_mount_namespaces, nfs_valid_ioctl)
+{
+ ASSERT_NE(ioctl(self->fd_mnt_ns[0], AUTOFS_DEV_IOCTL_OPENMOUNT, NULL), 0);
+ ASSERT_EQ(errno, ENOTTY);
+
+ ASSERT_NE(ioctl(self->fd_mnt_ns[0], AUTOFS_DEV_IOCTL_CLOSEMOUNT, NULL), 0);
+ ASSERT_EQ(errno, ENOTTY);
+
+ ASSERT_NE(ioctl(self->fd_mnt_ns[0], AUTOFS_DEV_IOCTL_READY, NULL), 0);
+ ASSERT_EQ(errno, ENOTTY);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/nsfs/owner.c b/tools/testing/selftests/filesystems/nsfs/owner.c
new file mode 100644
index 000000000000..96a976c74550
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/owner.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+
+#define NSIO 0xb7
+#define NS_GET_USERNS _IO(NSIO, 0x1)
+
+#define pr_err(fmt, ...) \
+ ({ \
+ fprintf(stderr, "%s:%d:" fmt ": %m\n", \
+ __func__, __LINE__, ##__VA_ARGS__); \
+ 1; \
+ })
+
+int main(int argc, char *argvp[])
+{
+ int pfd[2], ns, uns, init_uns;
+ struct stat st1, st2;
+ char path[128];
+ pid_t pid;
+ char c;
+
+ if (pipe(pfd))
+ return 1;
+
+ pid = fork();
+ if (pid < 0)
+ return pr_err("fork");
+ if (pid == 0) {
+ prctl(PR_SET_PDEATHSIG, SIGKILL);
+ if (unshare(CLONE_NEWUTS | CLONE_NEWUSER))
+ return pr_err("unshare");
+ close(pfd[0]);
+ close(pfd[1]);
+ while (1)
+ sleep(1);
+ return 0;
+ }
+ close(pfd[1]);
+ if (read(pfd[0], &c, 1) != 0)
+ return pr_err("Unable to read from pipe");
+ close(pfd[0]);
+
+ snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid);
+ ns = open(path, O_RDONLY);
+ if (ns < 0)
+ return pr_err("Unable to open %s", path);
+
+ uns = ioctl(ns, NS_GET_USERNS);
+ if (uns < 0)
+ return pr_err("Unable to get an owning user namespace");
+
+ if (fstat(uns, &st1))
+ return pr_err("fstat");
+
+ snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
+ if (stat(path, &st2))
+ return pr_err("stat");
+
+ if (st1.st_ino != st2.st_ino)
+ return pr_err("NS_GET_USERNS returned a wrong namespace");
+
+ init_uns = ioctl(uns, NS_GET_USERNS);
+ if (uns < 0)
+ return pr_err("Unable to get an owning user namespace");
+
+ if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
+ return pr_err("Don't get EPERM");
+
+ if (unshare(CLONE_NEWUSER))
+ return pr_err("unshare");
+
+ if (ioctl(ns, NS_GET_USERNS) >= 0 || errno != EPERM)
+ return pr_err("Don't get EPERM");
+ if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
+ return pr_err("Don't get EPERM");
+
+ kill(pid, SIGKILL);
+ wait(NULL);
+ return 0;
+}
diff --git a/tools/testing/selftests/filesystems/nsfs/pidns.c b/tools/testing/selftests/filesystems/nsfs/pidns.c
new file mode 100644
index 000000000000..e3c772c6a7c7
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/pidns.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+
+#define pr_err(fmt, ...) \
+ ({ \
+ fprintf(stderr, "%s:%d:" fmt ": %m\n", \
+ __func__, __LINE__, ##__VA_ARGS__); \
+ 1; \
+ })
+
+#define NSIO 0xb7
+#define NS_GET_USERNS _IO(NSIO, 0x1)
+#define NS_GET_PARENT _IO(NSIO, 0x2)
+
+#define __stack_aligned__ __attribute__((aligned(16)))
+struct cr_clone_arg {
+ char stack[128] __stack_aligned__;
+ char stack_ptr[];
+};
+
+static int child(void *args)
+{
+ prctl(PR_SET_PDEATHSIG, SIGKILL);
+ while (1)
+ sleep(1);
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ char *ns_strs[] = {"pid", "user"};
+ char path[] = "/proc/0123456789/ns/pid";
+ struct cr_clone_arg ca;
+ struct stat st1, st2;
+ int ns, pns, i;
+ pid_t pid;
+
+ pid = clone(child, ca.stack_ptr, CLONE_NEWUSER | CLONE_NEWPID | SIGCHLD, NULL);
+ if (pid < 0)
+ return pr_err("clone");
+
+ for (i = 0; i < 2; i++) {
+ snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns_strs[i]);
+ ns = open(path, O_RDONLY);
+ if (ns < 0)
+ return pr_err("Unable to open %s", path);
+
+ pns = ioctl(ns, NS_GET_PARENT);
+ if (pns < 0)
+ return pr_err("Unable to get a parent pidns");
+
+ snprintf(path, sizeof(path), "/proc/self/ns/%s", ns_strs[i]);
+ if (stat(path, &st2))
+ return pr_err("Unable to stat %s", path);
+ if (fstat(pns, &st1))
+ return pr_err("Unable to stat the parent pidns");
+ if (st1.st_ino != st2.st_ino)
+ return pr_err("NS_GET_PARENT returned a wrong namespace");
+
+ if (ioctl(pns, NS_GET_PARENT) >= 0 || errno != EPERM)
+ return pr_err("Don't get EPERM");
+ }
+
+ kill(pid, SIGKILL);
+ wait(NULL);
+ return 0;
+}
diff --git a/tools/testing/selftests/filesystems/overlayfs/.gitignore b/tools/testing/selftests/filesystems/overlayfs/.gitignore
new file mode 100644
index 000000000000..e23a18c8b37f
--- /dev/null
+++ b/tools/testing/selftests/filesystems/overlayfs/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+dev_in_maps
+set_layers_via_fds
diff --git a/tools/testing/selftests/filesystems/overlayfs/Makefile b/tools/testing/selftests/filesystems/overlayfs/Makefile
new file mode 100644
index 000000000000..d3ad4a77db9b
--- /dev/null
+++ b/tools/testing/selftests/filesystems/overlayfs/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -Wall
+CFLAGS += $(KHDR_INCLUDES)
+LDLIBS += -lcap
+
+LOCAL_HDRS += ../wrappers.h log.h
+
+TEST_GEN_PROGS := dev_in_maps
+TEST_GEN_PROGS += set_layers_via_fds
+
+include ../../lib.mk
+
+$(OUTPUT)/set_layers_via_fds: ../utils.c
diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
new file mode 100644
index 000000000000..31db54b00e64
--- /dev/null
+++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__ // Use ll64
+
+#include <inttypes.h>
+#include <unistd.h>
+#include <stdio.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <sys/syscall.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <fcntl.h>
+
+#include "../../kselftest.h"
+#include "log.h"
+#include "../wrappers.h"
+
+static long get_file_dev_and_inode(void *addr, struct statx *stx)
+{
+ char buf[4096];
+ FILE *mapf;
+
+ mapf = fopen("/proc/self/maps", "r");
+ if (mapf == NULL)
+ return pr_perror("fopen(/proc/self/maps)");
+
+ while (fgets(buf, sizeof(buf), mapf)) {
+ unsigned long start, end;
+ uint32_t maj, min;
+ __u64 ino;
+
+ if (sscanf(buf, "%lx-%lx %*s %*s %x:%x %llu",
+ &start, &end, &maj, &min, &ino) != 5)
+ return pr_perror("unable to parse: %s", buf);
+ if (start == (unsigned long)addr) {
+ stx->stx_dev_major = maj;
+ stx->stx_dev_minor = min;
+ stx->stx_ino = ino;
+ return 0;
+ }
+ }
+
+ return pr_err("unable to find the mapping");
+}
+
+static int ovl_mount(void)
+{
+ int tmpfs, fsfd, ovl;
+
+ fsfd = sys_fsopen("tmpfs", 0);
+ if (fsfd == -1)
+ return pr_perror("fsopen(tmpfs)");
+
+ if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1)
+ return pr_perror("FSCONFIG_CMD_CREATE");
+
+ tmpfs = sys_fsmount(fsfd, 0, 0);
+ if (tmpfs == -1)
+ return pr_perror("fsmount");
+
+ close(fsfd);
+
+ /* overlayfs can't be constructed on top of a detached mount. */
+ if (sys_move_mount(tmpfs, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH))
+ return pr_perror("move_mount");
+ close(tmpfs);
+
+ if (mkdir("/tmp/w", 0755) == -1 ||
+ mkdir("/tmp/u", 0755) == -1 ||
+ mkdir("/tmp/l", 0755) == -1)
+ return pr_perror("mkdir");
+
+ fsfd = sys_fsopen("overlay", 0);
+ if (fsfd == -1)
+ return pr_perror("fsopen(overlay)");
+ if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 ||
+ sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 ||
+ sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 ||
+ sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1)
+ return pr_perror("fsconfig");
+ if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1)
+ return pr_perror("fsconfig");
+ ovl = sys_fsmount(fsfd, 0, 0);
+ if (ovl == -1)
+ return pr_perror("fsmount");
+
+ return ovl;
+}
+
+/*
+ * Check that the file device and inode shown in /proc/pid/maps match values
+ * returned by stat(2).
+ */
+static int test(void)
+{
+ struct statx stx, mstx;
+ int ovl, fd;
+ void *addr;
+
+ ovl = ovl_mount();
+ if (ovl == -1)
+ return -1;
+
+ fd = openat(ovl, "test", O_RDWR | O_CREAT, 0644);
+ if (fd == -1)
+ return pr_perror("openat");
+
+ addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0);
+ if (addr == MAP_FAILED)
+ return pr_perror("mmap");
+
+ if (get_file_dev_and_inode(addr, &mstx))
+ return -1;
+ if (statx(fd, "", AT_EMPTY_PATH | AT_STATX_SYNC_AS_STAT, STATX_INO, &stx))
+ return pr_perror("statx");
+
+ if (stx.stx_dev_major != mstx.stx_dev_major ||
+ stx.stx_dev_minor != mstx.stx_dev_minor ||
+ stx.stx_ino != mstx.stx_ino)
+ return pr_fail("unmatched dev:ino %x:%x:%llx (expected %x:%x:%llx)\n",
+ mstx.stx_dev_major, mstx.stx_dev_minor, mstx.stx_ino,
+ stx.stx_dev_major, stx.stx_dev_minor, stx.stx_ino);
+
+ ksft_test_result_pass("devices are matched\n");
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int fsfd;
+
+ fsfd = sys_fsopen("overlay", 0);
+ if (fsfd == -1) {
+ ksft_test_result_skip("unable to create overlay mount\n");
+ return 1;
+ }
+ close(fsfd);
+
+ /* Create a new mount namespace to not care about cleaning test mounts. */
+ if (unshare(CLONE_NEWNS) == -1) {
+ ksft_test_result_skip("unable to create a new mount namespace\n");
+ return 1;
+ }
+ if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) {
+ pr_perror("mount");
+ return 1;
+ }
+
+ ksft_set_plan(1);
+
+ if (test())
+ return 1;
+
+ ksft_exit_pass();
+ return 0;
+}
diff --git a/tools/testing/selftests/filesystems/overlayfs/log.h b/tools/testing/selftests/filesystems/overlayfs/log.h
new file mode 100644
index 000000000000..db64df2a8483
--- /dev/null
+++ b/tools/testing/selftests/filesystems/overlayfs/log.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __SELFTEST_TIMENS_LOG_H__
+#define __SELFTEST_TIMENS_LOG_H__
+
+#define pr_msg(fmt, lvl, ...) \
+ ksft_print_msg("[%s] (%s:%d)\t" fmt "\n", \
+ lvl, __FILE__, __LINE__, ##__VA_ARGS__)
+
+#define pr_p(func, fmt, ...) func(fmt ": %m", ##__VA_ARGS__)
+
+#define pr_err(fmt, ...) \
+ ({ \
+ ksft_test_result_error(fmt "\n", ##__VA_ARGS__); \
+ -1; \
+ })
+
+#define pr_fail(fmt, ...) \
+ ({ \
+ ksft_test_result_fail(fmt, ##__VA_ARGS__); \
+ -1; \
+ })
+
+#define pr_perror(fmt, ...) pr_p(pr_err, fmt, ##__VA_ARGS__)
+
+#endif
diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
new file mode 100644
index 000000000000..dc0449fa628f
--- /dev/null
+++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__ // Use ll64
+
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "../../kselftest_harness.h"
+#include "../../pidfd/pidfd.h"
+#include "log.h"
+#include "../utils.h"
+#include "../wrappers.h"
+
+FIXTURE(set_layers_via_fds) {
+ int pidfd;
+};
+
+FIXTURE_SETUP(set_layers_via_fds)
+{
+ self->pidfd = -EBADF;
+ EXPECT_EQ(mkdir("/set_layers_via_fds", 0755), 0);
+ EXPECT_EQ(mkdir("/set_layers_via_fds_tmpfs", 0755), 0);
+}
+
+FIXTURE_TEARDOWN(set_layers_via_fds)
+{
+ if (self->pidfd >= 0) {
+ EXPECT_EQ(sys_pidfd_send_signal(self->pidfd, SIGKILL, NULL, 0), 0);
+ EXPECT_EQ(close(self->pidfd), 0);
+ }
+ umount2("/set_layers_via_fds", 0);
+ EXPECT_EQ(rmdir("/set_layers_via_fds"), 0);
+
+ umount2("/set_layers_via_fds_tmpfs", 0);
+ EXPECT_EQ(rmdir("/set_layers_via_fds_tmpfs"), 0);
+}
+
+TEST_F(set_layers_via_fds, set_layers_via_fds)
+{
+ int fd_context, fd_tmpfs, fd_overlay;
+ int layer_fds[] = { [0 ... 8] = -EBADF };
+ bool layers_found[] = { [0 ... 8] = false };
+ size_t len = 0;
+ char *line = NULL;
+ FILE *f_mountinfo;
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_tmpfs, 0);
+ ASSERT_EQ(close(fd_context), 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l3", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l4", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "d1", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "d2", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "d3", 0755), 0);
+
+ layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY);
+ ASSERT_GE(layer_fds[0], 0);
+
+ layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY);
+ ASSERT_GE(layer_fds[1], 0);
+
+ layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY);
+ ASSERT_GE(layer_fds[2], 0);
+
+ layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY);
+ ASSERT_GE(layer_fds[3], 0);
+
+ layer_fds[4] = openat(fd_tmpfs, "l3", O_DIRECTORY);
+ ASSERT_GE(layer_fds[4], 0);
+
+ layer_fds[5] = openat(fd_tmpfs, "l4", O_DIRECTORY);
+ ASSERT_GE(layer_fds[5], 0);
+
+ layer_fds[6] = openat(fd_tmpfs, "d1", O_DIRECTORY);
+ ASSERT_GE(layer_fds[6], 0);
+
+ layer_fds[7] = openat(fd_tmpfs, "d2", O_DIRECTORY);
+ ASSERT_GE(layer_fds[7], 0);
+
+ layer_fds[8] = openat(fd_tmpfs, "d3", O_DIRECTORY);
+ ASSERT_GE(layer_fds[8], 0);
+
+ ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
+ ASSERT_EQ(close(fd_tmpfs), 0);
+
+ fd_context = sys_fsopen("overlay", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[0]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[1]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[4]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[5]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[6]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[7]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[8]), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+ fd_overlay = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_overlay, 0);
+
+ ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);
+
+ f_mountinfo = fopen("/proc/self/mountinfo", "r");
+ ASSERT_NE(f_mountinfo, NULL);
+
+ while (getline(&line, &len, f_mountinfo) != -1) {
+ char *haystack = line;
+
+ if (strstr(haystack, "workdir=/tmp/w"))
+ layers_found[0] = true;
+ if (strstr(haystack, "upperdir=/tmp/u"))
+ layers_found[1] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l1"))
+ layers_found[2] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l2"))
+ layers_found[3] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l3"))
+ layers_found[4] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l4"))
+ layers_found[5] = true;
+ if (strstr(haystack, "datadir+=/tmp/d1"))
+ layers_found[6] = true;
+ if (strstr(haystack, "datadir+=/tmp/d2"))
+ layers_found[7] = true;
+ if (strstr(haystack, "datadir+=/tmp/d3"))
+ layers_found[8] = true;
+ }
+ free(line);
+
+ for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+ ASSERT_EQ(layers_found[i], true);
+ ASSERT_EQ(close(layer_fds[i]), 0);
+ }
+
+ ASSERT_EQ(close(fd_context), 0);
+ ASSERT_EQ(close(fd_overlay), 0);
+ ASSERT_EQ(fclose(f_mountinfo), 0);
+}
+
+TEST_F(set_layers_via_fds, set_500_layers_via_fds)
+{
+ int fd_context, fd_tmpfs, fd_overlay, fd_work, fd_upper, fd_lower;
+ int layer_fds[500] = { [0 ... 499] = -EBADF };
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_tmpfs, 0);
+ ASSERT_EQ(close(fd_context), 0);
+
+ for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+ char path[100];
+
+ sprintf(path, "l%d", i);
+ ASSERT_EQ(mkdirat(fd_tmpfs, path, 0755), 0);
+ layer_fds[i] = openat(fd_tmpfs, path, O_DIRECTORY);
+ ASSERT_GE(layer_fds[i], 0);
+ }
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
+ fd_work = openat(fd_tmpfs, "w", O_DIRECTORY);
+ ASSERT_GE(fd_work, 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+ fd_upper = openat(fd_tmpfs, "u", O_DIRECTORY);
+ ASSERT_GE(fd_upper, 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l501", 0755), 0);
+ fd_lower = openat(fd_tmpfs, "l501", O_DIRECTORY);
+ ASSERT_GE(fd_lower, 0);
+
+ ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
+ ASSERT_EQ(close(fd_tmpfs), 0);
+
+ fd_context = sys_fsopen("overlay", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, fd_work), 0);
+ ASSERT_EQ(close(fd_work), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, fd_upper), 0);
+ ASSERT_EQ(close(fd_upper), 0);
+
+ for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[i]), 0);
+ ASSERT_EQ(close(layer_fds[i]), 0);
+ }
+
+ ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, fd_lower), 0);
+ ASSERT_EQ(close(fd_lower), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+ fd_overlay = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_overlay, 0);
+ ASSERT_EQ(close(fd_context), 0);
+ ASSERT_EQ(close(fd_overlay), 0);
+}
+
+TEST_F(set_layers_via_fds, set_override_creds)
+{
+ int fd_context, fd_tmpfs, fd_overlay;
+ int layer_fds[] = { [0 ... 3] = -EBADF };
+ pid_t pid;
+ int pidfd;
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_tmpfs, 0);
+ ASSERT_EQ(close(fd_context), 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);
+
+ layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY);
+ ASSERT_GE(layer_fds[0], 0);
+
+ layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY);
+ ASSERT_GE(layer_fds[1], 0);
+
+ layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY);
+ ASSERT_GE(layer_fds[2], 0);
+
+ layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY);
+ ASSERT_GE(layer_fds[3], 0);
+
+ ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
+ ASSERT_EQ(close(fd_tmpfs), 0);
+
+ fd_context = sys_fsopen("overlay", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[0]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[1]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0);
+
+ pid = create_child(&pidfd, 0);
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
+ TH_LOG("sys_fsconfig should have succeeded");
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+ ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
+ ASSERT_GE(close(pidfd), 0);
+
+ pid = create_child(&pidfd, 0);
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "nooverride_creds", NULL, 0)) {
+ TH_LOG("sys_fsconfig should have succeeded");
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+ ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
+ ASSERT_GE(close(pidfd), 0);
+
+ pid = create_child(&pidfd, 0);
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
+ TH_LOG("sys_fsconfig should have succeeded");
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+ ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
+ ASSERT_GE(close(pidfd), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+ fd_overlay = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_overlay, 0);
+
+ ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);
+
+ ASSERT_EQ(close(fd_context), 0);
+ ASSERT_EQ(close(fd_overlay), 0);
+}
+
+TEST_F(set_layers_via_fds, set_override_creds_invalid)
+{
+ int fd_context, fd_tmpfs, fd_overlay, ret;
+ int layer_fds[] = { [0 ... 3] = -EBADF };
+ pid_t pid;
+ int fd_userns1, fd_userns2;
+ int ipc_sockets[2];
+ char c;
+ const unsigned int predictable_fd_context_nr = 123;
+
+ fd_userns1 = get_userns_fd(0, 0, 10000);
+ ASSERT_GE(fd_userns1, 0);
+
+ fd_userns2 = get_userns_fd(0, 1234, 10000);
+ ASSERT_GE(fd_userns2, 0);
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_GE(ret, 0);
+
+ pid = create_child(&self->pidfd, 0);
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ if (close(ipc_sockets[0])) {
+ TH_LOG("close should have succeeded");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (!switch_userns(fd_userns2, 0, 0, false)) {
+ TH_LOG("switch_userns should have succeeded");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (read_nointr(ipc_sockets[1], &c, 1) != 1) {
+ TH_LOG("read_nointr should have succeeded");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (close(ipc_sockets[1])) {
+ TH_LOG("close should have succeeded");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (!sys_fsconfig(predictable_fd_context_nr, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
+ TH_LOG("sys_fsconfig should have failed");
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ ASSERT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(switch_userns(fd_userns1, 0, 0, false), true);
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_tmpfs, 0);
+ ASSERT_EQ(close(fd_context), 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);
+
+ layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY);
+ ASSERT_GE(layer_fds[0], 0);
+
+ layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY);
+ ASSERT_GE(layer_fds[1], 0);
+
+ layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY);
+ ASSERT_GE(layer_fds[2], 0);
+
+ layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY);
+ ASSERT_GE(layer_fds[3], 0);
+
+ ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
+ ASSERT_EQ(close(fd_tmpfs), 0);
+
+ fd_context = sys_fsopen("overlay", 0);
+ ASSERT_GE(fd_context, 0);
+ ASSERT_EQ(dup3(fd_context, predictable_fd_context_nr, 0), predictable_fd_context_nr);
+ ASSERT_EQ(close(fd_context), 0);
+ fd_context = predictable_fd_context_nr;
+ ASSERT_EQ(write_nointr(ipc_sockets[0], "1", 1), 1);
+ ASSERT_EQ(close(ipc_sockets[0]), 0);
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+ ASSERT_EQ(close(self->pidfd), 0);
+ self->pidfd = -EBADF;
+
+ ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[0]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[1]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
+
+ for (int i = 0; i < ARRAY_SIZE(layer_fds); i++)
+ ASSERT_EQ(close(layer_fds[i]), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "userxattr", NULL, 0), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+ fd_overlay = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_overlay, 0);
+
+ ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);
+
+ ASSERT_EQ(close(fd_context), 0);
+ ASSERT_EQ(close(fd_overlay), 0);
+ ASSERT_EQ(close(fd_userns1), 0);
+ ASSERT_EQ(close(fd_userns2), 0);
+}
+
+TEST_F(set_layers_via_fds, set_override_creds_nomknod)
+{
+ int fd_context, fd_tmpfs, fd_overlay;
+ int layer_fds[] = { [0 ... 3] = -EBADF };
+ pid_t pid;
+ int pidfd;
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_tmpfs, 0);
+ ASSERT_EQ(close(fd_context), 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);
+
+ layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY);
+ ASSERT_GE(layer_fds[0], 0);
+
+ layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY);
+ ASSERT_GE(layer_fds[1], 0);
+
+ layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY);
+ ASSERT_GE(layer_fds[2], 0);
+
+ layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY);
+ ASSERT_GE(layer_fds[3], 0);
+
+ ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
+ ASSERT_EQ(close(fd_tmpfs), 0);
+
+ fd_context = sys_fsopen("overlay", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[0]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[1]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "userxattr", NULL, 0), 0);
+
+ pid = create_child(&pidfd, 0);
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ if (!cap_down(CAP_MKNOD))
+ _exit(EXIT_FAILURE);
+
+ if (!cap_down(CAP_SYS_ADMIN))
+ _exit(EXIT_FAILURE);
+
+ if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0))
+ _exit(EXIT_FAILURE);
+
+ _exit(EXIT_SUCCESS);
+ }
+ ASSERT_EQ(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
+ ASSERT_GE(close(pidfd), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+ fd_overlay = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_overlay, 0);
+
+ ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);
+ ASSERT_EQ(mknodat(fd_overlay, "dev-zero", S_IFCHR | 0644, makedev(1, 5)), -1);
+ ASSERT_EQ(errno, EPERM);
+
+ ASSERT_EQ(close(fd_context), 0);
+ ASSERT_EQ(close(fd_overlay), 0);
+}
+
+TEST_F(set_layers_via_fds, set_500_layers_via_opath_fds)
+{
+ int fd_context, fd_tmpfs, fd_overlay, fd_work, fd_upper, fd_lower;
+ int layer_fds[500] = { [0 ... 499] = -EBADF };
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_tmpfs, 0);
+ ASSERT_EQ(close(fd_context), 0);
+
+ for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+ char path[100];
+
+ sprintf(path, "l%d", i);
+ ASSERT_EQ(mkdirat(fd_tmpfs, path, 0755), 0);
+ layer_fds[i] = openat(fd_tmpfs, path, O_DIRECTORY | O_PATH);
+ ASSERT_GE(layer_fds[i], 0);
+ }
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
+ fd_work = openat(fd_tmpfs, "w", O_DIRECTORY | O_PATH);
+ ASSERT_GE(fd_work, 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+ fd_upper = openat(fd_tmpfs, "u", O_DIRECTORY | O_PATH);
+ ASSERT_GE(fd_upper, 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l501", 0755), 0);
+ fd_lower = openat(fd_tmpfs, "l501", O_DIRECTORY | O_PATH);
+ ASSERT_GE(fd_lower, 0);
+
+ ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
+ ASSERT_EQ(close(fd_tmpfs), 0);
+
+ fd_context = sys_fsopen("overlay", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, fd_work), 0);
+ ASSERT_EQ(close(fd_work), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, fd_upper), 0);
+ ASSERT_EQ(close(fd_upper), 0);
+
+ for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[i]), 0);
+ ASSERT_EQ(close(layer_fds[i]), 0);
+ }
+
+ ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, fd_lower), 0);
+ ASSERT_EQ(close(fd_lower), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+ fd_overlay = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_overlay, 0);
+ ASSERT_EQ(close(fd_context), 0);
+ ASSERT_EQ(close(fd_overlay), 0);
+}
+
+TEST_F(set_layers_via_fds, set_layers_via_detached_mount_fds)
+{
+ int fd_context, fd_tmpfs, fd_overlay, fd_tmp;
+ int layer_fds[] = { [0 ... 8] = -EBADF };
+ bool layers_found[] = { [0 ... 8] = false };
+ size_t len = 0;
+ char *line = NULL;
+ FILE *f_mountinfo;
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_tmpfs, 0);
+ ASSERT_EQ(close(fd_context), 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "u/upper", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "u/work", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l3", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l4", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "d1", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "d2", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "d3", 0755), 0);
+
+ ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/set_layers_via_fds_tmpfs", MOVE_MOUNT_F_EMPTY_PATH), 0);
+
+ fd_tmp = open_tree(fd_tmpfs, "u", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+ ASSERT_GE(fd_tmp, 0);
+
+ layer_fds[0] = openat(fd_tmp, "upper", O_CLOEXEC | O_DIRECTORY | O_PATH);
+ ASSERT_GE(layer_fds[0], 0);
+
+ layer_fds[1] = openat(fd_tmp, "work", O_CLOEXEC | O_DIRECTORY | O_PATH);
+ ASSERT_GE(layer_fds[1], 0);
+
+ layer_fds[2] = open_tree(fd_tmpfs, "l1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+ ASSERT_GE(layer_fds[2], 0);
+
+ layer_fds[3] = open_tree(fd_tmpfs, "l2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+ ASSERT_GE(layer_fds[3], 0);
+
+ layer_fds[4] = open_tree(fd_tmpfs, "l3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+ ASSERT_GE(layer_fds[4], 0);
+
+ layer_fds[5] = open_tree(fd_tmpfs, "l4", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+ ASSERT_GE(layer_fds[5], 0);
+
+ layer_fds[6] = open_tree(fd_tmpfs, "d1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+ ASSERT_GE(layer_fds[6], 0);
+
+ layer_fds[7] = open_tree(fd_tmpfs, "d2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+ ASSERT_GE(layer_fds[7], 0);
+
+ layer_fds[8] = open_tree(fd_tmpfs, "d3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+ ASSERT_GE(layer_fds[8], 0);
+
+ ASSERT_EQ(close(fd_tmpfs), 0);
+
+ fd_context = sys_fsopen("overlay", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[0]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[1]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[4]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[5]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[6]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[7]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[8]), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+ fd_overlay = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_overlay, 0);
+
+ ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);
+
+ f_mountinfo = fopen("/proc/self/mountinfo", "r");
+ ASSERT_NE(f_mountinfo, NULL);
+
+ while (getline(&line, &len, f_mountinfo) != -1) {
+ char *haystack = line;
+
+ if (strstr(haystack, "workdir=/tmp/w"))
+ layers_found[0] = true;
+ if (strstr(haystack, "upperdir=/tmp/u"))
+ layers_found[1] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l1"))
+ layers_found[2] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l2"))
+ layers_found[3] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l3"))
+ layers_found[4] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l4"))
+ layers_found[5] = true;
+ if (strstr(haystack, "datadir+=/tmp/d1"))
+ layers_found[6] = true;
+ if (strstr(haystack, "datadir+=/tmp/d2"))
+ layers_found[7] = true;
+ if (strstr(haystack, "datadir+=/tmp/d3"))
+ layers_found[8] = true;
+ }
+ free(line);
+
+ for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+ ASSERT_EQ(layers_found[i], true);
+ ASSERT_EQ(close(layer_fds[i]), 0);
+ }
+
+ ASSERT_EQ(close(fd_context), 0);
+ ASSERT_EQ(close(fd_overlay), 0);
+ ASSERT_EQ(fclose(f_mountinfo), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/statmount/.gitignore b/tools/testing/selftests/filesystems/statmount/.gitignore
new file mode 100644
index 000000000000..973363ad66a2
--- /dev/null
+++ b/tools/testing/selftests/filesystems/statmount/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+statmount_test_ns
+/*_test
diff --git a/tools/testing/selftests/filesystems/statmount/Makefile b/tools/testing/selftests/filesystems/statmount/Makefile
new file mode 100644
index 000000000000..8e354fe99b44
--- /dev/null
+++ b/tools/testing/selftests/filesystems/statmount/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
+
+TEST_GEN_PROGS := statmount_test statmount_test_ns listmount_test
+
+include ../../lib.mk
+
+$(OUTPUT)/statmount_test_ns: ../utils.c
diff --git a/tools/testing/selftests/filesystems/statmount/listmount_test.c b/tools/testing/selftests/filesystems/statmount/listmount_test.c
new file mode 100644
index 000000000000..15f0834f7557
--- /dev/null
+++ b/tools/testing/selftests/filesystems/statmount/listmount_test.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "statmount.h"
+#include "../../kselftest_harness.h"
+
+#ifndef LISTMOUNT_REVERSE
+#define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */
+#endif
+
+#define LISTMNT_BUFFER 10
+
+/* Check that all mount ids are in increasing order. */
+TEST(listmount_forward)
+{
+ uint64_t list[LISTMNT_BUFFER], last_mnt_id = 0;
+
+ for (;;) {
+ ssize_t nr_mounts;
+
+ nr_mounts = listmount(LSMT_ROOT, 0, last_mnt_id,
+ list, LISTMNT_BUFFER, 0);
+ ASSERT_GE(nr_mounts, 0);
+ if (nr_mounts == 0)
+ break;
+
+ for (size_t cur = 0; cur < nr_mounts; cur++) {
+ if (cur < nr_mounts - 1)
+ ASSERT_LT(list[cur], list[cur + 1]);
+ last_mnt_id = list[cur];
+ }
+ }
+}
+
+/* Check that all mount ids are in decreasing order. */
+TEST(listmount_backward)
+{
+ uint64_t list[LISTMNT_BUFFER], last_mnt_id = 0;
+
+ for (;;) {
+ ssize_t nr_mounts;
+
+ nr_mounts = listmount(LSMT_ROOT, 0, last_mnt_id,
+ list, LISTMNT_BUFFER, LISTMOUNT_REVERSE);
+ ASSERT_GE(nr_mounts, 0);
+ if (nr_mounts == 0)
+ break;
+
+ for (size_t cur = 0; cur < nr_mounts; cur++) {
+ if (cur < nr_mounts - 1)
+ ASSERT_GT(list[cur], list[cur + 1]);
+ last_mnt_id = list[cur];
+ }
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h
new file mode 100644
index 000000000000..99e5ad082fb1
--- /dev/null
+++ b/tools/testing/selftests/filesystems/statmount/statmount.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __STATMOUNT_H
+#define __STATMOUNT_H
+
+#include <stdint.h>
+#include <linux/mount.h>
+#include <asm/unistd.h>
+
+#ifndef __NR_statmount
+ #if defined __alpha__
+ #define __NR_statmount 567
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_statmount 4457
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_statmount 6457
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_statmount 5457
+ #endif
+ #else
+ #define __NR_statmount 457
+ #endif
+#endif
+
+#ifndef __NR_listmount
+ #if defined __alpha__
+ #define __NR_listmount 568
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_listmount 4458
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_listmount 6458
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_listmount 5458
+ #endif
+ #else
+ #define __NR_listmount 458
+ #endif
+#endif
+
+static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask,
+ struct statmount *buf, size_t bufsize,
+ unsigned int flags)
+{
+ struct mnt_id_req req = {
+ .size = MNT_ID_REQ_SIZE_VER0,
+ .mnt_id = mnt_id,
+ .param = mask,
+ };
+
+ if (mnt_ns_id) {
+ req.size = MNT_ID_REQ_SIZE_VER1;
+ req.mnt_ns_id = mnt_ns_id;
+ }
+
+ return syscall(__NR_statmount, &req, buf, bufsize, flags);
+}
+
+static inline ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id,
+ uint64_t last_mnt_id, uint64_t list[], size_t num,
+ unsigned int flags)
+{
+ struct mnt_id_req req = {
+ .size = MNT_ID_REQ_SIZE_VER0,
+ .mnt_id = mnt_id,
+ .param = last_mnt_id,
+ };
+
+ if (mnt_ns_id) {
+ req.size = MNT_ID_REQ_SIZE_VER1;
+ req.mnt_ns_id = mnt_ns_id;
+ }
+
+ return syscall(__NR_listmount, &req, list, num, flags);
+}
+
+#endif /* __STATMOUNT_H */
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c
new file mode 100644
index 000000000000..f048042e53e9
--- /dev/null
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
@@ -0,0 +1,702 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <stddef.h>
+#include <sched.h>
+#include <fcntl.h>
+#include <sys/param.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <linux/stat.h>
+
+#include "statmount.h"
+#include "../../kselftest.h"
+
+static const char *const known_fs[] = {
+ "9p", "adfs", "affs", "afs", "aio", "anon_inodefs", "apparmorfs",
+ "autofs", "bcachefs", "bdev", "befs", "bfs", "binder", "binfmt_misc",
+ "bpf", "btrfs", "btrfs_test_fs", "ceph", "cgroup", "cgroup2", "cifs",
+ "coda", "configfs", "cpuset", "cramfs", "cxl", "dax", "debugfs",
+ "devpts", "devtmpfs", "dmabuf", "drm", "ecryptfs", "efivarfs", "efs",
+ "erofs", "exfat", "ext2", "ext3", "ext4", "f2fs", "functionfs",
+ "fuse", "fuseblk", "fusectl", "gadgetfs", "gfs2", "gfs2meta", "hfs",
+ "hfsplus", "hostfs", "hpfs", "hugetlbfs", "ibmasmfs", "iomem",
+ "ipathfs", "iso9660", "jffs2", "jfs", "minix", "mqueue", "msdos",
+ "nfs", "nfs4", "nfsd", "nilfs2", "nsfs", "ntfs", "ntfs3", "ocfs2",
+ "ocfs2_dlmfs", "omfs", "openpromfs", "overlay", "pipefs", "proc",
+ "pstore", "pvfs2", "qnx4", "qnx6", "ramfs", "resctrl", "romfs",
+ "rootfs", "rpc_pipefs", "s390_hypfs", "secretmem", "securityfs",
+ "selinuxfs", "smackfs", "smb3", "sockfs", "spufs", "squashfs", "sysfs",
+ "sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf",
+ "vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL };
+
+static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags)
+{
+ size_t bufsize = 1 << 15;
+ struct statmount *buf = NULL, *tmp = alloca(bufsize);
+ int tofree = 0;
+ int ret;
+
+ for (;;) {
+ ret = statmount(mnt_id, 0, mask, tmp, bufsize, flags);
+ if (ret != -1)
+ break;
+ if (tofree)
+ free(tmp);
+ if (errno != EOVERFLOW)
+ return NULL;
+ bufsize <<= 1;
+ tofree = 1;
+ tmp = malloc(bufsize);
+ if (!tmp)
+ return NULL;
+ }
+ buf = malloc(tmp->size);
+ if (buf)
+ memcpy(buf, tmp, tmp->size);
+ if (tofree)
+ free(tmp);
+
+ return buf;
+}
+
+static void write_file(const char *path, const char *val)
+{
+ int fd = open(path, O_WRONLY);
+ size_t len = strlen(val);
+ int ret;
+
+ if (fd == -1)
+ ksft_exit_fail_msg("opening %s for write: %s\n", path, strerror(errno));
+
+ ret = write(fd, val, len);
+ if (ret == -1)
+ ksft_exit_fail_msg("writing to %s: %s\n", path, strerror(errno));
+ if (ret != len)
+ ksft_exit_fail_msg("short write to %s\n", path);
+
+ ret = close(fd);
+ if (ret == -1)
+ ksft_exit_fail_msg("closing %s\n", path);
+}
+
+static uint64_t get_mnt_id(const char *name, const char *path, uint64_t mask)
+{
+ struct statx sx;
+ int ret;
+
+ ret = statx(AT_FDCWD, path, 0, mask, &sx);
+ if (ret == -1)
+ ksft_exit_fail_msg("retrieving %s mount ID for %s: %s\n",
+ mask & STATX_MNT_ID_UNIQUE ? "unique" : "old",
+ name, strerror(errno));
+ if (!(sx.stx_mask & mask))
+ ksft_exit_fail_msg("no %s mount ID available for %s\n",
+ mask & STATX_MNT_ID_UNIQUE ? "unique" : "old",
+ name);
+
+ return sx.stx_mnt_id;
+}
+
+
+static char root_mntpoint[] = "/tmp/statmount_test_root.XXXXXX";
+static int orig_root;
+static uint64_t root_id, parent_id;
+static uint32_t old_root_id, old_parent_id;
+static FILE *f_mountinfo;
+
+static void cleanup_namespace(void)
+{
+ int ret;
+
+ ret = fchdir(orig_root);
+ if (ret == -1)
+ ksft_perror("fchdir to original root");
+
+ ret = chroot(".");
+ if (ret == -1)
+ ksft_perror("chroot to original root");
+
+ umount2(root_mntpoint, MNT_DETACH);
+ rmdir(root_mntpoint);
+}
+
+static void setup_namespace(void)
+{
+ int ret;
+ char buf[32];
+ uid_t uid = getuid();
+ gid_t gid = getgid();
+
+ ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID);
+ if (ret == -1)
+ ksft_exit_fail_msg("unsharing mountns and userns: %s\n",
+ strerror(errno));
+
+ sprintf(buf, "0 %d 1", uid);
+ write_file("/proc/self/uid_map", buf);
+ write_file("/proc/self/setgroups", "deny");
+ sprintf(buf, "0 %d 1", gid);
+ write_file("/proc/self/gid_map", buf);
+
+ f_mountinfo = fopen("/proc/self/mountinfo", "re");
+ if (!f_mountinfo)
+ ksft_exit_fail_msg("failed to open mountinfo: %s\n",
+ strerror(errno));
+
+ ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL);
+ if (ret == -1)
+ ksft_exit_fail_msg("making mount tree private: %s\n",
+ strerror(errno));
+
+ if (!mkdtemp(root_mntpoint))
+ ksft_exit_fail_msg("creating temporary directory %s: %s\n",
+ root_mntpoint, strerror(errno));
+
+ old_parent_id = get_mnt_id("parent", root_mntpoint, STATX_MNT_ID);
+ parent_id = get_mnt_id("parent", root_mntpoint, STATX_MNT_ID_UNIQUE);
+
+ orig_root = open("/", O_PATH);
+ if (orig_root == -1)
+ ksft_exit_fail_msg("opening root directory: %s",
+ strerror(errno));
+
+ atexit(cleanup_namespace);
+
+ ret = mount(root_mntpoint, root_mntpoint, NULL, MS_BIND, NULL);
+ if (ret == -1)
+ ksft_exit_fail_msg("mounting temp root %s: %s\n",
+ root_mntpoint, strerror(errno));
+
+ ret = chroot(root_mntpoint);
+ if (ret == -1)
+ ksft_exit_fail_msg("chroot to temp root %s: %s\n",
+ root_mntpoint, strerror(errno));
+
+ ret = chdir("/");
+ if (ret == -1)
+ ksft_exit_fail_msg("chdir to root: %s\n", strerror(errno));
+
+ old_root_id = get_mnt_id("root", "/", STATX_MNT_ID);
+ root_id = get_mnt_id("root", "/", STATX_MNT_ID_UNIQUE);
+}
+
+static int setup_mount_tree(int log2_num)
+{
+ int ret, i;
+
+ ret = mount("", "/", NULL, MS_REC|MS_SHARED, NULL);
+ if (ret == -1) {
+ ksft_test_result_fail("making mount tree shared: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ for (i = 0; i < log2_num; i++) {
+ ret = mount("/", "/", NULL, MS_BIND, NULL);
+ if (ret == -1) {
+ ksft_test_result_fail("mounting submount %s: %s\n",
+ root_mntpoint, strerror(errno));
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static void test_listmount_empty_root(void)
+{
+ ssize_t res;
+ const unsigned int size = 32;
+ uint64_t list[size];
+
+ res = listmount(LSMT_ROOT, 0, 0, list, size, 0);
+ if (res == -1) {
+ ksft_test_result_fail("listmount: %s\n", strerror(errno));
+ return;
+ }
+ if (res != 1) {
+ ksft_test_result_fail("listmount result is %zi != 1\n", res);
+ return;
+ }
+
+ if (list[0] != root_id) {
+ ksft_test_result_fail("listmount ID doesn't match 0x%llx != 0x%llx\n",
+ (unsigned long long) list[0],
+ (unsigned long long) root_id);
+ return;
+ }
+
+ ksft_test_result_pass("listmount empty root\n");
+}
+
+static void test_statmount_zero_mask(void)
+{
+ struct statmount sm;
+ int ret;
+
+ ret = statmount(root_id, 0, 0, &sm, sizeof(sm), 0);
+ if (ret == -1) {
+ ksft_test_result_fail("statmount zero mask: %s\n",
+ strerror(errno));
+ return;
+ }
+ if (sm.size != sizeof(sm)) {
+ ksft_test_result_fail("unexpected size: %u != %u\n",
+ sm.size, (uint32_t) sizeof(sm));
+ return;
+ }
+ if (sm.mask != 0) {
+ ksft_test_result_fail("unexpected mask: 0x%llx != 0x0\n",
+ (unsigned long long) sm.mask);
+ return;
+ }
+
+ ksft_test_result_pass("statmount zero mask\n");
+}
+
+static void test_statmount_mnt_basic(void)
+{
+ struct statmount sm;
+ int ret;
+ uint64_t mask = STATMOUNT_MNT_BASIC;
+
+ ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0);
+ if (ret == -1) {
+ ksft_test_result_fail("statmount mnt basic: %s\n",
+ strerror(errno));
+ return;
+ }
+ if (sm.size != sizeof(sm)) {
+ ksft_test_result_fail("unexpected size: %u != %u\n",
+ sm.size, (uint32_t) sizeof(sm));
+ return;
+ }
+ if (sm.mask != mask) {
+ ksft_test_result_skip("statmount mnt basic unavailable\n");
+ return;
+ }
+
+ if (sm.mnt_id != root_id) {
+ ksft_test_result_fail("unexpected root ID: 0x%llx != 0x%llx\n",
+ (unsigned long long) sm.mnt_id,
+ (unsigned long long) root_id);
+ return;
+ }
+
+ if (sm.mnt_id_old != old_root_id) {
+ ksft_test_result_fail("unexpected old root ID: %u != %u\n",
+ sm.mnt_id_old, old_root_id);
+ return;
+ }
+
+ if (sm.mnt_parent_id != parent_id) {
+ ksft_test_result_fail("unexpected parent ID: 0x%llx != 0x%llx\n",
+ (unsigned long long) sm.mnt_parent_id,
+ (unsigned long long) parent_id);
+ return;
+ }
+
+ if (sm.mnt_parent_id_old != old_parent_id) {
+ ksft_test_result_fail("unexpected old parent ID: %u != %u\n",
+ sm.mnt_parent_id_old, old_parent_id);
+ return;
+ }
+
+ if (sm.mnt_propagation != MS_PRIVATE) {
+ ksft_test_result_fail("unexpected propagation: 0x%llx\n",
+ (unsigned long long) sm.mnt_propagation);
+ return;
+ }
+
+ ksft_test_result_pass("statmount mnt basic\n");
+}
+
+
+static void test_statmount_sb_basic(void)
+{
+ struct statmount sm;
+ int ret;
+ uint64_t mask = STATMOUNT_SB_BASIC;
+ struct statx sx;
+ struct statfs sf;
+
+ ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0);
+ if (ret == -1) {
+ ksft_test_result_fail("statmount sb basic: %s\n",
+ strerror(errno));
+ return;
+ }
+ if (sm.size != sizeof(sm)) {
+ ksft_test_result_fail("unexpected size: %u != %u\n",
+ sm.size, (uint32_t) sizeof(sm));
+ return;
+ }
+ if (sm.mask != mask) {
+ ksft_test_result_skip("statmount sb basic unavailable\n");
+ return;
+ }
+
+ ret = statx(AT_FDCWD, "/", 0, 0, &sx);
+ if (ret == -1) {
+ ksft_test_result_fail("stat root failed: %s\n",
+ strerror(errno));
+ return;
+ }
+
+ if (sm.sb_dev_major != sx.stx_dev_major ||
+ sm.sb_dev_minor != sx.stx_dev_minor) {
+ ksft_test_result_fail("unexpected sb dev %u:%u != %u:%u\n",
+ sm.sb_dev_major, sm.sb_dev_minor,
+ sx.stx_dev_major, sx.stx_dev_minor);
+ return;
+ }
+
+ ret = statfs("/", &sf);
+ if (ret == -1) {
+ ksft_test_result_fail("statfs root failed: %s\n",
+ strerror(errno));
+ return;
+ }
+
+ if (sm.sb_magic != sf.f_type) {
+ ksft_test_result_fail("unexpected sb magic: 0x%llx != 0x%lx\n",
+ (unsigned long long) sm.sb_magic,
+ sf.f_type);
+ return;
+ }
+
+ ksft_test_result_pass("statmount sb basic\n");
+}
+
+static void test_statmount_mnt_point(void)
+{
+ struct statmount *sm;
+
+ sm = statmount_alloc(root_id, STATMOUNT_MNT_POINT, 0);
+ if (!sm) {
+ ksft_test_result_fail("statmount mount point: %s\n",
+ strerror(errno));
+ return;
+ }
+
+ if (!(sm->mask & STATMOUNT_MNT_POINT)) {
+ ksft_test_result_fail("missing STATMOUNT_MNT_POINT in mask\n");
+ return;
+ }
+ if (strcmp(sm->str + sm->mnt_point, "/") != 0) {
+ ksft_test_result_fail("unexpected mount point: '%s' != '/'\n",
+ sm->str + sm->mnt_point);
+ goto out;
+ }
+ ksft_test_result_pass("statmount mount point\n");
+out:
+ free(sm);
+}
+
+static void test_statmount_mnt_root(void)
+{
+ struct statmount *sm;
+ const char *mnt_root, *last_dir, *last_root;
+
+ last_dir = strrchr(root_mntpoint, '/');
+ assert(last_dir);
+ last_dir++;
+
+ sm = statmount_alloc(root_id, STATMOUNT_MNT_ROOT, 0);
+ if (!sm) {
+ ksft_test_result_fail("statmount mount root: %s\n",
+ strerror(errno));
+ return;
+ }
+ if (!(sm->mask & STATMOUNT_MNT_ROOT)) {
+ ksft_test_result_fail("missing STATMOUNT_MNT_ROOT in mask\n");
+ return;
+ }
+ mnt_root = sm->str + sm->mnt_root;
+ last_root = strrchr(mnt_root, '/');
+ if (last_root)
+ last_root++;
+ else
+ last_root = mnt_root;
+
+ if (strcmp(last_dir, last_root) != 0) {
+ ksft_test_result_fail("unexpected mount root last component: '%s' != '%s'\n",
+ last_root, last_dir);
+ goto out;
+ }
+ ksft_test_result_pass("statmount mount root\n");
+out:
+ free(sm);
+}
+
+static void test_statmount_fs_type(void)
+{
+ struct statmount *sm;
+ const char *fs_type;
+ const char *const *s;
+
+ sm = statmount_alloc(root_id, STATMOUNT_FS_TYPE, 0);
+ if (!sm) {
+ ksft_test_result_fail("statmount fs type: %s\n",
+ strerror(errno));
+ return;
+ }
+ if (!(sm->mask & STATMOUNT_FS_TYPE)) {
+ ksft_test_result_fail("missing STATMOUNT_FS_TYPE in mask\n");
+ return;
+ }
+ fs_type = sm->str + sm->fs_type;
+ for (s = known_fs; s != NULL; s++) {
+ if (strcmp(fs_type, *s) == 0)
+ break;
+ }
+ if (!s)
+ ksft_print_msg("unknown filesystem type: %s\n", fs_type);
+
+ ksft_test_result_pass("statmount fs type\n");
+ free(sm);
+}
+
+static void test_statmount_mnt_opts(void)
+{
+ struct statmount *sm;
+ const char *statmount_opts;
+ char *line = NULL;
+ size_t len = 0;
+
+ sm = statmount_alloc(root_id, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS,
+ 0);
+ if (!sm) {
+ ksft_test_result_fail("statmount mnt opts: %s\n",
+ strerror(errno));
+ return;
+ }
+
+ if (!(sm->mask & STATMOUNT_MNT_BASIC)) {
+ ksft_test_result_fail("missing STATMOUNT_MNT_BASIC in mask\n");
+ return;
+ }
+
+ while (getline(&line, &len, f_mountinfo) != -1) {
+ int i;
+ char *p, *p2;
+ unsigned int old_mnt_id;
+
+ old_mnt_id = atoi(line);
+ if (old_mnt_id != sm->mnt_id_old)
+ continue;
+
+ for (p = line, i = 0; p && i < 5; i++)
+ p = strchr(p + 1, ' ');
+ if (!p)
+ continue;
+
+ p2 = strchr(p + 1, ' ');
+ if (!p2)
+ continue;
+ *p2 = '\0';
+ p = strchr(p2 + 1, '-');
+ if (!p)
+ continue;
+ for (p++, i = 0; p && i < 2; i++)
+ p = strchr(p + 1, ' ');
+ if (!p)
+ continue;
+ p++;
+
+ /* skip generic superblock options */
+ if (strncmp(p, "ro", 2) == 0)
+ p += 2;
+ else if (strncmp(p, "rw", 2) == 0)
+ p += 2;
+ if (*p == ',')
+ p++;
+ if (strncmp(p, "sync", 4) == 0)
+ p += 4;
+ if (*p == ',')
+ p++;
+ if (strncmp(p, "dirsync", 7) == 0)
+ p += 7;
+ if (*p == ',')
+ p++;
+ if (strncmp(p, "lazytime", 8) == 0)
+ p += 8;
+ if (*p == ',')
+ p++;
+ p2 = strrchr(p, '\n');
+ if (p2)
+ *p2 = '\0';
+
+ if (sm->mask & STATMOUNT_MNT_OPTS)
+ statmount_opts = sm->str + sm->mnt_opts;
+ else
+ statmount_opts = "";
+ if (strcmp(statmount_opts, p) != 0)
+ ksft_test_result_fail(
+ "unexpected mount options: '%s' != '%s'\n",
+ statmount_opts, p);
+ else
+ ksft_test_result_pass("statmount mount options\n");
+ free(sm);
+ free(line);
+ return;
+ }
+
+ ksft_test_result_fail("didnt't find mount entry\n");
+ free(sm);
+ free(line);
+}
+
+static void test_statmount_string(uint64_t mask, size_t off, const char *name)
+{
+ struct statmount *sm;
+ size_t len, shortsize, exactsize;
+ uint32_t start, i;
+ int ret;
+
+ sm = statmount_alloc(root_id, mask, 0);
+ if (!sm) {
+ ksft_test_result_fail("statmount %s: %s\n", name,
+ strerror(errno));
+ goto out;
+ }
+ if (sm->size < sizeof(*sm)) {
+ ksft_test_result_fail("unexpected size: %u < %u\n",
+ sm->size, (uint32_t) sizeof(*sm));
+ goto out;
+ }
+ if (sm->mask != mask) {
+ ksft_test_result_skip("statmount %s unavailable\n", name);
+ goto out;
+ }
+ len = sm->size - sizeof(*sm);
+ start = ((uint32_t *) sm)[off];
+
+ for (i = start;; i++) {
+ if (i >= len) {
+ ksft_test_result_fail("string out of bounds\n");
+ goto out;
+ }
+ if (!sm->str[i])
+ break;
+ }
+ exactsize = sm->size;
+ shortsize = sizeof(*sm) + i;
+
+ ret = statmount(root_id, 0, mask, sm, exactsize, 0);
+ if (ret == -1) {
+ ksft_test_result_fail("statmount exact size: %s\n",
+ strerror(errno));
+ goto out;
+ }
+ errno = 0;
+ ret = statmount(root_id, 0, mask, sm, shortsize, 0);
+ if (ret != -1 || errno != EOVERFLOW) {
+ ksft_test_result_fail("should have failed with EOVERFLOW: %s\n",
+ strerror(errno));
+ goto out;
+ }
+
+ ksft_test_result_pass("statmount string %s\n", name);
+out:
+ free(sm);
+}
+
+static void test_listmount_tree(void)
+{
+ ssize_t res;
+ const unsigned int log2_num = 4;
+ const unsigned int step = 3;
+ const unsigned int size = (1 << log2_num) + step + 1;
+ size_t num, expect = 1 << log2_num;
+ uint64_t list[size];
+ uint64_t list2[size];
+ size_t i;
+
+
+ res = setup_mount_tree(log2_num);
+ if (res == -1)
+ return;
+
+ num = res = listmount(LSMT_ROOT, 0, 0, list, size, 0);
+ if (res == -1) {
+ ksft_test_result_fail("listmount: %s\n", strerror(errno));
+ return;
+ }
+ if (num != expect) {
+ ksft_test_result_fail("listmount result is %zi != %zi\n",
+ res, expect);
+ return;
+ }
+
+ for (i = 0; i < size - step;) {
+ res = listmount(LSMT_ROOT, 0, i ? list2[i - 1] : 0, list2 + i, step, 0);
+ if (res == -1)
+ ksft_test_result_fail("short listmount: %s\n",
+ strerror(errno));
+ i += res;
+ if (res < step)
+ break;
+ }
+ if (i != num) {
+ ksft_test_result_fail("different number of entries: %zu != %zu\n",
+ i, num);
+ return;
+ }
+ for (i = 0; i < num; i++) {
+ if (list2[i] != list[i]) {
+ ksft_test_result_fail("different value for entry %zu: 0x%llx != 0x%llx\n",
+ i,
+ (unsigned long long) list2[i],
+ (unsigned long long) list[i]);
+ }
+ }
+
+ ksft_test_result_pass("listmount tree\n");
+}
+
+#define str_off(memb) (offsetof(struct statmount, memb) / sizeof(uint32_t))
+
+int main(void)
+{
+ int ret;
+ uint64_t all_mask = STATMOUNT_SB_BASIC | STATMOUNT_MNT_BASIC |
+ STATMOUNT_PROPAGATE_FROM | STATMOUNT_MNT_ROOT |
+ STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE | STATMOUNT_MNT_NS_ID;
+
+ ksft_print_header();
+
+ ret = statmount(0, 0, 0, NULL, 0, 0);
+ assert(ret == -1);
+ if (errno == ENOSYS)
+ ksft_exit_skip("statmount() syscall not supported\n");
+
+ setup_namespace();
+
+ ksft_set_plan(15);
+ test_listmount_empty_root();
+ test_statmount_zero_mask();
+ test_statmount_mnt_basic();
+ test_statmount_sb_basic();
+ test_statmount_mnt_root();
+ test_statmount_mnt_point();
+ test_statmount_fs_type();
+ test_statmount_mnt_opts();
+ test_statmount_string(STATMOUNT_MNT_ROOT, str_off(mnt_root), "mount root");
+ test_statmount_string(STATMOUNT_MNT_POINT, str_off(mnt_point), "mount point");
+ test_statmount_string(STATMOUNT_FS_TYPE, str_off(fs_type), "fs type");
+ test_statmount_string(all_mask, str_off(mnt_root), "mount root & all");
+ test_statmount_string(all_mask, str_off(mnt_point), "mount point & all");
+ test_statmount_string(all_mask, str_off(fs_type), "fs type & all");
+
+ test_listmount_tree();
+
+
+ if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
+ ksft_exit_fail();
+ else
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
new file mode 100644
index 000000000000..605a3fa16bf7
--- /dev/null
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
@@ -0,0 +1,291 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <linux/nsfs.h>
+#include <linux/stat.h>
+
+#include "statmount.h"
+#include "../utils.h"
+#include "../../kselftest.h"
+
+#define NSID_PASS 0
+#define NSID_FAIL 1
+#define NSID_SKIP 2
+#define NSID_ERROR 3
+
+static void handle_result(int ret, const char *testname)
+{
+ if (ret == NSID_PASS)
+ ksft_test_result_pass("%s\n", testname);
+ else if (ret == NSID_FAIL)
+ ksft_test_result_fail("%s\n", testname);
+ else if (ret == NSID_ERROR)
+ ksft_exit_fail_msg("%s\n", testname);
+ else
+ ksft_test_result_skip("%s\n", testname);
+}
+
+static inline int wait_for_pid(pid_t pid)
+{
+ int status, ret;
+
+again:
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+
+ ksft_print_msg("waitpid returned -1, errno=%d\n", errno);
+ return -1;
+ }
+
+ if (!WIFEXITED(status)) {
+ ksft_print_msg(
+ "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n",
+ WIFSIGNALED(status), WTERMSIG(status));
+ return -1;
+ }
+
+ ret = WEXITSTATUS(status);
+ return ret;
+}
+
+static int get_mnt_ns_id(const char *mnt_ns, uint64_t *mnt_ns_id)
+{
+ int fd = open(mnt_ns, O_RDONLY);
+
+ if (fd < 0) {
+ ksft_print_msg("failed to open for ns %s: %s\n",
+ mnt_ns, strerror(errno));
+ sleep(60);
+ return NSID_ERROR;
+ }
+
+ if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0) {
+ ksft_print_msg("failed to get the nsid for ns %s: %s\n",
+ mnt_ns, strerror(errno));
+ return NSID_ERROR;
+ }
+ close(fd);
+ return NSID_PASS;
+}
+
+static int setup_namespace(void)
+{
+ if (setup_userns() != 0)
+ return NSID_ERROR;
+
+ return NSID_PASS;
+}
+
+static int _test_statmount_mnt_ns_id(void)
+{
+ struct statmount sm;
+ uint64_t mnt_ns_id;
+ uint64_t root_id;
+ int ret;
+
+ ret = get_mnt_ns_id("/proc/self/ns/mnt", &mnt_ns_id);
+ if (ret != NSID_PASS)
+ return ret;
+
+ root_id = get_unique_mnt_id("/");
+ if (!root_id)
+ return NSID_ERROR;
+
+ ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0);
+ if (ret == -1) {
+ ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno));
+ return NSID_ERROR;
+ }
+
+ if (sm.size != sizeof(sm)) {
+ ksft_print_msg("unexpected size: %u != %u\n", sm.size,
+ (uint32_t)sizeof(sm));
+ return NSID_FAIL;
+ }
+ if (sm.mask != STATMOUNT_MNT_NS_ID) {
+ ksft_print_msg("statmount mnt ns id unavailable\n");
+ return NSID_SKIP;
+ }
+
+ if (sm.mnt_ns_id != mnt_ns_id) {
+ ksft_print_msg("unexpected mnt ns ID: 0x%llx != 0x%llx\n",
+ (unsigned long long)sm.mnt_ns_id,
+ (unsigned long long)mnt_ns_id);
+ return NSID_FAIL;
+ }
+
+ return NSID_PASS;
+}
+
+static void test_statmount_mnt_ns_id(void)
+{
+ pid_t pid;
+ int ret;
+
+ pid = fork();
+ if (pid < 0)
+ ksft_exit_fail_msg("failed to fork: %s\n", strerror(errno));
+
+ /* We're the original pid, wait for the result. */
+ if (pid != 0) {
+ ret = wait_for_pid(pid);
+ handle_result(ret, "test statmount ns id");
+ return;
+ }
+
+ ret = setup_namespace();
+ if (ret != NSID_PASS)
+ exit(ret);
+ ret = _test_statmount_mnt_ns_id();
+ exit(ret);
+}
+
+static int validate_external_listmount(pid_t pid, uint64_t child_nr_mounts)
+{
+ uint64_t list[256];
+ uint64_t mnt_ns_id;
+ uint64_t nr_mounts;
+ char buf[256];
+ int ret;
+
+ /* Get the mount ns id for our child. */
+ snprintf(buf, sizeof(buf), "/proc/%lu/ns/mnt", (unsigned long)pid);
+ ret = get_mnt_ns_id(buf, &mnt_ns_id);
+
+ nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0);
+ if (nr_mounts == (uint64_t)-1) {
+ ksft_print_msg("listmount: %s\n", strerror(errno));
+ return NSID_ERROR;
+ }
+
+ if (nr_mounts != child_nr_mounts) {
+ ksft_print_msg("listmount results is %zi != %zi\n", nr_mounts,
+ child_nr_mounts);
+ return NSID_FAIL;
+ }
+
+ /* Validate that all of our entries match our mnt_ns_id. */
+ for (int i = 0; i < nr_mounts; i++) {
+ struct statmount sm;
+
+ ret = statmount(list[i], mnt_ns_id, STATMOUNT_MNT_NS_ID, &sm,
+ sizeof(sm), 0);
+ if (ret < 0) {
+ ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno));
+ return NSID_ERROR;
+ }
+
+ if (sm.mask != STATMOUNT_MNT_NS_ID) {
+ ksft_print_msg("statmount mnt ns id unavailable\n");
+ return NSID_SKIP;
+ }
+
+ if (sm.mnt_ns_id != mnt_ns_id) {
+ ksft_print_msg("listmount gave us the wrong ns id: 0x%llx != 0x%llx\n",
+ (unsigned long long)sm.mnt_ns_id,
+ (unsigned long long)mnt_ns_id);
+ return NSID_FAIL;
+ }
+ }
+
+ return NSID_PASS;
+}
+
+static void test_listmount_ns(void)
+{
+ uint64_t nr_mounts;
+ char pval;
+ int child_ready_pipe[2];
+ int parent_ready_pipe[2];
+ pid_t pid;
+ int ret, child_ret;
+
+ if (pipe(child_ready_pipe) < 0)
+ ksft_exit_fail_msg("failed to create the child pipe: %s\n",
+ strerror(errno));
+ if (pipe(parent_ready_pipe) < 0)
+ ksft_exit_fail_msg("failed to create the parent pipe: %s\n",
+ strerror(errno));
+
+ pid = fork();
+ if (pid < 0)
+ ksft_exit_fail_msg("failed to fork: %s\n", strerror(errno));
+
+ if (pid == 0) {
+ char cval;
+ uint64_t list[256];
+
+ close(child_ready_pipe[0]);
+ close(parent_ready_pipe[1]);
+
+ ret = setup_namespace();
+ if (ret != NSID_PASS)
+ exit(ret);
+
+ nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 256, 0);
+ if (nr_mounts == (uint64_t)-1) {
+ ksft_print_msg("listmount: %s\n", strerror(errno));
+ exit(NSID_FAIL);
+ }
+
+ /*
+ * Tell our parent how many mounts we have, and then wait for it
+ * to tell us we're done.
+ */
+ if (write(child_ready_pipe[1], &nr_mounts, sizeof(nr_mounts)) !=
+ sizeof(nr_mounts))
+ ret = NSID_ERROR;
+ if (read(parent_ready_pipe[0], &cval, sizeof(cval)) != sizeof(cval))
+ ret = NSID_ERROR;
+ exit(NSID_PASS);
+ }
+
+ close(child_ready_pipe[1]);
+ close(parent_ready_pipe[0]);
+
+ /* Wait until the child has created everything. */
+ if (read(child_ready_pipe[0], &nr_mounts, sizeof(nr_mounts)) !=
+ sizeof(nr_mounts))
+ ret = NSID_ERROR;
+
+ ret = validate_external_listmount(pid, nr_mounts);
+
+ if (write(parent_ready_pipe[1], &pval, sizeof(pval)) != sizeof(pval))
+ ret = NSID_ERROR;
+
+ child_ret = wait_for_pid(pid);
+ if (child_ret != NSID_PASS)
+ ret = child_ret;
+ handle_result(ret, "test listmount ns id");
+}
+
+int main(void)
+{
+ int ret;
+
+ ksft_print_header();
+ ret = statmount(0, 0, 0, NULL, 0, 0);
+ assert(ret == -1);
+ if (errno == ENOSYS)
+ ksft_exit_skip("statmount() syscall not supported\n");
+
+ ksft_set_plan(2);
+ test_statmount_mnt_ns_id();
+ test_listmount_ns();
+
+ if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
+ ksft_exit_fail();
+ else
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
new file mode 100644
index 000000000000..c43a69dffd83
--- /dev/null
+++ b/tools/testing/selftests/filesystems/utils.c
@@ -0,0 +1,589 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <fcntl.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <grp.h>
+#include <linux/limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/eventfd.h>
+#include <sys/fsuid.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/xattr.h>
+#include <sys/mount.h>
+
+#include "../kselftest.h"
+#include "wrappers.h"
+#include "utils.h"
+
+#define MAX_USERNS_LEVEL 32
+
+#define syserror(format, ...) \
+ ({ \
+ fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__); \
+ (-errno); \
+ })
+
+#define syserror_set(__ret__, format, ...) \
+ ({ \
+ typeof(__ret__) __internal_ret__ = (__ret__); \
+ errno = labs(__ret__); \
+ fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__); \
+ __internal_ret__; \
+ })
+
+#define STRLITERALLEN(x) (sizeof(""x"") - 1)
+
+#define INTTYPE_TO_STRLEN(type) \
+ (2 + (sizeof(type) <= 1 \
+ ? 3 \
+ : sizeof(type) <= 2 \
+ ? 5 \
+ : sizeof(type) <= 4 \
+ ? 10 \
+ : sizeof(type) <= 8 ? 20 : sizeof(int[-2 * (sizeof(type) > 8)])))
+
+#define list_for_each(__iterator, __list) \
+ for (__iterator = (__list)->next; __iterator != __list; __iterator = __iterator->next)
+
+typedef enum idmap_type_t {
+ ID_TYPE_UID,
+ ID_TYPE_GID
+} idmap_type_t;
+
+struct id_map {
+ idmap_type_t map_type;
+ __u32 nsid;
+ __u32 hostid;
+ __u32 range;
+};
+
+struct list {
+ void *elem;
+ struct list *next;
+ struct list *prev;
+};
+
+struct userns_hierarchy {
+ int fd_userns;
+ int fd_event;
+ unsigned int level;
+ struct list id_map;
+};
+
+static inline void list_init(struct list *list)
+{
+ list->elem = NULL;
+ list->next = list->prev = list;
+}
+
+static inline int list_empty(const struct list *list)
+{
+ return list == list->next;
+}
+
+static inline void __list_add(struct list *new, struct list *prev, struct list *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+static inline void list_add_tail(struct list *head, struct list *list)
+{
+ __list_add(list, head->prev, head);
+}
+
+static inline void list_del(struct list *list)
+{
+ struct list *next, *prev;
+
+ next = list->next;
+ prev = list->prev;
+ next->prev = prev;
+ prev->next = next;
+}
+
+static ssize_t read_nointr(int fd, void *buf, size_t count)
+{
+ ssize_t ret;
+
+ do {
+ ret = read(fd, buf, count);
+ } while (ret < 0 && errno == EINTR);
+
+ return ret;
+}
+
+static ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+ ssize_t ret;
+
+ do {
+ ret = write(fd, buf, count);
+ } while (ret < 0 && errno == EINTR);
+
+ return ret;
+}
+
+#define __STACK_SIZE (8 * 1024 * 1024)
+static pid_t do_clone(int (*fn)(void *), void *arg, int flags)
+{
+ void *stack;
+
+ stack = malloc(__STACK_SIZE);
+ if (!stack)
+ return -ENOMEM;
+
+#ifdef __ia64__
+ return __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, NULL);
+#else
+ return clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, NULL);
+#endif
+}
+
+static int get_userns_fd_cb(void *data)
+{
+ for (;;)
+ pause();
+ _exit(0);
+}
+
+static int wait_for_pid(pid_t pid)
+{
+ int status, ret;
+
+again:
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+
+ return -1;
+ }
+
+ if (!WIFEXITED(status))
+ return -1;
+
+ return WEXITSTATUS(status);
+}
+
+static int write_id_mapping(idmap_type_t map_type, pid_t pid, const char *buf, size_t buf_size)
+{
+ int fd = -EBADF, setgroups_fd = -EBADF;
+ int fret = -1;
+ int ret;
+ char path[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) +
+ STRLITERALLEN("/setgroups") + 1];
+
+ if (geteuid() != 0 && map_type == ID_TYPE_GID) {
+ ret = snprintf(path, sizeof(path), "/proc/%d/setgroups", pid);
+ if (ret < 0 || ret >= sizeof(path))
+ goto out;
+
+ setgroups_fd = open(path, O_WRONLY | O_CLOEXEC);
+ if (setgroups_fd < 0 && errno != ENOENT) {
+ syserror("Failed to open \"%s\"", path);
+ goto out;
+ }
+
+ if (setgroups_fd >= 0) {
+ ret = write_nointr(setgroups_fd, "deny\n", STRLITERALLEN("deny\n"));
+ if (ret != STRLITERALLEN("deny\n")) {
+ syserror("Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid);
+ goto out;
+ }
+ }
+ }
+
+ ret = snprintf(path, sizeof(path), "/proc/%d/%cid_map", pid, map_type == ID_TYPE_UID ? 'u' : 'g');
+ if (ret < 0 || ret >= sizeof(path))
+ goto out;
+
+ fd = open(path, O_WRONLY | O_CLOEXEC);
+ if (fd < 0) {
+ syserror("Failed to open \"%s\"", path);
+ goto out;
+ }
+
+ ret = write_nointr(fd, buf, buf_size);
+ if (ret != buf_size) {
+ syserror("Failed to write %cid mapping to \"%s\"",
+ map_type == ID_TYPE_UID ? 'u' : 'g', path);
+ goto out;
+ }
+
+ fret = 0;
+out:
+ close(fd);
+ close(setgroups_fd);
+
+ return fret;
+}
+
+static int map_ids_from_idmap(struct list *idmap, pid_t pid)
+{
+ int fill, left;
+ char mapbuf[4096] = {};
+ bool had_entry = false;
+ idmap_type_t map_type, u_or_g;
+
+ if (list_empty(idmap))
+ return 0;
+
+ for (map_type = ID_TYPE_UID, u_or_g = 'u';
+ map_type <= ID_TYPE_GID; map_type++, u_or_g = 'g') {
+ char *pos = mapbuf;
+ int ret;
+ struct list *iterator;
+
+
+ list_for_each(iterator, idmap) {
+ struct id_map *map = iterator->elem;
+ if (map->map_type != map_type)
+ continue;
+
+ had_entry = true;
+
+ left = 4096 - (pos - mapbuf);
+ fill = snprintf(pos, left, "%u %u %u\n", map->nsid, map->hostid, map->range);
+ /*
+ * The kernel only takes <= 4k for writes to
+ * /proc/<pid>/{g,u}id_map
+ */
+ if (fill <= 0 || fill >= left)
+ return syserror_set(-E2BIG, "Too many %cid mappings defined", u_or_g);
+
+ pos += fill;
+ }
+ if (!had_entry)
+ continue;
+
+ ret = write_id_mapping(map_type, pid, mapbuf, pos - mapbuf);
+ if (ret < 0)
+ return syserror("Failed to write mapping: %s", mapbuf);
+
+ memset(mapbuf, 0, sizeof(mapbuf));
+ }
+
+ return 0;
+}
+
+static int get_userns_fd_from_idmap(struct list *idmap)
+{
+ int ret;
+ pid_t pid;
+ char path_ns[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) +
+ STRLITERALLEN("/ns/user") + 1];
+
+ pid = do_clone(get_userns_fd_cb, NULL, CLONE_NEWUSER | CLONE_NEWNS);
+ if (pid < 0)
+ return -errno;
+
+ ret = map_ids_from_idmap(idmap, pid);
+ if (ret < 0)
+ return ret;
+
+ ret = snprintf(path_ns, sizeof(path_ns), "/proc/%d/ns/user", pid);
+ if (ret < 0 || (size_t)ret >= sizeof(path_ns))
+ ret = -EIO;
+ else
+ ret = open(path_ns, O_RDONLY | O_CLOEXEC | O_NOCTTY);
+
+ (void)kill(pid, SIGKILL);
+ (void)wait_for_pid(pid);
+ return ret;
+}
+
+int get_userns_fd(unsigned long nsid, unsigned long hostid, unsigned long range)
+{
+ struct list head, uid_mapl, gid_mapl;
+ struct id_map uid_map = {
+ .map_type = ID_TYPE_UID,
+ .nsid = nsid,
+ .hostid = hostid,
+ .range = range,
+ };
+ struct id_map gid_map = {
+ .map_type = ID_TYPE_GID,
+ .nsid = nsid,
+ .hostid = hostid,
+ .range = range,
+ };
+
+ list_init(&head);
+ uid_mapl.elem = &uid_map;
+ gid_mapl.elem = &gid_map;
+ list_add_tail(&head, &uid_mapl);
+ list_add_tail(&head, &gid_mapl);
+
+ return get_userns_fd_from_idmap(&head);
+}
+
+bool switch_ids(uid_t uid, gid_t gid)
+{
+ if (setgroups(0, NULL))
+ return syserror("failure: setgroups");
+
+ if (setresgid(gid, gid, gid))
+ return syserror("failure: setresgid");
+
+ if (setresuid(uid, uid, uid))
+ return syserror("failure: setresuid");
+
+ /* Ensure we can access proc files from processes we can ptrace. */
+ if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0))
+ return syserror("failure: make dumpable");
+
+ return true;
+}
+
+static int create_userns_hierarchy(struct userns_hierarchy *h);
+
+static int userns_fd_cb(void *data)
+{
+ struct userns_hierarchy *h = data;
+ char c;
+ int ret;
+
+ ret = read_nointr(h->fd_event, &c, 1);
+ if (ret < 0)
+ return syserror("failure: read from socketpair");
+
+ /* Only switch ids if someone actually wrote a mapping for us. */
+ if (c == '1') {
+ if (!switch_ids(0, 0))
+ return syserror("failure: switch ids to 0");
+ }
+
+ ret = write_nointr(h->fd_event, "1", 1);
+ if (ret < 0)
+ return syserror("failure: write to socketpair");
+
+ ret = create_userns_hierarchy(++h);
+ if (ret < 0)
+ return syserror("failure: userns level %d", h->level);
+
+ return 0;
+}
+
+static int create_userns_hierarchy(struct userns_hierarchy *h)
+{
+ int fret = -1;
+ char c;
+ int fd_socket[2];
+ int fd_userns = -EBADF, ret = -1;
+ ssize_t bytes;
+ pid_t pid;
+ char path[256];
+
+ if (h->level == MAX_USERNS_LEVEL)
+ return 0;
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, fd_socket);
+ if (ret < 0)
+ return syserror("failure: create socketpair");
+
+ /* Note the CLONE_FILES | CLONE_VM when mucking with fds and memory. */
+ h->fd_event = fd_socket[1];
+ pid = do_clone(userns_fd_cb, h, CLONE_NEWUSER | CLONE_FILES | CLONE_VM);
+ if (pid < 0) {
+ syserror("failure: userns level %d", h->level);
+ goto out_close;
+ }
+
+ ret = map_ids_from_idmap(&h->id_map, pid);
+ if (ret < 0) {
+ kill(pid, SIGKILL);
+ syserror("failure: writing id mapping for userns level %d for %d", h->level, pid);
+ goto out_wait;
+ }
+
+ if (!list_empty(&h->id_map))
+ bytes = write_nointr(fd_socket[0], "1", 1); /* Inform the child we wrote a mapping. */
+ else
+ bytes = write_nointr(fd_socket[0], "0", 1); /* Inform the child we didn't write a mapping. */
+ if (bytes < 0) {
+ kill(pid, SIGKILL);
+ syserror("failure: write to socketpair");
+ goto out_wait;
+ }
+
+ /* Wait for child to set*id() and become dumpable. */
+ bytes = read_nointr(fd_socket[0], &c, 1);
+ if (bytes < 0) {
+ kill(pid, SIGKILL);
+ syserror("failure: read from socketpair");
+ goto out_wait;
+ }
+
+ snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
+ fd_userns = open(path, O_RDONLY | O_CLOEXEC);
+ if (fd_userns < 0) {
+ kill(pid, SIGKILL);
+ syserror("failure: open userns level %d for %d", h->level, pid);
+ goto out_wait;
+ }
+
+ fret = 0;
+
+out_wait:
+ if (!wait_for_pid(pid) && !fret) {
+ h->fd_userns = fd_userns;
+ fd_userns = -EBADF;
+ }
+
+out_close:
+ if (fd_userns >= 0)
+ close(fd_userns);
+ close(fd_socket[0]);
+ close(fd_socket[1]);
+ return fret;
+}
+
+static int write_file(const char *path, const char *val)
+{
+ int fd = open(path, O_WRONLY);
+ size_t len = strlen(val);
+ int ret;
+
+ if (fd == -1) {
+ ksft_print_msg("opening %s for write: %s\n", path, strerror(errno));
+ return -1;
+ }
+
+ ret = write(fd, val, len);
+ if (ret == -1) {
+ ksft_print_msg("writing to %s: %s\n", path, strerror(errno));
+ return -1;
+ }
+ if (ret != len) {
+ ksft_print_msg("short write to %s\n", path);
+ return -1;
+ }
+
+ ret = close(fd);
+ if (ret == -1) {
+ ksft_print_msg("closing %s\n", path);
+ return -1;
+ }
+
+ return 0;
+}
+
+int setup_userns(void)
+{
+ int ret;
+ char buf[32];
+ uid_t uid = getuid();
+ gid_t gid = getgid();
+
+ ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID);
+ if (ret) {
+ ksft_exit_fail_msg("unsharing mountns and userns: %s\n",
+ strerror(errno));
+ return ret;
+ }
+
+ sprintf(buf, "0 %d 1", uid);
+ ret = write_file("/proc/self/uid_map", buf);
+ if (ret)
+ return ret;
+ ret = write_file("/proc/self/setgroups", "deny");
+ if (ret)
+ return ret;
+ sprintf(buf, "0 %d 1", gid);
+ ret = write_file("/proc/self/gid_map", buf);
+ if (ret)
+ return ret;
+
+ ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL);
+ if (ret) {
+ ksft_print_msg("making mount tree private: %s\n", strerror(errno));
+ return ret;
+ }
+
+ return 0;
+}
+
+/* caps_down - lower all effective caps */
+int caps_down(void)
+{
+ bool fret = false;
+ cap_t caps = NULL;
+ int ret = -1;
+
+ caps = cap_get_proc();
+ if (!caps)
+ goto out;
+
+ ret = cap_clear_flag(caps, CAP_EFFECTIVE);
+ if (ret)
+ goto out;
+
+ ret = cap_set_proc(caps);
+ if (ret)
+ goto out;
+
+ fret = true;
+
+out:
+ cap_free(caps);
+ return fret;
+}
+
+/* cap_down - lower an effective cap */
+int cap_down(cap_value_t down)
+{
+ bool fret = false;
+ cap_t caps = NULL;
+ cap_value_t cap = down;
+ int ret = -1;
+
+ caps = cap_get_proc();
+ if (!caps)
+ goto out;
+
+ ret = cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap, 0);
+ if (ret)
+ goto out;
+
+ ret = cap_set_proc(caps);
+ if (ret)
+ goto out;
+
+ fret = true;
+
+out:
+ cap_free(caps);
+ return fret;
+}
+
+uint64_t get_unique_mnt_id(const char *path)
+{
+ struct statx sx;
+ int ret;
+
+ ret = statx(AT_FDCWD, path, 0, STATX_MNT_ID_UNIQUE, &sx);
+ if (ret == -1) {
+ ksft_print_msg("retrieving unique mount ID for %s: %s\n", path,
+ strerror(errno));
+ return 0;
+ }
+
+ if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE)) {
+ ksft_print_msg("no unique mount ID available for %s\n", path);
+ return 0;
+ }
+
+ return sx.stx_mnt_id;
+}
diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h
new file mode 100644
index 000000000000..70f7ccc607f4
--- /dev/null
+++ b/tools/testing/selftests/filesystems/utils.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __IDMAP_UTILS_H
+#define __IDMAP_UTILS_H
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <errno.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/capability.h>
+#include <sys/fsuid.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+extern int get_userns_fd(unsigned long nsid, unsigned long hostid,
+ unsigned long range);
+
+extern int caps_down(void);
+extern int cap_down(cap_value_t down);
+
+extern bool switch_ids(uid_t uid, gid_t gid);
+extern int setup_userns(void);
+
+static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps)
+{
+ if (setns(fd, CLONE_NEWUSER))
+ return false;
+
+ if (!switch_ids(uid, gid))
+ return false;
+
+ if (drop_caps && !caps_down())
+ return false;
+
+ return true;
+}
+
+extern uint64_t get_unique_mnt_id(const char *path);
+
+#endif /* __IDMAP_UTILS_H */
diff --git a/tools/testing/selftests/filesystems/wrappers.h b/tools/testing/selftests/filesystems/wrappers.h
new file mode 100644
index 000000000000..420ae4f908cf
--- /dev/null
+++ b/tools/testing/selftests/filesystems/wrappers.h
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+#ifndef __SELFTEST_OVERLAYFS_WRAPPERS_H__
+#define __SELFTEST_OVERLAYFS_WRAPPERS_H__
+
+#define _GNU_SOURCE
+
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <sys/syscall.h>
+
+#ifndef STATX_MNT_ID_UNIQUE
+#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */
+#endif
+
+static inline int sys_fsopen(const char *fsname, unsigned int flags)
+{
+ return syscall(__NR_fsopen, fsname, flags);
+}
+
+static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key,
+ const char *value, int aux)
+{
+ return syscall(__NR_fsconfig, fd, cmd, key, value, aux);
+}
+
+static inline int sys_fsmount(int fd, unsigned int flags,
+ unsigned int attr_flags)
+{
+ return syscall(__NR_fsmount, fd, flags, attr_flags);
+}
+
+static inline int sys_mount(const char *src, const char *tgt, const char *fst,
+ unsigned long flags, const void *data)
+{
+ return syscall(__NR_mount, src, tgt, fst, flags, data);
+}
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#endif
+
+#ifndef MOVE_MOUNT_T_EMPTY_PATH
+#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */
+#endif
+
+#ifndef __NR_move_mount
+ #if defined __alpha__
+ #define __NR_move_mount 539
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_move_mount 4429
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_move_mount 6429
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_move_mount 5429
+ #endif
+ #else
+ #define __NR_move_mount 429
+ #endif
+#endif
+
+static inline int sys_move_mount(int from_dfd, const char *from_pathname,
+ int to_dfd, const char *to_pathname,
+ unsigned int flags)
+{
+ return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
+ to_pathname, flags);
+}
+
+#ifndef OPEN_TREE_CLONE
+#define OPEN_TREE_CLONE 1
+#endif
+
+#ifndef OPEN_TREE_CLOEXEC
+#define OPEN_TREE_CLOEXEC O_CLOEXEC
+#endif
+
+#ifndef AT_RECURSIVE
+#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
+#endif
+
+#ifndef __NR_open_tree
+ #if defined __alpha__
+ #define __NR_open_tree 538
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_open_tree 4428
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_open_tree 6428
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_open_tree 5428
+ #endif
+ #else
+ #define __NR_open_tree 428
+ #endif
+#endif
+
+static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
+{
+ return syscall(__NR_open_tree, dfd, filename, flags);
+}
+
+#endif