aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/namespace.c15
-rw-r--r--fs/nsfs.c5
-rw-r--r--include/linux/mnt_namespace.h2
-rw-r--r--include/linux/nsproxy.h24
-rw-r--r--include/linux/proc_fs.h2
-rw-r--r--include/linux/proc_ns.h4
-rw-r--r--ipc/namespace.c7
-rw-r--r--kernel/cgroup/namespace.c5
-rw-r--r--kernel/nsproxy.c305
-rw-r--r--kernel/pid_namespace.c5
-rw-r--r--kernel/time/namespace.c5
-rw-r--r--kernel/user_namespace.c8
-rw-r--r--kernel/utsname.c5
-rw-r--r--net/core/net_namespace.c5
-rw-r--r--tools/testing/selftests/pidfd/.gitignore1
-rw-r--r--tools/testing/selftests/pidfd/Makefile3
-rw-r--r--tools/testing/selftests/pidfd/config6
-rw-r--r--tools/testing/selftests/pidfd/pidfd_setns_test.c473
18 files changed, 833 insertions, 47 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index a6baee3c7904..6d499ab254b7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1786,6 +1786,11 @@ static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
return container_of(ns, struct mnt_namespace, ns);
}
+struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
+{
+ return &mnt->ns;
+}
+
static bool mnt_ns_loop(struct dentry *dentry)
{
/* Could bind mounting the mount namespace inode cause a
@@ -4013,16 +4018,18 @@ static void mntns_put(struct ns_common *ns)
put_mnt_ns(to_mnt_ns(ns));
}
-static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int mntns_install(struct nsset *nsset, struct ns_common *ns)
{
- struct fs_struct *fs = current->fs;
+ struct nsproxy *nsproxy = nsset->nsproxy;
+ struct fs_struct *fs = nsset->fs;
struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
+ struct user_namespace *user_ns = nsset->cred->user_ns;
struct path root;
int err;
if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(user_ns, CAP_SYS_CHROOT) ||
+ !ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
if (is_anon_ns(mnt_ns))
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 4f1205725cfe..800c1d0eb0d0 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -229,6 +229,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
return res;
}
+bool proc_ns_file(const struct file *file)
+{
+ return file->f_op == &ns_file_operations;
+}
+
struct file *proc_ns_fget(int fd)
{
struct file *file;
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 35942084cd40..8f882f5881e8 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -6,10 +6,12 @@
struct mnt_namespace;
struct fs_struct;
struct user_namespace;
+struct ns_common;
extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
struct user_namespace *, struct fs_struct *);
extern void put_mnt_ns(struct mnt_namespace *ns);
+extern struct ns_common *from_mnt_ns(struct mnt_namespace *);
extern const struct file_operations proc_mounts_operations;
extern const struct file_operations proc_mountinfo_operations;
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 074f395b9ad2..cdb171efc7cb 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -42,6 +42,30 @@ struct nsproxy {
extern struct nsproxy init_nsproxy;
/*
+ * A structure to encompass all bits needed to install
+ * a partial or complete new set of namespaces.
+ *
+ * If a new user namespace is requested cred will
+ * point to a modifiable set of credentials. If a pointer
+ * to a modifiable set is needed nsset_cred() must be
+ * used and tested.
+ */
+struct nsset {
+ unsigned flags;
+ struct nsproxy *nsproxy;
+ struct fs_struct *fs;
+ const struct cred *cred;
+};
+
+static inline struct cred *nsset_cred(struct nsset *set)
+{
+ if (set->flags & CLONE_NEWUSER)
+ return (struct cred *)set->cred;
+
+ return NULL;
+}
+
+/*
* the namespaces access rules are:
*
* 1. only current task is allowed to change tsk->nsproxy pointer or
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 45c05fd9c99d..0cfc44d7ac74 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -179,4 +179,6 @@ static inline struct pid_namespace *proc_pid_ns(const struct inode *inode)
return inode->i_sb->s_fs_info;
}
+bool proc_ns_file(const struct file *file);
+
#endif /* _LINUX_PROC_FS_H */
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 6abe85c34681..75807ecef880 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -8,7 +8,7 @@
#include <linux/ns_common.h>
struct pid_namespace;
-struct nsproxy;
+struct nsset;
struct path;
struct task_struct;
struct inode;
@@ -19,7 +19,7 @@ struct proc_ns_operations {
int type;
struct ns_common *(*get)(struct task_struct *task);
void (*put)(struct ns_common *ns);
- int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
+ int (*install)(struct nsset *nsset, struct ns_common *ns);
struct user_namespace *(*owner)(struct ns_common *ns);
struct ns_common *(*get_parent)(struct ns_common *ns);
} __randomize_layout;
diff --git a/ipc/namespace.c b/ipc/namespace.c
index b3ca1476ca51..fdc3b5f3f53a 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -177,15 +177,14 @@ static void ipcns_put(struct ns_common *ns)
return put_ipc_ns(to_ipc_ns(ns));
}
-static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int ipcns_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct ipc_namespace *ns = to_ipc_ns(new);
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
- /* Ditch state from the old ipc namespace */
- exit_sem(current);
put_ipc_ns(nsproxy->ipc_ns);
nsproxy->ipc_ns = get_ipc_ns(ns);
return 0;
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index b05f1dd58a62..812a61afd538 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -95,11 +95,12 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
return container_of(ns, struct cgroup_namespace, ns);
}
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ed9882108cd2..b03df67621d0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -19,6 +19,8 @@
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/time_namespace.h>
+#include <linux/fs_struct.h>
+#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -257,37 +259,296 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
-SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+static int check_setns_flags(unsigned long flags)
{
- struct task_struct *tsk = current;
- struct nsproxy *new_nsproxy;
- struct file *file;
- struct ns_common *ns;
- int err;
+ if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
+ CLONE_NEWCGROUP)))
+ return -EINVAL;
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return PTR_ERR(file);
+#ifndef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_UTS_NS
+ if (flags & CLONE_NEWUTS)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_CGROUPS
+ if (flags & CLONE_NEWCGROUP)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_NET_NS
+ if (flags & CLONE_NEWNET)
+ return -EINVAL;
+#endif
- err = -EINVAL;
- ns = get_proc_ns(file_inode(file));
- if (nstype && (ns->ops->type != nstype))
- goto out;
+ return 0;
+}
+
+static void put_nsset(struct nsset *nsset)
+{
+ unsigned flags = nsset->flags;
+
+ if (flags & CLONE_NEWUSER)
+ put_cred(nsset_cred(nsset));
+ /*
+ * We only created a temporary copy if we attached to more than just
+ * the mount namespace.
+ */
+ if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
+ free_fs_struct(nsset->fs);
+ if (nsset->nsproxy)
+ free_nsproxy(nsset->nsproxy);
+}
+
+static int prepare_nsset(unsigned flags, struct nsset *nsset)
+{
+ struct task_struct *me = current;
+
+ nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
+ if (IS_ERR(nsset->nsproxy))
+ return PTR_ERR(nsset->nsproxy);
- new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
- if (IS_ERR(new_nsproxy)) {
- err = PTR_ERR(new_nsproxy);
+ if (flags & CLONE_NEWUSER)
+ nsset->cred = prepare_creds();
+ else
+ nsset->cred = current_cred();
+ if (!nsset->cred)
goto out;
+
+ /* Only create a temporary copy of fs_struct if we really need to. */
+ if (flags == CLONE_NEWNS) {
+ nsset->fs = me->fs;
+ } else if (flags & CLONE_NEWNS) {
+ nsset->fs = copy_fs_struct(me->fs);
+ if (!nsset->fs)
+ goto out;
}
- err = ns->ops->install(new_nsproxy, ns);
- if (err) {
- free_nsproxy(new_nsproxy);
- goto out;
+ nsset->flags = flags;
+ return 0;
+
+out:
+ put_nsset(nsset);
+ return -ENOMEM;
+}
+
+static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
+{
+ return ns->ops->install(nsset, ns);
+}
+
+/*
+ * This is the inverse operation to unshare().
+ * Ordering is equivalent to the standard ordering used everywhere else
+ * during unshare and process creation. The switch to the new set of
+ * namespaces occurs at the point of no return after installation of
+ * all requested namespaces was successful in commit_nsset().
+ */
+static int validate_nsset(struct nsset *nsset, struct pid *pid)
+{
+ int ret = 0;
+ unsigned flags = nsset->flags;
+ struct user_namespace *user_ns = NULL;
+ struct pid_namespace *pid_ns = NULL;
+ struct nsproxy *nsp;
+ struct task_struct *tsk;
+
+ /* Take a "snapshot" of the target task's namespaces. */
+ rcu_read_lock();
+ tsk = pid_task(pid, PIDTYPE_PID);
+ if (!tsk) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
+ rcu_read_unlock();
+ return -EPERM;
+ }
+
+ task_lock(tsk);
+ nsp = tsk->nsproxy;
+ if (nsp)
+ get_nsproxy(nsp);
+ task_unlock(tsk);
+ if (!nsp) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+#ifdef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID) {
+ pid_ns = task_active_pid_ns(tsk);
+ if (unlikely(!pid_ns)) {
+ rcu_read_unlock();
+ ret = -ESRCH;
+ goto out;
+ }
+ get_pid_ns(pid_ns);
+ }
+#endif
+
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER)
+ user_ns = get_user_ns(__task_cred(tsk)->user_ns);
+#endif
+ rcu_read_unlock();
+
+ /*
+ * Install requested namespaces. The caller will have
+ * verified earlier that the requested namespaces are
+ * supported on this kernel. We don't report errors here
+ * if a namespace is requested that isn't supported.
+ */
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER) {
+ ret = validate_ns(nsset, &user_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+ if (flags & CLONE_NEWNS) {
+ ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns));
+ if (ret)
+ goto out;
+ }
+
+#ifdef CONFIG_UTS_NS
+ if (flags & CLONE_NEWUTS) {
+ ret = validate_ns(nsset, &nsp->uts_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC) {
+ ret = validate_ns(nsset, &nsp->ipc_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID) {
+ ret = validate_ns(nsset, &pid_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_CGROUPS
+ if (flags & CLONE_NEWCGROUP) {
+ ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_NET_NS
+ if (flags & CLONE_NEWNET) {
+ ret = validate_ns(nsset, &nsp->net_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+out:
+ if (pid_ns)
+ put_pid_ns(pid_ns);
+ if (nsp)
+ put_nsproxy(nsp);
+ put_user_ns(user_ns);
+
+ return ret;
+}
+
+/*
+ * This is the point of no return. There are just a few namespaces
+ * that do some actual work here and it's sufficiently minimal that
+ * a separate ns_common operation seems unnecessary for now.
+ * Unshare is doing the same thing. If we'll end up needing to do
+ * more in a given namespace or a helper here is ultimately not
+ * exported anymore a simple commit handler for each namespace
+ * should be added to ns_common.
+ */
+static void commit_nsset(struct nsset *nsset)
+{
+ unsigned flags = nsset->flags;
+ struct task_struct *me = current;
+
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER) {
+ /* transfer ownership */
+ commit_creds(nsset_cred(nsset));
+ nsset->cred = NULL;
+ }
+#endif
+
+ /* We only need to commit if we have used a temporary fs_struct. */
+ if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
+ set_fs_root(me->fs, &nsset->fs->root);
+ set_fs_pwd(me->fs, &nsset->fs->pwd);
}
- switch_task_namespaces(tsk, new_nsproxy);
- perf_event_namespaces(tsk);
+#ifdef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC)
+ exit_sem(me);
+#endif
+
+ /* transfer ownership */
+ switch_task_namespaces(me, nsset->nsproxy);
+ nsset->nsproxy = NULL;
+}
+
+SYSCALL_DEFINE2(setns, int, fd, int, flags)
+{
+ struct file *file;
+ struct ns_common *ns = NULL;
+ struct nsset nsset = {};
+ int err = 0;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ if (proc_ns_file(file)) {
+ ns = get_proc_ns(file_inode(file));
+ if (flags && (ns->ops->type != flags))
+ err = -EINVAL;
+ flags = ns->ops->type;
+ } else if (!IS_ERR(pidfd_pid(file))) {
+ err = check_setns_flags(flags);
+ } else {
+ err = -EBADF;
+ }
+ if (err)
+ goto out;
+
+ err = prepare_nsset(flags, &nsset);
+ if (err)
+ goto out;
+
+ if (proc_ns_file(file))
+ err = validate_ns(&nsset, ns);
+ else
+ err = validate_nsset(&nsset, file->private_data);
+ if (!err) {
+ commit_nsset(&nsset);
+ perf_event_namespaces(current);
+ }
+ put_nsset(&nsset);
out:
fput(file);
return err;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 01f8ba32cc0c..11db2bdbb41e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -378,13 +378,14 @@ static void pidns_put(struct ns_common *ns)
put_pid_ns(to_pid_ns(ns));
}
-static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int pidns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct pid_namespace *active = task_active_pid_ns(current);
struct pid_namespace *ancestor, *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 53bce347cd50..5d9fc22d836a 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -280,8 +280,9 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
-static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int timens_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct time_namespace *ns = to_time_ns(new);
int err;
@@ -289,7 +290,7 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
return -EUSERS;
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
timens_set_vvar_page(current, ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 8eadadc478f9..87804e0371fe 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1253,7 +1253,7 @@ static void userns_put(struct ns_common *ns)
put_user_ns(to_user_ns(ns));
}
-static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int userns_install(struct nsset *nsset, struct ns_common *ns)
{
struct user_namespace *user_ns = to_user_ns(ns);
struct cred *cred;
@@ -1274,14 +1274,14 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
- cred = prepare_creds();
+ cred = nsset_cred(nsset);
if (!cred)
- return -ENOMEM;
+ return -EINVAL;
put_user_ns(cred->user_ns);
set_cred_user_ns(cred, get_user_ns(user_ns));
- return commit_creds(cred);
+ return 0;
}
struct ns_common *ns_get_owner(struct ns_common *ns)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index f0e491193009..e488d0e2ab45 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -140,12 +140,13 @@ static void utsns_put(struct ns_common *ns)
put_uts_ns(to_uts_ns(ns));
}
-static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int utsns_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct uts_namespace *ns = to_uts_ns(new);
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
get_uts_ns(ns);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 190ca66a383b..dcd61aca343e 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -1353,12 +1353,13 @@ static void netns_put(struct ns_common *ns)
put_net(to_net_ns(ns));
}
-static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int netns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct net *net = to_net_ns(ns);
if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
put_net(nsproxy->net_ns);
diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore
index 2d4db5afb142..973198a3ec3d 100644
--- a/tools/testing/selftests/pidfd/.gitignore
+++ b/tools/testing/selftests/pidfd/.gitignore
@@ -5,3 +5,4 @@ pidfd_test
pidfd_wait
pidfd_fdinfo_test
pidfd_getfd_test
+pidfd_setns_test
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index 75a545861375..f4a2f28f926b 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -1,7 +1,8 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -g -I../../../../usr/include/ -pthread
-TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait pidfd_getfd_test
+TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
+ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test
include ../lib.mk
diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config
new file mode 100644
index 000000000000..bb11de90c0c9
--- /dev/null
+++ b/tools/testing/selftests/pidfd/config
@@ -0,0 +1,6 @@
+CONFIG_UTS_NS=y
+CONFIG_IPC_NS=y
+CONFIG_USER_NS=y
+CONFIG_PID_NS=y
+CONFIG_NET_NS=y
+CONFIG_CGROUPS=y
diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c
new file mode 100644
index 000000000000..133ec5b6cda8
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <linux/kcmp.h>
+
+#include "pidfd.h"
+#include "../clone3/clone3_selftests.h"
+#include "../kselftest.h"
+#include "../kselftest_harness.h"
+
+enum {
+ PIDFD_NS_USER,
+ PIDFD_NS_MNT,
+ PIDFD_NS_PID,
+ PIDFD_NS_UTS,
+ PIDFD_NS_IPC,
+ PIDFD_NS_NET,
+ PIDFD_NS_CGROUP,
+ PIDFD_NS_PIDCLD,
+ PIDFD_NS_MAX
+};
+
+const struct ns_info {
+ const char *name;
+ int flag;
+} ns_info[] = {
+ [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, },
+ [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, },
+ [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, },
+ [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, },
+ [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, },
+ [PIDFD_NS_NET] = { "net", CLONE_NEWNET, },
+ [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, },
+ [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, },
+};
+
+FIXTURE(current_nsset)
+{
+ pid_t pid;
+ int pidfd;
+ int nsfds[PIDFD_NS_MAX];
+
+ pid_t child_pid_exited;
+ int child_pidfd_exited;
+
+ pid_t child_pid1;
+ int child_pidfd1;
+ int child_nsfds1[PIDFD_NS_MAX];
+
+ pid_t child_pid2;
+ int child_pidfd2;
+ int child_nsfds2[PIDFD_NS_MAX];
+};
+
+static int sys_waitid(int which, pid_t pid, int options)
+{
+ return syscall(__NR_waitid, which, pid, NULL, options, NULL);
+}
+
+pid_t create_child(int *pidfd, unsigned flags)
+{
+ struct clone_args args = {
+ .flags = CLONE_PIDFD | flags,
+ .exit_signal = SIGCHLD,
+ .pidfd = ptr_to_u64(pidfd),
+ };
+
+ return sys_clone3(&args, sizeof(struct clone_args));
+}
+
+FIXTURE_SETUP(current_nsset)
+{
+ int i, proc_fd, ret;
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ self->nsfds[i] = -EBADF;
+ self->child_nsfds1[i] = -EBADF;
+ self->child_nsfds2[i] = -EBADF;
+ }
+
+ proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC);
+ ASSERT_GE(proc_fd, 0) {
+ TH_LOG("%m - Failed to open /proc/self/ns");
+ }
+
+ self->pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
+ if (self->nsfds[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->name, self->pid);
+ }
+ }
+ }
+
+ self->pidfd = sys_pidfd_open(self->pid, 0);
+ EXPECT_GT(self->pidfd, 0) {
+ TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
+ }
+
+ /* Create task that exits right away. */
+ self->child_pid_exited = create_child(&self->child_pidfd_exited,
+ CLONE_NEWUSER | CLONE_NEWNET);
+ EXPECT_GT(self->child_pid_exited, 0);
+
+ if (self->child_pid_exited == 0)
+ _exit(EXIT_SUCCESS);
+
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
+
+ self->pidfd = sys_pidfd_open(self->pid, 0);
+ EXPECT_GE(self->pidfd, 0) {
+ TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
+ }
+
+ /* Create tasks that will be stopped. */
+ self->child_pid1 = create_child(&self->child_pidfd1,
+ CLONE_NEWUSER | CLONE_NEWNS |
+ CLONE_NEWCGROUP | CLONE_NEWIPC |
+ CLONE_NEWUTS | CLONE_NEWPID |
+ CLONE_NEWNET);
+ EXPECT_GE(self->child_pid1, 0);
+
+ if (self->child_pid1 == 0) {
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ self->child_pid2 = create_child(&self->child_pidfd2,
+ CLONE_NEWUSER | CLONE_NEWNS |
+ CLONE_NEWCGROUP | CLONE_NEWIPC |
+ CLONE_NEWUTS | CLONE_NEWPID |
+ CLONE_NEWNET);
+ EXPECT_GE(self->child_pid2, 0);
+
+ if (self->child_pid2 == 0) {
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ char p[100];
+
+ const struct ns_info *info = &ns_info[i];
+
+ self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
+ if (self->nsfds[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->name, self->pid);
+ }
+ }
+
+ ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
+ self->child_pid1, info->name);
+ EXPECT_GT(ret, 0);
+ EXPECT_LT(ret, sizeof(p));
+
+ self->child_nsfds1[i] = open(p, O_RDONLY | O_CLOEXEC);
+ if (self->child_nsfds1[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->name, self->child_pid1);
+ }
+ }
+
+ ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
+ self->child_pid2, info->name);
+ EXPECT_GT(ret, 0);
+ EXPECT_LT(ret, sizeof(p));
+
+ self->child_nsfds2[i] = open(p, O_RDONLY | O_CLOEXEC);
+ if (self->child_nsfds2[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->name, self->child_pid1);
+ }
+ }
+ }
+
+ close(proc_fd);
+}
+
+FIXTURE_TEARDOWN(current_nsset)
+{
+ int i;
+
+ ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd1,
+ SIGKILL, NULL, 0), 0);
+ ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd2,
+ SIGKILL, NULL, 0), 0);
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ if (self->nsfds[i] >= 0)
+ close(self->nsfds[i]);
+ if (self->child_nsfds1[i] >= 0)
+ close(self->child_nsfds1[i]);
+ if (self->child_nsfds2[i] >= 0)
+ close(self->child_nsfds2[i]);
+ }
+
+ if (self->child_pidfd1 >= 0)
+ EXPECT_EQ(0, close(self->child_pidfd1));
+ if (self->child_pidfd2 >= 0)
+ EXPECT_EQ(0, close(self->child_pidfd2));
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0);
+}
+
+static int preserve_ns(const int pid, const char *ns)
+{
+ int ret;
+ char path[50];
+
+ ret = snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns);
+ if (ret < 0 || (size_t)ret >= sizeof(path))
+ return -EIO;
+
+ return open(path, O_RDONLY | O_CLOEXEC);
+}
+
+static int in_same_namespace(int ns_fd1, pid_t pid2, const char *ns)
+{
+ int ns_fd2 = -EBADF;
+ int ret = -1;
+ struct stat ns_st1, ns_st2;
+
+ ret = fstat(ns_fd1, &ns_st1);
+ if (ret < 0)
+ return -1;
+
+ ns_fd2 = preserve_ns(pid2, ns);
+ if (ns_fd2 < 0)
+ return -1;
+
+ ret = fstat(ns_fd2, &ns_st2);
+ close(ns_fd2);
+ if (ret < 0)
+ return -1;
+
+ /* processes are in the same namespace */
+ if ((ns_st1.st_dev == ns_st2.st_dev) &&
+ (ns_st1.st_ino == ns_st2.st_ino))
+ return 1;
+
+ /* processes are in different namespaces */
+ return 0;
+}
+
+/* Test that we can't pass garbage to the kernel. */
+TEST_F(current_nsset, invalid_flags)
+{
+ ASSERT_NE(setns(self->pidfd, 0), 0);
+ EXPECT_EQ(errno, EINVAL);
+
+ ASSERT_NE(setns(self->pidfd, -1), 0);
+ EXPECT_EQ(errno, EINVAL);
+
+ ASSERT_NE(setns(self->pidfd, CLONE_VM), 0);
+ EXPECT_EQ(errno, EINVAL);
+
+ ASSERT_NE(setns(self->pidfd, CLONE_NEWUSER | CLONE_VM), 0);
+ EXPECT_EQ(errno, EINVAL);
+}
+
+/* Test that we can't attach to a task that has already exited. */
+TEST_F(current_nsset, pidfd_exited_child)
+{
+ int i;
+ pid_t pid;
+
+ ASSERT_NE(setns(self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET),
+ 0);
+ EXPECT_EQ(errno, ESRCH);
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ /* Verify that we haven't changed any namespaces. */
+ if (self->nsfds[i] >= 0)
+ ASSERT_EQ(in_same_namespace(self->nsfds[i], pid, info->name), 1);
+ }
+}
+
+TEST_F(current_nsset, pidfd_incremental_setns)
+{
+ int i;
+ pid_t pid;
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ int nsfd;
+
+ if (self->child_nsfds1[i] < 0)
+ continue;
+
+ if (info->flag) {
+ ASSERT_EQ(setns(self->child_pidfd1, info->flag), 0) {
+ TH_LOG("%m - Failed to setns to %s namespace of %d via pidfd %d",
+ info->name, self->child_pid1,
+ self->child_pidfd1);
+ }
+ }
+
+ /* Verify that we have changed to the correct namespaces. */
+ if (info->flag == CLONE_NEWPID)
+ nsfd = self->nsfds[i];
+ else
+ nsfd = self->child_nsfds1[i];
+ ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
+ TH_LOG("setns failed to place us correctly into %s namespace of %d via pidfd %d",
+ info->name, self->child_pid1,
+ self->child_pidfd1);
+ }
+ TH_LOG("Managed to correctly setns to %s namespace of %d via pidfd %d",
+ info->name, self->child_pid1, self->child_pidfd1);
+ }
+}
+
+TEST_F(current_nsset, nsfd_incremental_setns)
+{
+ int i;
+ pid_t pid;
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ int nsfd;
+
+ if (self->child_nsfds1[i] < 0)
+ continue;
+
+ if (info->flag) {
+ ASSERT_EQ(setns(self->child_nsfds1[i], info->flag), 0) {
+ TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid1,
+ self->child_nsfds1[i]);
+ }
+ }
+
+ /* Verify that we have changed to the correct namespaces. */
+ if (info->flag == CLONE_NEWPID)
+ nsfd = self->nsfds[i];
+ else
+ nsfd = self->child_nsfds1[i];
+ ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
+ TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d",
+ info->name, self->child_pid1,
+ self->child_nsfds1[i]);
+ }
+ TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid1, self->child_nsfds1[i]);
+ }
+}
+
+TEST_F(current_nsset, pidfd_one_shot_setns)
+{
+ unsigned flags = 0;
+ int i;
+ pid_t pid;
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+
+ if (self->child_nsfds1[i] < 0)
+ continue;
+
+ flags |= info->flag;
+ TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
+ info->name, self->child_pid1);
+ }
+
+ ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
+ TH_LOG("%m - Failed to setns to namespaces of %d",
+ self->child_pid1);
+ }
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ int nsfd;
+
+ if (self->child_nsfds1[i] < 0)
+ continue;
+
+ /* Verify that we have changed to the correct namespaces. */
+ if (info->flag == CLONE_NEWPID)
+ nsfd = self->nsfds[i];
+ else
+ nsfd = self->child_nsfds1[i];
+ ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
+ TH_LOG("setns failed to place us correctly into %s namespace of %d",
+ info->name, self->child_pid1);
+ }
+ TH_LOG("Managed to correctly setns to %s namespace of %d",
+ info->name, self->child_pid1);
+ }
+}
+
+TEST_F(current_nsset, no_foul_play)
+{
+ unsigned flags = 0;
+ int i;
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+
+ if (self->child_nsfds1[i] < 0)
+ continue;
+
+ flags |= info->flag;
+ if (info->flag) /* No use logging pid_for_children. */
+ TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
+ info->name, self->child_pid1);
+ }
+
+ ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
+ TH_LOG("%m - Failed to setns to namespaces of %d vid pidfd %d",
+ self->child_pid1, self->child_pidfd1);
+ }
+
+ /*
+ * Can't setns to a user namespace outside of our hierarchy since we
+ * don't have caps in there and didn't create it. That means that under
+ * no circumstances should we be able to setns to any of the other
+ * ones since they aren't owned by our user namespace.
+ */
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+
+ if (self->child_nsfds2[i] < 0 || !info->flag)
+ continue;
+
+ ASSERT_NE(setns(self->child_pidfd2, info->flag), 0) {
+ TH_LOG("Managed to setns to %s namespace of %d via pidfd %d",
+ info->name, self->child_pid2,
+ self->child_pidfd2);
+ }
+ TH_LOG("%m - Correctly failed to setns to %s namespace of %d via pidfd %d",
+ info->name, self->child_pid2,
+ self->child_pidfd2);
+
+ ASSERT_NE(setns(self->child_nsfds2[i], info->flag), 0) {
+ TH_LOG("Managed to setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid2,
+ self->child_nsfds2[i]);
+ }
+ TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid2,
+ self->child_nsfds2[i]);
+ }
+}
+
+TEST_HARNESS_MAIN