aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/pid.h2
-rw-r--r--include/uapi/linux/sched.h1
-rw-r--r--kernel/fork.c107
3 files changed, 106 insertions, 4 deletions
diff --git a/include/linux/pid.h b/include/linux/pid.h
index b6f4ba16065a..3c8ef5a199ca 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -66,6 +66,8 @@ struct pid
extern struct pid init_struct_pid;
+extern const struct file_operations pidfd_fops;
+
static inline struct pid *get_pid(struct pid *pid)
{
if (pid)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 22627f80063e..ed4ee170bee2 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -10,6 +10,7 @@
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
+#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..e45f0acaf451 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -11,6 +11,7 @@
* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
*/
+#include <linux/anon_inodes.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
@@ -21,6 +22,7 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
@@ -1662,6 +1664,58 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_RCU */
}
+static int pidfd_release(struct inode *inode, struct file *file)
+{
+ struct pid *pid = file->private_data;
+
+ file->private_data = NULL;
+ put_pid(pid);
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
+ struct pid *pid = f->private_data;
+
+ seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
+ seq_putc(m, '\n');
+}
+#endif
+
+const struct file_operations pidfd_fops = {
+ .release = pidfd_release,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = pidfd_show_fdinfo,
+#endif
+};
+
+/**
+ * pidfd_create() - Create a new pid file descriptor.
+ *
+ * @pid: struct pid that the pidfd will reference
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set.
+ *
+ * Note, that this function can only be called after the fd table has
+ * been unshared to avoid leaking the pidfd to the new process.
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ * On error, a negative errno number will be returned.
+ */
+static int pidfd_create(struct pid *pid)
+{
+ int fd;
+
+ fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
+ O_RDWR | O_CLOEXEC);
+ if (fd < 0)
+ put_pid(pid);
+
+ return fd;
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1674,13 +1728,14 @@ static __latent_entropy struct task_struct *copy_process(
unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
+ int __user *parent_tidptr,
int __user *child_tidptr,
struct pid *pid,
int trace,
unsigned long tls,
int node)
{
- int retval;
+ int pidfd = -1, retval;
struct task_struct *p;
struct multiprocess_signals delayed;
@@ -1730,6 +1785,31 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
+ if (clone_flags & CLONE_PIDFD) {
+ int reserved;
+
+ /*
+ * - CLONE_PARENT_SETTID is useless for pidfds and also
+ * parent_tidptr is used to return pidfds.
+ * - CLONE_DETACHED is blocked so that we can potentially
+ * reuse it later for CLONE_PIDFD.
+ * - CLONE_THREAD is blocked until someone really needs it.
+ */
+ if (clone_flags &
+ (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Verify that parent_tidptr is sane so we can potentially
+ * reuse it later.
+ */
+ if (get_user(reserved, parent_tidptr))
+ return ERR_PTR(-EFAULT);
+
+ if (reserved != 0)
+ return ERR_PTR(-EINVAL);
+ }
+
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
@@ -1936,6 +2016,22 @@ static __latent_entropy struct task_struct *copy_process(
}
}
+ /*
+ * This has to happen after we've potentially unshared the file
+ * descriptor table (so that the pidfd doesn't leak into the child
+ * if the fd table isn't shared).
+ */
+ if (clone_flags & CLONE_PIDFD) {
+ retval = pidfd_create(pid);
+ if (retval < 0)
+ goto bad_fork_free_pid;
+
+ pidfd = retval;
+ retval = put_user(pidfd, parent_tidptr);
+ if (retval)
+ goto bad_fork_put_pidfd;
+ }
+
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
@@ -1996,7 +2092,7 @@ static __latent_entropy struct task_struct *copy_process(
*/
retval = cgroup_can_fork(p);
if (retval)
- goto bad_fork_free_pid;
+ goto bad_fork_put_pidfd;
/*
* From this point on we must avoid any synchronous user-space
@@ -2111,6 +2207,9 @@ bad_fork_cancel_cgroup:
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p);
+bad_fork_put_pidfd:
+ if (clone_flags & CLONE_PIDFD)
+ ksys_close(pidfd);
bad_fork_free_pid:
cgroup_threadgroup_change_end(current);
if (pid != &init_struct_pid)
@@ -2176,7 +2275,7 @@ static inline void init_idle_pids(struct task_struct *idle)
struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
+ task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
cpu_to_node(cpu));
if (!IS_ERR(task)) {
init_idle_pids(task);
@@ -2223,7 +2322,7 @@ long _do_fork(unsigned long clone_flags,
trace = 0;
}
- p = copy_process(clone_flags, stack_start, stack_size,
+ p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
add_latent_entropy();