aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorEric W. Biederman <ebiederm@xmission.com>2020-07-21 08:25:22 -0500
committerEric W. Biederman <ebiederm@xmission.com>2020-07-21 08:26:44 -0500
commit7fce69dff8db30cb93aace0bbebda09972027af7 (patch)
treebe0f257b8da82e9e2eb9bdaa2574a48221e7b516
parentMake the user mode driver code a better citizen (diff)
parentexec: Implement kernel_execve (diff)
downloadwireguard-linux-7fce69dff8db30cb93aace0bbebda09972027af7.tar.xz
wireguard-linux-7fce69dff8db30cb93aace0bbebda09972027af7.zip
Implement kernel_execve
This set of changes implements kernel_execve to remove the need for kernel threads to pass in pointers to in-kernel data structures to functions that take __user pointers. Which is part of the greater removal of set_fs work. This set of changes makes do_execve static and so I have updated the comments. This affects the comments in the x86 entry point code and the comments in tomoyo. I believe I have updated them correctly. If not please let me know. I have moved the calls of copy_strings before the call of security_bprm_creds_for_exec. Which might be of interest to the security folks. I can't see that it matters but I have copied the security folks just to be certain. By moving the initialization of the new stack that copy_strings does earlier it becomes possible to copy all of the parameters to exec before anything else is done which makes it possible to have one function kernel_execve that uncondtionally handles copying parameters from kernel space, and another function do_execveat_common which handles copying parameters from userspace. This work was inspired by Christoph Hellwig's similar patchset, which my earlier work to remove the file parameter to do_execveat_common conflicted with. https://lore.kernel.org/linux-fsdevel/20200627072704.2447163-1-hch@lst.de/ I figured that after causing all of that trouble for the set_fs work the least I could do is implement the change myself. The big practical change from Christoph's work is that he did not separate out the copying of parameters from the rest of the work of exec, which did not help the maintainability of the code. Eric W. Biederman (7): exec: Remove unnecessary spaces from binfmts.h exec: Factor out alloc_bprm exec: Move initialization of bprm->filename into alloc_bprm exec: Move bprm_mm_init into alloc_bprm exec: Factor bprm_execve out of do_execve_common exec: Factor bprm_stack_limits out of prepare_arg_pages exec: Implement kernel_execve arch/x86/entry/entry_32.S | 2 +- arch/x86/entry/entry_64.S | 2 +- arch/x86/kernel/unwind_frame.c | 2 +- fs/exec.c | 301 ++++++++++++++++++++++++++++------------- include/linux/binfmts.h | 20 ++- init/main.c | 4 +- kernel/umh.c | 6 +- security/tomoyo/common.h | 2 +- security/tomoyo/domain.c | 4 +- security/tomoyo/tomoyo.c | 4 +- 10 files changed, 224 insertions(+), 123 deletions(-) Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Link: https://lkml.kernel.org/r/871rle8bw2.fsf@x220.int.ebiederm.org Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
-rw-r--r--arch/x86/entry/entry_32.S2
-rw-r--r--arch/x86/entry/entry_64.S2
-rw-r--r--arch/x86/kernel/unwind_frame.c2
-rw-r--r--fs/exec.c301
-rw-r--r--include/linux/binfmts.h20
-rw-r--r--init/main.c4
-rw-r--r--kernel/umh.c6
-rw-r--r--security/tomoyo/common.h2
-rw-r--r--security/tomoyo/domain.c4
-rw-r--r--security/tomoyo/tomoyo.c4
10 files changed, 224 insertions, 123 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 024d7d276cd4..8f4e085ee06d 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -854,7 +854,7 @@ SYM_CODE_START(ret_from_fork)
CALL_NOSPEC ebx
/*
* A kernel thread is allowed to return here after successfully
- * calling do_execve(). Exit to userspace to complete the execve()
+ * calling kernel_execve(). Exit to userspace to complete the execve()
* syscall.
*/
movl $0, PT_EAX(%esp)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d2a00c97e53f..73c7e255256b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -293,7 +293,7 @@ SYM_CODE_START(ret_from_fork)
CALL_NOSPEC rbx
/*
* A kernel thread is allowed to return here after successfully
- * calling do_execve(). Exit to userspace to complete the execve()
+ * calling kernel_execve(). Exit to userspace to complete the execve()
* syscall.
*/
movq $0, RAX(%rsp)
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 722a85f3b2dd..e40b4942157f 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -275,7 +275,7 @@ bool unwind_next_frame(struct unwind_state *state)
* This user_mode() check is slightly broader than a PF_KTHREAD
* check because it also catches the awkward situation where a
* newly forked kthread transitions into a user task by calling
- * do_execve(), which eventually clears PF_KTHREAD.
+ * kernel_execve(), which eventually clears PF_KTHREAD.
*/
if (!user_mode(regs))
goto the_end;
diff --git a/fs/exec.c b/fs/exec.c
index 23dfbb820626..3698252719a3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -448,18 +448,26 @@ static int count(struct user_arg_ptr argv, int max)
return i;
}
-static int prepare_arg_pages(struct linux_binprm *bprm,
- struct user_arg_ptr argv, struct user_arg_ptr envp)
+static int count_strings_kernel(const char *const *argv)
{
- unsigned long limit, ptr_size;
+ int i;
+
+ if (!argv)
+ return 0;
- bprm->argc = count(argv, MAX_ARG_STRINGS);
- if (bprm->argc < 0)
- return bprm->argc;
+ for (i = 0; argv[i]; ++i) {
+ if (i >= MAX_ARG_STRINGS)
+ return -E2BIG;
+ if (fatal_signal_pending(current))
+ return -ERESTARTNOHAND;
+ cond_resched();
+ }
+ return i;
+}
- bprm->envc = count(envp, MAX_ARG_STRINGS);
- if (bprm->envc < 0)
- return bprm->envc;
+static int bprm_stack_limits(struct linux_binprm *bprm)
+{
+ unsigned long limit, ptr_size;
/*
* Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
@@ -633,6 +641,20 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
}
EXPORT_SYMBOL(copy_string_kernel);
+static int copy_strings_kernel(int argc, const char *const *argv,
+ struct linux_binprm *bprm)
+{
+ while (argc-- > 0) {
+ int ret = copy_string_kernel(argv[argc], bprm);
+ if (ret < 0)
+ return ret;
+ if (fatal_signal_pending(current))
+ return -ERESTARTNOHAND;
+ cond_resched();
+ }
+ return 0;
+}
+
#ifdef CONFIG_MMU
/*
@@ -1543,6 +1565,10 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
static void free_bprm(struct linux_binprm *bprm)
{
+ if (bprm->mm) {
+ acct_arg_size(bprm, 0);
+ mmput(bprm->mm);
+ }
free_arg_pages(bprm);
if (bprm->cred) {
mutex_unlock(&current->signal->cred_guard_mutex);
@@ -1557,9 +1583,43 @@ static void free_bprm(struct linux_binprm *bprm)
/* If a binfmt changed the interp, free it. */
if (bprm->interp != bprm->filename)
kfree(bprm->interp);
+ kfree(bprm->fdpath);
kfree(bprm);
}
+static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
+{
+ struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ int retval = -ENOMEM;
+ if (!bprm)
+ goto out;
+
+ if (fd == AT_FDCWD || filename->name[0] == '/') {
+ bprm->filename = filename->name;
+ } else {
+ if (filename->name[0] == '\0')
+ bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
+ else
+ bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
+ fd, filename->name);
+ if (!bprm->fdpath)
+ goto out_free;
+
+ bprm->filename = bprm->fdpath;
+ }
+ bprm->interp = bprm->filename;
+
+ retval = bprm_mm_init(bprm);
+ if (retval)
+ goto out_free;
+ return bprm;
+
+out_free:
+ free_bprm(bprm);
+out:
+ return ERR_PTR(retval);
+}
+
int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
{
/* If a binfmt changed the interp, free it first. */
@@ -1818,48 +1878,20 @@ static int exec_binprm(struct linux_binprm *bprm)
/*
* sys_execve() executes a new program.
*/
-static int do_execveat_common(int fd, struct filename *filename,
- struct user_arg_ptr argv,
- struct user_arg_ptr envp,
- int flags)
+static int bprm_execve(struct linux_binprm *bprm,
+ int fd, struct filename *filename, int flags)
{
- char *pathbuf = NULL;
- struct linux_binprm *bprm;
struct file *file;
struct files_struct *displaced;
int retval;
- if (IS_ERR(filename))
- return PTR_ERR(filename);
-
- /*
- * We move the actual failure in case of RLIMIT_NPROC excess from
- * set*uid() to execve() because too many poorly written programs
- * don't check setuid() return code. Here we additionally recheck
- * whether NPROC limit is still exceeded.
- */
- if ((current->flags & PF_NPROC_EXCEEDED) &&
- atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
- retval = -EAGAIN;
- goto out_ret;
- }
-
- /* We're below the limit (still or again), so we don't want to make
- * further execve() calls fail. */
- current->flags &= ~PF_NPROC_EXCEEDED;
-
retval = unshare_files(&displaced);
if (retval)
- goto out_ret;
-
- retval = -ENOMEM;
- bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
- if (!bprm)
- goto out_files;
+ return retval;
retval = prepare_bprm_creds(bprm);
if (retval)
- goto out_free;
+ goto out_files;
check_unsafe_exec(bprm);
current->in_execve = 1;
@@ -1872,55 +1904,20 @@ static int do_execveat_common(int fd, struct filename *filename,
sched_exec();
bprm->file = file;
- if (fd == AT_FDCWD || filename->name[0] == '/') {
- bprm->filename = filename->name;
- } else {
- if (filename->name[0] == '\0')
- pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
- else
- pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
- fd, filename->name);
- if (!pathbuf) {
- retval = -ENOMEM;
- goto out_unmark;
- }
- /*
- * Record that a name derived from an O_CLOEXEC fd will be
- * inaccessible after exec. Relies on having exclusive access to
- * current->files (due to unshare_files above).
- */
- if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
- bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
- bprm->filename = pathbuf;
- }
- bprm->interp = bprm->filename;
-
- retval = bprm_mm_init(bprm);
- if (retval)
- goto out_unmark;
-
- retval = prepare_arg_pages(bprm, argv, envp);
- if (retval < 0)
- goto out;
+ /*
+ * Record that a name derived from an O_CLOEXEC fd will be
+ * inaccessible after exec. Relies on having exclusive access to
+ * current->files (due to unshare_files above).
+ */
+ if (bprm->fdpath &&
+ close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
+ bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
/* Set the unchanging part of bprm->cred */
retval = security_bprm_creds_for_exec(bprm);
if (retval)
goto out;
- retval = copy_string_kernel(bprm->filename, bprm);
- if (retval < 0)
- goto out;
-
- bprm->exec = bprm->p;
- retval = copy_strings(bprm->envc, envp, bprm);
- if (retval < 0)
- goto out;
-
- retval = copy_strings(bprm->argc, argv, bprm);
- if (retval < 0)
- goto out;
-
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
@@ -1931,9 +1928,6 @@ static int do_execveat_common(int fd, struct filename *filename,
rseq_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
- free_bprm(bprm);
- kfree(pathbuf);
- putname(filename);
if (displaced)
put_files_struct(displaced);
return retval;
@@ -1947,28 +1941,141 @@ out:
*/
if (bprm->point_of_no_return && !fatal_signal_pending(current))
force_sigsegv(SIGSEGV);
- if (bprm->mm) {
- acct_arg_size(bprm, 0);
- mmput(bprm->mm);
- }
out_unmark:
current->fs->in_exec = 0;
current->in_execve = 0;
-out_free:
- free_bprm(bprm);
- kfree(pathbuf);
-
out_files:
if (displaced)
reset_files_struct(displaced);
+
+ return retval;
+}
+
+static int do_execveat_common(int fd, struct filename *filename,
+ struct user_arg_ptr argv,
+ struct user_arg_ptr envp,
+ int flags)
+{
+ struct linux_binprm *bprm;
+ int retval;
+
+ if (IS_ERR(filename))
+ return PTR_ERR(filename);
+
+ /*
+ * We move the actual failure in case of RLIMIT_NPROC excess from
+ * set*uid() to execve() because too many poorly written programs
+ * don't check setuid() return code. Here we additionally recheck
+ * whether NPROC limit is still exceeded.
+ */
+ if ((current->flags & PF_NPROC_EXCEEDED) &&
+ atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
+ retval = -EAGAIN;
+ goto out_ret;
+ }
+
+ /* We're below the limit (still or again), so we don't want to make
+ * further execve() calls fail. */
+ current->flags &= ~PF_NPROC_EXCEEDED;
+
+ bprm = alloc_bprm(fd, filename);
+ if (IS_ERR(bprm)) {
+ retval = PTR_ERR(bprm);
+ goto out_ret;
+ }
+
+ retval = count(argv, MAX_ARG_STRINGS);
+ if (retval < 0)
+ goto out_free;
+ bprm->argc = retval;
+
+ retval = count(envp, MAX_ARG_STRINGS);
+ if (retval < 0)
+ goto out_free;
+ bprm->envc = retval;
+
+ retval = bprm_stack_limits(bprm);
+ if (retval < 0)
+ goto out_free;
+
+ retval = copy_string_kernel(bprm->filename, bprm);
+ if (retval < 0)
+ goto out_free;
+ bprm->exec = bprm->p;
+
+ retval = copy_strings(bprm->envc, envp, bprm);
+ if (retval < 0)
+ goto out_free;
+
+ retval = copy_strings(bprm->argc, argv, bprm);
+ if (retval < 0)
+ goto out_free;
+
+ retval = bprm_execve(bprm, fd, filename, flags);
+out_free:
+ free_bprm(bprm);
+
+out_ret:
+ putname(filename);
+ return retval;
+}
+
+int kernel_execve(const char *kernel_filename,
+ const char *const *argv, const char *const *envp)
+{
+ struct filename *filename;
+ struct linux_binprm *bprm;
+ int fd = AT_FDCWD;
+ int retval;
+
+ filename = getname_kernel(kernel_filename);
+ if (IS_ERR(filename))
+ return PTR_ERR(filename);
+
+ bprm = alloc_bprm(fd, filename);
+ if (IS_ERR(bprm)) {
+ retval = PTR_ERR(bprm);
+ goto out_ret;
+ }
+
+ retval = count_strings_kernel(argv);
+ if (retval < 0)
+ goto out_free;
+ bprm->argc = retval;
+
+ retval = count_strings_kernel(envp);
+ if (retval < 0)
+ goto out_free;
+ bprm->envc = retval;
+
+ retval = bprm_stack_limits(bprm);
+ if (retval < 0)
+ goto out_free;
+
+ retval = copy_string_kernel(bprm->filename, bprm);
+ if (retval < 0)
+ goto out_free;
+ bprm->exec = bprm->p;
+
+ retval = copy_strings_kernel(bprm->envc, envp, bprm);
+ if (retval < 0)
+ goto out_free;
+
+ retval = copy_strings_kernel(bprm->argc, argv, bprm);
+ if (retval < 0)
+ goto out_free;
+
+ retval = bprm_execve(bprm, fd, filename, 0);
+out_free:
+ free_bprm(bprm);
out_ret:
putname(filename);
return retval;
}
-int do_execve(struct filename *filename,
+static int do_execve(struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp)
{
@@ -1977,7 +2084,7 @@ int do_execve(struct filename *filename,
return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}
-int do_execveat(int fd, struct filename *filename,
+static int do_execveat(int fd, struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp,
int flags)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 7c27d7b57871..0571701ab1c5 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -45,17 +45,18 @@ struct linux_binprm {
#ifdef __alpha__
unsigned int taso:1;
#endif
- struct file * executable; /* Executable to pass to the interpreter */
- struct file * interpreter;
- struct file * file;
+ struct file *executable; /* Executable to pass to the interpreter */
+ struct file *interpreter;
+ struct file *file;
struct cred *cred; /* new credentials */
int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */
unsigned int per_clear; /* bits to clear in current->personality */
int argc, envc;
- const char * filename; /* Name of binary as seen by procps */
- const char * interp; /* Name of the binary really executed. Most
+ const char *filename; /* Name of binary as seen by procps */
+ const char *interp; /* Name of the binary really executed. Most
of the time same as filename, but could be
different for binfmt_{misc,script} */
+ const char *fdpath; /* generated filename for execveat */
unsigned interp_flags;
int execfd; /* File descriptor of the executable */
unsigned long loader, exec;
@@ -134,12 +135,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm);
extern void set_binfmt(struct linux_binfmt *new);
extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t);
-extern int do_execve(struct filename *,
- const char __user * const __user *,
- const char __user * const __user *);
-extern int do_execveat(int, struct filename *,
- const char __user * const __user *,
- const char __user * const __user *,
- int);
+int kernel_execve(const char *filename,
+ const char *const *argv, const char *const *envp);
#endif /* _LINUX_BINFMTS_H */
diff --git a/init/main.c b/init/main.c
index 0ead83e86b5a..78ccec5c28f3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1329,9 +1329,7 @@ static int run_init_process(const char *init_filename)
pr_debug(" with environment:\n");
for (p = envp_init; *p; p++)
pr_debug(" %s\n", *p);
- return do_execve(getname_kernel(init_filename),
- (const char __user *const __user *)argv_init,
- (const char __user *const __user *)envp_init);
+ return kernel_execve(init_filename, argv_init, envp_init);
}
static int try_to_run_init_process(const char *init_filename)
diff --git a/kernel/umh.c b/kernel/umh.c
index 6ca2096298b9..a25433f9cd9a 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -98,9 +98,9 @@ static int call_usermodehelper_exec_async(void *data)
commit_creds(new);
- retval = do_execve(getname_kernel(sub_info->path),
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp);
+ retval = kernel_execve(sub_info->path,
+ (const char *const *)sub_info->argv,
+ (const char *const *)sub_info->envp);
out:
sub_info->retval = retval;
/*
diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h
index 050473df5809..85246b9df7ca 100644
--- a/security/tomoyo/common.h
+++ b/security/tomoyo/common.h
@@ -425,7 +425,7 @@ struct tomoyo_request_info {
struct tomoyo_obj_info *obj;
/*
* For holding parameters specific to execve() request.
- * NULL if not dealing do_execve().
+ * NULL if not dealing execve().
*/
struct tomoyo_execve *ee;
struct tomoyo_domain_info *domain;
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 7869d6a9980b..53b3e1f5f227 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -767,7 +767,7 @@ retry:
/*
* Check for domain transition preference if "file execute" matched.
- * If preference is given, make do_execve() fail if domain transition
+ * If preference is given, make execve() fail if domain transition
* has failed, for domain transition preference should be used with
* destination domain defined.
*/
@@ -810,7 +810,7 @@ force_reset_domain:
snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "<%s>",
candidate->name);
/*
- * Make do_execve() fail if domain transition across namespaces
+ * Make execve() fail if domain transition across namespaces
* has failed.
*/
reject_on_transition_failure = true;
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index f9adddc42ac8..1f3cd432d830 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -93,7 +93,7 @@ static int tomoyo_bprm_check_security(struct linux_binprm *bprm)
struct tomoyo_task *s = tomoyo_task(current);
/*
- * Execute permission is checked against pathname passed to do_execve()
+ * Execute permission is checked against pathname passed to execve()
* using current domain.
*/
if (!s->old_domain_info) {
@@ -307,7 +307,7 @@ static int tomoyo_file_fcntl(struct file *file, unsigned int cmd,
*/
static int tomoyo_file_open(struct file *f)
{
- /* Don't check read permission here if called from do_execve(). */
+ /* Don't check read permission here if called from execve(). */
if (current->in_execve)
return 0;
return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path,