From 5905429ad85657c28d93ec3d826ddeea1f44c3ce Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Aug 2017 13:00:58 -0700 Subject: fork: Provide usercopy whitelisting for task_struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While the blocked and saved_sigmask fields of task_struct are copied to userspace (via sigmask_to_save() and setup_rt_frame()), it is always copied with a static length (i.e. sizeof(sigset_t)). The only portion of task_struct that is potentially dynamically sized and may be copied to userspace is in the architecture-specific thread_struct at the end of task_struct. cache object allocation: kernel/fork.c: alloc_task_struct_node(...): return kmem_cache_alloc_node(task_struct_cachep, ...); dup_task_struct(...): ... tsk = alloc_task_struct_node(node); copy_process(...): ... dup_task_struct(...) _do_fork(...): ... copy_process(...) example usage trace: arch/x86/kernel/fpu/signal.c: __fpu__restore_sig(...): ... struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; ... __copy_from_user(&fpu->state.xsave, ..., state_size); fpu__restore_sig(...): ... return __fpu__restore_sig(...); arch/x86/kernel/signal.c: restore_sigcontext(...): ... fpu__restore_sig(...) This introduces arch_thread_struct_whitelist() to let an architecture declare specifically where the whitelist should be within thread_struct. If undefined, the entire thread_struct field is left whitelisted. Cc: Andrew Morton Cc: Nicholas Piggin Cc: Laura Abbott Cc: "Mickaël Salaün" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Kees Cook Acked-by: Rik van Riel --- include/linux/sched/task.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux/sched') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 05b8650f06f5..5be31eb7b266 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -104,6 +104,20 @@ extern int arch_task_struct_size __read_mostly; # define arch_task_struct_size (sizeof(struct task_struct)) #endif +#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST +/* + * If an architecture has not declared a thread_struct whitelist we + * must assume something there may need to be copied to userspace. + */ +static inline void arch_thread_struct_whitelist(unsigned long *offset, + unsigned long *size) +{ + *offset = 0; + /* Handle dynamically sized thread_struct. */ + *size = arch_task_struct_size - offsetof(struct task_struct, thread); +} +#endif + #ifdef CONFIG_VMAP_STACK static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { -- cgit v1.2.3-59-g8ed1b From f8ec66014ffd95a783b1f9f3b62d7cadb96b78d5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Jan 2018 14:54:54 -0600 Subject: signal: Add send_sig_fault and force_sig_fault The vast majority of signals sent from architecture specific code are simple faults. Encapsulate this reality with two helper functions so that the nit-picky implementation of preparing a siginfo does not need to be repeated many times on each architecture. As only some architectures support the trapno field, make the trapno arguement only present on those architectures. Similary as ia64 has three fields: imm, flags, and isr that are specific to it. Have those arguments always present on ia64 and no where else. This ensures the architecture specific code always remembers which fields it needs to pass into the siginfo structure. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 20 +++++++++++++++++++ kernel/signal.c | 47 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'include/linux/sched') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0aa4548fb492..375f31eb3b6b 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -285,6 +285,26 @@ static inline void kernel_signal_stop(void) schedule(); } +#ifdef __ARCH_SI_TRAPNO +# define ___ARCH_SI_TRAPNO(_a1) , _a1 +#else +# define ___ARCH_SI_TRAPNO(_a1) +#endif +#ifdef __ia64__ +# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3 +#else +# define ___ARCH_SI_IA64(_a1, _a2, _a3) +#endif + +int force_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t); +int send_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t); + extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); extern int force_sig_info(int, struct siginfo *, struct task_struct *); diff --git a/kernel/signal.c b/kernel/signal.c index f14492ff976f..15ec7b3cbe69 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1491,6 +1491,53 @@ force_sigsegv(int sig, struct task_struct *p) return 0; } +int force_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; +#ifdef __ARCH_SI_TRAPNO + info.si_trapno = trapno; +#endif +#ifdef __ia64__ + info.si_imm = imm; + info.si_flags = flags; + info.si_isr = isr; +#endif + return force_sig_info(info.si_signo, &info, t); +} + +int send_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; +#ifdef __ARCH_SI_TRAPNO + info.si_trapno = trapno; +#endif +#ifdef __ia64__ + info.si_imm = imm; + info.si_flags = flags; + info.si_isr = isr; +#endif + return send_sig_info(info.si_signo, &info, t); +} + + int kill_pgrp(struct pid *pid, int sig, int priv) { int ret; -- cgit v1.2.3-59-g8ed1b From 382467358ac9675b1b6814400a9a9e36dc7da14f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Jan 2018 18:54:31 -0600 Subject: signal: Helpers for faults with specialized siginfo layouts The helpers added are: send_sig_mceerr force_sig_mceerr force_sig_bnderr force_sig_pkuerr Filling out siginfo properly can ge tricky. Especially for these specialized cases where the temptation is to share code with other cases which use a different subset of siginfo fields. Unfortunately that code sharing frequently results in bugs with the wrong siginfo fields filled in, and makes it harder to verify that the siginfo structure was properly initialized. Provide these helpers instead that get all of the details right, and guarantee that siginfo is properly initialized. send_sig_mceerr and force_sig_mceer are a little special as two si codes BUS_MCEERR_AO and BUS_MCEER_AR both use the same extended signinfo layout. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 6 +++++ kernel/signal.c | 61 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'include/linux/sched') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 375f31eb3b6b..944fe6356f4a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -305,6 +305,12 @@ int send_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t); +int force_sig_mceerr(int code, void __user *, short, struct task_struct *); +int send_sig_mceerr(int code, void __user *, short, struct task_struct *); + +int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); +int force_sig_pkuerr(void __user *addr, u32 pkey); + extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); extern int force_sig_info(int, struct siginfo *, struct task_struct *); diff --git a/kernel/signal.c b/kernel/signal.c index 15ec7b3cbe69..4f6300ef8062 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1537,6 +1537,67 @@ int send_sig_fault(int sig, int code, void __user *addr return send_sig_info(info.si_signo, &info, t); } +#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR) +int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) +{ + struct siginfo info; + + WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_addr_lsb = lsb; + return force_sig_info(info.si_signo, &info, t); +} + +int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) +{ + struct siginfo info; + + WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_addr_lsb = lsb; + return send_sig_info(info.si_signo, &info, t); +} +EXPORT_SYMBOL(send_sig_mceerr); +#endif + +#ifdef SEGV_BNDERR +int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_BNDERR; + info.si_addr = addr; + info.si_lower = lower; + info.si_upper = upper; + return force_sig_info(info.si_signo, &info, current); +} +#endif + +#ifdef SEGV_PKUERR +int force_sig_pkuerr(void __user *addr, u32 pkey) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_PKUERR; + info.si_addr = addr; + info.si_pkey = pkey; + return force_sig_info(info.si_signo, &info, current); +} +#endif int kill_pgrp(struct pid *pid, int sig, int priv) { -- cgit v1.2.3-59-g8ed1b From f71dd7dc2dc989dc712b246a74d243e4b2c5f8a7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Jan 2018 14:37:25 -0600 Subject: signal/ptrace: Add force_sig_ptrace_errno_trap and use it where needed There are so many places that build struct siginfo by hand that at least one of them is bound to get it wrong. A handful of cases in the kernel arguably did just that when using the errno field of siginfo to pass no errno values to userspace. The usage is limited to a single si_code so at least does not mess up anything else. Encapsulate this questionable pattern in a helper function so that the userspace ABI is preserved. Update all of the places that use this pattern to use the new helper function. Signed-off-by: "Eric W. Biederman" --- arch/arm/kernel/ptrace.c | 8 +------- arch/arm64/kernel/ptrace.c | 6 ++++-- arch/powerpc/kernel/process.c | 9 ++------- arch/xtensa/kernel/ptrace.c | 8 +------- include/linux/sched/signal.h | 2 ++ kernel/signal.c | 15 +++++++++++++++ 6 files changed, 25 insertions(+), 23 deletions(-) (limited to 'include/linux/sched') diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 58e3771e4c5b..7724b0f661b3 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -390,7 +390,6 @@ static void ptrace_hbptriggered(struct perf_event *bp, struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); long num; int i; - siginfo_t info; for (i = 0; i < ARM_MAX_HBP_SLOTS; ++i) if (current->thread.debug.hbp[i] == bp) @@ -398,12 +397,7 @@ static void ptrace_hbptriggered(struct perf_event *bp, num = (i == ARM_MAX_HBP_SLOTS) ? 0 : ptrace_hbp_idx_to_num(i); - info.si_signo = SIGTRAP; - info.si_errno = (int)num; - info.si_code = TRAP_HWBKPT; - info.si_addr = (void __user *)(bkpt->trigger); - - force_sig_info(SIGTRAP, &info, current); + force_sig_ptrace_errno_trap((int)num, (void __user *)(bkpt->trigger)); } /* diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 95daa1478a7c..6618036ae6d4 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -190,21 +190,23 @@ static void ptrace_hbptriggered(struct perf_event *bp, #ifdef CONFIG_COMPAT if (is_compat_task()) { + int si_errno = 0; int i; for (i = 0; i < ARM_MAX_BRP; ++i) { if (current->thread.debug.hbp_break[i] == bp) { - info.si_errno = (i << 1) + 1; + si_errno = (i << 1) + 1; break; } } for (i = 0; i < ARM_MAX_WRP; ++i) { if (current->thread.debug.hbp_watch[i] == bp) { - info.si_errno = -((i << 1) + 1); + si_errno = -((i << 1) + 1); break; } } + force_sig_ptrace_errno_trap(si_errno, (void __user *)bkpt->trigger); } #endif force_sig_info(SIGTRAP, &info, current); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index bfb48cf56bc3..4208cbe2fb7f 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -603,19 +603,14 @@ EXPORT_SYMBOL(flush_all_to_thread); void do_send_trap(struct pt_regs *regs, unsigned long address, unsigned long error_code, int breakpt) { - siginfo_t info; - current->thread.trap_nr = TRAP_HWBKPT; if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, 11, SIGSEGV) == NOTIFY_STOP) return; /* Deliver the signal to userspace */ - info.si_signo = SIGTRAP; - info.si_errno = breakpt; /* breakpoint or watchpoint id */ - info.si_code = TRAP_HWBKPT; - info.si_addr = (void __user *)address; - force_sig_info(SIGTRAP, &info, current); + force_sig_ptrace_errno_trap(breakpt, /* breakpoint or watchpoint id */ + (void __user *)address); } #else /* !CONFIG_PPC_ADV_DEBUG_REGS */ void do_break (struct pt_regs *regs, unsigned long address, diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c index e2461968efb2..c0845cb1cbb9 100644 --- a/arch/xtensa/kernel/ptrace.c +++ b/arch/xtensa/kernel/ptrace.c @@ -278,7 +278,6 @@ static void ptrace_hbptriggered(struct perf_event *bp, struct pt_regs *regs) { int i; - siginfo_t info; struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); if (bp->attr.bp_type & HW_BREAKPOINT_X) { @@ -293,12 +292,7 @@ static void ptrace_hbptriggered(struct perf_event *bp, i = (i << 1) | 1; } - info.si_signo = SIGTRAP; - info.si_errno = i; - info.si_code = TRAP_HWBKPT; - info.si_addr = (void __user *)bkpt->address; - - force_sig_info(SIGTRAP, &info, current); + force_sig_ptrace_errno_trap(i, (void __user *)bkpt->address); } static struct perf_event *ptrace_hbp_create(struct task_struct *tsk, int type) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 944fe6356f4a..23b4f9cb82db 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -311,6 +311,8 @@ int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); +int force_sig_ptrace_errno_trap(int errno, void __user *addr); + extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); extern int force_sig_info(int, struct siginfo *, struct task_struct *); diff --git a/kernel/signal.c b/kernel/signal.c index 4f6300ef8062..e549174c0831 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1599,6 +1599,21 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) } #endif +/* For the crazy architectures that include trap information in + * the errno field, instead of an actual errno value. + */ +int force_sig_ptrace_errno_trap(int errno, void __user *addr) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_errno = errno; + info.si_code = TRAP_HWBKPT; + info.si_addr = addr; + return force_sig_info(info.si_signo, &info, current); +} + int kill_pgrp(struct pid *pid, int sig, int priv) { int ret; -- cgit v1.2.3-59-g8ed1b From d70f2a14b72a4bc094cf3a92e4794644a7adc590 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 31 Jan 2018 16:15:51 -0800 Subject: include/linux/sched/mm.h: uninline mmdrop_async(), etc mmdrop_async() is only used in fork.c. Move that and its support functions into fork.c, uninline it all. Quite a lot of code gets moved around to avoid forward declarations. Cc: Ingo Molnar Cc: Michal Hocko Cc: Peter Zijlstra Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 24 +-- kernel/fork.c | 448 +++++++++++++++++++++++++---------------------- 2 files changed, 238 insertions(+), 234 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3d49b91b674d..bd422561a75e 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -11,7 +11,7 @@ /* * Routines for handling mm_structs */ -extern struct mm_struct * mm_alloc(void); +extern struct mm_struct *mm_alloc(void); /** * mmgrab() - Pin a &struct mm_struct. @@ -35,27 +35,7 @@ static inline void mmgrab(struct mm_struct *mm) atomic_inc(&mm->mm_count); } -/* mmdrop drops the mm and the page tables */ -extern void __mmdrop(struct mm_struct *); -static inline void mmdrop(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) - __mmdrop(mm); -} - -static inline void mmdrop_async_fn(struct work_struct *work) -{ - struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); - __mmdrop(mm); -} - -static inline void mmdrop_async(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) { - INIT_WORK(&mm->async_put_work, mmdrop_async_fn); - schedule_work(&mm->async_put_work); - } -} +extern void mmdrop(struct mm_struct *mm); /** * mmget() - Pin the address space associated with a &struct mm_struct. diff --git a/kernel/fork.c b/kernel/fork.c index 2295fc69717f..5e6cf0dd031c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -390,6 +391,241 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +#ifdef CONFIG_MMU +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) +{ + struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge; + LIST_HEAD(uf); + + uprobe_start_dup_mmap(); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; + + prev = NULL; + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + struct file *file; + + if (mpnt->vm_flags & VM_DONTCOPY) { + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + tmp->vm_next = tmp->vm_prev = NULL; + file = tmp->vm_file; + if (file) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; + + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + i_mmap_lock_write(mapping); + if (tmp->vm_flags & VM_SHARED) + atomic_inc(&mapping->i_mmap_writable); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* + * Link in the new vma and copy the page table entries. + */ + *pprev = tmp; + pprev = &tmp->vm_next; + tmp->vm_prev = prev; + prev = tmp; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto out; + } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); + retval = 0; +out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); + up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); +fail_uprobe_end: + uprobe_end_dup_mmap(); + return retval; +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +} + +static inline int mm_alloc_pgd(struct mm_struct *mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct *mm) +{ + pgd_free(mm, mm->pgd); +} +#else +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); +#endif +} + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +static void __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); + hmm_mm_destroy(mm); + mmu_notifier_mm_destroy(mm); + check_mm(mm); + put_user_ns(mm->user_ns); + free_mm(mm); +} + +void mmdrop(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) + __mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmdrop); + +static void mmdrop_async_fn(struct work_struct *work) +{ + struct mm_struct *mm; + + mm = container_of(work, struct mm_struct, async_put_work); + __mmdrop(mm); +} + +static void mmdrop_async(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) { + INIT_WORK(&mm->async_put_work, mmdrop_async_fn); + schedule_work(&mm->async_put_work); + } +} + static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); @@ -594,181 +830,8 @@ free_tsk: return NULL; } -#ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; - int retval; - unsigned long charge; - LIST_HEAD(uf); - - uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { - retval = -EINTR; - goto fail_uprobe_end; - } - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; - - prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { - struct file *file; - - if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* VM_WIPEONFORK gets a clean slate in the child. */ - tmp->anon_vma = NULL; - if (anon_vma_prepare(tmp)) - goto fail_nomem_anon_vma_fork; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); - tmp->vm_next = tmp->vm_prev = NULL; - file = tmp->vm_file; - if (file) { - struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; - - get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); - i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only - */ - if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); - - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - - mm->map_count++; - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - if (retval) - goto out; - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); - dup_userfaultfd_complete(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); - return retval; -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto out; -} - -static inline int mm_alloc_pgd(struct mm_struct *mm) -{ - mm->pgd = pgd_alloc(mm); - if (unlikely(!mm->pgd)) - return -ENOMEM; - return 0; -} - -static inline void mm_free_pgd(struct mm_struct *mm) -{ - pgd_free(mm, mm->pgd); -} -#else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - down_write(&oldmm->mmap_sem); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); - return 0; -} -#define mm_alloc_pgd(mm) (0) -#define mm_free_pgd(mm) -#endif /* CONFIG_MMU */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) - static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) @@ -858,27 +921,6 @@ fail_nopgd: return NULL; } -static void check_mm(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); - - if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); - } - - if (mm_pgtables_bytes(mm)) - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", - mm_pgtables_bytes(mm)); - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -#endif -} - /* * Allocate and initialize an mm_struct. */ @@ -894,24 +936,6 @@ struct mm_struct *mm_alloc(void) return mm_init(mm, current, current_user_ns()); } -/* - * Called when the last reference to the mm - * is dropped: either by a lazy thread or by - * mmput. Free the page directory and the mm. - */ -void __mmdrop(struct mm_struct *mm) -{ - BUG_ON(mm == &init_mm); - mm_free_pgd(mm); - destroy_context(mm); - hmm_mm_destroy(mm); - mmu_notifier_mm_destroy(mm); - check_mm(mm); - put_user_ns(mm->user_ns); - free_mm(mm); -} -EXPORT_SYMBOL_GPL(__mmdrop); - static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); -- cgit v1.2.3-59-g8ed1b