diff options
Diffstat (limited to 'kernel/exit.c')
-rw-r--r-- | kernel/exit.c | 407 |
1 files changed, 268 insertions, 139 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index 0b81b26a872a..35e0a31a0315 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,9 +48,9 @@ #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> -#include <linux/blkdev.h> #include <linux/task_io_accounting_ops.h> -#include <linux/tracehook.h> +#include <linux/blkdev.h> +#include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> @@ -60,13 +60,16 @@ #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> +#include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> +#include <linux/io_uring.h> +#include <linux/kprobes.h> +#include <linux/rethook.h> #include <linux/uaccess.h> #include <asm/unistd.h> -#include <asm/pgtable.h> #include <asm/mmu_context.h> static void __unhash_process(struct task_struct *p, bool group_dead) @@ -94,7 +97,7 @@ static void __exit_signal(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; - struct tty_struct *uninitialized_var(tty); + struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, @@ -103,17 +106,8 @@ static void __exit_signal(struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); - if (group_dead) { + if (group_dead) posix_cpu_timers_exit_group(tsk); - } else { - /* - * This can only happen if the caller is de_thread(). - * FIXME: this is the temporary hack, we should teach - * posix-cpu-timers to handle this case correctly. - */ - if (unlikely(has_group_leader_pid(tsk))) - posix_cpu_timers_exit_group(tsk); - } #endif if (group_dead) { @@ -125,7 +119,7 @@ static void __exit_signal(struct task_struct *tsk) * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) - wake_up_process(sig->group_exit_task); + wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); @@ -177,6 +171,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + kprobe_flush_task(tsk); + rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); @@ -188,22 +184,27 @@ void put_task_struct_rcu_user(struct task_struct *task) call_rcu(&task->rcu, delayed_put_task_struct); } +void __weak release_thread(struct task_struct *dead_task) +{ +} + void release_task(struct task_struct *p) { struct task_struct *leader; + struct pid *thread_pid; int zap_leader; repeat: /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); - atomic_dec(&__task_cred(p)->user->processes); + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); - proc_flush_task(p); cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); + thread_pid = get_pid(p->thread_pid); __exit_signal(p); /* @@ -226,6 +227,9 @@ repeat: } write_unlock_irq(&tasklist_lock); + seccomp_filter_release(p); + proc_flush_pid(thread_pid); + put_pid(thread_pid); release_thread(p); put_task_struct_rcu_user(p); @@ -234,8 +238,9 @@ repeat: goto repeat; } -void rcuwait_wake_up(struct rcuwait *w) +int rcuwait_wake_up(struct rcuwait *w) { + int ret = 0; struct task_struct *task; rcu_read_lock(); @@ -243,7 +248,7 @@ void rcuwait_wake_up(struct rcuwait *w) /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called - * rcuwait_trywake() in the first place. Pairs with set_current_state() + * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE @@ -255,9 +260,12 @@ void rcuwait_wake_up(struct rcuwait *w) task = rcu_dereference(w->task); if (task) - wake_up_process(task); + ret = wake_up_process(task); rcu_read_unlock(); + + return ret; } +EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX @@ -340,6 +348,46 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } +static void coredump_task_exit(struct task_struct *tsk) +{ + struct core_state *core_state; + + /* + * Serialize with any possible pending coredump. + * We must hold siglock around checking core_state + * and setting PF_POSTCOREDUMP. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group without PF_POSTCOREDUMP set. + */ + spin_lock_irq(&tsk->sighand->siglock); + tsk->flags |= PF_POSTCOREDUMP; + core_state = tsk->signal->core_state; + spin_unlock_irq(&tsk->sighand->siglock); + if (core_state) { + struct core_thread self; + + self.task = current; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_current_state(TASK_RUNNING); + } +} + #ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. @@ -423,6 +471,7 @@ assign_new_owner: goto retry; } WRITE_ONCE(mm->owner, c); + lru_gen_migrate_mm(mm); task_unlock(c); put_task_struct(c); } @@ -435,52 +484,34 @@ assign_new_owner: static void exit_mm(void) { struct mm_struct *mm = current->mm; - struct core_state *core_state; exit_mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); - /* - * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_state - * and clearing tsk->mm. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group with ->mm != NULL. - */ - down_read(&mm->mmap_sem); - core_state = mm->core_state; - if (core_state) { - struct core_thread self; - - up_read(&mm->mmap_sem); - - self.task = current; - self.next = xchg(&core_state->dumper.next, &self); - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!self.task) /* see coredump_finish() */ - break; - freezable_schedule(); - } - __set_current_state(TASK_RUNNING); - down_read(&mm->mmap_sem); - } + mmap_read_lock(mm); mmgrab(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); + /* + * When a thread stops operating on an address space, the loop + * in membarrier_private_expedited() may not observe that + * tsk->mm, and the loop in membarrier_global_expedited() may + * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED + * rq->membarrier_state, so those would not issue an IPI. + * Membarrier requires a memory barrier after accessing + * user-space memory, before clearing tsk->mm or the + * rq->membarrier_state. + */ + smp_mb__after_spinlock(); + local_irq_disable(); current->mm = NULL; - up_read(&mm->mmap_sem); + membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); + local_irq_enable(); task_unlock(current); + mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) @@ -675,7 +706,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) - wake_up_process(tsk->signal->group_exit_task); + wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { @@ -708,54 +739,42 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif +static void synchronize_group_exit(struct task_struct *tsk, long code) +{ + struct sighand_struct *sighand = tsk->sighand; + struct signal_struct *signal = tsk->signal; + + spin_lock_irq(&sighand->siglock); + signal->quick_threads--; + if ((signal->quick_threads == 0) && + !(signal->flags & SIGNAL_GROUP_EXIT)) { + signal->flags = SIGNAL_GROUP_EXIT; + signal->group_exit_code = code; + signal->group_stop_count = 0; + } + spin_unlock_irq(&sighand->siglock); +} + void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; - profile_task_exit(tsk); - kcov_task_exit(tsk); + synchronize_group_exit(tsk, code); - WARN_ON(blk_needs_flush_plug(tsk)); + WARN_ON(tsk->plug); - if (unlikely(in_interrupt())) - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) - panic("Attempted to kill the idle task!"); - - /* - * If do_exit is called because this processes oopsed, it's possible - * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before - * continuing. Amongst other possible reasons, this is to prevent - * mm_release()->clear_child_tid() from writing to a user-controlled - * kernel address. - */ - set_fs(USER_DS); + kcov_task_exit(tsk); + kmsan_task_exit(tsk); + coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); - /* - * We're taking recursive faults here in do_exit. Safest is to just - * leave this task alone and wait for reboot. - */ - if (unlikely(tsk->flags & PF_EXITING)) { - pr_alert("Fixing recursive fault but reboot is needed!\n"); - futex_exit_recursive(tsk); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - } - + io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } - /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); @@ -772,7 +791,7 @@ void __noreturn do_exit(long code) #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); - exit_itimers(tsk->signal); + exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); @@ -800,7 +819,6 @@ void __noreturn do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); - exit_umh(tsk); /* * Flush inherited counters to the parent - before the parent @@ -841,6 +859,7 @@ void __noreturn do_exit(long code) put_page(tsk->task_frag.page); validate_creds_for_do_exit(tsk); + exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); @@ -852,16 +871,46 @@ void __noreturn do_exit(long code) lockdep_free_task(tsk); do_task_dead(); } -EXPORT_SYMBOL_GPL(do_exit); -void complete_and_exit(struct completion *comp, long code) +void __noreturn make_task_dead(int signr) { - if (comp) - complete(comp); + /* + * Take the task off the cpu after something catastrophic has + * happened. + * + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. + */ + struct task_struct *tsk = current; - do_exit(code); + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); + + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } + + /* + * We're taking recursive faults here in make_task_dead. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + pr_alert("Fixing recursive fault but reboot is needed!\n"); + futex_exit_recursive(tsk); + tsk->exit_state = EXIT_DEAD; + refcount_inc(&tsk->rcu_users); + do_task_dead(); + } + + do_exit(signr); } -EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) { @@ -872,22 +921,24 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -void +void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; - BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; - else if (!thread_group_empty(current)) { + else if (sig->group_exec_task) + exit_code = 0; + else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; + else if (sig->group_exec_task) + exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; @@ -982,7 +1033,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { - status = p->exit_code; + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); @@ -1024,7 +1076,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * p->signal fields because the whole thread group is dead * and nobody can change them. * - * psig->stats_lock also protects us from our sub-theads + * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. Until * we change k_getrusage()-like users to rely on this lock * we have to take ->siglock as well. @@ -1416,9 +1468,48 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent) TASK_INTERRUPTIBLE, p); } +static bool is_effectively_child(struct wait_opts *wo, bool ptrace, + struct task_struct *target) +{ + struct task_struct *parent = + !ptrace ? target->real_parent : target->parent; + + return current == parent || (!(wo->wo_flags & __WNOTHREAD) && + same_thread_group(current, parent)); +} + +/* + * Optimization for waiting on PIDTYPE_PID. No need to iterate through child + * and tracee lists to find the target task. + */ +static int do_wait_pid(struct wait_opts *wo) +{ + bool ptrace; + struct task_struct *target; + int retval; + + ptrace = false; + target = pid_task(wo->wo_pid, PIDTYPE_TGID); + if (target && is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + ptrace = true; + target = pid_task(wo->wo_pid, PIDTYPE_PID); + if (target && target->ptrace && + is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + return 0; +} + static long do_wait(struct wait_opts *wo) { - struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); @@ -1440,19 +1531,27 @@ repeat: set_current_state(TASK_INTERRUPTIBLE); read_lock(&tasklist_lock); - tsk = current; - do { - retval = do_wait_thread(wo, tsk); - if (retval) - goto end; - retval = ptrace_do_wait(wo, tsk); + if (wo->wo_type == PIDTYPE_PID) { + retval = do_wait_pid(wo); if (retval) goto end; + } else { + struct task_struct *tsk = current; - if (wo->wo_flags & __WNOTHREAD) - break; - } while_each_thread(current, tsk); + do { + retval = do_wait_thread(wo, tsk); + if (retval) + goto end; + + retval = ptrace_do_wait(wo, tsk); + if (retval) + goto end; + + if (wo->wo_flags & __WNOTHREAD) + break; + } while_each_thread(current, tsk); + } read_unlock(&tasklist_lock); notask: @@ -1470,23 +1569,6 @@ end: return retval; } -static struct pid *pidfd_get_pid(unsigned int fd) -{ - struct fd f; - struct pid *pid; - - f = fdget(fd); - if (!f.file) - return ERR_PTR(-EBADF); - - pid = pidfd_pid(f.file); - if (!IS_ERR(pid)) - get_pid(pid); - - fdput(f); - return pid; -} - static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { @@ -1494,6 +1576,7 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, struct pid *pid = NULL; enum pid_type type; long ret; + unsigned int f_flags = 0; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) @@ -1527,9 +1610,10 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, if (upid < 0) return -EINVAL; - pid = pidfd_get_pid(upid); + pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); + break; default: return -EINVAL; @@ -1540,7 +1624,12 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, wo.wo_flags = options; wo.wo_info = infop; wo.wo_rusage = ru; + if (f_flags & O_NONBLOCK) + wo.wo_flags |= WNOHANG; + ret = do_wait(&wo); + if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) + ret = -EAGAIN; put_pid(pid); return ret; @@ -1563,7 +1652,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1572,10 +1661,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } @@ -1622,6 +1711,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options, return ret; } +int kernel_wait(pid_t pid, int *stat) +{ + struct wait_opts wo = { + .wo_type = PIDTYPE_PID, + .wo_pid = find_get_pid(pid), + .wo_flags = WEXITED, + }; + int ret; + + ret = do_wait(&wo); + if (ret > 0 && wo.wo_stat) + *stat = wo.wo_stat; + put_pid(wo.wo_pid); + return ret; +} + SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { @@ -1690,7 +1795,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1699,14 +1804,38 @@ COMPAT_SYSCALL_DEFINE5(waitid, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } #endif +/** + * thread_group_exited - check that a thread group has exited + * @pid: tgid of thread group to be checked. + * + * Test if the thread group represented by tgid has exited (all + * threads are zombies, dead or completely gone). + * + * Return: true if the thread group has exited. false otherwise. + */ +bool thread_group_exited(struct pid *pid) +{ + struct task_struct *task; + bool exited; + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + exited = !task || + (READ_ONCE(task->exit_state) && thread_group_empty(task)); + rcu_read_unlock(); + + return exited; +} +EXPORT_SYMBOL(thread_group_exited); + __weak void abort(void) { BUG(); |