From a51e91981870d013fcfcc08b0117997edbcbc7a7 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 24 Mar 2011 14:00:18 +0100 Subject: sched: Leave sched_setscheduler() earlier if possible, do not disturb SCHED_FIFO tasks sched_setscheduler() (in sched.c) is called in order of changing the scheduling policy and/or the real-time priority of a task. Thus, if we find out that neither of those are actually being modified, it is possible to return earlier and save the overhead of a full deactivate+activate cycle of the task in question. Beside that, if we have more than one SCHED_FIFO task with the same priority on the same rq (which means they share the same priority queue) having one of them changing its position in the priority queue because of a sched_setscheduler (as it happens by means of the deactivate+activate) that does not actually change the priority violates POSIX which states, for SCHED_FIFO: "If a thread whose policy or priority has been modified by pthread_setschedprio() is a running thread or is runnable, the effect on its position in the thread list depends on the direction of the modification, as follows: a. <...> b. If the priority is unchanged, the thread does not change position in the thread list. c. <...>" http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_08.html (ed: And the POSIX specification here does, briefly and somewhat unexpectedly, match what common sense tells us as well. ) Signed-off-by: Dario Faggioli Signed-off-by: Peter Zijlstra LKML-Reference: <1300971618.3960.82.camel@Palantir> Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f592ce6f8616..a8845516ace6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5011,6 +5011,17 @@ recheck: return -EINVAL; } + /* + * If not changing anything there's no need to proceed further: + */ + if (unlikely(policy == p->policy && (!rt_policy(policy) || + param->sched_priority == p->rt_priority))) { + + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return 0; + } + #ifdef CONFIG_RT_GROUP_SCHED if (user) { /* -- cgit v1.3-14-g43fede From 3436ae1298cb22d722a6520fc97f112dd767a9e1 Mon Sep 17 00:00:00 2001 From: Sisir Koppaka Date: Sat, 26 Mar 2011 18:22:55 +0530 Subject: sched: Fix rebalance interval calculation The interval for checking scheduling domains if they are due to be balanced currently depends on boot state NR_CPUS, which may not accurately reflect the number of online CPUs at the time of check. Thus replace NR_CPUS with num_online_cpus(). (ed: Should only affect those who set NR_CPUS really high, such as 4096 or so :-) Signed-off-by: Sisir Koppaka Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3f7ec9e27ee1..c7ec5c8e7b44 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -22,6 +22,7 @@ #include #include +#include /* * Targeted preemption latency for CPU-bound tasks: @@ -3850,8 +3851,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) interval = msecs_to_jiffies(interval); if (unlikely(!interval)) interval = 1; - if (interval > HZ*NR_CPUS/10) - interval = HZ*NR_CPUS/10; + if (interval > HZ*num_online_cpus()/10) + interval = HZ*num_online_cpus()/10; need_serialize = sd->flags & SD_SERIALIZE; -- cgit v1.3-14-g43fede From 20443384fe090c5f8aeb016e7e85659c5bbdd69f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 31 Mar 2011 03:33:29 +0200 Subject: perf: Rebase max unprivileged mlock threshold on top of page size Ensure we allow 512 kiB + 1 page for user control without assuming a 4096 bytes page size. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Stephane Eranian Cc: LKML-Reference: <1301535209-9679-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c75925c4d1e2..261690923ffb 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -145,8 +145,8 @@ static struct srcu_struct pmus_srcu; */ int sysctl_perf_event_paranoid __read_mostly = 1; -/* Minimum for 128 pages + 1 for the user control page */ -int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */ +/* Minimum for 512 kiB + 1 user control page */ +int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ /* * max perf event sample rate -- cgit v1.3-14-g43fede From fd1edb3aa2c1d92618d8f0c6d15d44ea41fcac6a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Mar 2011 13:13:56 +0200 Subject: perf: Fix task_struct reference leak sys_perf_event_open() had an imbalance in the number of task refs it took causing memory leakage Cc: Jiri Olsa Cc: Oleg Nesterov Cc: stable@kernel.org # .37+ Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 261690923ffb..27960f114efd 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6531,6 +6531,11 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if (task) { + put_task_struct(task); + task = NULL; + } + /* * Look up the group leader (we will attach this event to it): */ -- cgit v1.3-14-g43fede From 4352d9d44b935e4d000be6ec89ddb55c2bf35f24 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Mon, 4 Apr 2011 08:31:23 -0700 Subject: ntp: fix non privileged system time shifting The ADJ_SETOFFSET bit added in commit 094aa188 ("ntp: Add ADJ_SETOFFSET mode bit") also introduced a way for any user to change the system time. Sneaky or buggy calls to adjtimex() could set ADJ_OFFSET_SS_READ | ADJ_SETOFFSET which would result in a successful call to timekeeping_inject_offset(). This patch fixes the issue by adding the capability check. Signed-off-by: Richard Cochran Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5f1bb8e2008f..f6117a4c7cb8 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -652,6 +652,8 @@ int do_adjtimex(struct timex *txc) struct timespec delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; + if (!capable(CAP_SYS_TIME)) + return -EPERM; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; result = timekeeping_inject_offset(&delta); -- cgit v1.3-14-g43fede From 5aba085ededa6c5a1ff465e2aebc3e8eb00a7567 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Apr 2011 14:59:31 -0700 Subject: kernel/signal.c: fix typos and coding style General coding style and comment fixes; no code changes: - Use multi-line-comment coding style. - Put some function signatures completely on one line. - Hyphenate some words. - Spell Posix as POSIX. - Correct typos & spellos in some comments. - Drop trailing whitespace. - End sentences with periods. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/signal.c | 90 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 1186cf7fac77..3ab90e8b6ecf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -226,7 +226,7 @@ static inline void print_dropped_signal(int sig) /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an - * appopriate lock must be held to stop the target task from exiting + * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue * __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) @@ -375,15 +375,15 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tracehook_consider_fatal_signal(tsk, sig); } - -/* Notify the system that a driver wants to block all signals for this +/* + * Notify the system that a driver wants to block all signals for this * process, and wants to be notified if any signals at all were to be * sent/acted upon. If the notifier routine returns non-zero, then the * signal will be acted upon after all. If the notifier routine returns 0, * then then signal will be blocked. Only one block per process is * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. */ - + * can use to determine if the signal should be blocked or not. + */ void block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) { @@ -434,9 +434,10 @@ still_pending: copy_siginfo(info, &first->info); __sigqueue_free(first); } else { - /* Ok, it wasn't in the queue. This must be - a fast-pathed signal or we must have been - out of queue space. So zero out the info. + /* + * Ok, it wasn't in the queue. This must be + * a fast-pathed signal or we must have been + * out of queue space. So zero out the info. */ info->si_signo = sig; info->si_errno = 0; @@ -468,7 +469,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, } /* - * Dequeue a signal and return the element to the caller, which is + * Dequeue a signal and return the element to the caller, which is * expected to free it. * * All callers have to hold the siglock. @@ -490,7 +491,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * itimers are process shared and we restart periodic * itimers in the signal delivery path to prevent DoS * attacks in the high resolution timer case. This is - * compliant with the old way of self restarting + * compliant with the old way of self-restarting * itimers, as the SIGALRM is a legacy signal and only * queued once. Changing the restart behaviour to * restart the timer in the signal dequeue path is @@ -923,14 +924,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, if (info == SEND_SIG_FORCED) goto out_set; - /* Real-time signals must be queued if sent by sigqueue, or - some other real-time mechanism. It is implementation - defined whether kill() does so. We attempt to do so, on - the principle of least surprise, but since kill is not - allowed to fail with EAGAIN when low on memory we just - make sure at least one signal gets delivered and don't - pass on the info struct. */ - + /* + * Real-time signals must be queued if sent by sigqueue, or + * some other real-time mechanism. It is implementation + * defined whether kill() does so. We attempt to do so, on + * the principle of least surprise, but since kill is not + * allowed to fail with EAGAIN when low on memory we just + * make sure at least one signal gets delivered and don't + * pass on the info struct. + */ if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0); else @@ -1201,8 +1203,7 @@ retry: return error; } -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) +int kill_proc_info(int sig, struct siginfo *info, pid_t pid) { int error; rcu_read_lock(); @@ -1299,8 +1300,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -int -send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { /* * Make sure legacy kernel users don't send in bad values @@ -1368,7 +1368,7 @@ EXPORT_SYMBOL(kill_pid); * These functions support sending signals using preallocated sigqueue * structures. This is needed "because realtime applications cannot * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of Posix Timers + * expirations or I/O completions". In the case of POSIX Timers * we allocate the sigqueue structure from the timer_create. If this * allocation fails we are able to report the failure to the application * with an EAGAIN error. @@ -1553,7 +1553,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) info.si_signo = SIGCHLD; info.si_errno = 0; /* - * see comment in do_notify_parent() abot the following 3 lines + * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); @@ -1611,7 +1611,7 @@ static inline int may_ptrace_stop(void) } /* - * Return nonzero if there is a SIGKILL that should be waking us up. + * Return non-zero if there is a SIGKILL that should be waking us up. * Called with the siglock held. */ static int sigkill_pending(struct task_struct *tsk) @@ -1735,7 +1735,7 @@ void ptrace_notify(int exit_code) /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. - * Returns nonzero if we've actually stopped and released the siglock. + * Returns non-zero if we've actually stopped and released the siglock. * Returns zero if we didn't stop and still hold the siglock. */ static int do_signal_stop(int signr) @@ -1823,10 +1823,12 @@ static int ptrace_signal(int signr, siginfo_t *info, current->exit_code = 0; - /* Update the siginfo structure if the signal has - changed. If the debugger wanted something - specific in the siginfo structure then it should - have updated *info via PTRACE_SETSIGINFO. */ + /* + * Update the siginfo structure if the signal has + * changed. If the debugger wanted something + * specific in the siginfo structure then it should + * have updated *info via PTRACE_SETSIGINFO. + */ if (signr != info->si_signo) { info->si_signo = signr; info->si_errno = 0; @@ -2034,7 +2036,8 @@ void exit_signals(struct task_struct *tsk) if (!signal_pending(tsk)) goto out; - /* It could be that __group_complete_signal() choose us to + /* + * It could be that __group_complete_signal() choose us to * notify about group-wide signal. Another thread should be * woken now to take the signal since we will not. */ @@ -2183,7 +2186,7 @@ long do_sigpending(void __user *set, unsigned long sigsetsize) out: return error; -} +} SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) { @@ -2233,9 +2236,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_trapno, &to->si_trapno); #endif #ifdef BUS_MCEERR_AO - /* + /* * Other callers might not initialize the si_lsb field, - * so check explicitely for the right codes here. + * so check explicitly for the right codes here. */ if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); @@ -2280,7 +2283,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; - + /* * Invert the set of allowed signals to get those we * want to block. @@ -2305,9 +2308,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, + (ts.tv_sec || ts.tv_nsec)); if (timeout) { - /* None ready -- temporarily unblock those we're + /* + * None ready -- temporarily unblock those we're * interested while we are sleeping in so that we'll - * be awakened when they arrive. */ + * be awakened when they arrive. + */ current->real_blocked = current->blocked; sigandsets(¤t->blocked, ¤t->blocked, &these); recalc_sigpending(); @@ -2553,12 +2558,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s error = -EINVAL; /* - * - * Note - this code used to test ss_flags incorrectly + * Note - this code used to test ss_flags incorrectly: * old code may have been written using ss_flags==0 * to mean ss_flags==SS_ONSTACK (as this was the only * way that worked) - this fix preserves that older - * mechanism + * mechanism. */ if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) goto out; @@ -2600,8 +2604,10 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK -/* Some platforms have their own version with special arguments others - support only sys_rt_sigprocmask. */ +/* + * Some platforms have their own version with special arguments; + * others support only sys_rt_sigprocmask. + */ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, old_sigset_t __user *, oset) -- cgit v1.3-14-g43fede From 41c57892a2895865afc89ff1a21f91a0f1506f66 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Apr 2011 15:00:26 -0700 Subject: kernel/signal.c: add kernel-doc notation to syscalls Add kernel-doc to syscalls in signal.c. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/signal.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 3ab90e8b6ecf..dc17929ab78a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2075,6 +2075,9 @@ EXPORT_SYMBOL(unblock_all_signals); * System call entry points. */ +/** + * sys_restart_syscall - restart a system call + */ SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = ¤t_thread_info()->restart_block; @@ -2128,6 +2131,13 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) return error; } +/** + * sys_rt_sigprocmask - change the list of currently blocked signals + * @how: whether to add, remove, or set signals + * @set: stores pending signals + * @oset: previous value of signal mask if non-null + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, sigset_t __user *, oset, size_t, sigsetsize) { @@ -2188,6 +2198,12 @@ out: return error; } +/** + * sys_rt_sigpending - examine a pending signal that has been raised + * while blocked + * @set: stores pending signals + * @sigsetsize: size of sigset_t type or larger + */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) { return do_sigpending(set, sigsetsize); @@ -2267,6 +2283,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #endif +/** + * sys_rt_sigtimedwait - synchronously wait for queued signals specified + * in @uthese + * @uthese: queued signals to wait for + * @uinfo: if non-null, the signal's siginfo is returned here + * @uts: upper bound on process time suspension + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct timespec __user *, uts, size_t, sigsetsize) @@ -2344,6 +2368,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return ret; } +/** + * sys_kill - send a signal to a process + * @pid: the PID of the process + * @sig: signal to be sent + */ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; @@ -2419,7 +2448,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) return do_tkill(tgid, pid, sig); } -/* +/** + * sys_tkill - send signal to one specific task + * @pid: the PID of the task + * @sig: signal to be sent + * * Send a signal to only one task, even if it's a CLONE_THREAD task. */ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) @@ -2431,6 +2464,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) return do_tkill(0, pid, sig); } +/** + * sys_rt_sigqueueinfo - send signal information to a signal + * @pid: the PID of the thread + * @sig: signal to be sent + * @uinfo: signal info to be sent + */ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { @@ -2596,6 +2635,10 @@ out: #ifdef __ARCH_WANT_SYS_SIGPENDING +/** + * sys_sigpending - examine pending signals + * @set: where mask of pending signal is returned + */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { return do_sigpending(set, sizeof(*set)); @@ -2604,7 +2647,12 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK -/* +/** + * sys_sigprocmask - examine and change blocked signals + * @how: whether to add, remove, or set signals + * @set: signals to add or remove (if non-null) + * @oset: previous value of signal mask if non-null + * * Some platforms have their own version with special arguments; * others support only sys_rt_sigprocmask. */ @@ -2660,6 +2708,13 @@ out: #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifdef __ARCH_WANT_SYS_RT_SIGACTION +/** + * sys_rt_sigaction - alter an action taken by a process + * @sig: signal to be sent + * @act: the thread group ID of the thread + * @oact: the PID of the thread + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct sigaction __user *, act, struct sigaction __user *, oact, @@ -2746,6 +2801,12 @@ SYSCALL_DEFINE0(pause) #endif #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND +/** + * sys_rt_sigsuspend - replace the signal mask for a value with the + * @unewset value until a signal is received + * @unewset: new signal mask value + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; -- cgit v1.3-14-g43fede