From 38498a67aa2cf8c80754b8d304bfacc10bc582b5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 20 Apr 2012 13:05:44 +0000 Subject: smp: Add generic smpboot facility Start a new file, which will hold SMP and CPU hotplug related generic infrastructure. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Rusty Russell Cc: Paul E. McKenney Cc: Srivatsa S. Bhat Cc: Matt Turner Cc: Russell King Cc: Mike Frysinger Cc: Jesper Nilsson Cc: Richard Kuo Cc: Tony Luck Cc: Hirokazu Takata Cc: Ralf Baechle Cc: David Howells Cc: James E.J. Bottomley Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Paul Mundt Cc: David S. Miller Cc: Chris Metcalf Cc: Richard Weinberger Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20120420124557.035417523@linutronix.de --- kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index cb41b9547c9f..6c07f30fa9b7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smpboot.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif -- cgit v1.3-8-gc7d7 From e73f8959af0439d114847eab5a8a5ce48f1217c4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 11 May 2012 10:59:07 +1000 Subject: task_work_add: generic process-context callbacks Provide a simple mechanism that allows running code in the (nonatomic) context of the arbitrary task. The caller does task_work_add(task, task_work) and this task executes task_work->func() either from do_notify_resume() or from do_exit(). The callback can rely on PF_EXITING to detect the latter case. "struct task_work" can be embedded in another struct, still it has "void *data" to handle the most common/simple case. This allows us to kill the ->replacement_session_keyring hack, and potentially this can have more users. Performance-wise, this adds 2 "unlikely(!hlist_empty())" checks into tracehook_notify_resume() and do_exit(). But at the same time we can remove the "replacement_session_keyring != NULL" checks from arch/*/signal.c and exit_creds(). Note: task_work_add/task_work_run abuses ->pi_lock. This is only because this lock is already used by lookup_pi_state() to synchronize with do_exit() setting PF_EXITING. Fortunately the scope of this lock in task_work.c is really tiny, and the code is unlikely anyway. Signed-off-by: Oleg Nesterov Acked-by: David Howells Cc: Thomas Gleixner Cc: Richard Kuo Cc: Linus Torvalds Cc: Alexander Gordeev Cc: Chris Zankel Cc: David Smith Cc: "Frank Ch. Eigler" Cc: Geert Uytterhoeven Cc: Larry Woodman Cc: Peter Zijlstra Cc: Tejun Heo Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/sched.h | 2 ++ include/linux/task_work.h | 33 +++++++++++++++++++ include/linux/tracehook.h | 11 +++++++ kernel/Makefile | 2 +- kernel/exit.c | 5 ++- kernel/fork.c | 1 + kernel/task_work.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 include/linux/task_work.h create mode 100644 kernel/task_work.c (limited to 'kernel/Makefile') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5ea8baea9387..7930131abc1a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1400,6 +1400,8 @@ struct task_struct { int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; + struct hlist_head task_works; + struct audit_context *audit_context; #ifdef CONFIG_AUDITSYSCALL uid_t loginuid; diff --git a/include/linux/task_work.h b/include/linux/task_work.h new file mode 100644 index 000000000000..294d5d5e90b1 --- /dev/null +++ b/include/linux/task_work.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_TASK_WORK_H +#define _LINUX_TASK_WORK_H + +#include +#include + +struct task_work; +typedef void (*task_work_func_t)(struct task_work *); + +struct task_work { + struct hlist_node hlist; + task_work_func_t func; + void *data; +}; + +static inline void +init_task_work(struct task_work *twork, task_work_func_t func, void *data) +{ + twork->func = func; + twork->data = data; +} + +int task_work_add(struct task_struct *task, struct task_work *twork, bool); +struct task_work *task_work_cancel(struct task_struct *, task_work_func_t); +void task_work_run(void); + +static inline void exit_task_work(struct task_struct *task) +{ + if (unlikely(!hlist_empty(&task->task_works))) + task_work_run(); +} + +#endif /* _LINUX_TASK_WORK_H */ diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index b9ca903bb553..b2dd0917ca0d 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -49,6 +49,7 @@ #include #include #include +#include struct linux_binprm; /* @@ -164,8 +165,10 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, */ static inline void set_notify_resume(struct task_struct *task) { +#ifdef TIF_NOTIFY_RESUME if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME)) kick_process(task); +#endif } /** @@ -185,6 +188,14 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) { if (current->replacement_session_keyring) key_replace_session_keyring(); + /* + * The caller just cleared TIF_NOTIFY_RESUME. This barrier + * pairs with task_work_add()->set_notify_resume() after + * hlist_add_head(task->task_works); + */ + smp_mb__after_clear_bit(); + if (unlikely(!hlist_empty(¤t->task_works))) + task_work_run(); } #endif /* */ diff --git a/kernel/Makefile b/kernel/Makefile index 6c07f30fa9b7..bf1034008aca 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,7 +5,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ - signal.o sys.o kmod.o workqueue.o pid.o \ + signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ diff --git a/kernel/exit.c b/kernel/exit.c index 910a0716e17a..3d93325e0b1a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -946,11 +946,14 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. + * an exiting task cleaning up the robust pi futexes, and in + * task_work_add() to avoid the race with exit_task_work(). */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); + exit_task_work(tsk); + exit_irq_thread(); if (unlikely(in_atomic())) diff --git a/kernel/fork.c b/kernel/fork.c index 05c813dc9ecc..a46db217a589 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1411,6 +1411,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); + INIT_HLIST_HEAD(&p->task_works); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible diff --git a/kernel/task_work.c b/kernel/task_work.c new file mode 100644 index 000000000000..82d1c794066d --- /dev/null +++ b/kernel/task_work.c @@ -0,0 +1,84 @@ +#include +#include +#include + +int +task_work_add(struct task_struct *task, struct task_work *twork, bool notify) +{ + unsigned long flags; + int err = -ESRCH; + +#ifndef TIF_NOTIFY_RESUME + if (notify) + return -ENOTSUPP; +#endif + /* + * We must not insert the new work if the task has already passed + * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() + * and check PF_EXITING under pi_lock. + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (likely(!(task->flags & PF_EXITING))) { + hlist_add_head(&twork->hlist, &task->task_works); + err = 0; + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ + if (likely(!err) && notify) + set_notify_resume(task); + return err; +} + +struct task_work * +task_work_cancel(struct task_struct *task, task_work_func_t func) +{ + unsigned long flags; + struct task_work *twork; + struct hlist_node *pos; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + hlist_for_each_entry(twork, pos, &task->task_works, hlist) { + if (twork->func == func) { + hlist_del(&twork->hlist); + goto found; + } + } + twork = NULL; + found: + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + return twork; +} + +void task_work_run(void) +{ + struct task_struct *task = current; + struct hlist_head task_works; + struct hlist_node *pos; + + raw_spin_lock_irq(&task->pi_lock); + hlist_move_list(&task->task_works, &task_works); + raw_spin_unlock_irq(&task->pi_lock); + + if (unlikely(hlist_empty(&task_works))) + return; + /* + * We use hlist to save the space in task_struct, but we want fifo. + * Find the last entry, the list should be short, then process them + * in reverse order. + */ + for (pos = task_works.first; pos->next; pos = pos->next) + ; + + for (;;) { + struct hlist_node **pprev = pos->pprev; + struct task_work *twork = container_of(pos, struct task_work, + hlist); + twork->func(twork); + + if (pprev == &task_works.first) + break; + pos = container_of(pprev, struct hlist_node, next); + } +} -- cgit v1.3-8-gc7d7 From eea62f831b8030b0eeea8314eed73b6132d1de26 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 8 May 2012 13:32:24 +0930 Subject: brlocks/lglocks: turn into functions lglocks and brlocks are currently generated with some complicated macros in lglock.h. But there's no reason to not just use common utility functions and put all the data into a common data structure. Since there are at least two users it makes sense to share this code in a library. This is also easier maintainable than a macro forest. This will also make it later possible to dynamically allocate lglocks and also use them in modules (this would both still need some additional, but now straightforward, code) [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Andi Kleen Cc: Al Viro Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Rusty Russell Signed-off-by: Al Viro --- fs/file_table.c | 1 - fs/internal.h | 2 +- include/linux/lglock.h | 125 ++++++++++--------------------------------------- kernel/Makefile | 2 +- kernel/lglock.c | 89 +++++++++++++++++++++++++++++++++++ 5 files changed, 117 insertions(+), 102 deletions(-) create mode 100644 kernel/lglock.c (limited to 'kernel/Makefile') diff --git a/fs/file_table.c b/fs/file_table.c index 70f2a0fd6aec..f5c67c59ec10 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -34,7 +34,6 @@ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -DECLARE_LGLOCK(files_lglock); DEFINE_LGLOCK(files_lglock); /* SLAB cache for file structures */ diff --git a/fs/internal.h b/fs/internal.h index 9962c59ba280..8040af489c78 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -56,7 +56,7 @@ extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -DECLARE_BRLOCK(vfsmount_lock); +extern struct lglock vfsmount_lock; /* diff --git a/include/linux/lglock.h b/include/linux/lglock.h index 0fdd821e77b7..f01e5f6d1f07 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -23,26 +23,17 @@ #include #include #include +#include /* can make br locks by using local lock for read side, global lock for write */ -#define br_lock_init(name) name##_lock_init() -#define br_read_lock(name) name##_local_lock() -#define br_read_unlock(name) name##_local_unlock() -#define br_write_lock(name) name##_global_lock() -#define br_write_unlock(name) name##_global_unlock() +#define br_lock_init(name) lg_lock_init(name, #name) +#define br_read_lock(name) lg_local_lock(name) +#define br_read_unlock(name) lg_local_unlock(name) +#define br_write_lock(name) lg_global_lock(name) +#define br_write_unlock(name) lg_global_unlock(name) -#define DECLARE_BRLOCK(name) DECLARE_LGLOCK(name) #define DEFINE_BRLOCK(name) DEFINE_LGLOCK(name) - -#define lg_lock_init(name) name##_lock_init() -#define lg_local_lock(name) name##_local_lock() -#define lg_local_unlock(name) name##_local_unlock() -#define lg_local_lock_cpu(name, cpu) name##_local_lock_cpu(cpu) -#define lg_local_unlock_cpu(name, cpu) name##_local_unlock_cpu(cpu) -#define lg_global_lock(name) name##_global_lock() -#define lg_global_unlock(name) name##_global_unlock() - #ifdef CONFIG_DEBUG_LOCK_ALLOC #define LOCKDEP_INIT_MAP lockdep_init_map @@ -57,90 +48,26 @@ #define DEFINE_LGLOCK_LOCKDEP(name) #endif - -#define DECLARE_LGLOCK(name) \ - extern void name##_lock_init(void); \ - extern void name##_local_lock(void); \ - extern void name##_local_unlock(void); \ - extern void name##_local_lock_cpu(int cpu); \ - extern void name##_local_unlock_cpu(int cpu); \ - extern void name##_global_lock(void); \ - extern void name##_global_unlock(void); \ +struct lglock { + arch_spinlock_t __percpu *lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lock_class_key lock_key; + struct lockdep_map lock_dep_map; +#endif +}; #define DEFINE_LGLOCK(name) \ - \ - DEFINE_SPINLOCK(name##_cpu_lock); \ - DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \ - DEFINE_LGLOCK_LOCKDEP(name); \ - \ - void name##_lock_init(void) { \ - int i; \ - LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \ - for_each_possible_cpu(i) { \ - arch_spinlock_t *lock; \ - lock = &per_cpu(name##_lock, i); \ - *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; \ - } \ - } \ - EXPORT_SYMBOL(name##_lock_init); \ - \ - void name##_local_lock(void) { \ - arch_spinlock_t *lock; \ - preempt_disable(); \ - rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_); \ - lock = &__get_cpu_var(name##_lock); \ - arch_spin_lock(lock); \ - } \ - EXPORT_SYMBOL(name##_local_lock); \ - \ - void name##_local_unlock(void) { \ - arch_spinlock_t *lock; \ - rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_); \ - lock = &__get_cpu_var(name##_lock); \ - arch_spin_unlock(lock); \ - preempt_enable(); \ - } \ - EXPORT_SYMBOL(name##_local_unlock); \ - \ - void name##_local_lock_cpu(int cpu) { \ - arch_spinlock_t *lock; \ - preempt_disable(); \ - rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_); \ - lock = &per_cpu(name##_lock, cpu); \ - arch_spin_lock(lock); \ - } \ - EXPORT_SYMBOL(name##_local_lock_cpu); \ - \ - void name##_local_unlock_cpu(int cpu) { \ - arch_spinlock_t *lock; \ - rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_); \ - lock = &per_cpu(name##_lock, cpu); \ - arch_spin_unlock(lock); \ - preempt_enable(); \ - } \ - EXPORT_SYMBOL(name##_local_unlock_cpu); \ - \ - void name##_global_lock(void) { \ - int i; \ - preempt_disable(); \ - rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \ - for_each_possible_cpu(i) { \ - arch_spinlock_t *lock; \ - lock = &per_cpu(name##_lock, i); \ - arch_spin_lock(lock); \ - } \ - } \ - EXPORT_SYMBOL(name##_global_lock); \ - \ - void name##_global_unlock(void) { \ - int i; \ - rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \ - for_each_possible_cpu(i) { \ - arch_spinlock_t *lock; \ - lock = &per_cpu(name##_lock, i); \ - arch_spin_unlock(lock); \ - } \ - preempt_enable(); \ - } \ - EXPORT_SYMBOL(name##_global_unlock); + DEFINE_LGLOCK_LOCKDEP(name); \ + DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \ + = __ARCH_SPIN_LOCK_UNLOCKED; \ + struct lglock name = { .lock = &name ## _lock } + +void lg_lock_init(struct lglock *lg, char *name); +void lg_local_lock(struct lglock *lg); +void lg_local_unlock(struct lglock *lg); +void lg_local_lock_cpu(struct lglock *lg, int cpu); +void lg_local_unlock_cpu(struct lglock *lg, int cpu); +void lg_global_lock(struct lglock *lg); +void lg_global_unlock(struct lglock *lg); + #endif diff --git a/kernel/Makefile b/kernel/Makefile index 6c07f30fa9b7..296132c19a57 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o \ - async.o range.o groups.o + async.o range.o groups.o lglock.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files diff --git a/kernel/lglock.c b/kernel/lglock.c new file mode 100644 index 000000000000..6535a667a5a7 --- /dev/null +++ b/kernel/lglock.c @@ -0,0 +1,89 @@ +/* See include/linux/lglock.h for description */ +#include +#include +#include +#include + +/* + * Note there is no uninit, so lglocks cannot be defined in + * modules (but it's fine to use them from there) + * Could be added though, just undo lg_lock_init + */ + +void lg_lock_init(struct lglock *lg, char *name) +{ + LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); +} +EXPORT_SYMBOL(lg_lock_init); + +void lg_local_lock(struct lglock *lg) +{ + arch_spinlock_t *lock; + + preempt_disable(); + rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock = this_cpu_ptr(lg->lock); + arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock); + +void lg_local_unlock(struct lglock *lg) +{ + arch_spinlock_t *lock; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock = this_cpu_ptr(lg->lock); + arch_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock); + +void lg_local_lock_cpu(struct lglock *lg, int cpu) +{ + arch_spinlock_t *lock; + + preempt_disable(); + rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock = per_cpu_ptr(lg->lock, cpu); + arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock_cpu); + +void lg_local_unlock_cpu(struct lglock *lg, int cpu) +{ + arch_spinlock_t *lock; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock = per_cpu_ptr(lg->lock, cpu); + arch_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock_cpu); + +void lg_global_lock(struct lglock *lg) +{ + int i; + + preempt_disable(); + rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); + for_each_possible_cpu(i) { + arch_spinlock_t *lock; + lock = per_cpu_ptr(lg->lock, i); + arch_spin_lock(lock); + } +} +EXPORT_SYMBOL(lg_global_lock); + +void lg_global_unlock(struct lglock *lg) +{ + int i; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + for_each_possible_cpu(i) { + arch_spinlock_t *lock; + lock = per_cpu_ptr(lg->lock, i); + arch_spin_unlock(lock); + } + preempt_enable(); +} +EXPORT_SYMBOL(lg_global_unlock); -- cgit v1.3-8-gc7d7 From d97b46a64674a267bc41c9e16132ee2a98c3347d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:44 -0700 Subject: syscalls, x86: add __NR_kcmp syscall While doing the checkpoint-restore in the user space one need to determine whether various kernel objects (like mm_struct-s of file_struct-s) are shared between tasks and restore this state. The 2nd step can be solved by using appropriate CLONE_ flags and the unshare syscall, while there's currently no ways for solving the 1st one. One of the ways for checking whether two tasks share e.g. mm_struct is to provide some mm_struct ID of a task to its proc file, but showing such info considered to be not that good for security reasons. Thus after some debates we end up in conclusion that using that named 'comparison' syscall might be the best candidate. So here is it -- __NR_kcmp. It takes up to 5 arguments - the pids of the two tasks (which characteristics should be compared), the comparison type and (in case of comparison of files) two file descriptors. Lookups for pids are done in the caller's PID namespace only. At moment only x86 is supported and tested. [akpm@linux-foundation.org: fix up selftests, warnings] [akpm@linux-foundation.org: include errno.h] [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Cyrill Gorcunov Acked-by: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Andrey Vagin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Glauber Costa Cc: Andi Kleen Cc: Tejun Heo Cc: Matt Helsley Cc: Pekka Enberg Cc: Eric Dumazet Cc: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: Valdis.Kletnieks@vt.edu Cc: Michal Marek Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 2 + include/linux/kcmp.h | 17 +++ include/linux/syscalls.h | 2 + kernel/Makefile | 3 + kernel/kcmp.c | 196 +++++++++++++++++++++++++++++++ kernel/sys_ni.c | 3 + tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/kcmp/Makefile | 29 +++++ tools/testing/selftests/kcmp/kcmp_test.c | 94 +++++++++++++++ 10 files changed, 348 insertions(+), 1 deletion(-) create mode 100644 include/linux/kcmp.h create mode 100644 kernel/kcmp.c create mode 100644 tools/testing/selftests/kcmp/Makefile create mode 100644 tools/testing/selftests/kcmp/kcmp_test.c (limited to 'kernel/Makefile') diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 29f9f0554f7d..7a35a6e71d44 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -355,3 +355,4 @@ 346 i386 setns sys_setns 347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +349 i386 kcmp sys_kcmp diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index dd29a9ea27c5..51171aeff0dc 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -318,6 +318,8 @@ 309 common getcpu sys_getcpu 310 64 process_vm_readv sys_process_vm_readv 311 64 process_vm_writev sys_process_vm_writev +312 64 kcmp sys_kcmp + # # x32-specific system call numbers start at 512 to avoid cache impact # for native 64-bit operation. diff --git a/include/linux/kcmp.h b/include/linux/kcmp.h new file mode 100644 index 000000000000..2dcd1b3aafc8 --- /dev/null +++ b/include/linux/kcmp.h @@ -0,0 +1,17 @@ +#ifndef _LINUX_KCMP_H +#define _LINUX_KCMP_H + +/* Comparison type */ +enum kcmp_type { + KCMP_FILE, + KCMP_VM, + KCMP_FILES, + KCMP_FS, + KCMP_SIGHAND, + KCMP_IO, + KCMP_SYSVSEM, + + KCMP_TYPES, +}; + +#endif /* _LINUX_KCMP_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3de3acb84a95..19439c75c5b2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -858,4 +858,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid, unsigned long riovcnt, unsigned long flags); +asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type, + unsigned long idx1, unsigned long idx2); #endif diff --git a/kernel/Makefile b/kernel/Makefile index 6c07f30fa9b7..80be6ca0cc75 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -25,6 +25,9 @@ endif obj-y += sched/ obj-y += power/ +ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) +obj-$(CONFIG_X86) += kcmp.o +endif obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/kernel/kcmp.c b/kernel/kcmp.c new file mode 100644 index 000000000000..30b7b225306c --- /dev/null +++ b/kernel/kcmp.c @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * We don't expose the real in-memory order of objects for security reasons. + * But still the comparison results should be suitable for sorting. So we + * obfuscate kernel pointers values and compare the production instead. + * + * The obfuscation is done in two steps. First we xor the kernel pointer with + * a random value, which puts pointer into a new position in a reordered space. + * Secondly we multiply the xor production with a large odd random number to + * permute its bits even more (the odd multiplier guarantees that the product + * is unique ever after the high bits are truncated, since any odd number is + * relative prime to 2^n). + * + * Note also that the obfuscation itself is invisible to userspace and if needed + * it can be changed to an alternate scheme. + */ +static unsigned long cookies[KCMP_TYPES][2] __read_mostly; + +static long kptr_obfuscate(long v, int type) +{ + return (v ^ cookies[type][0]) * cookies[type][1]; +} + +/* + * 0 - equal, i.e. v1 = v2 + * 1 - less than, i.e. v1 < v2 + * 2 - greater than, i.e. v1 > v2 + * 3 - not equal but ordering unavailable (reserved for future) + */ +static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) +{ + long ret; + + ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); + + return (ret < 0) | ((ret > 0) << 1); +} + +/* The caller must have pinned the task */ +static struct file * +get_file_raw_ptr(struct task_struct *task, unsigned int idx) +{ + struct file *file = NULL; + + task_lock(task); + rcu_read_lock(); + + if (task->files) + file = fcheck_files(task->files, idx); + + rcu_read_unlock(); + task_unlock(task); + + return file; +} + +static void kcmp_unlock(struct mutex *m1, struct mutex *m2) +{ + if (likely(m2 != m1)) + mutex_unlock(m2); + mutex_unlock(m1); +} + +static int kcmp_lock(struct mutex *m1, struct mutex *m2) +{ + int err; + + if (m2 > m1) + swap(m1, m2); + + err = mutex_lock_killable(m1); + if (!err && likely(m1 != m2)) { + err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING); + if (err) + mutex_unlock(m1); + } + + return err; +} + +SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, + unsigned long, idx1, unsigned long, idx2) +{ + struct task_struct *task1, *task2; + int ret; + + rcu_read_lock(); + + /* + * Tasks are looked up in caller's PID namespace only. + */ + task1 = find_task_by_vpid(pid1); + task2 = find_task_by_vpid(pid2); + if (!task1 || !task2) + goto err_no_task; + + get_task_struct(task1); + get_task_struct(task2); + + rcu_read_unlock(); + + /* + * One should have enough rights to inspect task details. + */ + ret = kcmp_lock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); + if (ret) + goto err; + if (!ptrace_may_access(task1, PTRACE_MODE_READ) || + !ptrace_may_access(task2, PTRACE_MODE_READ)) { + ret = -EPERM; + goto err_unlock; + } + + switch (type) { + case KCMP_FILE: { + struct file *filp1, *filp2; + + filp1 = get_file_raw_ptr(task1, idx1); + filp2 = get_file_raw_ptr(task2, idx2); + + if (filp1 && filp2) + ret = kcmp_ptr(filp1, filp2, KCMP_FILE); + else + ret = -EBADF; + break; + } + case KCMP_VM: + ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM); + break; + case KCMP_FILES: + ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); + break; + case KCMP_FS: + ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); + break; + case KCMP_SIGHAND: + ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); + break; + case KCMP_IO: + ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO); + break; + case KCMP_SYSVSEM: +#ifdef CONFIG_SYSVIPC + ret = kcmp_ptr(task1->sysvsem.undo_list, + task2->sysvsem.undo_list, + KCMP_SYSVSEM); +#else + ret = -EOPNOTSUPP; +#endif + break; + default: + ret = -EINVAL; + break; + } + +err_unlock: + kcmp_unlock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); +err: + put_task_struct(task1); + put_task_struct(task2); + + return ret; + +err_no_task: + rcu_read_unlock(); + return -ESRCH; +} + +static __init int kcmp_cookies_init(void) +{ + int i; + + get_random_bytes(cookies, sizeof(cookies)); + + for (i = 0; i < KCMP_TYPES; i++) + cookies[i][1] |= (~(~0UL >> 1) | 1); + + return 0; +} +arch_initcall(kcmp_cookies_init); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 47bfa16430d7..dbff751e4086 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark); cond_syscall(sys_name_to_handle_at); cond_syscall(sys_open_by_handle_at); cond_syscall(compat_sys_open_by_handle_at); + +/* compare kernel pointers */ +cond_syscall(sys_kcmp); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 14972017a43e..a4162e15c25f 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,4 +1,4 @@ -TARGETS = breakpoints mqueue vm +TARGETS = breakpoints kcmp mqueue vm all: for TARGET in $(TARGETS); do \ diff --git a/tools/testing/selftests/kcmp/Makefile b/tools/testing/selftests/kcmp/Makefile new file mode 100644 index 000000000000..dc79b86ea65c --- /dev/null +++ b/tools/testing/selftests/kcmp/Makefile @@ -0,0 +1,29 @@ +uname_M := $(shell uname -m 2>/dev/null || echo not) +ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) +ifeq ($(ARCH),i386) + ARCH := X86 + CFLAGS := -DCONFIG_X86_32 -D__i386__ +endif +ifeq ($(ARCH),x86_64) + ARCH := X86 + CFLAGS := -DCONFIG_X86_64 -D__x86_64__ +endif + +CFLAGS += -I../../../../arch/x86/include/generated/ +CFLAGS += -I../../../../include/ +CFLAGS += -I../../../../usr/include/ +CFLAGS += -I../../../../arch/x86/include/ + +all: +ifeq ($(ARCH),X86) + gcc $(CFLAGS) kcmp_test.c -o run_test +else + echo "Not an x86 target, can't build kcmp selftest" +endif + +run-tests: all + ./kcmp_test + +clean: + rm -fr ./run_test + rm -fr ./test-file diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c new file mode 100644 index 000000000000..358cc6bfa35d --- /dev/null +++ b/tools/testing/selftests/kcmp/kcmp_test.c @@ -0,0 +1,94 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +static long sys_kcmp(int pid1, int pid2, int type, int fd1, int fd2) +{ + return syscall(__NR_kcmp, pid1, pid2, type, fd1, fd2); +} + +int main(int argc, char **argv) +{ + const char kpath[] = "kcmp-test-file"; + int pid1, pid2; + int fd1, fd2; + int status; + + fd1 = open(kpath, O_RDWR | O_CREAT | O_TRUNC, 0644); + pid1 = getpid(); + + if (fd1 < 0) { + perror("Can't create file"); + exit(1); + } + + pid2 = fork(); + if (pid2 < 0) { + perror("fork failed"); + exit(1); + } + + if (!pid2) { + int pid2 = getpid(); + int ret; + + fd2 = open(kpath, O_RDWR, 0644); + if (fd2 < 0) { + perror("Can't open file"); + exit(1); + } + + /* An example of output and arguments */ + printf("pid1: %6d pid2: %6d FD: %2ld FILES: %2ld VM: %2ld " + "FS: %2ld SIGHAND: %2ld IO: %2ld SYSVSEM: %2ld " + "INV: %2ld\n", + pid1, pid2, + sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd2), + sys_kcmp(pid1, pid2, KCMP_FILES, 0, 0), + sys_kcmp(pid1, pid2, KCMP_VM, 0, 0), + sys_kcmp(pid1, pid2, KCMP_FS, 0, 0), + sys_kcmp(pid1, pid2, KCMP_SIGHAND, 0, 0), + sys_kcmp(pid1, pid2, KCMP_IO, 0, 0), + sys_kcmp(pid1, pid2, KCMP_SYSVSEM, 0, 0), + + /* This one should fail */ + sys_kcmp(pid1, pid2, KCMP_TYPES + 1, 0, 0)); + + /* This one should return same fd */ + ret = sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd1); + if (ret) { + printf("FAIL: 0 expected but %d returned\n", ret); + ret = -1; + } else + printf("PASS: 0 returned as expected\n"); + + /* Compare with self */ + ret = sys_kcmp(pid1, pid1, KCMP_VM, 0, 0); + if (ret) { + printf("FAIL: 0 expected but %li returned\n", ret); + ret = -1; + } else + printf("PASS: 0 returned as expected\n"); + + exit(ret); + } + + waitpid(pid2, &status, P_ALL); + + return 0; +} -- cgit v1.3-8-gc7d7