From 38498a67aa2cf8c80754b8d304bfacc10bc582b5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 20 Apr 2012 13:05:44 +0000
Subject: smp: Add generic smpboot facility

Start a new file, which will hold SMP and CPU hotplug related generic
infrastructure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Howells <dhowells@redhat.com>
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: x86@kernel.org
Link: http://lkml.kernel.org/r/20120420124557.035417523@linutronix.de
---
 kernel/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/Makefile')

diff --git a/kernel/Makefile b/kernel/Makefile
index cb41b9547c9f..6c07f30fa9b7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SMP) += smpboot.o
 ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
-- 
cgit v1.3-8-gc7d7


From e73f8959af0439d114847eab5a8a5ce48f1217c4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 11 May 2012 10:59:07 +1000
Subject: task_work_add: generic process-context callbacks

Provide a simple mechanism that allows running code in the (nonatomic)
context of the arbitrary task.

The caller does task_work_add(task, task_work) and this task executes
task_work->func() either from do_notify_resume() or from do_exit().  The
callback can rely on PF_EXITING to detect the latter case.

"struct task_work" can be embedded in another struct, still it has "void
*data" to handle the most common/simple case.

This allows us to kill the ->replacement_session_keyring hack, and
potentially this can have more users.

Performance-wise, this adds 2 "unlikely(!hlist_empty())" checks into
tracehook_notify_resume() and do_exit().  But at the same time we can
remove the "replacement_session_keyring != NULL" checks from
arch/*/signal.c and exit_creds().

Note: task_work_add/task_work_run abuses ->pi_lock.  This is only because
this lock is already used by lookup_pi_state() to synchronize with
do_exit() setting PF_EXITING.  Fortunately the scope of this lock in
task_work.c is really tiny, and the code is unlikely anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Alexander Gordeev <agordeev@redhat.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Smith <dsmith@redhat.com>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/sched.h     |  2 ++
 include/linux/task_work.h | 33 +++++++++++++++++++
 include/linux/tracehook.h | 11 +++++++
 kernel/Makefile           |  2 +-
 kernel/exit.c             |  5 ++-
 kernel/fork.c             |  1 +
 kernel/task_work.c        | 84 +++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 136 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/task_work.h
 create mode 100644 kernel/task_work.c

(limited to 'kernel/Makefile')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5ea8baea9387..7930131abc1a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1400,6 +1400,8 @@ struct task_struct {
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
+	struct hlist_head task_works;
+
 	struct audit_context *audit_context;
 #ifdef CONFIG_AUDITSYSCALL
 	uid_t loginuid;
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
new file mode 100644
index 000000000000..294d5d5e90b1
--- /dev/null
+++ b/include/linux/task_work.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_TASK_WORK_H
+#define _LINUX_TASK_WORK_H
+
+#include <linux/list.h>
+#include <linux/sched.h>
+
+struct task_work;
+typedef void (*task_work_func_t)(struct task_work *);
+
+struct task_work {
+	struct hlist_node hlist;
+	task_work_func_t func;
+	void *data;
+};
+
+static inline void
+init_task_work(struct task_work *twork, task_work_func_t func, void *data)
+{
+	twork->func = func;
+	twork->data = data;
+}
+
+int task_work_add(struct task_struct *task, struct task_work *twork, bool);
+struct task_work *task_work_cancel(struct task_struct *, task_work_func_t);
+void task_work_run(void);
+
+static inline void exit_task_work(struct task_struct *task)
+{
+	if (unlikely(!hlist_empty(&task->task_works)))
+		task_work_run();
+}
+
+#endif	/* _LINUX_TASK_WORK_H */
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index b9ca903bb553..b2dd0917ca0d 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -49,6 +49,7 @@
 #include <linux/sched.h>
 #include <linux/ptrace.h>
 #include <linux/security.h>
+#include <linux/task_work.h>
 struct linux_binprm;
 
 /*
@@ -164,8 +165,10 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
  */
 static inline void set_notify_resume(struct task_struct *task)
 {
+#ifdef TIF_NOTIFY_RESUME
 	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
 		kick_process(task);
+#endif
 }
 
 /**
@@ -185,6 +188,14 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 {
 	if (current->replacement_session_keyring)
 		key_replace_session_keyring();
+	/*
+	 * The caller just cleared TIF_NOTIFY_RESUME. This barrier
+	 * pairs with task_work_add()->set_notify_resume() after
+	 * hlist_add_head(task->task_works);
+	 */
+	smp_mb__after_clear_bit();
+	if (unlikely(!hlist_empty(&current->task_works)))
+		task_work_run();
 }
 
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c07f30fa9b7..bf1034008aca 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,7 +5,7 @@
 obj-y     = fork.o exec_domain.o panic.o printk.o \
 	    cpu.o exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
-	    signal.o sys.o kmod.o workqueue.o pid.o \
+	    signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
diff --git a/kernel/exit.c b/kernel/exit.c
index 910a0716e17a..3d93325e0b1a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -946,11 +946,14 @@ void do_exit(long code)
 	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
 	 * tsk->flags are checked in the futex code to protect against
-	 * an exiting task cleaning up the robust pi futexes.
+	 * an exiting task cleaning up the robust pi futexes, and in
+	 * task_work_add() to avoid the race with exit_task_work().
 	 */
 	smp_mb();
 	raw_spin_unlock_wait(&tsk->pi_lock);
 
+	exit_task_work(tsk);
+
 	exit_irq_thread();
 
 	if (unlikely(in_atomic()))
diff --git a/kernel/fork.c b/kernel/fork.c
index 05c813dc9ecc..a46db217a589 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1411,6 +1411,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	p->group_leader = p;
 	INIT_LIST_HEAD(&p->thread_group);
+	INIT_HLIST_HEAD(&p->task_works);
 
 	/* Now that the task is set up, run cgroup callbacks if
 	 * necessary. We need to run them before the task is visible
diff --git a/kernel/task_work.c b/kernel/task_work.c
new file mode 100644
index 000000000000..82d1c794066d
--- /dev/null
+++ b/kernel/task_work.c
@@ -0,0 +1,84 @@
+#include <linux/spinlock.h>
+#include <linux/task_work.h>
+#include <linux/tracehook.h>
+
+int
+task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
+{
+	unsigned long flags;
+	int err = -ESRCH;
+
+#ifndef TIF_NOTIFY_RESUME
+	if (notify)
+		return -ENOTSUPP;
+#endif
+	/*
+	 * We must not insert the new work if the task has already passed
+	 * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait()
+	 * and check PF_EXITING under pi_lock.
+	 */
+	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	if (likely(!(task->flags & PF_EXITING))) {
+		hlist_add_head(&twork->hlist, &task->task_works);
+		err = 0;
+	}
+	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	/* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
+	if (likely(!err) && notify)
+		set_notify_resume(task);
+	return err;
+}
+
+struct task_work *
+task_work_cancel(struct task_struct *task, task_work_func_t func)
+{
+	unsigned long flags;
+	struct task_work *twork;
+	struct hlist_node *pos;
+
+	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	hlist_for_each_entry(twork, pos, &task->task_works, hlist) {
+		if (twork->func == func) {
+			hlist_del(&twork->hlist);
+			goto found;
+		}
+	}
+	twork = NULL;
+ found:
+	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	return twork;
+}
+
+void task_work_run(void)
+{
+	struct task_struct *task = current;
+	struct hlist_head task_works;
+	struct hlist_node *pos;
+
+	raw_spin_lock_irq(&task->pi_lock);
+	hlist_move_list(&task->task_works, &task_works);
+	raw_spin_unlock_irq(&task->pi_lock);
+
+	if (unlikely(hlist_empty(&task_works)))
+		return;
+	/*
+	 * We use hlist to save the space in task_struct, but we want fifo.
+	 * Find the last entry, the list should be short, then process them
+	 * in reverse order.
+	 */
+	for (pos = task_works.first; pos->next; pos = pos->next)
+		;
+
+	for (;;) {
+		struct hlist_node **pprev = pos->pprev;
+		struct task_work *twork = container_of(pos, struct task_work,
+							hlist);
+		twork->func(twork);
+
+		if (pprev == &task_works.first)
+			break;
+		pos = container_of(pprev, struct hlist_node, next);
+	}
+}
-- 
cgit v1.3-8-gc7d7


From eea62f831b8030b0eeea8314eed73b6132d1de26 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 8 May 2012 13:32:24 +0930
Subject: brlocks/lglocks: turn into functions

lglocks and brlocks are currently generated with some complicated macros
in lglock.h.  But there's no reason to not just use common utility
functions and put all the data into a common data structure.

Since there are at least two users it makes sense to share this code in a
library.  This is also easier maintainable than a macro forest.

This will also make it later possible to dynamically allocate lglocks and
also use them in modules (this would both still need some additional, but
now straightforward, code)

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c        |   1 -
 fs/internal.h          |   2 +-
 include/linux/lglock.h | 125 ++++++++++---------------------------------------
 kernel/Makefile        |   2 +-
 kernel/lglock.c        |  89 +++++++++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+), 102 deletions(-)
 create mode 100644 kernel/lglock.c

(limited to 'kernel/Makefile')

diff --git a/fs/file_table.c b/fs/file_table.c
index 70f2a0fd6aec..f5c67c59ec10 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -34,7 +34,6 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-DECLARE_LGLOCK(files_lglock);
 DEFINE_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
diff --git a/fs/internal.h b/fs/internal.h
index 9962c59ba280..8040af489c78 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -56,7 +56,7 @@ extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
-DECLARE_BRLOCK(vfsmount_lock);
+extern struct lglock vfsmount_lock;
 
 
 /*
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0fdd821e77b7..f01e5f6d1f07 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -23,26 +23,17 @@
 #include <linux/lockdep.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
+#include <linux/notifier.h>
 
 /* can make br locks by using local lock for read side, global lock for write */
-#define br_lock_init(name)	name##_lock_init()
-#define br_read_lock(name)	name##_local_lock()
-#define br_read_unlock(name)	name##_local_unlock()
-#define br_write_lock(name)	name##_global_lock()
-#define br_write_unlock(name)	name##_global_unlock()
+#define br_lock_init(name)	lg_lock_init(name, #name)
+#define br_read_lock(name)	lg_local_lock(name)
+#define br_read_unlock(name)	lg_local_unlock(name)
+#define br_write_lock(name)	lg_global_lock(name)
+#define br_write_unlock(name)	lg_global_unlock(name)
 
-#define DECLARE_BRLOCK(name)	DECLARE_LGLOCK(name)
 #define DEFINE_BRLOCK(name)	DEFINE_LGLOCK(name)
 
-
-#define lg_lock_init(name)	name##_lock_init()
-#define lg_local_lock(name)	name##_local_lock()
-#define lg_local_unlock(name)	name##_local_unlock()
-#define lg_local_lock_cpu(name, cpu)	name##_local_lock_cpu(cpu)
-#define lg_local_unlock_cpu(name, cpu)	name##_local_unlock_cpu(cpu)
-#define lg_global_lock(name)	name##_global_lock()
-#define lg_global_unlock(name)	name##_global_unlock()
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
 
@@ -57,90 +48,26 @@
 #define DEFINE_LGLOCK_LOCKDEP(name)
 #endif
 
-
-#define DECLARE_LGLOCK(name)						\
- extern void name##_lock_init(void);					\
- extern void name##_local_lock(void);					\
- extern void name##_local_unlock(void);					\
- extern void name##_local_lock_cpu(int cpu);				\
- extern void name##_local_unlock_cpu(int cpu);				\
- extern void name##_global_lock(void);					\
- extern void name##_global_unlock(void);				\
+struct lglock {
+	arch_spinlock_t __percpu *lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key lock_key;
+	struct lockdep_map    lock_dep_map;
+#endif
+};
 
 #define DEFINE_LGLOCK(name)						\
-									\
- DEFINE_SPINLOCK(name##_cpu_lock);					\
- DEFINE_PER_CPU(arch_spinlock_t, name##_lock);				\
- DEFINE_LGLOCK_LOCKDEP(name);						\
-									\
- void name##_lock_init(void) {						\
-	int i;								\
-	LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		*lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;	\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_lock_init);					\
-									\
- void name##_local_lock(void) {						\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock);					\
-									\
- void name##_local_unlock(void) {					\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock);					\
-									\
- void name##_local_lock_cpu(int cpu) {					\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock_cpu);					\
-									\
- void name##_local_unlock_cpu(int cpu) {				\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock_cpu);				\
-									\
- void name##_global_lock(void) {					\
-	int i;								\
-	preempt_disable();						\
-	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_lock(lock);					\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_global_lock);					\
-									\
- void name##_global_unlock(void) {					\
-	int i;								\
-	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_unlock(lock);					\
-	}								\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_global_unlock);
+	DEFINE_LGLOCK_LOCKDEP(name);					\
+	DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)			\
+	= __ARCH_SPIN_LOCK_UNLOCKED;					\
+	struct lglock name = { .lock = &name ## _lock }
+
+void lg_lock_init(struct lglock *lg, char *name);
+void lg_local_lock(struct lglock *lg);
+void lg_local_unlock(struct lglock *lg);
+void lg_local_lock_cpu(struct lglock *lg, int cpu);
+void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+void lg_global_lock(struct lglock *lg);
+void lg_global_unlock(struct lglock *lg);
+
 #endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c07f30fa9b7..296132c19a57 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o cred.o \
-	    async.o range.o groups.o
+	    async.o range.o groups.o lglock.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/lglock.c b/kernel/lglock.c
new file mode 100644
index 000000000000..6535a667a5a7
--- /dev/null
+++ b/kernel/lglock.c
@@ -0,0 +1,89 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+
+/*
+ * Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock(struct lglock *lg)
+{
+	int i;
+
+	preempt_disable();
+	rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_lock(lock);
+	}
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+	int i;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_unlock(lock);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);
-- 
cgit v1.3-8-gc7d7


From d97b46a64674a267bc41c9e16132ee2a98c3347d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 31 May 2012 16:26:44 -0700
Subject: syscalls, x86: add __NR_kcmp syscall

While doing the checkpoint-restore in the user space one need to determine
whether various kernel objects (like mm_struct-s of file_struct-s) are
shared between tasks and restore this state.

The 2nd step can be solved by using appropriate CLONE_ flags and the
unshare syscall, while there's currently no ways for solving the 1st one.

One of the ways for checking whether two tasks share e.g.  mm_struct is to
provide some mm_struct ID of a task to its proc file, but showing such
info considered to be not that good for security reasons.

Thus after some debates we end up in conclusion that using that named
'comparison' syscall might be the best candidate.  So here is it --
__NR_kcmp.

It takes up to 5 arguments - the pids of the two tasks (which
characteristics should be compared), the comparison type and (in case of
comparison of files) two file descriptors.

Lookups for pids are done in the caller's PID namespace only.

At moment only x86 is supported and tested.

[akpm@linux-foundation.org: fix up selftests, warnings]
[akpm@linux-foundation.org: include errno.h]
[akpm@linux-foundation.org: tweak comment text]
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Valdis.Kletnieks@vt.edu
Cc: Michal Marek <mmarek@suse.cz>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/syscalls/syscall_32.tbl         |   1 +
 arch/x86/syscalls/syscall_64.tbl         |   2 +
 include/linux/kcmp.h                     |  17 +++
 include/linux/syscalls.h                 |   2 +
 kernel/Makefile                          |   3 +
 kernel/kcmp.c                            | 196 +++++++++++++++++++++++++++++++
 kernel/sys_ni.c                          |   3 +
 tools/testing/selftests/Makefile         |   2 +-
 tools/testing/selftests/kcmp/Makefile    |  29 +++++
 tools/testing/selftests/kcmp/kcmp_test.c |  94 +++++++++++++++
 10 files changed, 348 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kcmp.h
 create mode 100644 kernel/kcmp.c
 create mode 100644 tools/testing/selftests/kcmp/Makefile
 create mode 100644 tools/testing/selftests/kcmp/kcmp_test.c

(limited to 'kernel/Makefile')

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 29f9f0554f7d..7a35a6e71d44 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -355,3 +355,4 @@
 346	i386	setns			sys_setns
 347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
 348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+349	i386	kcmp			sys_kcmp
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index dd29a9ea27c5..51171aeff0dc 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -318,6 +318,8 @@
 309	common	getcpu			sys_getcpu
 310	64	process_vm_readv	sys_process_vm_readv
 311	64	process_vm_writev	sys_process_vm_writev
+312	64	kcmp			sys_kcmp
+
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
 # for native 64-bit operation.
diff --git a/include/linux/kcmp.h b/include/linux/kcmp.h
new file mode 100644
index 000000000000..2dcd1b3aafc8
--- /dev/null
+++ b/include/linux/kcmp.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_KCMP_H
+#define _LINUX_KCMP_H
+
+/* Comparison type */
+enum kcmp_type {
+	KCMP_FILE,
+	KCMP_VM,
+	KCMP_FILES,
+	KCMP_FS,
+	KCMP_SIGHAND,
+	KCMP_IO,
+	KCMP_SYSVSEM,
+
+	KCMP_TYPES,
+};
+
+#endif /* _LINUX_KCMP_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3de3acb84a95..19439c75c5b2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -858,4 +858,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 				      unsigned long riovcnt,
 				      unsigned long flags);
 
+asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
+			 unsigned long idx1, unsigned long idx2);
 #endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c07f30fa9b7..80be6ca0cc75 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -25,6 +25,9 @@ endif
 obj-y += sched/
 obj-y += power/
 
+ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
+obj-$(CONFIG_X86) += kcmp.o
+endif
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
new file mode 100644
index 000000000000..30b7b225306c
--- /dev/null
+++ b/kernel/kcmp.c
@@ -0,0 +1,196 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/cache.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/kcmp.h>
+
+#include <asm/unistd.h>
+
+/*
+ * We don't expose the real in-memory order of objects for security reasons.
+ * But still the comparison results should be suitable for sorting. So we
+ * obfuscate kernel pointers values and compare the production instead.
+ *
+ * The obfuscation is done in two steps. First we xor the kernel pointer with
+ * a random value, which puts pointer into a new position in a reordered space.
+ * Secondly we multiply the xor production with a large odd random number to
+ * permute its bits even more (the odd multiplier guarantees that the product
+ * is unique ever after the high bits are truncated, since any odd number is
+ * relative prime to 2^n).
+ *
+ * Note also that the obfuscation itself is invisible to userspace and if needed
+ * it can be changed to an alternate scheme.
+ */
+static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
+
+static long kptr_obfuscate(long v, int type)
+{
+	return (v ^ cookies[type][0]) * cookies[type][1];
+}
+
+/*
+ * 0 - equal, i.e. v1 = v2
+ * 1 - less than, i.e. v1 < v2
+ * 2 - greater than, i.e. v1 > v2
+ * 3 - not equal but ordering unavailable (reserved for future)
+ */
+static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
+{
+	long ret;
+
+	ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+
+	return (ret < 0) | ((ret > 0) << 1);
+}
+
+/* The caller must have pinned the task */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+	struct file *file = NULL;
+
+	task_lock(task);
+	rcu_read_lock();
+
+	if (task->files)
+		file = fcheck_files(task->files, idx);
+
+	rcu_read_unlock();
+	task_unlock(task);
+
+	return file;
+}
+
+static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+{
+	if (likely(m2 != m1))
+		mutex_unlock(m2);
+	mutex_unlock(m1);
+}
+
+static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+{
+	int err;
+
+	if (m2 > m1)
+		swap(m1, m2);
+
+	err = mutex_lock_killable(m1);
+	if (!err && likely(m1 != m2)) {
+		err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+		if (err)
+			mutex_unlock(m1);
+	}
+
+	return err;
+}
+
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+		unsigned long, idx1, unsigned long, idx2)
+{
+	struct task_struct *task1, *task2;
+	int ret;
+
+	rcu_read_lock();
+
+	/*
+	 * Tasks are looked up in caller's PID namespace only.
+	 */
+	task1 = find_task_by_vpid(pid1);
+	task2 = find_task_by_vpid(pid2);
+	if (!task1 || !task2)
+		goto err_no_task;
+
+	get_task_struct(task1);
+	get_task_struct(task2);
+
+	rcu_read_unlock();
+
+	/*
+	 * One should have enough rights to inspect task details.
+	 */
+	ret = kcmp_lock(&task1->signal->cred_guard_mutex,
+			&task2->signal->cred_guard_mutex);
+	if (ret)
+		goto err;
+	if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+	    !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+		ret = -EPERM;
+		goto err_unlock;
+	}
+
+	switch (type) {
+	case KCMP_FILE: {
+		struct file *filp1, *filp2;
+
+		filp1 = get_file_raw_ptr(task1, idx1);
+		filp2 = get_file_raw_ptr(task2, idx2);
+
+		if (filp1 && filp2)
+			ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
+		else
+			ret = -EBADF;
+		break;
+	}
+	case KCMP_VM:
+		ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
+		break;
+	case KCMP_FILES:
+		ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
+		break;
+	case KCMP_FS:
+		ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
+		break;
+	case KCMP_SIGHAND:
+		ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
+		break;
+	case KCMP_IO:
+		ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
+		break;
+	case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+		ret = kcmp_ptr(task1->sysvsem.undo_list,
+			       task2->sysvsem.undo_list,
+			       KCMP_SYSVSEM);
+#else
+		ret = -EOPNOTSUPP;
+#endif
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+err_unlock:
+	kcmp_unlock(&task1->signal->cred_guard_mutex,
+		    &task2->signal->cred_guard_mutex);
+err:
+	put_task_struct(task1);
+	put_task_struct(task2);
+
+	return ret;
+
+err_no_task:
+	rcu_read_unlock();
+	return -ESRCH;
+}
+
+static __init int kcmp_cookies_init(void)
+{
+	int i;
+
+	get_random_bytes(cookies, sizeof(cookies));
+
+	for (i = 0; i < KCMP_TYPES; i++)
+		cookies[i][1] |= (~(~0UL >>  1) | 1);
+
+	return 0;
+}
+arch_initcall(kcmp_cookies_init);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 47bfa16430d7..dbff751e4086 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark);
 cond_syscall(sys_name_to_handle_at);
 cond_syscall(sys_open_by_handle_at);
 cond_syscall(compat_sys_open_by_handle_at);
+
+/* compare kernel pointers */
+cond_syscall(sys_kcmp);
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 14972017a43e..a4162e15c25f 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -1,4 +1,4 @@
-TARGETS = breakpoints mqueue vm
+TARGETS = breakpoints kcmp mqueue vm
 
 all:
 	for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/kcmp/Makefile b/tools/testing/selftests/kcmp/Makefile
new file mode 100644
index 000000000000..dc79b86ea65c
--- /dev/null
+++ b/tools/testing/selftests/kcmp/Makefile
@@ -0,0 +1,29 @@
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
+ifeq ($(ARCH),i386)
+        ARCH := X86
+	CFLAGS := -DCONFIG_X86_32 -D__i386__
+endif
+ifeq ($(ARCH),x86_64)
+	ARCH := X86
+	CFLAGS := -DCONFIG_X86_64 -D__x86_64__
+endif
+
+CFLAGS += -I../../../../arch/x86/include/generated/
+CFLAGS += -I../../../../include/
+CFLAGS += -I../../../../usr/include/
+CFLAGS += -I../../../../arch/x86/include/
+
+all:
+ifeq ($(ARCH),X86)
+	gcc $(CFLAGS) kcmp_test.c -o run_test
+else
+	echo "Not an x86 target, can't build kcmp selftest"
+endif
+
+run-tests: all
+	./kcmp_test
+
+clean:
+	rm -fr ./run_test
+	rm -fr ./test-file
diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c
new file mode 100644
index 000000000000..358cc6bfa35d
--- /dev/null
+++ b/tools/testing/selftests/kcmp/kcmp_test.c
@@ -0,0 +1,94 @@
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include <linux/unistd.h>
+#include <linux/kcmp.h>
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+static long sys_kcmp(int pid1, int pid2, int type, int fd1, int fd2)
+{
+	return syscall(__NR_kcmp, pid1, pid2, type, fd1, fd2);
+}
+
+int main(int argc, char **argv)
+{
+	const char kpath[] = "kcmp-test-file";
+	int pid1, pid2;
+	int fd1, fd2;
+	int status;
+
+	fd1 = open(kpath, O_RDWR | O_CREAT | O_TRUNC, 0644);
+	pid1 = getpid();
+
+	if (fd1 < 0) {
+		perror("Can't create file");
+		exit(1);
+	}
+
+	pid2 = fork();
+	if (pid2 < 0) {
+		perror("fork failed");
+		exit(1);
+	}
+
+	if (!pid2) {
+		int pid2 = getpid();
+		int ret;
+
+		fd2 = open(kpath, O_RDWR, 0644);
+		if (fd2 < 0) {
+			perror("Can't open file");
+			exit(1);
+		}
+
+		/* An example of output and arguments */
+		printf("pid1: %6d pid2: %6d FD: %2ld FILES: %2ld VM: %2ld "
+		       "FS: %2ld SIGHAND: %2ld IO: %2ld SYSVSEM: %2ld "
+		       "INV: %2ld\n",
+		       pid1, pid2,
+		       sys_kcmp(pid1, pid2, KCMP_FILE,		fd1, fd2),
+		       sys_kcmp(pid1, pid2, KCMP_FILES,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_VM,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_FS,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_SIGHAND,	0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_IO,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_SYSVSEM,	0, 0),
+
+			/* This one should fail */
+		       sys_kcmp(pid1, pid2, KCMP_TYPES + 1,	0, 0));
+
+		/* This one should return same fd */
+		ret = sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd1);
+		if (ret) {
+			printf("FAIL: 0 expected but %d returned\n", ret);
+			ret = -1;
+		} else
+			printf("PASS: 0 returned as expected\n");
+
+		/* Compare with self */
+		ret = sys_kcmp(pid1, pid1, KCMP_VM, 0, 0);
+		if (ret) {
+			printf("FAIL: 0 expected but %li returned\n", ret);
+			ret = -1;
+		} else
+			printf("PASS: 0 returned as expected\n");
+
+		exit(ret);
+	}
+
+	waitpid(pid2, &status, P_ALL);
+
+	return 0;
+}
-- 
cgit v1.3-8-gc7d7