From 74ba508f60c3650595297b33a4cdfd02e443194e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 3 Mar 2012 18:58:11 -0800
Subject: userns: Remove unnecessary cast to struct user_struct when copying
 cred->user.

In struct cred the user member is and has always been declared struct user_struct *user.
At most a constant struct cred will have a constant pointer to non-constant user_struct
so remove this unnecessary cast.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sys.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/kernel/sys.c b/kernel/sys.c
index e7006eb6c1e4..f7a43514ac65 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -209,7 +209,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = (struct user_struct *) cred->user;
+			user = cred->user;
 			if (!who)
 				who = cred->uid;
 			else if ((who != cred->uid) &&
@@ -274,7 +274,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = (struct user_struct *) cred->user;
+			user = cred->user;
 			if (!who)
 				who = cred->uid;
 			else if ((who != cred->uid) &&
-- 
cgit v1.2.3-59-g8ed1b


From c4a4d603796c727b9555867571f89483be9c565e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 16 Nov 2011 23:15:31 -0800
Subject: userns: Use cred->user_ns instead of cred->user->user_ns

Optimize performance and prepare for the removal of the user_ns reference
from user_struct.  Remove the slow long walk through cred->user->user_ns and
instead go straight to cred->user_ns.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/ecryptfs/messaging.c      |  2 +-
 ipc/namespace.c              |  2 +-
 kernel/ptrace.c              |  4 ++--
 kernel/sched/core.c          |  2 +-
 kernel/signal.c              |  4 ++--
 kernel/sys.c                 |  8 ++++----
 kernel/user_namespace.c      |  4 ++--
 kernel/utsname.c             |  2 +-
 security/commoncap.c         | 14 +++++++-------
 security/keys/key.c          |  2 +-
 security/keys/permission.c   |  2 +-
 security/keys/process_keys.c |  2 +-
 12 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index ab2248090515..a750f957b145 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -303,7 +303,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 		mutex_unlock(&ecryptfs_daemon_hash_mux);
 		goto wake_up;
 	}
-	tsk_user_ns = __task_cred(msg_ctx->task)->user->user_ns;
+	tsk_user_ns = __task_cred(msg_ctx->task)->user_ns;
 	ctx_euid = task_euid(msg_ctx->task);
 	rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns);
 	rcu_read_unlock();
diff --git a/ipc/namespace.c b/ipc/namespace.c
index ce0a647869b1..f362298c5ce4 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -46,7 +46,7 @@ static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk,
 	ipcns_notify(IPCNS_CREATED);
 	register_ipcns_notifier(ns);
 
-	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
+	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
 
 	return ns;
 }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ee8d49b9c309..24e0a5a94824 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -198,7 +198,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 		return 0;
 	rcu_read_lock();
 	tcred = __task_cred(task);
-	if (cred->user->user_ns == tcred->user->user_ns &&
+	if (cred->user_ns == tcred->user_ns &&
 	    (cred->uid == tcred->euid &&
 	     cred->uid == tcred->suid &&
 	     cred->uid == tcred->uid  &&
@@ -206,7 +206,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	     cred->gid == tcred->sgid &&
 	     cred->gid == tcred->gid))
 		goto ok;
-	if (ptrace_has_cap(tcred->user->user_ns, mode))
+	if (ptrace_has_cap(tcred->user_ns, mode))
 		goto ok;
 	rcu_read_unlock();
 	return -EPERM;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4603b9d8f30a..96bff855b866 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4042,7 +4042,7 @@ static bool check_same_owner(struct task_struct *p)
 
 	rcu_read_lock();
 	pcred = __task_cred(p);
-	if (cred->user->user_ns == pcred->user->user_ns)
+	if (cred->user_ns == pcred->user_ns)
 		match = (cred->euid == pcred->euid ||
 			 cred->euid == pcred->uid);
 	else
diff --git a/kernel/signal.c b/kernel/signal.c
index 17afcaf582d0..e2c5d84f2dac 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -767,14 +767,14 @@ static int kill_ok_by_cred(struct task_struct *t)
 	const struct cred *cred = current_cred();
 	const struct cred *tcred = __task_cred(t);
 
-	if (cred->user->user_ns == tcred->user->user_ns &&
+	if (cred->user_ns == tcred->user_ns &&
 	    (cred->euid == tcred->suid ||
 	     cred->euid == tcred->uid ||
 	     cred->uid  == tcred->suid ||
 	     cred->uid  == tcred->uid))
 		return 1;
 
-	if (ns_capable(tcred->user->user_ns, CAP_KILL))
+	if (ns_capable(tcred->user_ns, CAP_KILL))
 		return 1;
 
 	return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index f7a43514ac65..82d8714bbede 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -133,11 +133,11 @@ static bool set_one_prio_perm(struct task_struct *p)
 {
 	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
 
-	if (pcred->user->user_ns == cred->user->user_ns &&
+	if (pcred->user_ns == cred->user_ns &&
 	    (pcred->uid  == cred->euid ||
 	     pcred->euid == cred->euid))
 		return true;
-	if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+	if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
 		return true;
 	return false;
 }
@@ -1498,7 +1498,7 @@ static int check_prlimit_permission(struct task_struct *task)
 		return 0;
 
 	tcred = __task_cred(task);
-	if (cred->user->user_ns == tcred->user->user_ns &&
+	if (cred->user_ns == tcred->user_ns &&
 	    (cred->uid == tcred->euid &&
 	     cred->uid == tcred->suid &&
 	     cred->uid == tcred->uid  &&
@@ -1506,7 +1506,7 @@ static int check_prlimit_permission(struct task_struct *task)
 	     cred->gid == tcred->sgid &&
 	     cred->gid == tcred->gid))
 		return 0;
-	if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+	if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
 		return 0;
 
 	return -EPERM;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 3b906e98b1db..f084083a0fd3 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -90,7 +90,7 @@ uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t
 {
 	struct user_namespace *tmp;
 
-	if (likely(to == cred->user->user_ns))
+	if (likely(to == cred->user_ns))
 		return uid;
 
 
@@ -112,7 +112,7 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
 {
 	struct user_namespace *tmp;
 
-	if (likely(to == cred->user->user_ns))
+	if (likely(to == cred->user_ns))
 		return gid;
 
 	/* Is cred->user the creator of the target user_ns
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 405caf91aad5..679d97a5d3fd 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
 
 	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
-	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
+	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
 	up_read(&uts_sem);
 	return ns;
 }
diff --git a/security/commoncap.c b/security/commoncap.c
index 0cf4b53480a7..8b3e10e2eac7 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -81,7 +81,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
 			return 0;
 
 		/* Do we have the necessary capabilities? */
-		if (targ_ns == cred->user->user_ns)
+		if (targ_ns == cred->user_ns)
 			return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
 
 		/* Have we tried all of the parent namespaces? */
@@ -136,10 +136,10 @@ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
 	rcu_read_lock();
 	cred = current_cred();
 	child_cred = __task_cred(child);
-	if (cred->user->user_ns == child_cred->user->user_ns &&
+	if (cred->user_ns == child_cred->user_ns &&
 	    cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
 		goto out;
-	if (ns_capable(child_cred->user->user_ns, CAP_SYS_PTRACE))
+	if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
 		goto out;
 	ret = -EPERM;
 out:
@@ -168,10 +168,10 @@ int cap_ptrace_traceme(struct task_struct *parent)
 	rcu_read_lock();
 	cred = __task_cred(parent);
 	child_cred = current_cred();
-	if (cred->user->user_ns == child_cred->user->user_ns &&
+	if (cred->user_ns == child_cred->user_ns &&
 	    cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
 		goto out;
-	if (has_ns_capability(parent, child_cred->user->user_ns, CAP_SYS_PTRACE))
+	if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
 		goto out;
 	ret = -EPERM;
 out:
@@ -214,7 +214,7 @@ static inline int cap_inh_is_capped(void)
 	/* they are so limited unless the current task has the CAP_SETPCAP
 	 * capability
 	 */
-	if (cap_capable(current_cred(), current_cred()->user->user_ns,
+	if (cap_capable(current_cred(), current_cred()->user_ns,
 			CAP_SETPCAP, SECURITY_CAP_AUDIT) == 0)
 		return 0;
 	return 1;
@@ -866,7 +866,7 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		    || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
 		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
 		    || (cap_capable(current_cred(),
-				    current_cred()->user->user_ns, CAP_SETPCAP,
+				    current_cred()->user_ns, CAP_SETPCAP,
 				    SECURITY_CAP_AUDIT) != 0)		/*[4]*/
 			/*
 			 * [1] no changing of bits that are locked
diff --git a/security/keys/key.c b/security/keys/key.c
index 06783cffb3af..7e6034793af3 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -253,7 +253,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	quotalen = desclen + type->def_datalen;
 
 	/* get hold of the key tracking for this user */
-	user = key_user_lookup(uid, cred->user->user_ns);
+	user = key_user_lookup(uid, cred->user_ns);
 	if (!user)
 		goto no_memory_1;
 
diff --git a/security/keys/permission.c b/security/keys/permission.c
index c35b5229e3cd..e146cbd714bd 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -36,7 +36,7 @@ int key_task_permission(const key_ref_t key_ref, const struct cred *cred,
 
 	key = key_ref_to_ptr(key_ref);
 
-	if (key->user->user_ns != cred->user->user_ns)
+	if (key->user->user_ns != cred->user_ns)
 		goto use_other_perms;
 
 	/* use the second 8-bits of permissions for keys the caller owns */
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index be7ecb2018dd..70febff06da9 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -858,7 +858,7 @@ void key_replace_session_keyring(void)
 	new-> sgid	= old-> sgid;
 	new->fsgid	= old->fsgid;
 	new->user	= get_uid(old->user);
-	new->user_ns	= new->user->user_ns;
+	new->user_ns	= new->user_ns;
 	new->group_info	= get_group_info(old->group_info);
 
 	new->securebits	= old->securebits;
-- 
cgit v1.2.3-59-g8ed1b


From 7a4e7408c5cadb240e068a662251754a562355e3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 14 Nov 2011 14:29:51 -0800
Subject: userns: Add kuid_t and kgid_t and associated infrastructure in
 uidgid.h

Start distinguishing between internal kernel uids and gids and
values that userspace can use.  This is done by introducing two
new types: kuid_t and kgid_t.  These types and their associated
functions are infrastructure are declared in the new header
uidgid.h.

Ultimately there will be a different implementation of the mapping
functions for use with user namespaces.  But to keep it simple
we introduce the mapping functions first to separate the meat
from the mechanical code conversions.

Export overflowuid and overflowgid so we can use from_kuid_munged
and from_kgid_munged in modular code.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/uidgid.h | 176 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c           |   2 -
 2 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/uidgid.h

(limited to 'kernel/sys.c')

diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
new file mode 100644
index 000000000000..a0addb8e5889
--- /dev/null
+++ b/include/linux/uidgid.h
@@ -0,0 +1,176 @@
+#ifndef _LINUX_UIDGID_H
+#define _LINUX_UIDGID_H
+
+/*
+ * A set of types for the internal kernel types representing uids and gids.
+ *
+ * The types defined in this header allow distinguishing which uids and gids in
+ * the kernel are values used by userspace and which uid and gid values are
+ * the internal kernel values.  With the addition of user namespaces the values
+ * can be different.  Using the type system makes it possible for the compiler
+ * to detect when we overlook these differences.
+ *
+ */
+#include <linux/types.h>
+#include <linux/highuid.h>
+
+struct user_namespace;
+extern struct user_namespace init_user_ns;
+
+#if defined(NOTYET)
+
+typedef struct {
+	uid_t val;
+} kuid_t;
+
+
+typedef struct {
+	gid_t val;
+} kgid_t;
+
+#define KUIDT_INIT(value) (kuid_t){ value }
+#define KGIDT_INIT(value) (kgid_t){ value }
+
+static inline uid_t __kuid_val(kuid_t uid)
+{
+	return uid.val;
+}
+
+static inline gid_t __kgid_val(kgid_t gid)
+{
+	return gid.val;
+}
+
+#else
+
+typedef uid_t kuid_t;
+typedef gid_t kgid_t;
+
+static inline uid_t __kuid_val(kuid_t uid)
+{
+	return uid;
+}
+
+static inline gid_t __kgid_val(kgid_t gid)
+{
+	return gid;
+}
+
+#define KUIDT_INIT(value) ((kuid_t) value )
+#define KGIDT_INIT(value) ((kgid_t) value )
+
+#endif
+
+#define GLOBAL_ROOT_UID KUIDT_INIT(0)
+#define GLOBAL_ROOT_GID KGIDT_INIT(0)
+
+#define INVALID_UID KUIDT_INIT(-1)
+#define INVALID_GID KGIDT_INIT(-1)
+
+static inline bool uid_eq(kuid_t left, kuid_t right)
+{
+	return __kuid_val(left) == __kuid_val(right);
+}
+
+static inline bool gid_eq(kgid_t left, kgid_t right)
+{
+	return __kgid_val(left) == __kgid_val(right);
+}
+
+static inline bool uid_gt(kuid_t left, kuid_t right)
+{
+	return __kuid_val(left) > __kuid_val(right);
+}
+
+static inline bool gid_gt(kgid_t left, kgid_t right)
+{
+	return __kgid_val(left) > __kgid_val(right);
+}
+
+static inline bool uid_gte(kuid_t left, kuid_t right)
+{
+	return __kuid_val(left) >= __kuid_val(right);
+}
+
+static inline bool gid_gte(kgid_t left, kgid_t right)
+{
+	return __kgid_val(left) >= __kgid_val(right);
+}
+
+static inline bool uid_lt(kuid_t left, kuid_t right)
+{
+	return __kuid_val(left) < __kuid_val(right);
+}
+
+static inline bool gid_lt(kgid_t left, kgid_t right)
+{
+	return __kgid_val(left) < __kgid_val(right);
+}
+
+static inline bool uid_lte(kuid_t left, kuid_t right)
+{
+	return __kuid_val(left) <= __kuid_val(right);
+}
+
+static inline bool gid_lte(kgid_t left, kgid_t right)
+{
+	return __kgid_val(left) <= __kgid_val(right);
+}
+
+static inline bool uid_valid(kuid_t uid)
+{
+	return !uid_eq(uid, INVALID_UID);
+}
+
+static inline bool gid_valid(kgid_t gid)
+{
+	return !gid_eq(gid, INVALID_GID);
+}
+
+static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
+{
+	return KUIDT_INIT(uid);
+}
+
+static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid)
+{
+	return KGIDT_INIT(gid);
+}
+
+static inline uid_t from_kuid(struct user_namespace *to, kuid_t kuid)
+{
+	return __kuid_val(kuid);
+}
+
+static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid)
+{
+	return __kgid_val(kgid);
+}
+
+static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid)
+{
+	uid_t uid = from_kuid(to, kuid);
+	if (uid == (uid_t)-1)
+		uid = overflowuid;
+	return uid;
+}
+
+static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid)
+{
+	gid_t gid = from_kgid(to, kgid);
+	if (gid == (gid_t)-1)
+		gid = overflowgid;
+	return gid;
+}
+
+static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid)
+{
+	return true;
+}
+
+static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
+{
+	return true;
+}
+
+#endif /* _LINUX_UIDGID_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 82d8714bbede..71852417cfc8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -93,10 +93,8 @@
 int overflowuid = DEFAULT_OVERFLOWUID;
 int overflowgid = DEFAULT_OVERFLOWGID;
 
-#ifdef CONFIG_UID16
 EXPORT_SYMBOL(overflowuid);
 EXPORT_SYMBOL(overflowgid);
-#endif
 
 /*
  * the same as above, but for filesystems which can only store a 16-bit
-- 
cgit v1.2.3-59-g8ed1b


From 7b44ab978b77a91b327058a0f4db7e6fcdb90b92 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 16 Nov 2011 23:20:58 -0800
Subject: userns: Disassociate user_struct from the user_namespace.

Modify alloc_uid to take a kuid and make the user hash table global.
Stop holding a reference to the user namespace in struct user_struct.

This simplifies the code and makes the per user accounting not
care about which user namespace a uid happens to appear in.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/ioprio.c                    | 18 ++++++++++++++----
 include/linux/sched.h          |  8 ++++----
 include/linux/user_namespace.h |  4 ----
 kernel/sys.c                   | 34 +++++++++++++++++++++++-----------
 kernel/user.c                  | 28 +++++++++++++---------------
 kernel/user_namespace.c        |  6 +-----
 6 files changed, 55 insertions(+), 43 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index 0f1b9515213b..8e35e964d9ed 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -65,6 +65,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 	struct task_struct *p, *g;
 	struct user_struct *user;
 	struct pid *pgrp;
+	kuid_t uid;
 	int ret;
 
 	switch (class) {
@@ -110,16 +111,21 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case IOPRIO_WHO_USER:
+			uid = make_kuid(current_user_ns(), who);
+			if (!uid_valid(uid))
+				break;
 			if (!who)
 				user = current_user();
 			else
-				user = find_user(who);
+				user = find_user(uid);
 
 			if (!user)
 				break;
 
 			do_each_thread(g, p) {
-				if (__task_cred(p)->uid != who)
+				const struct cred *tcred = __task_cred(p);
+				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
+				if (!uid_eq(tcred_uid, uid))
 					continue;
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
@@ -174,6 +180,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 	struct task_struct *g, *p;
 	struct user_struct *user;
 	struct pid *pgrp;
+	kuid_t uid;
 	int ret = -ESRCH;
 	int tmpio;
 
@@ -203,16 +210,19 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case IOPRIO_WHO_USER:
+			uid = make_kuid(current_user_ns(), who);
 			if (!who)
 				user = current_user();
 			else
-				user = find_user(who);
+				user = find_user(uid);
 
 			if (!user)
 				break;
 
 			do_each_thread(g, p) {
-				if (__task_cred(p)->uid != user->uid)
+				const struct cred *tcred = __task_cred(p);
+				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
+				if (!uid_eq(tcred_uid, user->uid))
 					continue;
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6867ae9bc8a0..5fdc1ebbcbc4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -90,6 +90,7 @@ struct sched_param {
 #include <linux/latencytop.h>
 #include <linux/cred.h>
 #include <linux/llist.h>
+#include <linux/uidgid.h>
 
 #include <asm/processor.h>
 
@@ -728,8 +729,7 @@ struct user_struct {
 
 	/* Hash table maintenance information */
 	struct hlist_node uidhash_node;
-	uid_t uid;
-	struct user_namespace *_user_ns; /* Don't use will be removed soon */
+	kuid_t uid;
 
 #ifdef CONFIG_PERF_EVENTS
 	atomic_long_t locked_vm;
@@ -738,7 +738,7 @@ struct user_struct {
 
 extern int uids_sysfs_init(void);
 
-extern struct user_struct *find_user(uid_t);
+extern struct user_struct *find_user(kuid_t);
 
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
@@ -2177,7 +2177,7 @@ extern struct task_struct *find_task_by_pid_ns(pid_t nr,
 extern void __set_special_pids(struct pid *pid);
 
 /* per-UID process charging. */
-extern struct user_struct * alloc_uid(struct user_namespace *, uid_t);
+extern struct user_struct * alloc_uid(kuid_t);
 static inline struct user_struct *get_uid(struct user_struct *u)
 {
 	atomic_inc(&u->__count);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index dc2d85a76376..d767508db4f9 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -6,12 +6,8 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 
-#define UIDHASH_BITS	(CONFIG_BASE_SMALL ? 3 : 7)
-#define UIDHASH_SZ	(1 << UIDHASH_BITS)
-
 struct user_namespace {
 	struct kref		kref;
-	struct hlist_head	uidhash_table[UIDHASH_SZ];
 	struct user_namespace	*parent;
 	struct user_struct	*creator;
 	struct work_struct	destroyer;
diff --git a/kernel/sys.c b/kernel/sys.c
index 71852417cfc8..f0c43b4b6657 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -175,6 +175,8 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
 	const struct cred *cred = current_cred();
 	int error = -EINVAL;
 	struct pid *pgrp;
+	kuid_t cred_uid;
+	kuid_t uid;
 
 	if (which > PRIO_USER || which < PRIO_PROCESS)
 		goto out;
@@ -207,18 +209,22 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
+			cred_uid = make_kuid(cred->user_ns, cred->uid);
+			uid = make_kuid(cred->user_ns, who);
 			user = cred->user;
 			if (!who)
-				who = cred->uid;
-			else if ((who != cred->uid) &&
-				 !(user = find_user(who)))
+				uid = cred_uid;
+			else if (!uid_eq(uid, cred_uid) &&
+				 !(user = find_user(uid)))
 				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p) {
-				if (__task_cred(p)->uid == who)
+				const struct cred *tcred = __task_cred(p);
+				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
+				if (uid_eq(tcred_uid, uid))
 					error = set_one_prio(p, niceval, error);
 			} while_each_thread(g, p);
-			if (who != cred->uid)
+			if (!uid_eq(uid, cred_uid))
 				free_uid(user);		/* For find_user() */
 			break;
 	}
@@ -242,6 +248,8 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
 	const struct cred *cred = current_cred();
 	long niceval, retval = -ESRCH;
 	struct pid *pgrp;
+	kuid_t cred_uid;
+	kuid_t uid;
 
 	if (which > PRIO_USER || which < PRIO_PROCESS)
 		return -EINVAL;
@@ -272,21 +280,25 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
+			cred_uid = make_kuid(cred->user_ns, cred->uid);
+			uid = make_kuid(cred->user_ns, who);
 			user = cred->user;
 			if (!who)
-				who = cred->uid;
-			else if ((who != cred->uid) &&
-				 !(user = find_user(who)))
+				uid = cred_uid;
+			else if (!uid_eq(uid, cred_uid) &&
+				 !(user = find_user(uid)))
 				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p) {
-				if (__task_cred(p)->uid == who) {
+				const struct cred *tcred = __task_cred(p);
+				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
+				if (uid_eq(tcred_uid, uid)) {
 					niceval = 20 - task_nice(p);
 					if (niceval > retval)
 						retval = niceval;
 				}
 			} while_each_thread(g, p);
-			if (who != cred->uid)
+			if (!uid_eq(uid, cred_uid))
 				free_uid(user);		/* for find_user() */
 			break;
 	}
@@ -629,7 +641,7 @@ static int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(current_user_ns(), new->uid);
+	new_user = alloc_uid(make_kuid(new->user_ns, new->uid));
 	if (!new_user)
 		return -EAGAIN;
 
diff --git a/kernel/user.c b/kernel/user.c
index d65fec0615a0..025077e54a7c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -34,11 +34,14 @@ EXPORT_SYMBOL_GPL(init_user_ns);
  * when changing user ID's (ie setuid() and friends).
  */
 
+#define UIDHASH_BITS	(CONFIG_BASE_SMALL ? 3 : 7)
+#define UIDHASH_SZ	(1 << UIDHASH_BITS)
 #define UIDHASH_MASK		(UIDHASH_SZ - 1)
 #define __uidhashfn(uid)	(((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(ns, uid)	((ns)->uidhash_table + __uidhashfn((uid)))
+#define uidhashentry(uid)	(uidhash_table + __uidhashfn((__kuid_val(uid))))
 
 static struct kmem_cache *uid_cachep;
+struct hlist_head uidhash_table[UIDHASH_SZ];
 
 /*
  * The uidhash_lock is mostly taken from process context, but it is
@@ -58,7 +61,7 @@ struct user_struct root_user = {
 	.files		= ATOMIC_INIT(0),
 	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
-	._user_ns	= &init_user_ns,
+	.uid		= GLOBAL_ROOT_UID,
 };
 
 /*
@@ -72,16 +75,15 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
 static void uid_hash_remove(struct user_struct *up)
 {
 	hlist_del_init(&up->uidhash_node);
-	put_user_ns(up->_user_ns); /* It is safe to free the uid hash table now */
 }
 
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
 {
 	struct user_struct *user;
 	struct hlist_node *h;
 
 	hlist_for_each_entry(user, h, hashent, uidhash_node) {
-		if (user->uid == uid) {
+		if (uid_eq(user->uid, uid)) {
 			atomic_inc(&user->__count);
 			return user;
 		}
@@ -110,14 +112,13 @@ static void free_user(struct user_struct *up, unsigned long flags)
  *
  * If the user_struct could not be found, return NULL.
  */
-struct user_struct *find_user(uid_t uid)
+struct user_struct *find_user(kuid_t uid)
 {
 	struct user_struct *ret;
 	unsigned long flags;
-	struct user_namespace *ns = current_user_ns();
 
 	spin_lock_irqsave(&uidhash_lock, flags);
-	ret = uid_hash_find(uid, uidhashentry(ns, uid));
+	ret = uid_hash_find(uid, uidhashentry(uid));
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 	return ret;
 }
@@ -136,9 +137,9 @@ void free_uid(struct user_struct *up)
 		local_irq_restore(flags);
 }
 
-struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
+struct user_struct *alloc_uid(kuid_t uid)
 {
-	struct hlist_head *hashent = uidhashentry(ns, uid);
+	struct hlist_head *hashent = uidhashentry(uid);
 	struct user_struct *up, *new;
 
 	spin_lock_irq(&uidhash_lock);
@@ -153,8 +154,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		new->uid = uid;
 		atomic_set(&new->__count, 1);
 
-		new->_user_ns = get_user_ns(ns);
-
 		/*
 		 * Before adding this, check whether we raced
 		 * on adding the same user already..
@@ -162,7 +161,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
-			put_user_ns(ns);
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
 			kmem_cache_free(uid_cachep, new);
@@ -187,11 +185,11 @@ static int __init uid_cache_init(void)
 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
-		INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
+		INIT_HLIST_HEAD(uidhash_table + n);
 
 	/* Insert the root user immediately (init already runs as root) */
 	spin_lock_irq(&uidhash_lock);
-	uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
+	uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
 	spin_unlock_irq(&uidhash_lock);
 
 	return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e216e1e8ce84..898e973bd1e8 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -27,7 +27,6 @@ int create_user_ns(struct cred *new)
 {
 	struct user_namespace *ns, *parent_ns = new->user_ns;
 	struct user_struct *root_user;
-	int n;
 
 	ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
 	if (!ns)
@@ -35,11 +34,8 @@ int create_user_ns(struct cred *new)
 
 	kref_init(&ns->kref);
 
-	for (n = 0; n < UIDHASH_SZ; ++n)
-		INIT_HLIST_HEAD(ns->uidhash_table + n);
-
 	/* Alloc new root user.  */
-	root_user = alloc_uid(ns, 0);
+	root_user = alloc_uid(make_kuid(ns, 0));
 	if (!root_user) {
 		kmem_cache_free(user_ns_cachep, ns);
 		return -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From 259e5e6c75a910f3b5e656151dc602f53f9d7548 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 12 Apr 2012 16:47:50 -0500
Subject: Add PR_{GET,SET}_NO_NEW_PRIVS to prevent execve from granting privs

With this change, calling
  prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)
disables privilege granting operations at execve-time.  For example, a
process will not be able to execute a setuid binary to change their uid
or gid if this bit is set.  The same is true for file capabilities.

Additionally, LSM_UNSAFE_NO_NEW_PRIVS is defined to ensure that
LSMs respect the requested behavior.

To determine if the NO_NEW_PRIVS bit is set, a task may call
  prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
It returns 1 if set and 0 if it is not set. If any of the arguments are
non-zero, it will return -1 and set errno to -EINVAL.
(PR_SET_NO_NEW_PRIVS behaves similarly.)

This functionality is desired for the proposed seccomp filter patch
series.  By using PR_SET_NO_NEW_PRIVS, it allows a task to modify the
system call behavior for itself and its child tasks without being
able to impact the behavior of a more privileged task.

Another potential use is making certain privileged operations
unprivileged.  For example, chroot may be considered "safe" if it cannot
affect privileged tasks.

Note, this patch causes execve to fail when PR_SET_NO_NEW_PRIVS is
set and AppArmor is in use.  It is fixed in a subsequent patch.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Will Drewry <wad@chromium.org>
Acked-by: Eric Paris <eparis@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>

v18: updated change desc
v17: using new define values as per 3.4
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 fs/exec.c                  | 10 +++++++++-
 include/linux/prctl.h      | 15 +++++++++++++++
 include/linux/sched.h      |  2 ++
 include/linux/security.h   |  1 +
 kernel/sys.c               | 10 ++++++++++
 security/apparmor/domain.c |  4 ++++
 security/commoncap.c       |  7 +++++--
 security/selinux/hooks.c   | 10 +++++++++-
 8 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/fs/exec.c b/fs/exec.c
index b1fd2025e59a..d038968b54b4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1245,6 +1245,13 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
 			bprm->unsafe |= LSM_UNSAFE_PTRACE;
 	}
 
+	/*
+	 * This isn't strictly necessary, but it makes it harder for LSMs to
+	 * mess up.
+	 */
+	if (current->no_new_privs)
+		bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
+
 	n_fs = 1;
 	spin_lock(&p->fs->lock);
 	rcu_read_lock();
@@ -1288,7 +1295,8 @@ int prepare_binprm(struct linux_binprm *bprm)
 	bprm->cred->euid = current_euid();
 	bprm->cred->egid = current_egid();
 
-	if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
+	if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
+	    !current->no_new_privs) {
 		/* Set-uid? */
 		if (mode & S_ISUID) {
 			bprm->per_clear |= PER_CLEAR_ON_SETID;
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index e0cfec2490aa..78b76e24cc7e 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -124,4 +124,19 @@
 #define PR_SET_CHILD_SUBREAPER 36
 #define PR_GET_CHILD_SUBREAPER 37
 
+/*
+ * If no_new_privs is set, then operations that grant new privileges (i.e.
+ * execve) will either fail or not grant them.  This affects suid/sgid,
+ * file capabilities, and LSMs.
+ *
+ * Operations that merely manipulate or drop existing privileges (setresuid,
+ * capset, etc.) will still work.  Drop those privileges if you want them gone.
+ *
+ * Changing LSM security domain is considered a new privilege.  So, for example,
+ * asking selinux for a specific new context (e.g. with runcon) will result
+ * in execve returning -EPERM.
+ */
+#define PR_SET_NO_NEW_PRIVS 38
+#define PR_GET_NO_NEW_PRIVS 39
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 81a173c0897d..ba60897bb447 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1341,6 +1341,8 @@ struct task_struct {
 				 * execve */
 	unsigned in_iowait:1;
 
+	/* task may not gain privileges */
+	unsigned no_new_privs:1;
 
 	/* Revert to default priority/policy when forking */
 	unsigned sched_reset_on_fork:1;
diff --git a/include/linux/security.h b/include/linux/security.h
index 673afbb8238a..6e1dea93907a 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -144,6 +144,7 @@ struct request_sock;
 #define LSM_UNSAFE_SHARE	1
 #define LSM_UNSAFE_PTRACE	2
 #define LSM_UNSAFE_PTRACE_CAP	4
+#define LSM_UNSAFE_NO_NEW_PRIVS	8
 
 #ifdef CONFIG_MMU
 extern int mmap_min_addr_handler(struct ctl_table *table, int write,
diff --git a/kernel/sys.c b/kernel/sys.c
index e7006eb6c1e4..b82568b7d201 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1979,6 +1979,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			error = put_user(me->signal->is_child_subreaper,
 					 (int __user *) arg2);
 			break;
+		case PR_SET_NO_NEW_PRIVS:
+			if (arg2 != 1 || arg3 || arg4 || arg5)
+				return -EINVAL;
+
+			current->no_new_privs = 1;
+			break;
+		case PR_GET_NO_NEW_PRIVS:
+			if (arg2 || arg3 || arg4 || arg5)
+				return -EINVAL;
+			return current->no_new_privs ? 1 : 0;
 		default:
 			error = -EINVAL;
 			break;
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index 6327685c101e..18c88d06e881 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -360,6 +360,10 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
 	if (bprm->cred_prepared)
 		return 0;
 
+	/* XXX: no_new_privs is not usable with AppArmor yet */
+	if (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)
+		return -EPERM;
+
 	cxt = bprm->cred->security;
 	BUG_ON(!cxt);
 
diff --git a/security/commoncap.c b/security/commoncap.c
index 0cf4b53480a7..edd3918fac02 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -506,14 +506,17 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 skip:
 
 	/* Don't let someone trace a set[ug]id/setpcap binary with the revised
-	 * credentials unless they have the appropriate permit
+	 * credentials unless they have the appropriate permit.
+	 *
+	 * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
 	 */
 	if ((new->euid != old->uid ||
 	     new->egid != old->gid ||
 	     !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
 	    bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 		/* downgrade; they get no more than they had, and maybe less */
-		if (!capable(CAP_SETUID)) {
+		if (!capable(CAP_SETUID) ||
+		    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
 			new->euid = new->uid;
 			new->egid = new->gid;
 		}
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index d85b793c9321..0b06685787b9 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2016,6 +2016,13 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
 		new_tsec->sid = old_tsec->exec_sid;
 		/* Reset exec SID on execve. */
 		new_tsec->exec_sid = 0;
+
+		/*
+		 * Minimize confusion: if no_new_privs and a transition is
+		 * explicitly requested, then fail the exec.
+		 */
+		if (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)
+			return -EPERM;
 	} else {
 		/* Check for a default transition on this program. */
 		rc = security_transition_sid(old_tsec->sid, isec->sid,
@@ -2029,7 +2036,8 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
 	ad.selinux_audit_data = &sad;
 	ad.u.path = bprm->file->f_path;
 
-	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+	if ((bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) ||
+	    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS))
 		new_tsec->sid = old_tsec->sid;
 
 	if (new_tsec->sid == old_tsec->sid) {
-- 
cgit v1.2.3-59-g8ed1b


From e2cfabdfd075648216f99c2c03821cf3f47c1727 Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Thu, 12 Apr 2012 16:47:57 -0500
Subject: seccomp: add system call filtering using BPF

[This patch depends on luto@mit.edu's no_new_privs patch:
   https://lkml.org/lkml/2012/1/30/264
 The whole series including Andrew's patches can be found here:
   https://github.com/redpig/linux/tree/seccomp
 Complete diff here:
   https://github.com/redpig/linux/compare/1dc65fed...seccomp
]

This patch adds support for seccomp mode 2.  Mode 2 introduces the
ability for unprivileged processes to install system call filtering
policy expressed in terms of a Berkeley Packet Filter (BPF) program.
This program will be evaluated in the kernel for each system call
the task makes and computes a result based on data in the format
of struct seccomp_data.

A filter program may be installed by calling:
  struct sock_fprog fprog = { ... };
  ...
  prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &fprog);

The return value of the filter program determines if the system call is
allowed to proceed or denied.  If the first filter program installed
allows prctl(2) calls, then the above call may be made repeatedly
by a task to further reduce its access to the kernel.  All attached
programs must be evaluated before a system call will be allowed to
proceed.

Filter programs will be inherited across fork/clone and execve.
However, if the task attaching the filter is unprivileged
(!CAP_SYS_ADMIN) the no_new_privs bit will be set on the task.  This
ensures that unprivileged tasks cannot attach filters that affect
privileged tasks (e.g., setuid binary).

There are a number of benefits to this approach. A few of which are
as follows:
- BPF has been exposed to userland for a long time
- BPF optimization (and JIT'ing) are well understood
- Userland already knows its ABI: system call numbers and desired
  arguments
- No time-of-check-time-of-use vulnerable data accesses are possible.
- system call arguments are loaded on access only to minimize copying
  required for system call policy decisions.

Mode 2 support is restricted to architectures that enable
HAVE_ARCH_SECCOMP_FILTER.  In this patch, the primary dependency is on
syscall_get_arguments().  The full desired scope of this feature will
add a few minor additional requirements expressed later in this series.
Based on discussion, SECCOMP_RET_ERRNO and SECCOMP_RET_TRACE seem to be
the desired additional functionality.

No architectures are enabled in this patch.

Signed-off-by: Will Drewry <wad@chromium.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Reviewed-by: Indan Zupancic <indan@nul.nu>
Acked-by: Eric Paris <eparis@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>

v18: - rebase to v3.4-rc2
     - s/chk/check/ (akpm@linux-foundation.org,jmorris@namei.org)
     - allocate with GFP_KERNEL|__GFP_NOWARN (indan@nul.nu)
     - add a comment for get_u32 regarding endianness (akpm@)
     - fix other typos, style mistakes (akpm@)
     - added acked-by
v17: - properly guard seccomp filter needed headers (leann@ubuntu.com)
     - tighten return mask to 0x7fff0000
v16: - no change
v15: - add a 4 instr penalty when counting a path to account for seccomp_filter
       size (indan@nul.nu)
     - drop the max insns to 256KB (indan@nul.nu)
     - return ENOMEM if the max insns limit has been hit (indan@nul.nu)
     - move IP checks after args (indan@nul.nu)
     - drop !user_filter check (indan@nul.nu)
     - only allow explicit bpf codes (indan@nul.nu)
     - exit_code -> exit_sig
v14: - put/get_seccomp_filter takes struct task_struct
       (indan@nul.nu,keescook@chromium.org)
     - adds seccomp_chk_filter and drops general bpf_run/chk_filter user
     - add seccomp_bpf_load for use by net/core/filter.c
     - lower max per-process/per-hierarchy: 1MB
     - moved nnp/capability check prior to allocation
       (all of the above: indan@nul.nu)
v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
v12: - added a maximum instruction count per path (indan@nul.nu,oleg@redhat.com)
     - removed copy_seccomp (keescook@chromium.org,indan@nul.nu)
     - reworded the prctl_set_seccomp comment (indan@nul.nu)
v11: - reorder struct seccomp_data to allow future args expansion (hpa@zytor.com)
     - style clean up, @compat dropped, compat_sock_fprog32 (indan@nul.nu)
     - do_exit(SIGSYS) (keescook@chromium.org, luto@mit.edu)
     - pare down Kconfig doc reference.
     - extra comment clean up
v10: - seccomp_data has changed again to be more aesthetically pleasing
       (hpa@zytor.com)
     - calling convention is noted in a new u32 field using syscall_get_arch.
       This allows for cross-calling convention tasks to use seccomp filters.
       (hpa@zytor.com)
     - lots of clean up (thanks, Indan!)
 v9: - n/a
 v8: - use bpf_chk_filter, bpf_run_filter. update load_fns
     - Lots of fixes courtesy of indan@nul.nu:
     -- fix up load behavior, compat fixups, and merge alloc code,
     -- renamed pc and dropped __packed, use bool compat.
     -- Added a hidden CONFIG_SECCOMP_FILTER to synthesize non-arch
        dependencies
 v7:  (massive overhaul thanks to Indan, others)
     - added CONFIG_HAVE_ARCH_SECCOMP_FILTER
     - merged into seccomp.c
     - minimal seccomp_filter.h
     - no config option (part of seccomp)
     - no new prctl
     - doesn't break seccomp on systems without asm/syscall.h
       (works but arg access always fails)
     - dropped seccomp_init_task, extra free functions, ...
     - dropped the no-asm/syscall.h code paths
     - merges with network sk_run_filter and sk_chk_filter
 v6: - fix memory leak on attach compat check failure
     - require no_new_privs || CAP_SYS_ADMIN prior to filter
       installation. (luto@mit.edu)
     - s/seccomp_struct_/seccomp_/ for macros/functions (amwang@redhat.com)
     - cleaned up Kconfig (amwang@redhat.com)
     - on block, note if the call was compat (so the # means something)
 v5: - uses syscall_get_arguments
       (indan@nul.nu,oleg@redhat.com, mcgrathr@chromium.org)
      - uses union-based arg storage with hi/lo struct to
        handle endianness.  Compromises between the two alternate
        proposals to minimize extra arg shuffling and account for
        endianness assuming userspace uses offsetof().
        (mcgrathr@chromium.org, indan@nul.nu)
      - update Kconfig description
      - add include/seccomp_filter.h and add its installation
      - (naive) on-demand syscall argument loading
      - drop seccomp_t (eparis@redhat.com)
 v4:  - adjusted prctl to make room for PR_[SG]ET_NO_NEW_PRIVS
      - now uses current->no_new_privs
        (luto@mit.edu,torvalds@linux-foundation.com)
      - assign names to seccomp modes (rdunlap@xenotime.net)
      - fix style issues (rdunlap@xenotime.net)
      - reworded Kconfig entry (rdunlap@xenotime.net)
 v3:  - macros to inline (oleg@redhat.com)
      - init_task behavior fixed (oleg@redhat.com)
      - drop creator entry and extra NULL check (oleg@redhat.com)
      - alloc returns -EINVAL on bad sizing (serge.hallyn@canonical.com)
      - adds tentative use of "always_unprivileged" as per
        torvalds@linux-foundation.org and luto@mit.edu
 v2:  - (patch 2 only)
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 arch/Kconfig            |  17 +++
 include/linux/Kbuild    |   1 +
 include/linux/seccomp.h |  76 +++++++++-
 kernel/fork.c           |   3 +
 kernel/seccomp.c        | 396 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sys.c            |   2 +-
 6 files changed, 472 insertions(+), 23 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/arch/Kconfig b/arch/Kconfig
index 684eb5af439d..91c2c730fc1a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -216,4 +216,21 @@ config HAVE_CMPXCHG_DOUBLE
 config ARCH_WANT_OLD_COMPAT_IPC
 	bool
 
+config HAVE_ARCH_SECCOMP_FILTER
+	bool
+	help
+	  This symbol should be selected by an architecure if it provides
+	  asm/syscall.h, specifically syscall_get_arguments() and
+	  syscall_get_arch().
+
+config SECCOMP_FILTER
+	def_bool y
+	depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
+	help
+	  Enable tasks to build secure computing environments defined
+	  in terms of Berkeley Packet Filter programs which implement
+	  task-defined system call filtering polices.
+
+	  See Documentation/prctl/seccomp_filter.txt for details.
+
 source "kernel/gcov/Kconfig"
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 3c9b616c834a..5c93d6c5d591 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -332,6 +332,7 @@ header-y += scc.h
 header-y += sched.h
 header-y += screen_info.h
 header-y += sdla.h
+header-y += seccomp.h
 header-y += securebits.h
 header-y += selinux_netlink.h
 header-y += sem.h
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index d61f27fcaa97..86bb68fc7683 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -1,14 +1,67 @@
 #ifndef _LINUX_SECCOMP_H
 #define _LINUX_SECCOMP_H
 
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+
+/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
+#define SECCOMP_MODE_DISABLED	0 /* seccomp is not in use. */
+#define SECCOMP_MODE_STRICT	1 /* uses hard-coded filter. */
+#define SECCOMP_MODE_FILTER	2 /* uses user-supplied filter. */
+
+/*
+ * All BPF programs must return a 32-bit value.
+ * The bottom 16-bits are reserved for future use.
+ * The upper 16-bits are ordered from least permissive values to most.
+ *
+ * The ordering ensures that a min_t() over composed return values always
+ * selects the least permissive choice.
+ */
+#define SECCOMP_RET_KILL	0x00000000U /* kill the task immediately */
+#define SECCOMP_RET_ALLOW	0x7fff0000U /* allow */
+
+/* Masks for the return value sections. */
+#define SECCOMP_RET_ACTION	0x7fff0000U
+#define SECCOMP_RET_DATA	0x0000ffffU
+
+/**
+ * struct seccomp_data - the format the BPF program executes over.
+ * @nr: the system call number
+ * @arch: indicates system call convention as an AUDIT_ARCH_* value
+ *        as defined in <linux/audit.h>.
+ * @instruction_pointer: at the time of the system call.
+ * @args: up to 6 system call arguments always stored as 64-bit values
+ *        regardless of the architecture.
+ */
+struct seccomp_data {
+	int nr;
+	__u32 arch;
+	__u64 instruction_pointer;
+	__u64 args[6];
+};
 
+#ifdef __KERNEL__
 #ifdef CONFIG_SECCOMP
 
 #include <linux/thread_info.h>
 #include <asm/seccomp.h>
 
+struct seccomp_filter;
+/**
+ * struct seccomp - the state of a seccomp'ed process
+ *
+ * @mode:  indicates one of the valid values above for controlled
+ *         system calls available to a process.
+ * @filter: The metadata and ruleset for determining what system calls
+ *          are allowed for a task.
+ *
+ *          @filter must only be accessed from the context of current as there
+ *          is no locking.
+ */
 struct seccomp {
 	int mode;
+	struct seccomp_filter *filter;
 };
 
 extern void __secure_computing(int);
@@ -19,7 +72,7 @@ static inline void secure_computing(int this_syscall)
 }
 
 extern long prctl_get_seccomp(void);
-extern long prctl_set_seccomp(unsigned long);
+extern long prctl_set_seccomp(unsigned long, char __user *);
 
 static inline int seccomp_mode(struct seccomp *s)
 {
@@ -31,15 +84,16 @@ static inline int seccomp_mode(struct seccomp *s)
 #include <linux/errno.h>
 
 struct seccomp { };
+struct seccomp_filter { };
 
-#define secure_computing(x) do { } while (0)
+#define secure_computing(x) 0
 
 static inline long prctl_get_seccomp(void)
 {
 	return -EINVAL;
 }
 
-static inline long prctl_set_seccomp(unsigned long arg2)
+static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3)
 {
 	return -EINVAL;
 }
@@ -48,7 +102,21 @@ static inline int seccomp_mode(struct seccomp *s)
 {
 	return 0;
 }
-
 #endif /* CONFIG_SECCOMP */
 
+#ifdef CONFIG_SECCOMP_FILTER
+extern void put_seccomp_filter(struct task_struct *tsk);
+extern void get_seccomp_filter(struct task_struct *tsk);
+extern u32 seccomp_bpf_load(int off);
+#else  /* CONFIG_SECCOMP_FILTER */
+static inline void put_seccomp_filter(struct task_struct *tsk)
+{
+	return;
+}
+static inline void get_seccomp_filter(struct task_struct *tsk)
+{
+	return;
+}
+#endif /* CONFIG_SECCOMP_FILTER */
+#endif /* __KERNEL__ */
 #endif /* _LINUX_SECCOMP_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index b9372a0bff18..f7cf6fb107ec 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -170,6 +171,7 @@ void free_task(struct task_struct *tsk)
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
+	put_seccomp_filter(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -1162,6 +1164,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto fork_out;
 
 	ftrace_graph_init_task(p);
+	get_seccomp_filter(p);
 
 	rt_mutex_init_task(p);
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e8d76c5895ea..0aeec1960f91 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,16 +3,343 @@
  *
  * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
  *
- * This defines a simple but solid secure-computing mode.
+ * Copyright (C) 2012 Google, Inc.
+ * Will Drewry <wad@chromium.org>
+ *
+ * This defines a simple but solid secure-computing facility.
+ *
+ * Mode 1 uses a fixed list of allowed system calls.
+ * Mode 2 allows user-defined system call filters in the form
+ *        of Berkeley Packet Filters/Linux Socket Filters.
  */
 
+#include <linux/atomic.h>
 #include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/sched.h>
 #include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/seccomp.h>
 
 /* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
+
+#ifdef CONFIG_SECCOMP_FILTER
+#include <asm/syscall.h>
+#include <linux/filter.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/tracehook.h>
+#include <linux/uaccess.h>
+
+/**
+ * struct seccomp_filter - container for seccomp BPF programs
+ *
+ * @usage: reference count to manage the object lifetime.
+ *         get/put helpers should be used when accessing an instance
+ *         outside of a lifetime-guarded section.  In general, this
+ *         is only needed for handling filters shared across tasks.
+ * @prev: points to a previously installed, or inherited, filter
+ * @len: the number of instructions in the program
+ * @insns: the BPF program instructions to evaluate
+ *
+ * seccomp_filter objects are organized in a tree linked via the @prev
+ * pointer.  For any task, it appears to be a singly-linked list starting
+ * with current->seccomp.filter, the most recently attached or inherited filter.
+ * However, multiple filters may share a @prev node, by way of fork(), which
+ * results in a unidirectional tree existing in memory.  This is similar to
+ * how namespaces work.
+ *
+ * seccomp_filter objects should never be modified after being attached
+ * to a task_struct (other than @usage).
+ */
+struct seccomp_filter {
+	atomic_t usage;
+	struct seccomp_filter *prev;
+	unsigned short len;  /* Instruction count */
+	struct sock_filter insns[];
+};
+
+/* Limit any path through the tree to 256KB worth of instructions. */
+#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
+
+static void seccomp_filter_log_failure(int syscall)
+{
+	int compat = 0;
+#ifdef CONFIG_COMPAT
+	compat = is_compat_task();
+#endif
+	pr_info("%s[%d]: %ssystem call %d blocked at 0x%lx\n",
+		current->comm, task_pid_nr(current),
+		(compat ? "compat " : ""),
+		syscall, KSTK_EIP(current));
+}
+
+/**
+ * get_u32 - returns a u32 offset into data
+ * @data: a unsigned 64 bit value
+ * @index: 0 or 1 to return the first or second 32-bits
+ *
+ * This inline exists to hide the length of unsigned long.  If a 32-bit
+ * unsigned long is passed in, it will be extended and the top 32-bits will be
+ * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
+ * properly returned.
+ *
+ * Endianness is explicitly ignored and left for BPF program authors to manage
+ * as per the specific architecture.
+ */
+static inline u32 get_u32(u64 data, int index)
+{
+	return ((u32 *)&data)[index];
+}
+
+/* Helper for bpf_load below. */
+#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
+/**
+ * bpf_load: checks and returns a pointer to the requested offset
+ * @off: offset into struct seccomp_data to load from
+ *
+ * Returns the requested 32-bits of data.
+ * seccomp_check_filter() should assure that @off is 32-bit aligned
+ * and not out of bounds.  Failure to do so is a BUG.
+ */
+u32 seccomp_bpf_load(int off)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	if (off == BPF_DATA(nr))
+		return syscall_get_nr(current, regs);
+	if (off == BPF_DATA(arch))
+		return syscall_get_arch(current, regs);
+	if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
+		unsigned long value;
+		int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
+		int index = !!(off % sizeof(u64));
+		syscall_get_arguments(current, regs, arg, 1, &value);
+		return get_u32(value, index);
+	}
+	if (off == BPF_DATA(instruction_pointer))
+		return get_u32(KSTK_EIP(current), 0);
+	if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
+		return get_u32(KSTK_EIP(current), 1);
+	/* seccomp_check_filter should make this impossible. */
+	BUG();
+}
+
+/**
+ *	seccomp_check_filter - verify seccomp filter code
+ *	@filter: filter to verify
+ *	@flen: length of filter
+ *
+ * Takes a previously checked filter (by sk_chk_filter) and
+ * redirects all filter code that loads struct sk_buff data
+ * and related data through seccomp_bpf_load.  It also
+ * enforces length and alignment checking of those loads.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
+{
+	int pc;
+	for (pc = 0; pc < flen; pc++) {
+		struct sock_filter *ftest = &filter[pc];
+		u16 code = ftest->code;
+		u32 k = ftest->k;
+
+		switch (code) {
+		case BPF_S_LD_W_ABS:
+			ftest->code = BPF_S_ANC_SECCOMP_LD_W;
+			/* 32-bit aligned and not out of bounds. */
+			if (k >= sizeof(struct seccomp_data) || k & 3)
+				return -EINVAL;
+			continue;
+		case BPF_S_LD_W_LEN:
+			ftest->code = BPF_S_LD_IMM;
+			ftest->k = sizeof(struct seccomp_data);
+			continue;
+		case BPF_S_LDX_W_LEN:
+			ftest->code = BPF_S_LDX_IMM;
+			ftest->k = sizeof(struct seccomp_data);
+			continue;
+		/* Explicitly include allowed calls. */
+		case BPF_S_RET_K:
+		case BPF_S_RET_A:
+		case BPF_S_ALU_ADD_K:
+		case BPF_S_ALU_ADD_X:
+		case BPF_S_ALU_SUB_K:
+		case BPF_S_ALU_SUB_X:
+		case BPF_S_ALU_MUL_K:
+		case BPF_S_ALU_MUL_X:
+		case BPF_S_ALU_DIV_X:
+		case BPF_S_ALU_AND_K:
+		case BPF_S_ALU_AND_X:
+		case BPF_S_ALU_OR_K:
+		case BPF_S_ALU_OR_X:
+		case BPF_S_ALU_LSH_K:
+		case BPF_S_ALU_LSH_X:
+		case BPF_S_ALU_RSH_K:
+		case BPF_S_ALU_RSH_X:
+		case BPF_S_ALU_NEG:
+		case BPF_S_LD_IMM:
+		case BPF_S_LDX_IMM:
+		case BPF_S_MISC_TAX:
+		case BPF_S_MISC_TXA:
+		case BPF_S_ALU_DIV_K:
+		case BPF_S_LD_MEM:
+		case BPF_S_LDX_MEM:
+		case BPF_S_ST:
+		case BPF_S_STX:
+		case BPF_S_JMP_JA:
+		case BPF_S_JMP_JEQ_K:
+		case BPF_S_JMP_JEQ_X:
+		case BPF_S_JMP_JGE_K:
+		case BPF_S_JMP_JGE_X:
+		case BPF_S_JMP_JGT_K:
+		case BPF_S_JMP_JGT_X:
+		case BPF_S_JMP_JSET_K:
+		case BPF_S_JMP_JSET_X:
+			continue;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/**
+ * seccomp_run_filters - evaluates all seccomp filters against @syscall
+ * @syscall: number of the current system call
+ *
+ * Returns valid seccomp BPF response codes.
+ */
+static u32 seccomp_run_filters(int syscall)
+{
+	struct seccomp_filter *f;
+	u32 ret = SECCOMP_RET_KILL;
+	/*
+	 * All filters in the list are evaluated and the lowest BPF return
+	 * value always takes priority.
+	 */
+	for (f = current->seccomp.filter; f; f = f->prev) {
+		ret = sk_run_filter(NULL, f->insns);
+		if (ret != SECCOMP_RET_ALLOW)
+			break;
+	}
+	return ret;
+}
+
+/**
+ * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * @fprog: BPF program to install
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+static long seccomp_attach_filter(struct sock_fprog *fprog)
+{
+	struct seccomp_filter *filter;
+	unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
+	unsigned long total_insns = fprog->len;
+	long ret;
+
+	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
+		return -EINVAL;
+
+	for (filter = current->seccomp.filter; filter; filter = filter->prev)
+		total_insns += filter->len + 4;  /* include a 4 instr penalty */
+	if (total_insns > MAX_INSNS_PER_PATH)
+		return -ENOMEM;
+
+	/*
+	 * Installing a seccomp filter requires that the task have
+	 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
+	 * This avoids scenarios where unprivileged tasks can affect the
+	 * behavior of privileged children.
+	 */
+	if (!current->no_new_privs &&
+	    security_capable_noaudit(current_cred(), current_user_ns(),
+				     CAP_SYS_ADMIN) != 0)
+		return -EACCES;
+
+	/* Allocate a new seccomp_filter */
+	filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
+			 GFP_KERNEL|__GFP_NOWARN);
+	if (!filter)
+		return -ENOMEM;
+	atomic_set(&filter->usage, 1);
+	filter->len = fprog->len;
+
+	/* Copy the instructions from fprog. */
+	ret = -EFAULT;
+	if (copy_from_user(filter->insns, fprog->filter, fp_size))
+		goto fail;
+
+	/* Check and rewrite the fprog via the skb checker */
+	ret = sk_chk_filter(filter->insns, filter->len);
+	if (ret)
+		goto fail;
+
+	/* Check and rewrite the fprog for seccomp use */
+	ret = seccomp_check_filter(filter->insns, filter->len);
+	if (ret)
+		goto fail;
+
+	/*
+	 * If there is an existing filter, make it the prev and don't drop its
+	 * task reference.
+	 */
+	filter->prev = current->seccomp.filter;
+	current->seccomp.filter = filter;
+	return 0;
+fail:
+	kfree(filter);
+	return ret;
+}
+
+/**
+ * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * @user_filter: pointer to the user data containing a sock_fprog.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+long seccomp_attach_user_filter(char __user *user_filter)
+{
+	struct sock_fprog fprog;
+	long ret = -EFAULT;
+
+#ifdef CONFIG_COMPAT
+	if (is_compat_task()) {
+		struct compat_sock_fprog fprog32;
+		if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
+			goto out;
+		fprog.len = fprog32.len;
+		fprog.filter = compat_ptr(fprog32.filter);
+	} else /* falls through to the if below. */
+#endif
+	if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
+		goto out;
+	ret = seccomp_attach_filter(&fprog);
+out:
+	return ret;
+}
+
+/* get_seccomp_filter - increments the reference count of the filter on @tsk */
+void get_seccomp_filter(struct task_struct *tsk)
+{
+	struct seccomp_filter *orig = tsk->seccomp.filter;
+	if (!orig)
+		return;
+	/* Reference count is bounded by the number of total processes. */
+	atomic_inc(&orig->usage);
+}
+
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+	struct seccomp_filter *orig = tsk->seccomp.filter;
+	/* Clean up single-reference branches iteratively. */
+	while (orig && atomic_dec_and_test(&orig->usage)) {
+		struct seccomp_filter *freeme = orig;
+		orig = orig->prev;
+		kfree(freeme);
+	}
+}
+#endif	/* CONFIG_SECCOMP_FILTER */
 
 /*
  * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -34,10 +361,11 @@ static int mode1_syscalls_32[] = {
 void __secure_computing(int this_syscall)
 {
 	int mode = current->seccomp.mode;
-	int * syscall;
+	int exit_sig = 0;
+	int *syscall;
 
 	switch (mode) {
-	case 1:
+	case SECCOMP_MODE_STRICT:
 		syscall = mode1_syscalls;
 #ifdef CONFIG_COMPAT
 		if (is_compat_task())
@@ -47,7 +375,16 @@ void __secure_computing(int this_syscall)
 			if (*syscall == this_syscall)
 				return;
 		} while (*++syscall);
+		exit_sig = SIGKILL;
 		break;
+#ifdef CONFIG_SECCOMP_FILTER
+	case SECCOMP_MODE_FILTER:
+		if (seccomp_run_filters(this_syscall) == SECCOMP_RET_ALLOW)
+			return;
+		seccomp_filter_log_failure(this_syscall);
+		exit_sig = SIGSYS;
+		break;
+#endif
 	default:
 		BUG();
 	}
@@ -56,7 +393,7 @@ void __secure_computing(int this_syscall)
 	dump_stack();
 #endif
 	audit_seccomp(this_syscall);
-	do_exit(SIGKILL);
+	do_exit(exit_sig);
 }
 
 long prctl_get_seccomp(void)
@@ -64,25 +401,48 @@ long prctl_get_seccomp(void)
 	return current->seccomp.mode;
 }
 
-long prctl_set_seccomp(unsigned long seccomp_mode)
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * This function may be called repeatedly with a @seccomp_mode of
+ * SECCOMP_MODE_FILTER to install additional filters.  Every filter
+ * successfully installed will be evaluated (in reverse order) for each system
+ * call the task makes.
+ *
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 {
-	long ret;
+	long ret = -EINVAL;
 
-	/* can set it only once to be even more secure */
-	ret = -EPERM;
-	if (unlikely(current->seccomp.mode))
+	if (current->seccomp.mode &&
+	    current->seccomp.mode != seccomp_mode)
 		goto out;
 
-	ret = -EINVAL;
-	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
-		current->seccomp.mode = seccomp_mode;
-		set_thread_flag(TIF_SECCOMP);
+	switch (seccomp_mode) {
+	case SECCOMP_MODE_STRICT:
+		ret = 0;
 #ifdef TIF_NOTSC
 		disable_TSC();
 #endif
-		ret = 0;
+		break;
+#ifdef CONFIG_SECCOMP_FILTER
+	case SECCOMP_MODE_FILTER:
+		ret = seccomp_attach_user_filter(filter);
+		if (ret)
+			goto out;
+		break;
+#endif
+	default:
+		goto out;
 	}
 
- out:
+	current->seccomp.mode = seccomp_mode;
+	set_thread_flag(TIF_SECCOMP);
+out:
 	return ret;
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index b82568b7d201..ba0ae8eea6fb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			error = prctl_get_seccomp();
 			break;
 		case PR_SET_SECCOMP:
-			error = prctl_set_seccomp(arg2);
+			error = prctl_set_seccomp(arg2, (char __user *)arg3);
 			break;
 		case PR_GET_TSC:
 			error = GET_TSC_CTL(arg2);
-- 
cgit v1.2.3-59-g8ed1b


From 078de5f706ece36afd73bb4b8283314132d2dfdf Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 8 Feb 2012 07:00:08 -0800
Subject: userns: Store uid and gid values in struct cred with kuid_t and
 kgid_t types

cred.h and a few trivial users of struct cred are changed.  The rest of the users
of struct cred are left for other patches as there are too many changes to make
in one go and leave the change reviewable.  If the user namespace is disabled and
CONFIG_UIDGID_STRICT_TYPE_CHECKS are disabled the code will contiue to compile
and behave correctly.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/x86/mm/fault.c            |  2 +-
 fs/ioprio.c                    |  8 ++------
 include/linux/cred.h           | 16 ++++++++--------
 include/linux/user_namespace.h |  8 ++++----
 kernel/cred.c                  | 36 ++++++++++++++++++++++--------------
 kernel/signal.c                | 14 ++++++++------
 kernel/sys.c                   | 26 +++++++++-----------------
 kernel/user_namespace.c        |  4 ++--
 mm/oom_kill.c                  |  4 ++--
 security/commoncap.c           |  3 +--
 10 files changed, 59 insertions(+), 62 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3ecfd1aaf214..76dcd9d8e0bc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -582,7 +582,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		pte_t *pte = lookup_address(address, &level);
 
 		if (pte && pte_present(*pte) && !pte_exec(*pte))
-			printk(nx_warning, current_uid());
+			printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
 	}
 
 	printk(KERN_ALERT "BUG: unable to handle kernel ");
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 8e35e964d9ed..2072e41785d2 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -123,9 +123,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 				break;
 
 			do_each_thread(g, p) {
-				const struct cred *tcred = __task_cred(p);
-				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
-				if (!uid_eq(tcred_uid, uid))
+				if (!uid_eq(task_uid(p), uid))
 					continue;
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
@@ -220,9 +218,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 				break;
 
 			do_each_thread(g, p) {
-				const struct cred *tcred = __task_cred(p);
-				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
-				if (!uid_eq(tcred_uid, user->uid))
+				if (!uid_eq(task_uid(p), user->uid))
 					continue;
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 0ab3cda4a774..fac0579258fc 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -123,14 +123,14 @@ struct cred {
 #define CRED_MAGIC	0x43736564
 #define CRED_MAGIC_DEAD	0x44656144
 #endif
-	uid_t		uid;		/* real UID of the task */
-	gid_t		gid;		/* real GID of the task */
-	uid_t		suid;		/* saved UID of the task */
-	gid_t		sgid;		/* saved GID of the task */
-	uid_t		euid;		/* effective UID of the task */
-	gid_t		egid;		/* effective GID of the task */
-	uid_t		fsuid;		/* UID for VFS ops */
-	gid_t		fsgid;		/* GID for VFS ops */
+	kuid_t		uid;		/* real UID of the task */
+	kgid_t		gid;		/* real GID of the task */
+	kuid_t		suid;		/* saved UID of the task */
+	kgid_t		sgid;		/* saved GID of the task */
+	kuid_t		euid;		/* effective UID of the task */
+	kgid_t		egid;		/* effective GID of the task */
+	kuid_t		fsuid;		/* UID for VFS ops */
+	kgid_t		fsgid;		/* GID for VFS ops */
 	unsigned	securebits;	/* SUID-less security management */
 	kernel_cap_t	cap_inheritable; /* caps our children can inherit */
 	kernel_cap_t	cap_permitted;	/* caps we're permitted */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 4c9846d90741..a2c61457cba1 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -70,15 +70,15 @@ static inline void put_user_ns(struct user_namespace *ns)
 #endif
 
 static inline uid_t user_ns_map_uid(struct user_namespace *to,
-	const struct cred *cred, uid_t uid)
+	const struct cred *cred, kuid_t uid)
 {
-	return from_kuid_munged(to, make_kuid(cred->user_ns, uid));
+	return from_kuid_munged(to, uid);
 }
 
 static inline gid_t user_ns_map_gid(struct user_namespace *to,
-	const struct cred *cred, gid_t gid)
+	const struct cred *cred, kgid_t gid)
 {
-	return from_kgid_munged(to, make_kgid(cred->user_ns, gid));
+	return from_kgid_munged(to, gid);
 }
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/cred.c b/kernel/cred.c
index 7a0d80669886..eddc5e2e9587 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -49,6 +49,14 @@ struct cred init_cred = {
 	.subscribers		= ATOMIC_INIT(2),
 	.magic			= CRED_MAGIC,
 #endif
+	.uid			= GLOBAL_ROOT_UID,
+	.gid			= GLOBAL_ROOT_GID,
+	.suid			= GLOBAL_ROOT_UID,
+	.sgid			= GLOBAL_ROOT_GID,
+	.euid			= GLOBAL_ROOT_UID,
+	.egid			= GLOBAL_ROOT_GID,
+	.fsuid			= GLOBAL_ROOT_UID,
+	.fsgid			= GLOBAL_ROOT_GID,
 	.securebits		= SECUREBITS_DEFAULT,
 	.cap_inheritable	= CAP_EMPTY_SET,
 	.cap_permitted		= CAP_FULL_SET,
@@ -488,10 +496,10 @@ int commit_creds(struct cred *new)
 	get_cred(new); /* we will require a ref for the subj creds too */
 
 	/* dumpability changes */
-	if (old->euid != new->euid ||
-	    old->egid != new->egid ||
-	    old->fsuid != new->fsuid ||
-	    old->fsgid != new->fsgid ||
+	if (!uid_eq(old->euid, new->euid) ||
+	    !gid_eq(old->egid, new->egid) ||
+	    !uid_eq(old->fsuid, new->fsuid) ||
+	    !gid_eq(old->fsgid, new->fsgid) ||
 	    !cap_issubset(new->cap_permitted, old->cap_permitted)) {
 		if (task->mm)
 			set_dumpable(task->mm, suid_dumpable);
@@ -500,9 +508,9 @@ int commit_creds(struct cred *new)
 	}
 
 	/* alter the thread keyring */
-	if (new->fsuid != old->fsuid)
+	if (!uid_eq(new->fsuid, old->fsuid))
 		key_fsuid_changed(task);
-	if (new->fsgid != old->fsgid)
+	if (!gid_eq(new->fsgid, old->fsgid))
 		key_fsgid_changed(task);
 
 	/* do it
@@ -519,16 +527,16 @@ int commit_creds(struct cred *new)
 	alter_cred_subscribers(old, -2);
 
 	/* send notifications */
-	if (new->uid   != old->uid  ||
-	    new->euid  != old->euid ||
-	    new->suid  != old->suid ||
-	    new->fsuid != old->fsuid)
+	if (!uid_eq(new->uid,   old->uid)  ||
+	    !uid_eq(new->euid,  old->euid) ||
+	    !uid_eq(new->suid,  old->suid) ||
+	    !uid_eq(new->fsuid, old->fsuid))
 		proc_id_connector(task, PROC_EVENT_UID);
 
-	if (new->gid   != old->gid  ||
-	    new->egid  != old->egid ||
-	    new->sgid  != old->sgid ||
-	    new->fsgid != old->fsgid)
+	if (!gid_eq(new->gid,   old->gid)  ||
+	    !gid_eq(new->egid,  old->egid) ||
+	    !gid_eq(new->sgid,  old->sgid) ||
+	    !gid_eq(new->fsgid, old->fsgid))
 		proc_id_connector(task, PROC_EVENT_GID);
 
 	/* release the old obj and subj refs both */
diff --git a/kernel/signal.c b/kernel/signal.c
index e2c5d84f2dac..2734dc965f69 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1038,8 +1038,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str
 	if (SI_FROMKERNEL(info))
 		return;
 
-	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
-					current_cred(), info->si_uid);
+	rcu_read_lock();
+	info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
+					make_kuid(current_user_ns(), info->si_uid));
+	rcu_read_unlock();
 }
 #else
 static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
@@ -1106,7 +1108,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			q->info.si_code = SI_USER;
 			q->info.si_pid = task_tgid_nr_ns(current,
 							task_active_pid_ns(t));
-			q->info.si_uid = current_uid();
+			q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 			break;
 		case (unsigned long) SEND_SIG_PRIV:
 			q->info.si_signo = sig;
@@ -1973,7 +1975,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 	info.si_signo = signr;
 	info.si_code = exit_code;
 	info.si_pid = task_pid_vnr(current);
-	info.si_uid = current_uid();
+	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	/* Let the debugger run.  */
 	ptrace_stop(exit_code, why, 1, &info);
@@ -2828,7 +2830,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 	info.si_errno = 0;
 	info.si_code = SI_USER;
 	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = current_uid();
+	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	return kill_something_info(sig, &info, pid);
 }
@@ -2871,7 +2873,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
 	info.si_errno = 0;
 	info.si_code = SI_TKILL;
 	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = current_uid();
+	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	return do_send_specific(tgid, pid, sig, &info);
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index f0c43b4b6657..39962818c008 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -175,7 +175,6 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
 	const struct cred *cred = current_cred();
 	int error = -EINVAL;
 	struct pid *pgrp;
-	kuid_t cred_uid;
 	kuid_t uid;
 
 	if (which > PRIO_USER || which < PRIO_PROCESS)
@@ -209,22 +208,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			cred_uid = make_kuid(cred->user_ns, cred->uid);
 			uid = make_kuid(cred->user_ns, who);
 			user = cred->user;
 			if (!who)
-				uid = cred_uid;
-			else if (!uid_eq(uid, cred_uid) &&
+				uid = cred->uid;
+			else if (!uid_eq(uid, cred->uid) &&
 				 !(user = find_user(uid)))
 				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p) {
-				const struct cred *tcred = __task_cred(p);
-				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
-				if (uid_eq(tcred_uid, uid))
+				if (uid_eq(task_uid(p), uid))
 					error = set_one_prio(p, niceval, error);
 			} while_each_thread(g, p);
-			if (!uid_eq(uid, cred_uid))
+			if (!uid_eq(uid, cred->uid))
 				free_uid(user);		/* For find_user() */
 			break;
 	}
@@ -248,7 +244,6 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
 	const struct cred *cred = current_cred();
 	long niceval, retval = -ESRCH;
 	struct pid *pgrp;
-	kuid_t cred_uid;
 	kuid_t uid;
 
 	if (which > PRIO_USER || which < PRIO_PROCESS)
@@ -280,25 +275,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			cred_uid = make_kuid(cred->user_ns, cred->uid);
 			uid = make_kuid(cred->user_ns, who);
 			user = cred->user;
 			if (!who)
-				uid = cred_uid;
-			else if (!uid_eq(uid, cred_uid) &&
+				uid = cred->uid;
+			else if (!uid_eq(uid, cred->uid) &&
 				 !(user = find_user(uid)))
 				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p) {
-				const struct cred *tcred = __task_cred(p);
-				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
-				if (uid_eq(tcred_uid, uid)) {
+				if (uid_eq(task_uid(p), uid)) {
 					niceval = 20 - task_nice(p);
 					if (niceval > retval)
 						retval = niceval;
 				}
 			} while_each_thread(g, p);
-			if (!uid_eq(uid, cred_uid))
+			if (!uid_eq(uid, cred->uid))
 				free_uid(user);		/* for find_user() */
 			break;
 	}
@@ -641,7 +633,7 @@ static int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(make_kuid(new->user_ns, new->uid));
+	new_user = alloc_uid(new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 7eff867bfac5..86602316422d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -36,8 +36,8 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 int create_user_ns(struct cred *new)
 {
 	struct user_namespace *ns, *parent_ns = new->user_ns;
-	kuid_t owner = make_kuid(new->user_ns, new->euid);
-	kgid_t group = make_kgid(new->user_ns, new->egid);
+	kuid_t owner = new->euid;
+	kgid_t group = new->egid;
 
 	/* The creator needs a mapping in the parent user namespace
 	 * or else we won't be able to reasonably tell userspace who
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 46bf2ed5594c..9f09a1fde9f9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -410,8 +410,8 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
 		}
 
 		pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
-			task->pid, task_uid(task), task->tgid,
-			task->mm->total_vm, get_mm_rss(task->mm),
+			task->pid, from_kuid(&init_user_ns, task_uid(task)),
+			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 			task_cpu(task), task->signal->oom_adj,
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
diff --git a/security/commoncap.c b/security/commoncap.c
index f2399d8afbe0..dbd465a59286 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -77,8 +77,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
 {
 	for (;;) {
 		/* The owner of the user namespace has all caps. */
-		if (targ_ns != &init_user_ns && uid_eq(targ_ns->owner,
-						       make_kuid(cred->user_ns, cred->euid)))
+		if (targ_ns != &init_user_ns && uid_eq(targ_ns->owner, cred->euid))
 			return 0;
 
 		/* Do we have the necessary capabilities? */
-- 
cgit v1.2.3-59-g8ed1b


From a29c33f4e506e1dae7e0985b6328046535becbf8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 7 Feb 2012 18:51:01 -0800
Subject: userns: Convert setting and getting uid and gid system calls to use
 kuid and kgid

Convert setregid, setgid, setreuid, setuid,
setresuid, getresuid, setresgid, getresgid, setfsuid, setfsgid,
getuid, geteuid, getgid, getegid,
waitpid, waitid, wait4.

Convert userspace uids and gids into kuids and kgids before
being placed on struct cred.  Convert struct cred kuids and
kgids into userspace uids and gids when returning them.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/exit.c  |   6 +-
 kernel/sys.c   | 216 +++++++++++++++++++++++++++++++++++++++------------------
 kernel/timer.c |   8 +--
 kernel/uid16.c |  34 +++++----
 4 files changed, 178 insertions(+), 86 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index d8bd3b425fa7..789e3c5777f7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1214,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	unsigned long state;
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
-	uid_t uid = __task_cred(p)->uid;
+	uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid);
 	struct siginfo __user *infop;
 
 	if (!likely(wo->wo_flags & WEXITED))
@@ -1427,7 +1427,7 @@ static int wait_task_stopped(struct wait_opts *wo,
 	if (!unlikely(wo->wo_flags & WNOWAIT))
 		*p_code = 0;
 
-	uid = task_uid(p);
+	uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid);
 unlock_sig:
 	spin_unlock_irq(&p->sighand->siglock);
 	if (!exit_code)
@@ -1500,7 +1500,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	}
 	if (!unlikely(wo->wo_flags & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
-	uid = task_uid(p);
+	uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid);
 	spin_unlock_irq(&p->sighand->siglock);
 
 	pid = task_pid_vnr(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 39962818c008..aff09f208eb3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -555,9 +555,19 @@ void ctrl_alt_del(void)
  */
 SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 {
+	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
 	struct cred *new;
 	int retval;
+	kgid_t krgid, kegid;
+
+	krgid = make_kgid(ns, rgid);
+	kegid = make_kgid(ns, egid);
+
+	if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+		return -EINVAL;
+	if ((egid != (gid_t) -1) && !gid_valid(kegid))
+		return -EINVAL;
 
 	new = prepare_creds();
 	if (!new)
@@ -566,25 +576,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 
 	retval = -EPERM;
 	if (rgid != (gid_t) -1) {
-		if (old->gid == rgid ||
-		    old->egid == rgid ||
+		if (gid_eq(old->gid, krgid) ||
+		    gid_eq(old->egid, krgid) ||
 		    nsown_capable(CAP_SETGID))
-			new->gid = rgid;
+			new->gid = krgid;
 		else
 			goto error;
 	}
 	if (egid != (gid_t) -1) {
-		if (old->gid == egid ||
-		    old->egid == egid ||
-		    old->sgid == egid ||
+		if (gid_eq(old->gid, kegid) ||
+		    gid_eq(old->egid, kegid) ||
+		    gid_eq(old->sgid, kegid) ||
 		    nsown_capable(CAP_SETGID))
-			new->egid = egid;
+			new->egid = kegid;
 		else
 			goto error;
 	}
 
 	if (rgid != (gid_t) -1 ||
-	    (egid != (gid_t) -1 && egid != old->gid))
+	    (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
 		new->sgid = new->egid;
 	new->fsgid = new->egid;
 
@@ -602,9 +612,15 @@ error:
  */
 SYSCALL_DEFINE1(setgid, gid_t, gid)
 {
+	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
 	struct cred *new;
 	int retval;
+	kgid_t kgid;
+
+	kgid = make_kgid(ns, gid);
+	if (!gid_valid(kgid))
+		return -EINVAL;
 
 	new = prepare_creds();
 	if (!new)
@@ -613,9 +629,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
 
 	retval = -EPERM;
 	if (nsown_capable(CAP_SETGID))
-		new->gid = new->egid = new->sgid = new->fsgid = gid;
-	else if (gid == old->gid || gid == old->sgid)
-		new->egid = new->fsgid = gid;
+		new->gid = new->egid = new->sgid = new->fsgid = kgid;
+	else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
+		new->egid = new->fsgid = kgid;
 	else
 		goto error;
 
@@ -672,9 +688,19 @@ static int set_user(struct cred *new)
  */
 SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 {
+	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
 	struct cred *new;
 	int retval;
+	kuid_t kruid, keuid;
+
+	kruid = make_kuid(ns, ruid);
+	keuid = make_kuid(ns, euid);
+
+	if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+		return -EINVAL;
+	if ((euid != (uid_t) -1) && !uid_valid(keuid))
+		return -EINVAL;
 
 	new = prepare_creds();
 	if (!new)
@@ -683,29 +709,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 
 	retval = -EPERM;
 	if (ruid != (uid_t) -1) {
-		new->uid = ruid;
-		if (old->uid != ruid &&
-		    old->euid != ruid &&
+		new->uid = kruid;
+		if (!uid_eq(old->uid, kruid) &&
+		    !uid_eq(old->euid, kruid) &&
 		    !nsown_capable(CAP_SETUID))
 			goto error;
 	}
 
 	if (euid != (uid_t) -1) {
-		new->euid = euid;
-		if (old->uid != euid &&
-		    old->euid != euid &&
-		    old->suid != euid &&
+		new->euid = keuid;
+		if (!uid_eq(old->uid, keuid) &&
+		    !uid_eq(old->euid, keuid) &&
+		    !uid_eq(old->suid, keuid) &&
 		    !nsown_capable(CAP_SETUID))
 			goto error;
 	}
 
-	if (new->uid != old->uid) {
+	if (!uid_eq(new->uid, old->uid)) {
 		retval = set_user(new);
 		if (retval < 0)
 			goto error;
 	}
 	if (ruid != (uid_t) -1 ||
-	    (euid != (uid_t) -1 && euid != old->uid))
+	    (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
 		new->suid = new->euid;
 	new->fsuid = new->euid;
 
@@ -733,9 +759,15 @@ error:
  */
 SYSCALL_DEFINE1(setuid, uid_t, uid)
 {
+	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
 	struct cred *new;
 	int retval;
+	kuid_t kuid;
+
+	kuid = make_kuid(ns, uid);
+	if (!uid_valid(kuid))
+		return -EINVAL;
 
 	new = prepare_creds();
 	if (!new)
@@ -744,17 +776,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
 
 	retval = -EPERM;
 	if (nsown_capable(CAP_SETUID)) {
-		new->suid = new->uid = uid;
-		if (uid != old->uid) {
+		new->suid = new->uid = kuid;
+		if (!uid_eq(kuid, old->uid)) {
 			retval = set_user(new);
 			if (retval < 0)
 				goto error;
 		}
-	} else if (uid != old->uid && uid != new->suid) {
+	} else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
 		goto error;
 	}
 
-	new->fsuid = new->euid = uid;
+	new->fsuid = new->euid = kuid;
 
 	retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
 	if (retval < 0)
@@ -774,9 +806,24 @@ error:
  */
 SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 {
+	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
 	struct cred *new;
 	int retval;
+	kuid_t kruid, keuid, ksuid;
+
+	kruid = make_kuid(ns, ruid);
+	keuid = make_kuid(ns, euid);
+	ksuid = make_kuid(ns, suid);
+
+	if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+		return -EINVAL;
+
+	if ((euid != (uid_t) -1) && !uid_valid(keuid))
+		return -EINVAL;
+
+	if ((suid != (uid_t) -1) && !uid_valid(ksuid))
+		return -EINVAL;
 
 	new = prepare_creds();
 	if (!new)
@@ -786,29 +833,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 
 	retval = -EPERM;
 	if (!nsown_capable(CAP_SETUID)) {
-		if (ruid != (uid_t) -1 && ruid != old->uid &&
-		    ruid != old->euid  && ruid != old->suid)
+		if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
+		    !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
 			goto error;
-		if (euid != (uid_t) -1 && euid != old->uid &&
-		    euid != old->euid  && euid != old->suid)
+		if (euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
+		    !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
 			goto error;
-		if (suid != (uid_t) -1 && suid != old->uid &&
-		    suid != old->euid  && suid != old->suid)
+		if (suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
+		    !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
 			goto error;
 	}
 
 	if (ruid != (uid_t) -1) {
-		new->uid = ruid;
-		if (ruid != old->uid) {
+		new->uid = kruid;
+		if (!uid_eq(kruid, old->uid)) {
 			retval = set_user(new);
 			if (retval < 0)
 				goto error;
 		}
 	}
 	if (euid != (uid_t) -1)
-		new->euid = euid;
+		new->euid = keuid;
 	if (suid != (uid_t) -1)
-		new->suid = suid;
+		new->suid = ksuid;
 	new->fsuid = new->euid;
 
 	retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
@@ -822,14 +869,19 @@ error:
 	return retval;
 }
 
-SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid)
+SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
 {
 	const struct cred *cred = current_cred();
 	int retval;
+	uid_t ruid, euid, suid;
+
+	ruid = from_kuid_munged(cred->user_ns, cred->uid);
+	euid = from_kuid_munged(cred->user_ns, cred->euid);
+	suid = from_kuid_munged(cred->user_ns, cred->suid);
 
-	if (!(retval   = put_user(cred->uid,  ruid)) &&
-	    !(retval   = put_user(cred->euid, euid)))
-		retval = put_user(cred->suid, suid);
+	if (!(retval   = put_user(ruid, ruidp)) &&
+	    !(retval   = put_user(euid, euidp)))
+		retval = put_user(suid, suidp);
 
 	return retval;
 }
@@ -839,9 +891,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u
  */
 SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 {
+	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
 	struct cred *new;
 	int retval;
+	kgid_t krgid, kegid, ksgid;
+
+	krgid = make_kgid(ns, rgid);
+	kegid = make_kgid(ns, egid);
+	ksgid = make_kgid(ns, sgid);
+
+	if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+		return -EINVAL;
+	if ((egid != (gid_t) -1) && !gid_valid(kegid))
+		return -EINVAL;
+	if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
+		return -EINVAL;
 
 	new = prepare_creds();
 	if (!new)
@@ -850,23 +915,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 
 	retval = -EPERM;
 	if (!nsown_capable(CAP_SETGID)) {
-		if (rgid != (gid_t) -1 && rgid != old->gid &&
-		    rgid != old->egid  && rgid != old->sgid)
+		if (rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
+		    !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
 			goto error;
-		if (egid != (gid_t) -1 && egid != old->gid &&
-		    egid != old->egid  && egid != old->sgid)
+		if (egid != (gid_t) -1        && !gid_eq(kegid, old->gid) &&
+		    !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
 			goto error;
-		if (sgid != (gid_t) -1 && sgid != old->gid &&
-		    sgid != old->egid  && sgid != old->sgid)
+		if (sgid != (gid_t) -1        && !gid_eq(ksgid, old->gid) &&
+		    !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
 			goto error;
 	}
 
 	if (rgid != (gid_t) -1)
-		new->gid = rgid;
+		new->gid = krgid;
 	if (egid != (gid_t) -1)
-		new->egid = egid;
+		new->egid = kegid;
 	if (sgid != (gid_t) -1)
-		new->sgid = sgid;
+		new->sgid = ksgid;
 	new->fsgid = new->egid;
 
 	return commit_creds(new);
@@ -876,14 +941,19 @@ error:
 	return retval;
 }
 
-SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid)
+SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
 {
 	const struct cred *cred = current_cred();
 	int retval;
+	gid_t rgid, egid, sgid;
+
+	rgid = from_kgid_munged(cred->user_ns, cred->gid);
+	egid = from_kgid_munged(cred->user_ns, cred->egid);
+	sgid = from_kgid_munged(cred->user_ns, cred->sgid);
 
-	if (!(retval   = put_user(cred->gid,  rgid)) &&
-	    !(retval   = put_user(cred->egid, egid)))
-		retval = put_user(cred->sgid, sgid);
+	if (!(retval   = put_user(rgid, rgidp)) &&
+	    !(retval   = put_user(egid, egidp)))
+		retval = put_user(sgid, sgidp);
 
 	return retval;
 }
@@ -900,18 +970,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 	const struct cred *old;
 	struct cred *new;
 	uid_t old_fsuid;
+	kuid_t kuid;
+
+	old = current_cred();
+	old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
+
+	kuid = make_kuid(old->user_ns, uid);
+	if (!uid_valid(kuid))
+		return old_fsuid;
 
 	new = prepare_creds();
 	if (!new)
-		return current_fsuid();
-	old = current_cred();
-	old_fsuid = old->fsuid;
+		return old_fsuid;
 
-	if (uid == old->uid  || uid == old->euid  ||
-	    uid == old->suid || uid == old->fsuid ||
+	if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
+	    uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
 	    nsown_capable(CAP_SETUID)) {
-		if (uid != old_fsuid) {
-			new->fsuid = uid;
+		if (!uid_eq(kuid, old->fsuid)) {
+			new->fsuid = kuid;
 			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
 				goto change_okay;
 		}
@@ -933,18 +1009,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 	const struct cred *old;
 	struct cred *new;
 	gid_t old_fsgid;
+	kgid_t kgid;
+
+	old = current_cred();
+	old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
+
+	kgid = make_kgid(old->user_ns, gid);
+	if (!gid_valid(kgid))
+		return old_fsgid;
 
 	new = prepare_creds();
 	if (!new)
-		return current_fsgid();
-	old = current_cred();
-	old_fsgid = old->fsgid;
+		return old_fsgid;
 
-	if (gid == old->gid  || gid == old->egid  ||
-	    gid == old->sgid || gid == old->fsgid ||
+	if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
+	    gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
 	    nsown_capable(CAP_SETGID)) {
-		if (gid != old_fsgid) {
-			new->fsgid = gid;
+		if (!gid_eq(kgid, old->fsgid)) {
+			new->fsgid = kgid;
 			goto change_okay;
 		}
 	}
@@ -1503,10 +1585,10 @@ static int check_prlimit_permission(struct task_struct *task)
 	if (cred->user_ns == tcred->user_ns &&
 	    (cred->uid == tcred->euid &&
 	     cred->uid == tcred->suid &&
-	     cred->uid == tcred->uid  &&
+	     cred->uid == tcred->uid &&
 	     cred->gid == tcred->egid &&
 	     cred->gid == tcred->sgid &&
-	     cred->gid == tcred->gid))
+		    cred->gid == tcred->gid))
 		return 0;
 	if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
 		return 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888e..67316cb6a777 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1427,25 +1427,25 @@ SYSCALL_DEFINE0(getppid)
 SYSCALL_DEFINE0(getuid)
 {
 	/* Only we change this so SMP safe */
-	return current_uid();
+	return from_kuid_munged(current_user_ns(), current_uid());
 }
 
 SYSCALL_DEFINE0(geteuid)
 {
 	/* Only we change this so SMP safe */
-	return current_euid();
+	return from_kuid_munged(current_user_ns(), current_euid());
 }
 
 SYSCALL_DEFINE0(getgid)
 {
 	/* Only we change this so SMP safe */
-	return current_gid();
+	return from_kgid_munged(current_user_ns(), current_gid());
 }
 
 SYSCALL_DEFINE0(getegid)
 {
 	/* Only we change this so SMP safe */
-	return  current_egid();
+	return from_kgid_munged(current_user_ns(), current_egid());
 }
 
 #endif
diff --git a/kernel/uid16.c b/kernel/uid16.c
index e530bc34c4cf..d7948eb10225 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -81,14 +81,19 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
 	return ret;
 }
 
-SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid)
+SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
 {
 	const struct cred *cred = current_cred();
 	int retval;
+	old_uid_t ruid, euid, suid;
 
-	if (!(retval   = put_user(high2lowuid(cred->uid),  ruid)) &&
-	    !(retval   = put_user(high2lowuid(cred->euid), euid)))
-		retval = put_user(high2lowuid(cred->suid), suid);
+	ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid));
+	euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid));
+	suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid));
+
+	if (!(retval   = put_user(ruid, ruidp)) &&
+	    !(retval   = put_user(euid, euidp)))
+		retval = put_user(suid, suidp);
 
 	return retval;
 }
@@ -103,14 +108,19 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
 }
 
 
-SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid)
+SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
 {
 	const struct cred *cred = current_cred();
 	int retval;
+	old_gid_t rgid, egid, sgid;
+
+	rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid));
+	egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid));
+	sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid));
 
-	if (!(retval   = put_user(high2lowgid(cred->gid),  rgid)) &&
-	    !(retval   = put_user(high2lowgid(cred->egid), egid)))
-		retval = put_user(high2lowgid(cred->sgid), sgid);
+	if (!(retval   = put_user(rgid, rgidp)) &&
+	    !(retval   = put_user(egid, egidp)))
+		retval = put_user(sgid, sgidp);
 
 	return retval;
 }
@@ -221,20 +231,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
 
 SYSCALL_DEFINE0(getuid16)
 {
-	return high2lowuid(current_uid());
+	return high2lowuid(from_kuid_munged(current_user_ns(), current_uid()));
 }
 
 SYSCALL_DEFINE0(geteuid16)
 {
-	return high2lowuid(current_euid());
+	return high2lowuid(from_kuid_munged(current_user_ns(), current_euid()));
 }
 
 SYSCALL_DEFINE0(getgid16)
 {
-	return high2lowgid(current_gid());
+	return high2lowgid(from_kgid_munged(current_user_ns(), current_gid()));
 }
 
 SYSCALL_DEFINE0(getegid16)
 {
-	return high2lowgid(current_egid());
+	return high2lowgid(from_kgid_munged(current_user_ns(), current_egid()));
 }
-- 
cgit v1.2.3-59-g8ed1b


From 5af662030e5db1a5560fd917250d5d688a6be586 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 3 Mar 2012 20:21:47 -0800
Subject: userns: Convert ptrace, kill, set_priority permission checks to work
 with kuids and kgids

Update the permission checks to use the new uid_eq and gid_eq helpers
and remove the now unnecessary user_ns equality comparison.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/ptrace.c | 13 ++++++-------
 kernel/signal.c | 15 ++++++---------
 kernel/sys.c    | 18 ++++++++----------
 3 files changed, 20 insertions(+), 26 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 24e0a5a94824..a232bb59d93f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -198,13 +198,12 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 		return 0;
 	rcu_read_lock();
 	tcred = __task_cred(task);
-	if (cred->user_ns == tcred->user_ns &&
-	    (cred->uid == tcred->euid &&
-	     cred->uid == tcred->suid &&
-	     cred->uid == tcred->uid  &&
-	     cred->gid == tcred->egid &&
-	     cred->gid == tcred->sgid &&
-	     cred->gid == tcred->gid))
+	if (uid_eq(cred->uid, tcred->euid) &&
+	    uid_eq(cred->uid, tcred->suid) &&
+	    uid_eq(cred->uid, tcred->uid)  &&
+	    gid_eq(cred->gid, tcred->egid) &&
+	    gid_eq(cred->gid, tcred->sgid) &&
+	    gid_eq(cred->gid, tcred->gid))
 		goto ok;
 	if (ptrace_has_cap(tcred->user_ns, mode))
 		goto ok;
diff --git a/kernel/signal.c b/kernel/signal.c
index d6303277a640..aef629c65c87 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -767,11 +767,10 @@ static int kill_ok_by_cred(struct task_struct *t)
 	const struct cred *cred = current_cred();
 	const struct cred *tcred = __task_cred(t);
 
-	if (cred->user_ns == tcred->user_ns &&
-	    (cred->euid == tcred->suid ||
-	     cred->euid == tcred->uid ||
-	     cred->uid  == tcred->suid ||
-	     cred->uid  == tcred->uid))
+	if (uid_eq(cred->euid, tcred->suid) ||
+	    uid_eq(cred->euid, tcred->uid)  ||
+	    uid_eq(cred->uid,  tcred->suid) ||
+	    uid_eq(cred->uid,  tcred->uid))
 		return 1;
 
 	if (ns_capable(tcred->user_ns, CAP_KILL))
@@ -1389,10 +1388,8 @@ static int kill_as_cred_perm(const struct cred *cred,
 			     struct task_struct *target)
 {
 	const struct cred *pcred = __task_cred(target);
-	if (cred->user_ns != pcred->user_ns)
-		return 0;
-	if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
-	    cred->uid  != pcred->suid && cred->uid  != pcred->uid)
+	if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
+	    !uid_eq(cred->uid,  pcred->suid) && !uid_eq(cred->uid,  pcred->uid))
 		return 0;
 	return 1;
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index aff09f208eb3..f484077b6b14 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -131,9 +131,8 @@ static bool set_one_prio_perm(struct task_struct *p)
 {
 	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
 
-	if (pcred->user_ns == cred->user_ns &&
-	    (pcred->uid  == cred->euid ||
-	     pcred->euid == cred->euid))
+	if (uid_eq(pcred->uid,  cred->euid) ||
+	    uid_eq(pcred->euid, cred->euid))
 		return true;
 	if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
 		return true;
@@ -1582,13 +1581,12 @@ static int check_prlimit_permission(struct task_struct *task)
 		return 0;
 
 	tcred = __task_cred(task);
-	if (cred->user_ns == tcred->user_ns &&
-	    (cred->uid == tcred->euid &&
-	     cred->uid == tcred->suid &&
-	     cred->uid == tcred->uid &&
-	     cred->gid == tcred->egid &&
-	     cred->gid == tcred->sgid &&
-		    cred->gid == tcred->gid))
+	if (uid_eq(cred->uid, tcred->euid) &&
+	    uid_eq(cred->uid, tcred->suid) &&
+	    uid_eq(cred->uid, tcred->uid)  &&
+	    gid_eq(cred->gid, tcred->egid) &&
+	    gid_eq(cred->gid, tcred->sgid) &&
+	    gid_eq(cred->gid, tcred->gid))
 		return 0;
 	if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
 		return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 499eea6bf9c06df3bf4549954aee6fb3427946ed Mon Sep 17 00:00:00 2001
From: Sasikantha babu <sasikanth.v19@gmail.com>
Date: Thu, 31 May 2012 16:26:07 -0700
Subject: sethostname/setdomainname: notify userspace when there is a change in
 uts_kern_table

sethostname() and setdomainname() notify userspace on failure (without
modifying uts_kern_table).  Change things so that we only notify userspace
on success, when uts_kern_table was actually modified.

Signed-off-by: Sasikantha babu <sasikanth.v19@gmail.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: WANG Cong <amwang@redhat.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/kernel/sys.c b/kernel/sys.c
index 6df42624e454..8b71cef3bf1a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1378,8 +1378,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 		memcpy(u->nodename, tmp, len);
 		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
 		errno = 0;
+		uts_proc_notify(UTS_PROC_HOSTNAME);
 	}
-	uts_proc_notify(UTS_PROC_HOSTNAME);
 	up_write(&uts_sem);
 	return errno;
 }
@@ -1429,8 +1429,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 		memcpy(u->domainname, tmp, len);
 		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
 		errno = 0;
+		uts_proc_notify(UTS_PROC_DOMAINNAME);
 	}
-	uts_proc_notify(UTS_PROC_DOMAINNAME);
 	up_write(&uts_sem);
 	return errno;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 81ab6e7b26b453a795d46f2616ed0e31d97f05b9 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 31 May 2012 16:26:15 -0700
Subject: kmod: convert two call sites to call_usermodehelper_fns()

Both kernel/sys.c && security/keys/request_key.c where inlining the exact
same code as call_usermodehelper_fns(); So simply convert these sites to
directly use call_usermodehelper_fns().

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c                | 19 ++++++++-----------
 security/keys/request_key.c | 13 +++----------
 2 files changed, 11 insertions(+), 21 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/kernel/sys.c b/kernel/sys.c
index 8b71cef3bf1a..6e81aa7e4688 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2114,7 +2114,6 @@ int orderly_poweroff(bool force)
 		NULL
 	};
 	int ret = -ENOMEM;
-	struct subprocess_info *info;
 
 	if (argv == NULL) {
 		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
@@ -2122,18 +2121,16 @@ int orderly_poweroff(bool force)
 		goto out;
 	}
 
-	info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
-	if (info == NULL) {
-		argv_free(argv);
-		goto out;
-	}
-
-	call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
+	ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
+				      NULL, argv_cleanup, NULL);
+out:
+	if (likely(!ret))
+		return 0;
 
-	ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
+	if (ret == -ENOMEM)
+		argv_free(argv);
 
-  out:
-	if (ret && force) {
+	if (force) {
 		printk(KERN_WARNING "Failed to start orderly shutdown: "
 		       "forcing the issue\n");
 
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index cc3790315d2f..000e75017520 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -93,16 +93,9 @@ static void umh_keys_cleanup(struct subprocess_info *info)
 static int call_usermodehelper_keys(char *path, char **argv, char **envp,
 					struct key *session_keyring, int wait)
 {
-	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-	struct subprocess_info *info =
-		call_usermodehelper_setup(path, argv, envp, gfp_mask);
-
-	if (!info)
-		return -ENOMEM;
-
-	call_usermodehelper_setfns(info, umh_keys_init, umh_keys_cleanup,
-					key_get(session_keyring));
-	return call_usermodehelper_exec(info, wait);
+	return call_usermodehelper_fns(path, argv, envp, wait,
+				       umh_keys_init, umh_keys_cleanup,
+				       key_get(session_keyring));
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From fe8c7f5cbf91124987106faa3bdf0c8b955c4cf7 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 31 May 2012 16:26:45 -0700
Subject: c/r: prctl: extend PR_SET_MM to set up more mm_struct entries

During checkpoint we dump whole process memory to a file and the dump
includes process stack memory.  But among stack data itself, the stack
carries additional parameters such as command line arguments, environment
data and auxiliary vector.

So when we do restore procedure and once we've restored stack data itself
we need to setup mm_struct::arg_start/end, env_start/end, so restored
process would be able to find command line arguments and environment data
it had at checkpoint time.  The same applies to auxiliary vector.

For this reason additional PR_SET_MM_(ARG_START | ARG_END | ENV_START |
ENV_END | AUXV) codes are introduced.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Vagin <avagin@openvz.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prctl.h |   5 ++
 kernel/sys.c          | 134 +++++++++++++++++++++++++++++++-------------------
 2 files changed, 88 insertions(+), 51 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 78b76e24cc7e..18d84c4b42d8 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -113,6 +113,11 @@
 # define PR_SET_MM_START_STACK		5
 # define PR_SET_MM_START_BRK		6
 # define PR_SET_MM_BRK			7
+# define PR_SET_MM_ARG_START		8
+# define PR_SET_MM_ARG_END		9
+# define PR_SET_MM_ENV_START		10
+# define PR_SET_MM_ENV_END		11
+# define PR_SET_MM_AUXV			12
 
 /*
  * Set specific pid that is allowed to ptrace the current task.
diff --git a/kernel/sys.c b/kernel/sys.c
index 6e81aa7e4688..8b544972e46e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1784,17 +1784,23 @@ SYSCALL_DEFINE1(umask, int, mask)
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
+static bool vma_flags_mismatch(struct vm_area_struct *vma,
+			       unsigned long required,
+			       unsigned long banned)
+{
+	return (vma->vm_flags & required) != required ||
+		(vma->vm_flags & banned);
+}
+
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
 	unsigned long rlim = rlimit(RLIMIT_DATA);
-	unsigned long vm_req_flags;
-	unsigned long vm_bad_flags;
-	struct vm_area_struct *vma;
-	int error = 0;
 	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int error;
 
-	if (arg4 | arg5)
+	if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_RESOURCE))
@@ -1803,58 +1809,23 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (addr >= TASK_SIZE)
 		return -EINVAL;
 
+	error = -EINVAL;
+
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, addr);
 
-	if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
-		/* It must be existing VMA */
-		if (!vma || vma->vm_start > addr)
-			goto out;
-	}
-
-	error = -EINVAL;
 	switch (opt) {
 	case PR_SET_MM_START_CODE:
+		mm->start_code = addr;
+		break;
 	case PR_SET_MM_END_CODE:
-		vm_req_flags = VM_READ | VM_EXEC;
-		vm_bad_flags = VM_WRITE | VM_MAYSHARE;
-
-		if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
-		    (vma->vm_flags & vm_bad_flags))
-			goto out;
-
-		if (opt == PR_SET_MM_START_CODE)
-			mm->start_code = addr;
-		else
-			mm->end_code = addr;
+		mm->end_code = addr;
 		break;
-
 	case PR_SET_MM_START_DATA:
-	case PR_SET_MM_END_DATA:
-		vm_req_flags = VM_READ | VM_WRITE;
-		vm_bad_flags = VM_EXEC | VM_MAYSHARE;
-
-		if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
-		    (vma->vm_flags & vm_bad_flags))
-			goto out;
-
-		if (opt == PR_SET_MM_START_DATA)
-			mm->start_data = addr;
-		else
-			mm->end_data = addr;
+		mm->start_data = addr;
 		break;
-
-	case PR_SET_MM_START_STACK:
-
-#ifdef CONFIG_STACK_GROWSUP
-		vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
-#else
-		vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
-#endif
-		if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
-			goto out;
-
-		mm->start_stack = addr;
+	case PR_SET_MM_END_DATA:
+		mm->end_data = addr;
 		break;
 
 	case PR_SET_MM_START_BRK:
@@ -1881,16 +1852,77 @@ static int prctl_set_mm(int opt, unsigned long addr,
 		mm->brk = addr;
 		break;
 
+	/*
+	 * If command line arguments and environment
+	 * are placed somewhere else on stack, we can
+	 * set them up here, ARG_START/END to setup
+	 * command line argumets and ENV_START/END
+	 * for environment.
+	 */
+	case PR_SET_MM_START_STACK:
+	case PR_SET_MM_ARG_START:
+	case PR_SET_MM_ARG_END:
+	case PR_SET_MM_ENV_START:
+	case PR_SET_MM_ENV_END:
+		if (!vma) {
+			error = -EFAULT;
+			goto out;
+		}
+#ifdef CONFIG_STACK_GROWSUP
+		if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0))
+#else
+		if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0))
+#endif
+			goto out;
+		if (opt == PR_SET_MM_START_STACK)
+			mm->start_stack = addr;
+		else if (opt == PR_SET_MM_ARG_START)
+			mm->arg_start = addr;
+		else if (opt == PR_SET_MM_ARG_END)
+			mm->arg_end = addr;
+		else if (opt == PR_SET_MM_ENV_START)
+			mm->env_start = addr;
+		else if (opt == PR_SET_MM_ENV_END)
+			mm->env_end = addr;
+		break;
+
+	/*
+	 * This doesn't move auxiliary vector itself
+	 * since it's pinned to mm_struct, but allow
+	 * to fill vector with new values. It's up
+	 * to a caller to provide sane values here
+	 * otherwise user space tools which use this
+	 * vector might be unhappy.
+	 */
+	case PR_SET_MM_AUXV: {
+		unsigned long user_auxv[AT_VECTOR_SIZE];
+
+		if (arg4 > sizeof(user_auxv))
+			goto out;
+		up_read(&mm->mmap_sem);
+
+		if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
+			return -EFAULT;
+
+		/* Make sure the last entry is always AT_NULL */
+		user_auxv[AT_VECTOR_SIZE - 2] = 0;
+		user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+		BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+		task_lock(current);
+		memcpy(mm->saved_auxv, user_auxv, arg4);
+		task_unlock(current);
+
+		return 0;
+	}
 	default:
-		error = -EINVAL;
 		goto out;
 	}
 
 	error = 0;
-
 out:
 	up_read(&mm->mmap_sem);
-
 	return error;
 }
 #else /* CONFIG_CHECKPOINT_RESTORE */
-- 
cgit v1.2.3-59-g8ed1b


From b32dfe377102ce668775f8b6b1461f7ad428f8b6 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 31 May 2012 16:26:46 -0700
Subject: c/r: prctl: add ability to set new mm_struct::exe_file

When we do restore we would like to have a way to setup a former
mm_struct::exe_file so that /proc/pid/exe would point to the original
executable file a process had at checkpoint time.

For this the PR_SET_MM_EXE_FILE code is introduced.  This option takes a
file descriptor which will be set as a source for new /proc/$pid/exe
symlink.

Note it allows to change /proc/$pid/exe if there are no VM_EXECUTABLE
vmas present for current process, simply because this feature is a special
to C/R and mm::num_exe_file_vmas become meaningless after that.

To minimize the amount of transition the /proc/pid/exe symlink might have,
this feature is implemented in one-shot manner.  Thus once changed the
symlink can't be changed again.  This should help sysadmins to monitor the
symlinks over all process running in a system.

In particular one could make a snapshot of processes and ring alarm if
there unexpected changes of /proc/pid/exe's in a system.

Note -- this feature is available iif CONFIG_CHECKPOINT_RESTORE is set and
the caller must have CAP_SYS_RESOURCE capability granted, otherwise the
request to change symlink will be rejected.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prctl.h |  1 +
 kernel/sys.c          | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

(limited to 'kernel/sys.c')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 18d84c4b42d8..711e0a30aacc 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -118,6 +118,7 @@
 # define PR_SET_MM_ENV_START		10
 # define PR_SET_MM_ENV_END		11
 # define PR_SET_MM_AUXV			12
+# define PR_SET_MM_EXE_FILE		13
 
 /*
  * Set specific pid that is allowed to ptrace the current task.
diff --git a/kernel/sys.c b/kernel/sys.c
index 8b544972e46e..9ff89cb9657a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -36,6 +36,8 @@
 #include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
+#include <linux/file.h>
+#include <linux/mount.h>
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
 #include <linux/version.h>
@@ -1792,6 +1794,57 @@ static bool vma_flags_mismatch(struct vm_area_struct *vma,
 		(vma->vm_flags & banned);
 }
 
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+{
+	struct file *exe_file;
+	struct dentry *dentry;
+	int err;
+
+	/*
+	 * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's
+	 * remain. So perform a quick test first.
+	 */
+	if (mm->num_exe_file_vmas)
+		return -EBUSY;
+
+	exe_file = fget(fd);
+	if (!exe_file)
+		return -EBADF;
+
+	dentry = exe_file->f_path.dentry;
+
+	/*
+	 * Because the original mm->exe_file points to executable file, make
+	 * sure that this one is executable as well, to avoid breaking an
+	 * overall picture.
+	 */
+	err = -EACCES;
+	if (!S_ISREG(dentry->d_inode->i_mode)	||
+	    exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+		goto exit;
+
+	err = inode_permission(dentry->d_inode, MAY_EXEC);
+	if (err)
+		goto exit;
+
+	/*
+	 * The symlink can be changed only once, just to disallow arbitrary
+	 * transitions malicious software might bring in. This means one
+	 * could make a snapshot over all processes running and monitor
+	 * /proc/pid/exe changes to notice unusual activity if needed.
+	 */
+	down_write(&mm->mmap_sem);
+	if (likely(!mm->exe_file))
+		set_mm_exe_file(mm, exe_file);
+	else
+		err = -EBUSY;
+	up_write(&mm->mmap_sem);
+
+exit:
+	fput(exe_file);
+	return err;
+}
+
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
@@ -1806,6 +1859,9 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (!capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 
+	if (opt == PR_SET_MM_EXE_FILE)
+		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+
 	if (addr >= TASK_SIZE)
 		return -EINVAL;
 
-- 
cgit v1.2.3-59-g8ed1b


From bafb282df29c1524b1617019adebd6d0c3eb7a47 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Thu, 7 Jun 2012 14:21:11 -0700
Subject: c/r: prctl: update prctl_set_mm_exe_file() after
 mm->num_exe_file_vmas removal

A fix for commit b32dfe377102 ("c/r: prctl: add ability to set new
mm_struct::exe_file").

After removing mm->num_exe_file_vmas kernel keeps mm->exe_file until
final mmput(), it never becomes NULL while task is alive.

We can check for other mapped files in mm instead of checking
mm->num_exe_file_vmas, and mark mm with flag MMF_EXE_FILE_CHANGED in
order to forbid second changing of mm->exe_file.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 +
 kernel/sys.c          | 31 +++++++++++++++++++------------
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6029d8c54476..c688d4cc2e40 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -439,6 +439,7 @@ extern int get_dumpable(struct mm_struct *mm);
 					/* leave room for more dump flags */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
 #define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
+#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 9ff89cb9657a..54f20fdee93c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1796,17 +1796,11 @@ static bool vma_flags_mismatch(struct vm_area_struct *vma,
 
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
+	struct vm_area_struct *vma;
 	struct file *exe_file;
 	struct dentry *dentry;
 	int err;
 
-	/*
-	 * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's
-	 * remain. So perform a quick test first.
-	 */
-	if (mm->num_exe_file_vmas)
-		return -EBUSY;
-
 	exe_file = fget(fd);
 	if (!exe_file)
 		return -EBADF;
@@ -1827,17 +1821,30 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 	if (err)
 		goto exit;
 
+	down_write(&mm->mmap_sem);
+
+	/*
+	 * Forbid mm->exe_file change if there are mapped other files.
+	 */
+	err = -EBUSY;
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_file && !path_equal(&vma->vm_file->f_path,
+						&exe_file->f_path))
+			goto exit_unlock;
+	}
+
 	/*
 	 * The symlink can be changed only once, just to disallow arbitrary
 	 * transitions malicious software might bring in. This means one
 	 * could make a snapshot over all processes running and monitor
 	 * /proc/pid/exe changes to notice unusual activity if needed.
 	 */
-	down_write(&mm->mmap_sem);
-	if (likely(!mm->exe_file))
-		set_mm_exe_file(mm, exe_file);
-	else
-		err = -EBUSY;
+	err = -EPERM;
+	if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
+		goto exit_unlock;
+
+	set_mm_exe_file(mm, exe_file);
+exit_unlock:
 	up_write(&mm->mmap_sem);
 
 exit:
-- 
cgit v1.2.3-59-g8ed1b


From 1ad75b9e16280ca4e2501a629a225319cf2eef2e Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 7 Jun 2012 14:21:11 -0700
Subject: c/r: prctl: add minimal address test to PR_SET_MM

Make sure the address being set is greater than mmap_min_addr (as
suggested by Kees Cook).

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sys.c')

diff --git a/kernel/sys.c b/kernel/sys.c
index 54f20fdee93c..19a2c7139960 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1869,7 +1869,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (opt == PR_SET_MM_EXE_FILE)
 		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
 
-	if (addr >= TASK_SIZE)
+	if (addr >= TASK_SIZE || addr < mmap_min_addr)
 		return -EINVAL;
 
 	error = -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 300f786b2683f8bb1ec0afb6e1851183a479c86d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 7 Jun 2012 14:21:12 -0700
Subject: c/r: prctl: add ability to get clear_tid_address

Zero is written at clear_tid_address when the process exits.  This
functionality is used by pthread_join().

We already have sys_set_tid_address() to change this address for the
current task but there is no way to obtain it from user space.

Without the ability to find this address and dump it we can't restore
pthread'ed apps which call pthread_join() once they have been restored.

This patch introduces the PR_GET_TID_ADDRESS prctl option which allows
the current process to obtain own clear_tid_address.

This feature is available iif CONFIG_CHECKPOINT_RESTORE is set.

[akpm@linux-foundation.org: fix prctl numbering]
Signed-off-by: Andrew Vagin <avagin@openvz.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pedro Alves <palves@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prctl.h | 10 ++++++----
 kernel/sys.c          | 13 +++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 711e0a30aacc..3988012255dc 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -127,8 +127,8 @@
 #define PR_SET_PTRACER 0x59616d61
 # define PR_SET_PTRACER_ANY ((unsigned long)-1)
 
-#define PR_SET_CHILD_SUBREAPER 36
-#define PR_GET_CHILD_SUBREAPER 37
+#define PR_SET_CHILD_SUBREAPER	36
+#define PR_GET_CHILD_SUBREAPER	37
 
 /*
  * If no_new_privs is set, then operations that grant new privileges (i.e.
@@ -142,7 +142,9 @@
  * asking selinux for a specific new context (e.g. with runcon) will result
  * in execve returning -EPERM.
  */
-#define PR_SET_NO_NEW_PRIVS 38
-#define PR_GET_NO_NEW_PRIVS 39
+#define PR_SET_NO_NEW_PRIVS	38
+#define PR_GET_NO_NEW_PRIVS	39
+
+#define PR_GET_TID_ADDRESS	40
 
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 19a2c7139960..0ec1942ba7ea 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1988,12 +1988,22 @@ out:
 	up_read(&mm->mmap_sem);
 	return error;
 }
+
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+	return put_user(me->clear_child_tid, tid_addr);
+}
+
 #else /* CONFIG_CHECKPOINT_RESTORE */
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
 	return -EINVAL;
 }
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+	return -EINVAL;
+}
 #endif
 
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
@@ -2131,6 +2141,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 				else
 					return -EINVAL;
 				break;
+		case PR_GET_TID_ADDRESS:
+			error = prctl_get_tid_address(me, (int __user **)arg2);
+			break;
 			default:
 				return -EINVAL;
 			}
-- 
cgit v1.2.3-59-g8ed1b


From 736f24d5e59d699c6e300c5da7e3bb882eddda67 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 7 Jun 2012 14:21:12 -0700
Subject: c/r: prctl: drop VMA flags test on PR_SET_MM_ stack data assignment

In commit b76437579d13 ("procfs: mark thread stack correctly in
proc/<pid>/maps") the stack allocated via clone() is marked in
/proc/<pid>/maps as [stack:%d] thus it might be out of the former
mm->start_stack/end_stack values (and even has some custom VMA flags
set).

So to be able to restore mm->start_stack/end_stack drop vma flags test,
but still require the underlying VMA to exist.

As always note this feature is under CONFIG_CHECKPOINT_RESTORE and
requires CAP_SYS_RESOURCE to be granted.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/kernel/sys.c b/kernel/sys.c
index 0ec1942ba7ea..f0ec44dcd415 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1786,14 +1786,6 @@ SYSCALL_DEFINE1(umask, int, mask)
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
-static bool vma_flags_mismatch(struct vm_area_struct *vma,
-			       unsigned long required,
-			       unsigned long banned)
-{
-	return (vma->vm_flags & required) != required ||
-		(vma->vm_flags & banned);
-}
-
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
 	struct vm_area_struct *vma;
@@ -1931,12 +1923,6 @@ static int prctl_set_mm(int opt, unsigned long addr,
 			error = -EFAULT;
 			goto out;
 		}
-#ifdef CONFIG_STACK_GROWSUP
-		if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0))
-#else
-		if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0))
-#endif
-			goto out;
 		if (opt == PR_SET_MM_START_STACK)
 			mm->start_stack = addr;
 		else if (opt == PR_SET_MM_ARG_START)
-- 
cgit v1.2.3-59-g8ed1b


From 5702c5eeab959e86ee2d9b4fe7f2d87e65b25d46 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 20 Jun 2012 12:53:04 -0700
Subject: c/r: prctl: Move PR_GET_TID_ADDRESS to a proper place

During merging of PR_GET_TID_ADDRESS patch the code has been misplaced (it
happened to appear under PR_MCE_KILL) in result noone can use this option.

Fix it by moving code snippet to a proper place.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/sys.c')

diff --git a/kernel/sys.c b/kernel/sys.c
index f0ec44dcd415..e0c8ffc50d7f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2127,9 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 				else
 					return -EINVAL;
 				break;
-		case PR_GET_TID_ADDRESS:
-			error = prctl_get_tid_address(me, (int __user **)arg2);
-			break;
 			default:
 				return -EINVAL;
 			}
@@ -2147,6 +2144,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_MM:
 			error = prctl_set_mm(arg2, arg3, arg4, arg5);
 			break;
+		case PR_GET_TID_ADDRESS:
+			error = prctl_get_tid_address(me, (int __user **)arg2);
+			break;
 		case PR_SET_CHILD_SUBREAPER:
 			me->signal->is_child_subreaper = !!arg2;
 			error = 0;
-- 
cgit v1.2.3-59-g8ed1b