From d9cc20484e5e48c6a5deb4387c20fd45bfbdde8c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 16 Sep 2007 16:21:16 -0700
Subject: [NET] skbuff: Add skb_cow_head

This patch adds an optimised version of skb_cow that avoids the copy if
the header can be modified even if the rest of the payload is cloned.

This can be used in encapsulating paths where we only need to modify the
header.  As it is, this can be used in PPPOE and bridging.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 93c27f71122a..a656cecd373c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1352,6 +1352,22 @@ static inline int skb_clone_writable(struct sk_buff *skb, int len)
 	       skb_headroom(skb) + len <= skb->hdr_len;
 }
 
+static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
+			    int cloned)
+{
+	int delta = 0;
+
+	if (headroom < NET_SKB_PAD)
+		headroom = NET_SKB_PAD;
+	if (headroom > skb_headroom(skb))
+		delta = headroom - skb_headroom(skb);
+
+	if (delta || cloned)
+		return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
+					GFP_ATOMIC);
+	return 0;
+}
+
 /**
  *	skb_cow - copy header of skb when it is required
  *	@skb: buffer to cow
@@ -1366,16 +1382,22 @@ static inline int skb_clone_writable(struct sk_buff *skb, int len)
  */
 static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
 {
-	int delta = (headroom > NET_SKB_PAD ? headroom : NET_SKB_PAD) -
-			skb_headroom(skb);
-
-	if (delta < 0)
-		delta = 0;
+	return __skb_cow(skb, headroom, skb_cloned(skb));
+}
 
-	if (delta || skb_cloned(skb))
-		return pskb_expand_head(skb, (delta + (NET_SKB_PAD-1)) &
-				~(NET_SKB_PAD-1), 0, GFP_ATOMIC);
-	return 0;
+/**
+ *	skb_cow_head - skb_cow but only making the head writable
+ *	@skb: buffer to cow
+ *	@headroom: needed headroom
+ *
+ *	This function is identical to skb_cow except that we replace the
+ *	skb_cloned check by skb_header_cloned.  It should be used when
+ *	you only need to push on some header and do not need to modify
+ *	the data.
+ */
+static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
+{
+	return __skb_cow(skb, headroom, skb_header_cloned(skb));
 }
 
 /**
-- 
cgit v1.3-8-gc7d7


From fa890d586cc127ce72597ba0a909bfecf784e10c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Sun, 16 Sep 2007 17:01:26 -0600
Subject: Fix non-ISA link error in drivers/scsi/advansys.c

When CONFIG_ISA is disabled, the isa_driver support will not be compiled
in.  Define stubs so that we don't get link-time errors.

Signed-off-by: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/isa.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/isa.h b/include/linux/isa.h
index 1b855335cb11..b0270e3814c8 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -22,7 +22,18 @@ struct isa_driver {
 
 #define to_isa_driver(x) container_of((x), struct isa_driver, driver)
 
+#ifdef CONFIG_ISA
 int isa_register_driver(struct isa_driver *, unsigned int);
 void isa_unregister_driver(struct isa_driver *);
+#else
+static inline int isa_register_driver(struct isa_driver *d, unsigned int i)
+{
+	return 0;
+}
+
+static inline void isa_unregister_driver(struct isa_driver *d)
+{
+}
+#endif
 
 #endif /* __LINUX_ISA_H */
-- 
cgit v1.3-8-gc7d7


From 735de2230f09741077a645a913de0a04b10208bf Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 18 Sep 2007 22:46:44 -0700
Subject: Convert uid hash to hlist

Surprisingly, but (spotted by Alexey Dobriyan) the uid hash still uses
list_heads, thus occupying twice as much place as it could.  Convert it to
hlist_heads.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h          |  2 +-
 include/linux/user_namespace.h |  2 +-
 kernel/user.c                  | 15 ++++++++-------
 kernel/user_namespace.c        |  2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f4e324ed2e44..6239bc2c2baa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -593,7 +593,7 @@ struct user_struct {
 #endif
 
 	/* Hash table maintenance information */
-	struct list_head uidhash_list;
+	struct hlist_node uidhash_node;
 	uid_t uid;
 };
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 1101b0ce878f..b5f41d4c2eec 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -11,7 +11,7 @@
 
 struct user_namespace {
 	struct kref		kref;
-	struct list_head	uidhash_table[UIDHASH_SZ];
+	struct hlist_head	uidhash_table[UIDHASH_SZ];
 	struct user_struct	*root_user;
 };
 
diff --git a/kernel/user.c b/kernel/user.c
index e080ba863ae3..add57c7e4c07 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,21 +55,22 @@ struct user_struct root_user = {
 /*
  * These routines must be called with the uidhash spinlock held!
  */
-static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
+static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
 {
-	list_add(&up->uidhash_list, hashent);
+	hlist_add_head(&up->uidhash_node, hashent);
 }
 
 static inline void uid_hash_remove(struct user_struct *up)
 {
-	list_del(&up->uidhash_list);
+	hlist_del(&up->uidhash_node);
 }
 
-static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent)
+static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 {
 	struct user_struct *user;
+	struct hlist_node *h;
 
-	list_for_each_entry(user, hashent, uidhash_list) {
+	hlist_for_each_entry(user, h, hashent, uidhash_node) {
 		if(user->uid == uid) {
 			atomic_inc(&user->__count);
 			return user;
@@ -118,7 +119,7 @@ void free_uid(struct user_struct *up)
 
 struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 {
-	struct list_head *hashent = uidhashentry(ns, uid);
+	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up;
 
 	spin_lock_irq(&uidhash_lock);
@@ -207,7 +208,7 @@ static int __init uid_cache_init(void)
 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
-		INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
+		INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
 
 	/* Insert the root user immediately (init already runs as root) */
 	spin_lock_irq(&uidhash_lock);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 85af9422ea6e..e7ba1bf8457c 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 	kref_init(&ns->kref);
 
 	for (n = 0; n < UIDHASH_SZ; ++n)
-		INIT_LIST_HEAD(ns->uidhash_table + n);
+		INIT_HLIST_HEAD(ns->uidhash_table + n);
 
 	/* Insert new root user.  */
 	ns->root_user = alloc_uid(ns, 0);
-- 
cgit v1.3-8-gc7d7


From 28f300d23674fa01ae747c66ce861d4ee6aebe8c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 18 Sep 2007 22:46:45 -0700
Subject: Fix user namespace exiting OOPs

It turned out, that the user namespace is released during the do_exit() in
exit_task_namespaces(), but the struct user_struct is released only during the
put_task_struct(), i.e.  MUCH later.

On debug kernels with poisoned slabs this will cause the oops in
uid_hash_remove() because the head of the chain, which resides inside the
struct user_namespace, will be already freed and poisoned.

Since the uid hash itself is required only when someone can search it, i.e.
when the namespace is alive, we can safely unhash all the user_struct-s from
it during the namespace exiting.  The subsequent free_uid() will complete the
user_struct destruction.

For example simple program

   #include <sched.h>

   char stack[2 * 1024 * 1024];

   int f(void *foo)
   {
   	return 0;
   }

   int main(void)
   {
   	clone(f, stack + 1 * 1024 * 1024, 0x10000000, 0);
   	return 0;
   }

run on kernel with CONFIG_USER_NS turned on will oops the
kernel immediately.

This was spotted during OpenVZ kernel testing.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Acked-by: "Serge E. Hallyn" <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h   |  1 +
 kernel/user.c           | 26 +++++++++++++++++++++++++-
 kernel/user_namespace.c |  2 +-
 3 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6239bc2c2baa..5445eaec6908 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1472,6 +1472,7 @@ static inline struct user_struct *get_uid(struct user_struct *u)
 }
 extern void free_uid(struct user_struct *);
 extern void switch_uid(struct user_struct *);
+extern void release_uids(struct user_namespace *ns);
 
 #include <asm/current.h>
 
diff --git a/kernel/user.c b/kernel/user.c
index add57c7e4c07..9ca2848fc356 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -62,7 +62,7 @@ static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *ha
 
 static inline void uid_hash_remove(struct user_struct *up)
 {
-	hlist_del(&up->uidhash_node);
+	hlist_del_init(&up->uidhash_node);
 }
 
 static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
@@ -199,6 +199,30 @@ void switch_uid(struct user_struct *new_user)
 	suid_keys(current);
 }
 
+void release_uids(struct user_namespace *ns)
+{
+	int i;
+	unsigned long flags;
+	struct hlist_head *head;
+	struct hlist_node *nd;
+
+	spin_lock_irqsave(&uidhash_lock, flags);
+	/*
+	 * collapse the chains so that the user_struct-s will
+	 * be still alive, but not in hashes. subsequent free_uid()
+	 * will free them.
+	 */
+	for (i = 0; i < UIDHASH_SZ; i++) {
+		head = ns->uidhash_table + i;
+		while (!hlist_empty(head)) {
+			nd = head->first;
+			hlist_del_init(nd);
+		}
+	}
+	spin_unlock_irqrestore(&uidhash_lock, flags);
+
+	free_uid(ns->root_user);
+}
 
 static int __init uid_cache_init(void)
 {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e7ba1bf8457c..7af90fc4f0fd 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -81,7 +81,7 @@ void free_user_ns(struct kref *kref)
 	struct user_namespace *ns;
 
 	ns = container_of(kref, struct user_namespace, kref);
-	free_uid(ns->root_user);
+	release_uids(ns);
 	kfree(ns);
 }
 
-- 
cgit v1.3-8-gc7d7


From 480eccf9ae1073b87bb4fe118971fbf134a5bc61 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Tue, 18 Sep 2007 22:46:47 -0700
Subject: Fix NUMA Memory Policy Reference Counting

This patch proposes fixes to the reference counting of memory policy in the
page allocation paths and in show_numa_map().  Extracted from my "Memory
Policy Cleanups and Enhancements" series as stand-alone.

Shared policy lookup [shmem] has always added a reference to the policy,
but this was never unrefed after page allocation or after formatting the
numa map data.

Default system policy should not require additional ref counting, nor
should the current task's task policy.  However, show_numa_map() calls
get_vma_policy() to examine what may be [likely is] another task's policy.
The latter case needs protection against freeing of the policy.

This patch adds a reference count to a mempolicy returned by
get_vma_policy() when the policy is a vma policy or another task's
mempolicy.  Again, shared policy is already reference counted on lookup.  A
matching "unref" [__mpol_free()] is performed in alloc_page_vma() for
shared and vma policies, and in show_numa_map() for shared and another
task's mempolicy.  We can call __mpol_free() directly, saving an admittedly
inexpensive inline NULL test, because we know we have a non-NULL policy.

Handling policy ref counts for hugepages is a bit trickier.
huge_zonelist() returns a zone list that might come from a shared or vma
'BIND policy.  In this case, we should hold the reference until after the
huge page allocation in dequeue_hugepage().  The patch modifies
huge_zonelist() to return a pointer to the mempolicy if it needs to be
unref'd after allocation.

Kernel Build [16cpu, 32GB, ia64] - average of 10 runs:

		w/o patch	w/ refcount patch
	    Avg	  Std Devn	   Avg	  Std Devn
Real:	 100.59	    0.38	 100.63	    0.43
User:	1209.60	    0.37	1209.91	    0.31
System:   81.52	    0.42	  81.64	    0.34

Signed-off-by:  Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Andi Kleen <ak@suse.de>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  4 +--
 mm/hugetlb.c              |  4 ++-
 mm/mempolicy.c            | 79 +++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 75 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 5bdd656e88cf..a020eb2d4e2a 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -159,7 +159,7 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p);
 
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
-		unsigned long addr, gfp_t gfp_flags);
+		unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol);
 extern unsigned slab_node(struct mempolicy *policy);
 
 extern enum zone_type policy_zone;
@@ -256,7 +256,7 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p)
 #define set_cpuset_being_rebound(x) do {} while (0)
 
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
-		unsigned long addr, gfp_t gfp_flags)
+ 		unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol)
 {
 	return NODE_DATA(0)->node_zonelists + gfp_zone(gfp_flags);
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index de4cf458d6e1..84c795ee2d65 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -71,8 +71,9 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 {
 	int nid;
 	struct page *page = NULL;
+	struct mempolicy *mpol;
 	struct zonelist *zonelist = huge_zonelist(vma, address,
-						htlb_alloc_mask);
+					htlb_alloc_mask, &mpol);
 	struct zone **z;
 
 	for (z = zonelist->zones; *z; z++) {
@@ -87,6 +88,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 			break;
 		}
 	}
+	mpol_free(mpol);	/* unref if mpol !NULL */
 	return page;
 }
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bb54b88c3d5a..3d6ac9505d07 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1077,21 +1077,37 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 
 #endif
 
-/* Return effective policy for a VMA */
+/*
+ * get_vma_policy(@task, @vma, @addr)
+ * @task - task for fallback if vma policy == default
+ * @vma   - virtual memory area whose policy is sought
+ * @addr  - address in @vma for shared policy lookup
+ *
+ * Returns effective policy for a VMA at specified address.
+ * Falls back to @task or system default policy, as necessary.
+ * Returned policy has extra reference count if shared, vma,
+ * or some other task's policy [show_numa_maps() can pass
+ * @task != current].  It is the caller's responsibility to
+ * free the reference in these cases.
+ */
 static struct mempolicy * get_vma_policy(struct task_struct *task,
 		struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = task->mempolicy;
+	int shared_pol = 0;
 
 	if (vma) {
-		if (vma->vm_ops && vma->vm_ops->get_policy)
+		if (vma->vm_ops && vma->vm_ops->get_policy) {
 			pol = vma->vm_ops->get_policy(vma, addr);
-		else if (vma->vm_policy &&
+			shared_pol = 1;	/* if pol non-NULL, add ref below */
+		} else if (vma->vm_policy &&
 				vma->vm_policy->policy != MPOL_DEFAULT)
 			pol = vma->vm_policy;
 	}
 	if (!pol)
 		pol = &default_policy;
+	else if (!shared_pol && pol != current->mempolicy)
+		mpol_get(pol);	/* vma or other task's policy */
 	return pol;
 }
 
@@ -1207,19 +1223,45 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 }
 
 #ifdef CONFIG_HUGETLBFS
-/* Return a zonelist suitable for a huge page allocation. */
+/*
+ * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
+ * @vma = virtual memory area whose policy is sought
+ * @addr = address in @vma for shared policy lookup and interleave policy
+ * @gfp_flags = for requested zone
+ * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
+ *
+ * Returns a zonelist suitable for a huge page allocation.
+ * If the effective policy is 'BIND, returns pointer to policy's zonelist.
+ * If it is also a policy for which get_vma_policy() returns an extra
+ * reference, we must hold that reference until after allocation.
+ * In that case, return policy via @mpol so hugetlb allocation can drop
+ * the reference.  For non-'BIND referenced policies, we can/do drop the
+ * reference here, so the caller doesn't need to know about the special case
+ * for default and current task policy.
+ */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
-							gfp_t gfp_flags)
+				gfp_t gfp_flags, struct mempolicy **mpol)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+	struct zonelist *zl;
 
+	*mpol = NULL;		/* probably no unref needed */
 	if (pol->policy == MPOL_INTERLEAVE) {
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+		__mpol_free(pol);		/* finished with pol */
 		return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
 	}
-	return zonelist_policy(GFP_HIGHUSER, pol);
+
+	zl = zonelist_policy(GFP_HIGHUSER, pol);
+	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
+		if (pol->policy != MPOL_BIND)
+			__mpol_free(pol);	/* finished with pol */
+		else
+			*mpol = pol;	/* unref needed after allocation */
+	}
+	return zl;
 }
 #endif
 
@@ -1264,6 +1306,7 @@ struct page *
 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+	struct zonelist *zl;
 
 	cpuset_update_task_memory_state();
 
@@ -1273,7 +1316,19 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 		return alloc_page_interleave(gfp, 0, nid);
 	}
-	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
+	zl = zonelist_policy(gfp, pol);
+	if (pol != &default_policy && pol != current->mempolicy) {
+		/*
+		 * slow path: ref counted policy -- shared or vma
+		 */
+		struct page *page =  __alloc_pages(gfp, 0, zl);
+		__mpol_free(pol);
+		return page;
+	}
+	/*
+	 * fast path:  default or task policy
+	 */
+	return __alloc_pages(gfp, 0, zl);
 }
 
 /**
@@ -1872,6 +1927,7 @@ int show_numa_map(struct seq_file *m, void *v)
 	struct numa_maps *md;
 	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
+	struct mempolicy *pol;
 	int n;
 	char buffer[50];
 
@@ -1882,8 +1938,13 @@ int show_numa_map(struct seq_file *m, void *v)
 	if (!md)
 		return 0;
 
-	mpol_to_str(buffer, sizeof(buffer),
-			    get_vma_policy(priv->task, vma, vma->vm_start));
+	pol = get_vma_policy(priv->task, vma, vma->vm_start);
+	mpol_to_str(buffer, sizeof(buffer), pol);
+	/*
+	 * unref shared or other task's mempolicy
+	 */
+	if (pol != &default_policy && pol != current->mempolicy)
+		__mpol_free(pol);
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
-- 
cgit v1.3-8-gc7d7