From cd79007634854f9e936e2369890f2512f94b8759 Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Wed, 17 Oct 2007 16:55:11 +0200
Subject: sched: more robust sd-sysctl entry freeing

It occurred to me this morning that the procname field was dynamically
allocated and needed to be freed.  I started to put in break statements
when allocation failed but it was approaching 50% error handling code.

I came up with this alternative of looping while entry->mode is set and
checking proc_handler instead of ->table.  Alternatively, the string
version of the domain name and cpu number could be stored the structs.

I verified by compiling CONFIG_DEBUG_SLAB and checking the allocation
counts after taking a cpuset exclusive and back.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 0da2b2635c54..5e220bf7d4c4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5272,11 +5272,20 @@ static struct ctl_table *sd_alloc_ctl_entry(int n)
 
 static void sd_free_ctl_entry(struct ctl_table **tablep)
 {
-	struct ctl_table *entry = *tablep;
+	struct ctl_table *entry;
 
-	for (entry = *tablep; entry->procname; entry++)
+	/*
+	 * In the intermediate directories, both the child directory and
+	 * procname are dynamically allocated and could fail but the mode
+	 * will always be set.  In the lowest directory the names are
+	 * static strings and all have proc handlers.
+	 */
+	for (entry = *tablep; entry->mode; entry++) {
 		if (entry->child)
 			sd_free_ctl_entry(&entry->child);
+		if (entry->proc_handler == NULL)
+			kfree(entry->procname);
+	}
 
 	kfree(*tablep);
 	*tablep = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 908a7c1b9b80d06708177432020c80d147754691 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Wed, 17 Oct 2007 16:55:11 +0200
Subject: sched: fix improper load balance across sched domain

We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.

When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.

To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:

CPU0 attaching sched-domain:
 domain 0: span 0003  groups: 0001 0002
 domain 1: span 000f  groups: 0003 000c
CPU1 attaching sched-domain:
 domain 0: span 0003  groups: 0002 0001
 domain 1: span 000f  groups: 0003 000c
CPU2 attaching sched-domain:
 domain 0: span 000c  groups: 0004 0008
 domain 1: span 000f  groups: 000c 0003
CPU3 attaching sched-domain:
 domain 0: span 000c  groups: 0008 0004
 domain 1: span 000f  groups: 000c 0003

If I run 2 tasks with CPU affinity set to 0x5.  There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle.  The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity".  This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group.  The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle.  There are two other logic in the same function will also
causing similar effect.  The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state.  From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).

So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group.  When such condition is detected, load balance goes into spread
mode instead of default grouping mode.

Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 5e220bf7d4c4..975436435b42 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2336,7 +2336,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	unsigned long max_pull;
 	unsigned long busiest_load_per_task, busiest_nr_running;
 	unsigned long this_load_per_task, this_nr_running;
-	int load_idx;
+	int load_idx, group_imb = 0;
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	int power_savings_balance = 1;
 	unsigned long leader_nr_running = 0, min_load_per_task = 0;
@@ -2355,9 +2355,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		load_idx = sd->idle_idx;
 
 	do {
-		unsigned long load, group_capacity;
+		unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
 		int local_group;
 		int i;
+		int __group_imb = 0;
 		unsigned int balance_cpu = -1, first_idle_cpu = 0;
 		unsigned long sum_nr_running, sum_weighted_load;
 
@@ -2368,6 +2369,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
+		max_cpu_load = 0;
+		min_cpu_load = ~0UL;
 
 		for_each_cpu_mask(i, group->cpumask) {
 			struct rq *rq;
@@ -2388,8 +2391,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 				}
 
 				load = target_load(i, load_idx);
-			} else
+			} else {
 				load = source_load(i, load_idx);
+				if (load > max_cpu_load)
+					max_cpu_load = load;
+				if (min_cpu_load > load)
+					min_cpu_load = load;
+			}
 
 			avg_load += load;
 			sum_nr_running += rq->nr_running;
@@ -2415,6 +2423,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		avg_load = sg_div_cpu_power(group,
 				avg_load * SCHED_LOAD_SCALE);
 
+		if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
+			__group_imb = 1;
+
 		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 
 		if (local_group) {
@@ -2423,11 +2434,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 			this_nr_running = sum_nr_running;
 			this_load_per_task = sum_weighted_load;
 		} else if (avg_load > max_load &&
-			   sum_nr_running > group_capacity) {
+			   (sum_nr_running > group_capacity || __group_imb)) {
 			max_load = avg_load;
 			busiest = group;
 			busiest_nr_running = sum_nr_running;
 			busiest_load_per_task = sum_weighted_load;
+			group_imb = __group_imb;
 		}
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2499,6 +2511,9 @@ group_next:
 		goto out_balanced;
 
 	busiest_load_per_task /= busiest_nr_running;
+	if (group_imb)
+		busiest_load_per_task = min(busiest_load_per_task, avg_load);
+
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
-- 
cgit v1.2.3-59-g8ed1b


From b1a8c172c318534b96d0f0f1aecdad3898118b98 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Wed, 17 Oct 2007 16:55:11 +0200
Subject: sched: fix !SYSFS build breakage

When CONFIG_SYSFS is not set, CONFIG_FAIR_USER_SCHED fails to build
with

kernel/built-in.o: In function `uids_kobject_init':
(.init.text+0x1488): undefined reference to `kernel_subsys'
kernel/built-in.o: In function `uids_kobject_init':
(.init.text+0x1490): undefined reference to `kernel_subsys'
kernel/built-in.o: In function `uids_kobject_init':
(.init.text+0x1480): undefined reference to `kernel_subsys'
kernel/built-in.o: In function `uids_kobject_init':
(.init.text+0x1494): undefined reference to `kernel_subsys'

This patch fixes this build error.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/user.c         | 23 +++++++++++++++--------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 592e3a55f818..4ac7d51ad0e1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -530,10 +530,12 @@ struct user_struct {
 
 #ifdef CONFIG_FAIR_USER_SCHED
 	struct task_group *tg;
+#ifdef CONFIG_SYSFS
 	struct kset kset;
 	struct subsys_attribute user_attr;
 	struct work_struct work;
 #endif
+#endif
 };
 
 #ifdef CONFIG_FAIR_USER_SCHED
diff --git a/kernel/user.c b/kernel/user.c
index f0e561e6d085..7e8215d87b40 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -87,9 +87,6 @@ static inline struct user_struct *uid_hash_find(uid_t uid,
 
 #ifdef CONFIG_FAIR_USER_SCHED
 
-static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
-static DEFINE_MUTEX(uids_mutex);
-
 static void sched_destroy_user(struct user_struct *up)
 {
 	sched_destroy_group(up->tg);
@@ -111,6 +108,19 @@ static void sched_switch_user(struct task_struct *p)
 	sched_move_task(p);
 }
 
+#else	/* CONFIG_FAIR_USER_SCHED */
+
+static void sched_destroy_user(struct user_struct *up) { }
+static int sched_create_user(struct user_struct *up) { return 0; }
+static void sched_switch_user(struct task_struct *p) { }
+
+#endif	/* CONFIG_FAIR_USER_SCHED */
+
+#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
+
+static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
+static DEFINE_MUTEX(uids_mutex);
+
 static inline void uids_mutex_lock(void)
 {
 	mutex_lock(&uids_mutex);
@@ -257,11 +267,8 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
 	schedule_work(&up->work);
 }
 
-#else	/* CONFIG_FAIR_USER_SCHED */
+#else	/* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
 
-static void sched_destroy_user(struct user_struct *up) { }
-static int sched_create_user(struct user_struct *up) { return 0; }
-static void sched_switch_user(struct task_struct *p) { }
 static inline int user_kobject_create(struct user_struct *up) { return 0; }
 static inline void uids_mutex_lock(void) { }
 static inline void uids_mutex_unlock(void) { }
@@ -280,7 +287,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
 	kmem_cache_free(uid_cachep, up);
 }
 
-#endif	/* CONFIG_FAIR_USER_SCHED */
+#endif
 
 /*
  * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
-- 
cgit v1.2.3-59-g8ed1b


From b9dca1e0fcb696716840a3bc8f20a6941b484dbf Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Wed, 17 Oct 2007 16:55:11 +0200
Subject: sched: fix new task startup crash

Child task may be added on a different cpu that the one on which parent
is running. In which case, task_new_fair() should check whether the new
born task's parent entity should be added as well on the cfs_rq.

Patch below fixes the problem in task_new_fair.

This could fix the put_prev_task_fair() crashes reported.

Reported-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Reported-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 2 +-
 kernel/sched_fair.c | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 975436435b42..0ec9521a8e70 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1712,7 +1712,7 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 
 	p->prio = effective_prio(p);
 
-	if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) {
+	if (!p->sched_class->task_new || !current->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a17b785d7000..166ed6db600b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1031,12 +1031,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 		swap(curr->vruntime, se->vruntime);
 	}
 
-	update_stats_enqueue(cfs_rq, se);
-	check_spread(cfs_rq, se);
-	check_spread(cfs_rq, curr);
-	__enqueue_entity(cfs_rq, se);
-	account_entity_enqueue(cfs_rq, se);
 	se->peer_preempt = 0;
+	enqueue_task_fair(rq, p, 0);
 	resched_task(rq->curr);
 }
 
-- 
cgit v1.2.3-59-g8ed1b