From b18855500fc40da050512d9df82d2f1471e59642 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Sun, 15 Sep 2013 17:49:13 +0400 Subject: sched/balancing: Fix 'local->avg_load > sds->avg_load' case in calculate_imbalance() In busiest->group_imb case we can come to calculate_imbalance() with local->avg_load >= busiest->avg_load >= sds->avg_load. This can result in imbalance overflow, because it is calculated as follows env->imbalance = min( max_pull * busiest->group_power, (sds->avg_load - local->avg_load) * local->group_power) / SCHED_POWER_SCALE; As a result we can end up constantly bouncing tasks from one cpu to another if there are pinned tasks. Fix this by skipping the assignment and assuming imbalance=0 in case local->avg_load > sds->avg_load. [ The bug can be caught by running 2*N cpuhogs pinned to two logical cpus belonging to different cores on an HT-enabled machine with N logical cpus: just look at se.nr_migrations growth. ] Signed-off-by: Vladimir Davydov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/8f596cc6bc0e5e655119dc892c9bfcad26e971f4.1379252740.git.vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11cd13667359..0b99aae339cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4896,7 +4896,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * max load less than avg load(as we skip the groups at or below * its cpu_power, while calculating max_load..) */ - if (busiest->avg_load < sds->avg_load) { + if (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load) { env->imbalance = 0; return fix_small_imbalance(env, sds); } -- cgit v1.2.3-59-g8ed1b From 3029ede39373c368f402a76896600d85a4f7121b Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Sun, 15 Sep 2013 17:49:14 +0400 Subject: sched/balancing: Fix 'local->avg_load > busiest->avg_load' case in fix_small_imbalance() In busiest->group_imb case we can come to fix_small_imbalance() with local->avg_load > busiest->avg_load. This can result in wrong imbalance fix-up, because there is the following check there where all the members are unsigned: if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { env->imbalance = busiest->load_per_task; return; } As a result we can end up constantly bouncing tasks from one cpu to another if there are pinned tasks. Fix it by substituting the subtraction with an equivalent addition in the check. [ The bug can be caught by running 2*N cpuhogs pinned to two logical cpus belonging to different cores on an HT-enabled machine with N logical cpus: just look at se.nr_migrations growth. ] Signed-off-by: Vladimir Davydov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/ef167822e5c5b2d96cf5b0e3e4f4bdff3f0414a2.1379252740.git.vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b99aae339cb..2aedaccebcc8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4823,8 +4823,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) (busiest->load_per_task * SCHED_POWER_SCALE) / busiest->group_power; - if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= - (scaled_busy_load_per_task * imbn)) { + if (busiest->avg_load + scaled_busy_load_per_task >= + local->avg_load + (scaled_busy_load_per_task * imbn)) { env->imbalance = busiest->load_per_task; return; } -- cgit v1.2.3-59-g8ed1b From 7e3115ef5149fc502e3a2e80719dba54a8e7409d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Sat, 14 Sep 2013 19:39:46 +0400 Subject: sched/balancing: Fix cfs_rq->task_h_load calculation Patch a003a2 (sched: Consider runnable load average in move_tasks()) sets all top-level cfs_rqs' h_load to rq->avg.load_avg_contrib, which is always 0. This mistype leads to all tasks having weight 0 when load balancing in a cpu-cgroup enabled setup. There obviously should be sum of weights of all runnable tasks there instead. Fix it. Signed-off-by: Vladimir Davydov Reviewed-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1379173186-11944-1-git-send-email-vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2aedaccebcc8..7c70201fbc61 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4242,7 +4242,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = rq->avg.load_avg_contrib; + cfs_rq->h_load = cfs_rq->runnable_load_avg; cfs_rq->last_h_load_update = now; } -- cgit v1.2.3-59-g8ed1b