From d8ac897137a230ec351269f6378017f2decca512 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 21 Sep 2016 14:38:10 +0100 Subject: sched/core: Add wrappers for lockdep_(un)pin_lock() In preparation for adding diagnostic checks to catch missing calls to update_rq_clock(), provide wrappers for (re)pinning and unpinning rq->lock. Because the pending diagnostic checks allow state to be maintained in rq_flags across pin contexts, swap the 'struct pin_cookie' arguments for 'struct rq_flags *'. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Jan Kara Cc: Linus Torvalds Cc: Luca Abeni Cc: Mel Gorman Cc: Mike Galbraith Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Petr Mladek Cc: Rik van Riel Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yuyang Du Link: http://lkml.kernel.org/r/20160921133813.31976-5-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6559d197e08a..490441255c56 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6213,7 +6213,7 @@ preempt: } static struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -6326,9 +6326,9 @@ idle: * further scheduler activity on it and we're being very careful to * re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); new_tasks = idle_balance(rq); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we -- cgit v1.2.3-59-g8ed1b From 46f69fa33712ad12ccaa723e46ed5929ee93589b Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 21 Sep 2016 14:38:12 +0100 Subject: sched/fair: Push rq lock pin/unpin into idle_balance() Future patches will emit warnings if rq_clock() is called before update_rq_clock() inside a rq_pin_lock()/rq_unpin_lock() pair. Since there is only one caller of idle_balance() we can push the unpin/repin there. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Jan Kara Cc: Linus Torvalds Cc: Luca Abeni Cc: Mel Gorman Cc: Mike Galbraith Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Petr Mladek Cc: Rik van Riel Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yuyang Du Link: http://lkml.kernel.org/r/20160921133813.31976-7-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 490441255c56..faf80e10d662 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3424,7 +3424,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int idle_balance(struct rq *this_rq); +static int idle_balance(struct rq *this_rq, struct rq_flags *rf); #else /* CONFIG_SMP */ @@ -3453,7 +3453,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq) +static inline int idle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -6320,15 +6320,8 @@ simple: return p; idle: - /* - * This is OK, because current is on_cpu, which avoids it being picked - * for load-balance and preemption/IRQs are still disabled avoiding - * further scheduler activity on it and we're being very careful to - * re-start the picking loop. - */ - rq_unpin_lock(rq, rf); - new_tasks = idle_balance(rq); - rq_repin_lock(rq, rf); + new_tasks = idle_balance(rq, rf); + /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -8297,7 +8290,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static int idle_balance(struct rq *this_rq) +static int idle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -8311,6 +8304,14 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + rq_unpin_lock(this_rq, rf); + if (this_rq->avg_idle < sysctl_sched_migration_cost || !this_rq->rd->overload) { rcu_read_lock(); @@ -8388,6 +8389,8 @@ out: if (pulled_task) this_rq->idle_stamp = 0; + rq_repin_lock(this_rq, rf); + return pulled_task; } -- cgit v1.2.3-59-g8ed1b From 4126bad6717336abe5d666440ae15555563ca53f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Oct 2016 16:20:59 +0200 Subject: sched/core: Add missing update_rq_clock() in post_init_entity_util_avg() Address this rq-clock update bug: WARNING: CPU: 0 PID: 0 at ../kernel/sched/sched.h:797 post_init_entity_util_avg() rq->clock_update_flags < RQCF_ACT_SKIP Call Trace: __warn() post_init_entity_util_avg() wake_up_new_task() _do_fork() kernel_thread() rest_init() start_kernel() Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/fair.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 311460b46d68..9217c3221b0d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2578,6 +2578,7 @@ void wake_up_new_task(struct task_struct *p) __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); + update_rq_clock(rq); post_init_entity_util_avg(&p->se); activate_task(rq, p, 0); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index faf80e10d662..972b67622922 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9267,6 +9267,7 @@ void online_fair_sched_group(struct task_group *tg) se = tg->se[i]; raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); -- cgit v1.2.3-59-g8ed1b From 3bed5e2166a5e433bf62162f3cd3c5174d335934 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Oct 2016 16:35:32 +0200 Subject: sched/core: Add missing update_rq_clock() call for task_hot() Add the update_rq_clock() call at the top of the callstack instead of at the bottom where we find it missing, this to aid later effort to minimize the number of update_rq_lock() calls. WARNING: CPU: 30 PID: 194 at ../kernel/sched/sched.h:797 assert_clock_updated() rq->clock_update_flags < RQCF_ACT_SKIP Call Trace: dump_stack() __warn() warn_slowpath_fmt() assert_clock_updated.isra.63.part.64() can_migrate_task() load_balance() pick_next_task_fair() __schedule() schedule() worker_thread() kthread() Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 972b67622922..b3bfe3fb4e13 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8070,6 +8070,7 @@ redo: more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); + update_rq_clock(busiest); /* * cur_ld_moved - load moved in current iteration @@ -8446,6 +8447,7 @@ static int active_load_balance_cpu_stop(void *data) }; schedstat_inc(sd->alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { -- cgit v1.2.3-59-g8ed1b From 89ee048f3cc796db6f26906c6bef4edf0bee70fd Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 21 Dec 2016 16:50:26 +0100 Subject: sched/core: Fix group_entity's share update The update of the share of a cfs_rq is done when its load_avg is updated but before the group_entity's load_avg has been updated for the past time slot. This generates wrong load_avg accounting which can be significant when small tasks are involved in the scheduling. Let take the example of a task a that is dequeued of its task group A: root (cfs_rq) \ (se) A (cfs_rq) \ (se) a Task "a" was the only task in task group A which becomes idle when a is dequeued. We have the sequence: - dequeue_entity a->se - update_load_avg(a->se) - dequeue_entity_load_avg(A->cfs_rq, a->se) - update_cfs_shares(A->cfs_rq) A->cfs_rq->load.weight == 0 A->se->load.weight is updated with the new share (0 in this case) - dequeue_entity A->se - update_load_avg(A->se) but its weight is now null so the last time slot (up to a tick) will be accounted with a weight of 0 instead of its real weight during the time slot. The last time slot will be accounted as an idle one whereas it was a running one. If the running time of task a is short enough that no tick happens when it runs, all running time of group entity A->se will be accounted as idle time. Instead, we should update the share of a cfs_rq (in fact the weight of its group entity) only after having updated the load_avg of the group_entity. update_cfs_shares() now takes the sched_entity as a parameter instead of the cfs_rq, and the weight of the group_entity is updated only once its load_avg has been synced with current time. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: pjt@google.com Link: http://lkml.kernel.org/r/1482335426-7664-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b3bfe3fb4e13..2b866a279bdf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2689,16 +2689,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct sched_entity *se) { + struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg; - struct sched_entity *se; long shares; - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) + if (!cfs_rq) + return; + + if (throttled_hierarchy(cfs_rq)) return; + + tg = cfs_rq->tg; + #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) return; @@ -2707,8 +2711,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } + #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3582,10 +3587,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; + /* + * When enqueuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its weight to reflect the new share of + * its group cfs_rq + * - Add its new weight to cfs_rq->load.weight + */ update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); + update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); @@ -3657,6 +3670,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Substract its load from the cfs_rq->runnable_avg. + * - Substract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); @@ -3681,7 +3703,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); /* * Now advance min_vruntime if @se was the entity holding it back, @@ -3864,7 +3886,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_load_avg(curr, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -4761,7 +4783,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -4820,7 +4842,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -9362,8 +9384,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Possible calls to update_curr() need rq clock */ update_rq_clock(rq); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); + for_each_sched_entity(se) { + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } -- cgit v1.2.3-59-g8ed1b From b8fd8423697b9ec729c5bb91737faad84ae19985 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 11 Jan 2017 11:29:47 +0000 Subject: sched/fair: Explain why MIN_SHARES isn't scaled in calc_cfs_shares() Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Turner Cc: Peter Zijlstra Cc: Samuel Thibault Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e9a4d858-bcf3-36b9-e3a9-449953e34569@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2b866a279bdf..274c747a01ce 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2657,6 +2657,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) if (tg_weight) shares /= tg_weight; + /* + * MIN_SHARES has to be unscaled here to support per-CPU partitioning + * of a group with small tg->shares value. It is a floor value which is + * assigned as a minimum load.weight to the sched_entity representing + * the group on a CPU. + * + * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024 + * on an 8-core system with 8 tasks each runnable on one CPU shares has + * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In + * case no task is runnable on a CPU MIN_SHARES=2 should be returned + * instead of 0. + */ if (shares < MIN_SHARES) shares = MIN_SHARES; if (shares > tg->shares) -- cgit v1.2.3-59-g8ed1b