From 9d85f21c94f7f7a84d0ba686c58aa6d9da58fdbb Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:29 +0200 Subject: sched: Track the runnable average on a per-task entity basis Instead of tracking averaging the load parented by a cfs_rq, we can track entity load directly. With the load for a given cfs_rq then being the sum of its children. To do this we represent the historical contribution to runnable average within each trailing 1024us of execution as the coefficients of a geometric series. We can express this for a given task t as: runnable_sum(t) = \Sum u_i * y^i, runnable_avg_period(t) = \Sum 1024 * y^i load(t) = weight_t * runnable_sum(t) / runnable_avg_period(t) Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms and y is chosen such that y^k = 1/2. We currently choose k to be 32 which roughly translates to about a sched period. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.372695337@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2e..418fc6d8a4da 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1095,6 +1095,16 @@ struct load_weight { unsigned long weight, inv_weight; }; +struct sched_avg { + /* + * These sums represent an infinite geometric series and so are bound + * above by 1024/(1-y). Thus we only need a u32 to store them for for all + * choices of y < 1-2^(-32)*1024. + */ + u32 runnable_avg_sum, runnable_avg_period; + u64 last_runnable_update; +}; + #ifdef CONFIG_SCHEDSTATS struct sched_statistics { u64 wait_start; @@ -1155,6 +1165,9 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif +#ifdef CONFIG_SMP + struct sched_avg avg; +#endif }; struct sched_rt_entity { -- cgit v1.2.3-59-g8ed1b From 2dac754e10a5d41d94d2d2365c0345d4f215a266 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: sched: Aggregate load contributed by task entities on parenting cfs_rq For a given task t, we can compute its contribution to load as: task_load(t) = runnable_avg(t) * weight(t) On a parenting cfs_rq we can then aggregate: runnable_load(cfs_rq) = \Sum task_load(t), for all runnable children t Maintain this bottom up, with task entities adding their contributed load to the parenting cfs_rq sum. When a task entity's load changes we add the same delta to the maintained sum. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.514678907@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/debug.c | 3 +++ kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 10 +++++++++- 4 files changed, 60 insertions(+), 5 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 418fc6d8a4da..81d8b1ba4100 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1103,6 +1103,7 @@ struct sched_avg { */ u32 runnable_avg_sum, runnable_avg_period; u64 last_runnable_update; + unsigned long load_avg_contrib; }; #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4240abce4116..c953a89f94aa 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); P(se->avg.runnable_avg_period); + P(se->avg.load_avg_contrib); #endif #undef PN #undef P @@ -224,6 +225,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->load_contribution); SEQ_printf(m, " .%-30s: %d\n", "load_tg", atomic_read(&cfs_rq->tg->load_weight)); + SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8c5468fcf10d..77af759e5675 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1081,20 +1081,63 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return decayed; } +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.load_avg_contrib; + + if (!entity_is_task(se)) + return 0; + + se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum * + se->load.weight, + se->avg.runnable_avg_period + 1); + + return se->avg.load_avg_contrib - old_contrib; +} + /* Update a sched_entity's runnable average */ static inline void update_entity_load_avg(struct sched_entity *se) { - __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg, - se->on_rq); + struct cfs_rq *cfs_rq = cfs_rq_of(se); + long contrib_delta; + + if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg, + se->on_rq)) + return; + + contrib_delta = __update_entity_load_avg_contrib(se); + if (se->on_rq) + cfs_rq->runnable_load_avg += contrib_delta; } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); } + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + update_entity_load_avg(se); + cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; +} + +/* Remove se's load from this cfs_rq child load-average */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + update_entity_load_avg(se); + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; +} #else static inline void update_entity_load_avg(struct sched_entity *se) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1223,7 +1266,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - update_entity_load_avg(se); + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1298,7 +1341,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_entity_load_avg(se); + dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14b571968713..e6539736af58 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -222,6 +222,15 @@ struct cfs_rq { unsigned int nr_spread_over; #endif +#ifdef CONFIG_SMP + /* + * CFS Load tracking + * Under CFS, load is tracked on a per-entity basis and aggregated up. + * This allows for the description of both thread and group usage (in + * the FAIR_GROUP_SCHED case). + */ + u64 runnable_load_avg; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ @@ -1214,4 +1223,3 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -- cgit v1.2.3-59-g8ed1b From 9ee474f55664ff63111c843099d365e7ecffb56f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: sched: Maintain the load contribution of blocked entities We are currently maintaining: runnable_load(cfs_rq) = \Sum task_load(t) For all running children t of cfs_rq. While this can be naturally updated for tasks in a runnable state (as they are scheduled); this does not account for the load contributed by blocked task entities. This can be solved by introducing a separate accounting for blocked load: blocked_load(cfs_rq) = \Sum runnable(b) * weight(b) Obviously we do not want to iterate over all blocked entities to account for their decay, we instead observe that: runnable_load(t) = \Sum p_i*y^i and that to account for an additional idle period we only need to compute: y*runnable_load(t). This means that we can compute all blocked entities at once by evaluating: blocked_load(cfs_rq)` = y * blocked_load(cfs_rq) Finally we maintain a decay counter so that when a sleeping entity re-awakens we can determine how much of its load should be removed from the blocked sum. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.585389902@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 1 - kernel/sched/debug.c | 3 ++ kernel/sched/fair.c | 128 +++++++++++++++++++++++++++++++++++++++++++++----- kernel/sched/sched.h | 4 +- 5 files changed, 122 insertions(+), 15 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 81d8b1ba4100..b1831accfd89 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1103,6 +1103,7 @@ struct sched_avg { */ u32 runnable_avg_sum, runnable_avg_period; u64 last_runnable_update; + s64 decay_count; unsigned long load_avg_contrib; }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fd9d0859350a..00898f1fb69e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1528,7 +1528,6 @@ static void __sched_fork(struct task_struct *p) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif - #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c953a89f94aa..2d2e2b3c1bef 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -95,6 +95,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->avg.runnable_avg_sum); P(se->avg.runnable_avg_period); P(se->avg.load_avg_contrib); + P(se->avg.decay_count); #endif #undef PN #undef P @@ -227,6 +228,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) atomic_read(&cfs_rq->tg->load_weight)); SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", cfs_rq->runnable_load_avg); + SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", + cfs_rq->blocked_load_avg); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 77af759e5675..83194175e841 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -259,6 +259,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq); + static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -278,6 +280,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; + /* We should have no load, but we need to update last_decay. */ + update_cfs_rq_blocked_load(cfs_rq); } } @@ -1081,6 +1085,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return decayed; } +/* Synchronize an entity's decay with its parenting cfs_rq.*/ +static inline void __synchronize_entity_decay(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 decays = atomic64_read(&cfs_rq->decay_counter); + + decays -= se->avg.decay_count; + if (!decays) + return; + + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.decay_count = 0; +} + /* Compute the current contribution to load_avg by se, return any delta */ static long __update_entity_load_avg_contrib(struct sched_entity *se) { @@ -1096,8 +1114,18 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) return se->avg.load_avg_contrib - old_contrib; } +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, + long load_contrib) +{ + if (likely(load_contrib < cfs_rq->blocked_load_avg)) + cfs_rq->blocked_load_avg -= load_contrib; + else + cfs_rq->blocked_load_avg = 0; +} + /* Update a sched_entity's runnable average */ -static inline void update_entity_load_avg(struct sched_entity *se) +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta; @@ -1107,8 +1135,34 @@ static inline void update_entity_load_avg(struct sched_entity *se) return; contrib_delta = __update_entity_load_avg_contrib(se); + + if (!update_cfs_rq) + return; + if (se->on_rq) cfs_rq->runnable_load_avg += contrib_delta; + else + subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + +/* + * Decay the load contributed by all blocked children and account this so that + * their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) +{ + u64 now = rq_of(cfs_rq)->clock_task >> 20; + u64 decays; + + decays = now - cfs_rq->last_decay; + if (!decays) + return; + + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + + cfs_rq->last_decay = now; } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) @@ -1118,26 +1172,53 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable) /* Add the load generated by se into cfs_rq's child load-average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) + struct sched_entity *se, + int wakeup) { - update_entity_load_avg(se); + /* we track migrations using entity decay_count == 0 */ + if (unlikely(!se->avg.decay_count)) { + se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + wakeup = 0; + } else { + __synchronize_entity_decay(se); + } + + if (wakeup) + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + + update_entity_load_avg(se, 0); cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + update_cfs_rq_blocked_load(cfs_rq); } -/* Remove se's load from this cfs_rq child load-average */ +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) + struct sched_entity *se, + int sleep) { - update_entity_load_avg(se); + update_entity_load_avg(se, 1); + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + if (sleep) { + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } #else -static inline void update_entity_load_avg(struct sched_entity *se) {} +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) {} + struct sched_entity *se, + int wakeup) {} static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) {} + struct sched_entity *se, + int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1266,7 +1347,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - enqueue_entity_load_avg(cfs_rq, se); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1341,7 +1422,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1512,7 +1593,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_entity_load_avg(prev); + update_entity_load_avg(prev, 1); } cfs_rq->curr = NULL; } @@ -1528,7 +1609,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_entity_load_avg(curr); + update_entity_load_avg(curr, 1); + update_cfs_rq_blocked_load(cfs_rq); /* * Update share accounting for long-running entities. @@ -2387,6 +2469,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } if (!se) { @@ -2448,6 +2531,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } if (!se) { @@ -3498,6 +3582,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) update_rq_clock(rq); update_cfs_load(cfs_rq, 1); + update_cfs_rq_blocked_load(cfs_rq); /* * We need to update shares after updating tg->load_weight in @@ -5232,6 +5317,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) place_entity(cfs_rq, se, 0); se->vruntime -= cfs_rq->min_vruntime; } + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + /* + * Remove our load from contribution when we leave sched_fair + * and ensure we don't carry in an old decay_count if we + * switch back. + */ + if (p->se.avg.decay_count) { + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + __synchronize_entity_decay(&p->se); + subtract_blocked_load_contrib(cfs_rq, + p->se.avg.load_avg_contrib); + } +#endif } /* @@ -5278,6 +5377,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + atomic64_set(&cfs_rq->decay_counter, 1); +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e6539736af58..664ff39195f7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -229,7 +229,9 @@ struct cfs_rq { * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). */ - u64 runnable_load_avg; + u64 runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter; + u64 last_decay; #endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ -- cgit v1.2.3-59-g8ed1b From 0a74bef8bed18dc6889e9bc37ea1050a50c86c89 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: sched: Add an rq migration call-back to sched_class Since we are now doing bottom up load accumulation we need explicit notification when a task has been re-parented so that the old hierarchy can be updated. Adds: migrate_task_rq(struct task_struct *p, int next_cpu) (The alternative is to do this out of __set_task_cpu, but it was suggested that this would be a cleaner encapsulation.) Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.660023400@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 12 ++++++++++++ 3 files changed, 15 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b1831accfd89..e483ccb08ce6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1061,6 +1061,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 00898f1fb69e..f26860074ef2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -952,6 +952,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 83194175e841..5e602e6ba0c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3047,6 +3047,17 @@ unlock: return new_cpu; } + +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu. However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including the state of rq->lock, should be made. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) +{ +} #endif /* CONFIG_SMP */ static unsigned long @@ -5607,6 +5618,7 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, + .migrate_task_rq = migrate_task_rq_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, -- cgit v1.2.3-59-g8ed1b From f4e26b120b9de84cb627bc7361ba43cfdc51341f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:32 +0200 Subject: sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking While per-entity load-tracking is generally useful, beyond computing shares distribution, e.g. runnable based load-balance (in progress), governors, power-management, etc. These facilities are not yet consumers of this data. This may be trivially reverted when the information is required; but avoid paying the overhead for calculations we will not use until then. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.422162369@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 +++++++- kernel/sched/core.c | 7 ++++++- kernel/sched/fair.c | 13 +++++++++++-- kernel/sched/sched.h | 9 ++++++++- 4 files changed, 32 insertions(+), 5 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index e483ccb08ce6..e1581a029e3d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1168,7 +1168,13 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif -#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + /* Per-entity load-tracking */ struct sched_avg avg; #endif }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f26860074ef2..5dae0d252ff7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1526,7 +1526,12 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); -#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6ecf455fd95b..3e6a3531fa90 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -882,7 +882,8 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) /* * We choose a half-life close to 1 scheduling period. * Note: The tables below are dependent on this value. @@ -3173,6 +3174,12 @@ unlock: return new_cpu; } +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -3196,6 +3203,7 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } } +#endif #endif /* CONFIG_SMP */ static unsigned long @@ -5773,8 +5781,9 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, +#ifdef CONFIG_FAIR_GROUP_SCHED .migrate_task_rq = migrate_task_rq_fair, - +#endif .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0a75a430ca77..5eca173b563f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -225,6 +225,12 @@ struct cfs_rq { #endif #ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED /* * CFS Load tracking * Under CFS, load is tracked on a per-entity basis and aggregated up. @@ -234,7 +240,8 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; - +#endif /* CONFIG_FAIR_GROUP_SCHED */ +/* These always depend on CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED u32 tg_runnable_contrib; u64 tg_load_contrib; -- cgit v1.2.3-59-g8ed1b From e80d0a1ae8bb8fee0edd37427836f108b30f596b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Nov 2012 16:26:44 +0100 Subject: cputime: Rename thread_group_times to thread_group_cputime_adjusted We have thread_group_cputime() and thread_group_times(). The naming doesn't provide enough information about the difference between these two APIs. To lower the confusion, rename thread_group_times() to thread_group_cputime_adjusted(). This name better suggests that it's a version of thread_group_cputime() that does some stabilization on the raw cputime values. ie here: scale on top of CFS runtime stats and bound lower value for monotonicity. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- fs/proc/array.c | 4 ++-- include/linux/sched.h | 4 ++-- kernel/exit.c | 4 ++-- kernel/sched/cputime.c | 8 ++++---- kernel/sys.c | 6 +++--- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux/sched.h') diff --git a/fs/proc/array.c b/fs/proc/array.c index c1c207c36cae..d3696708fc1a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += sig->min_flt; maj_flt += sig->maj_flt; - thread_group_times(task, &utime, &stime); + thread_group_cputime_adjusted(task, &utime, &stime); gtime += sig->gtime; } @@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - task_times(task, &utime, &stime); + task_cputime_adjusted(task, &utime, &stime); gtime = task->gtime; } diff --git a/include/linux/sched.h b/include/linux/sched.h index e1581a029e3d..e75cab5820ab 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1751,8 +1751,8 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); -extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); +extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); +extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); /* * Per process flags diff --git a/kernel/exit.c b/kernel/exit.c index 346616c0092c..618f7ee56003 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1186,11 +1186,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_times() to get times for the thread + * We use thread_group_cputime_adjusted() to get times for the thread * group, which consolidates times for all threads in the * group including the group leader. */ - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e56f138a23c7..7dc155371b95 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -445,13 +445,13 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; *st = p->stime; } -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; @@ -516,7 +516,7 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; @@ -543,7 +543,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * Must be called with siglock held. */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct signal_struct *sig = p->signal; struct task_cputime cputime; diff --git a/kernel/sys.c b/kernel/sys.c index e6e0ece5f6a0..265b37690421 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1046,7 +1046,7 @@ void do_sys_times(struct tms *tms) cputime_t tgutime, tgstime, cutime, cstime; spin_lock_irq(¤t->sighand->siglock); - thread_group_times(current, &tgutime, &tgstime); + thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); @@ -1704,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = stime = 0; if (who == RUSAGE_THREAD) { - task_times(current, &utime, &stime); + task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = p->signal->maxrss; goto out; @@ -1730,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); utime += tgutime; stime += tgstime; r->ru_nvcsw += p->signal->nvcsw; -- cgit v1.2.3-59-g8ed1b From d37f761dbd276790f70dcf73a287fde2c3464482 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 22 Nov 2012 00:58:35 +0100 Subject: cputime: Consolidate cputime adjustment code task_cputime_adjusted() and thread_group_cputime_adjusted() essentially share the same code. They just don't use the same source: * The first function uses the cputime in the task struct and the previous adjusted snapshot that ensures monotonicity. * The second adds the cputime of all tasks in the group and the previous adjusted snapshot of the whole group from the signal structure. Just consolidate the common code that does the adjustment. These functions just need to fetch the values from the appropriate source. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- include/linux/sched.h | 23 +++++++++++++++++++---- kernel/fork.c | 2 +- kernel/sched/cputime.c | 46 +++++++++++++++++++++++----------------------- 3 files changed, 43 insertions(+), 28 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index e75cab5820ab..5dafac366811 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -433,14 +433,29 @@ struct cpu_itimer { u32 incr_error; }; +/** + * struct cputime - snaphsot of system and user cputime + * @utime: time spent in user mode + * @stime: time spent in system mode + * + * Gathers a generic snapshot of user and system time. + */ +struct cputime { + cputime_t utime; + cputime_t stime; +}; + /** * struct task_cputime - collected CPU time counts * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds * - * This structure groups together three kinds of CPU time that are - * tracked for threads and thread groups. Most things considering + * This is an extension of struct cputime that includes the total runtime + * spent by the task from the scheduler point of view. + * + * As a result, this structure groups together three kinds of CPU time + * that are tracked for threads and thread groups. Most things considering * CPU time want to group these counts together and treat all three * of them in parallel. */ @@ -581,7 +596,7 @@ struct signal_struct { cputime_t gtime; cputime_t cgtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; @@ -1340,7 +1355,7 @@ struct task_struct { cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* monotonic time */ diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..0e7cdb90476f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1222,7 +1222,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = p->prev_stime = 0; + p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7dc155371b95..220fdc4db770 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -516,14 +516,18 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime, total; + utime = curr->utime; + total = utime + curr->stime; /* * Use CFS's precise accounting: */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(curr->sum_exec_runtime); if (total) utime = scale_utime(utime, rtime, total); @@ -533,11 +537,22 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * Compare with previous values, to keep monotonicity: */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + prev->utime = max(prev->utime, utime); + prev->stime = max(prev->stime, rtime - prev->utime); + + *ut = prev->utime; + *st = prev->stime; +} - *ut = p->prev_utime; - *st = p->prev_stime; +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .utime = p->utime, + .stime = p->stime, + .sum_exec_runtime = p->se.sum_exec_runtime, + }; + + cputime_adjust(&cputime, &p->prev_cputime, ut, st); } /* @@ -545,24 +560,9 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { - struct signal_struct *sig = p->signal; struct task_cputime cputime; - cputime_t rtime, utime, total; thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } #endif -- cgit v1.2.3-59-g8ed1b