From 1158ddb55416855fd17abe3214298f736f00426a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 23 Nov 2012 00:02:15 +0400 Subject: sched/rt: Add reschedule check to switched_from_rt() Reschedule rq->curr if the first RT task has just been pulled to the rq. Signed-off-by: Kirill V Tkhai Acked-by: Steven Rostedt Cc: Peter Zijlstra Cc: Tkhai Kirill Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/118761353614535@web28f.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 418feb01344e..29bda5bdf2a5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1889,8 +1889,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (p->on_rq && !rq->rt.rt_nr_running) - pull_rt_task(rq); + if (!p->on_rq || rq->rt.rt_nr_running) + return; + + if (pull_rt_task(rq)) + resched_task(rq->curr); } void init_sched_rt_class(void) -- cgit v1.2.3-59-g8ed1b From 57d2aa00dcec67afa52478730f2b524521af14fb Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Tue, 17 Jul 2012 15:03:43 +0800 Subject: sched/rt: Avoid updating RT entry timeout twice within one tick period The issue below was found in 2.6.34-rt rather than mainline rt kernel, but the issue still exists upstream as well. So please let me describe how it was noticed on 2.6.34-rt: On this version, each softirq has its own thread, it means there is at least one RT FIFO task per cpu. The priority of these tasks is set to 49 by default. If user launches an RT FIFO task with priority lower than 49 of softirq RT tasks, it's possible there are two RT FIFO tasks enqueued one cpu runqueue at one moment. By current strategy of balancing RT tasks, when it comes to RT tasks, we really need to put them off to a CPU that they can run on as soon as possible. Even if it means a bit of cache line flushing, we want RT tasks to be run with the least latency. When the user RT FIFO task which just launched before is running, the sched timer tick of the current cpu happens. In this tick period, the timeout value of the user RT task will be updated once. Subsequently, we try to wake up one softirq RT task on its local cpu. As the priority of current user RT task is lower than the softirq RT task, the current task will be preempted by the higher priority softirq RT task. Before preemption, we check to see if current can readily move to a different cpu. If so, we will reschedule to allow the RT push logic to try to move current somewhere else. Whenever the woken softirq RT task runs, it first tries to migrate the user FIFO RT task over to a cpu that is running a task of lesser priority. If migration is done, it will send a reschedule request to the found cpu by IPI interrupt. Once the target cpu responds the IPI interrupt, it will pick the migrated user RT task to preempt its current task. When the user RT task is running on the new cpu, the sched timer tick of the cpu fires. So it will tick the user RT task again. This also means the RT task timeout value will be updated again. As the migration may be done in one tick period, it means the user RT task timeout value will be updated twice within one tick. If we set a limit on the amount of cpu time for the user RT task by setrlimit(RLIMIT_RTTIME), the SIGXCPU signal should be posted upon reaching the soft limit. But exactly when the SIGXCPU signal should be sent depends on the RT task timeout value. In fact the timeout mechanism of sending the SIGXCPU signal assumes the RT task timeout is increased once every tick. However, currently the timeout value may be added twice per tick. So it results in the SIGXCPU signal being sent earlier than expected. To solve this issue, we prevent the timeout value from increasing twice within one tick time by remembering the jiffies value of last updating the timeout. As long as the RT task's jiffies is different with the global jiffies value, we allow its timeout to be updated. Signed-off-by: Ying Xue Signed-off-by: Fan Du Reviewed-by: Yong Zhang Acked-by: Steven Rostedt Cc: Link: http://lkml.kernel.org/r/1342508623-2887-1-git-send-email-ying.xue@windriver.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/rt.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/sched/rt.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index d2112477ff5e..924e42a8df58 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1208,6 +1208,7 @@ struct sched_entity { struct sched_rt_entity { struct list_head run_list; unsigned long timeout; + unsigned long watchdog_stamp; unsigned int time_slice; struct sched_rt_entity *back; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 29bda5bdf2a5..2f69ca997826 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1988,7 +1988,11 @@ static void watchdog(struct rq *rq, struct task_struct *p) if (soft != RLIM_INFINITY) { unsigned long next; - p->rt.timeout++; + if (p->rt.watchdog_stamp != jiffies) { + p->rt.timeout++; + p->rt.watchdog_stamp = jiffies; + } + next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) p->cputime_expires.sched_exp = p->se.sum_exec_runtime; -- cgit v1.2.3-59-g8ed1b From fc79e240be5aa379dd36a62158be5a5ee0e4aec7 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 30 Jan 2013 16:50:36 +0400 Subject: sched/rt: Do not account zero delta_exec in update_curr_rt() There are several places of consecutive calls of dequeue_task_rt() and put_prev_task_rt() in the scheduler. For example, function rt_mutex_setprio() does it. The both calls lead to update_curr_rt(), the second of it receives zeroed delta_exec. The only effective action in this case is call of sched_rt_avg_update(), which can change rq->age_stamp and rq->rt_avg. But it is possible in case of ""floating"" rq->clock. This fact is not reasonable to be accounted. Another actions do nothing. Signed-off-by: Kirill V Tkhai Acked-by: Steven Rostedt Cc: Peter Zijlstra CC: linux-rt-users Link: http://lkml.kernel.org/r/931541359550236@web1g.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2f69ca997826..94abca4d9cf5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -925,8 +925,8 @@ static void update_curr_rt(struct rq *rq) return; delta_exec = rq->clock_task - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; + if (unlikely((s64)delta_exec <= 0)) + return; schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); -- cgit v1.2.3-59-g8ed1b From 60334caf37dc7c59120b21faa625534a6fffead0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 31 Jan 2013 18:56:17 +0400 Subject: sched/rt: Further simplify pick_rt_task() Function next_prio() has been removed and pull_rt_task() is the only user of pick_next_highest_task_rt() at the moment. pull_rt_task is not interested in p->nr_cpus_allowed, its only interest is the fact that cpu is allowed to execute p. If nr_cpus_allowed == 1, cpu != task_cpu(p) and cpu is allowed then it means that task p is in the middle of the migration techniques; the task waits until it is moved by migration thread. So, lets pull it earlier. Signed-off-by: Kirill V Tkhai Acked-by: Steven Rostedt Cc: Peter Zijlstra CC: linux-rt-users Link: http://lkml.kernel.org/r/70871359644177@web16d.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 94abca4d9cf5..c25de141c576 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1427,8 +1427,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->nr_cpus_allowed > 1)) + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return 1; return 0; } -- cgit v1.2.3-59-g8ed1b From ce0dbbbb30aee6a835511d5be446462388ba9eee Mon Sep 17 00:00:00 2001 From: Clark Williams Date: Thu, 7 Feb 2013 09:47:04 -0600 Subject: sched/rt: Add a tuning knob to allow changing SCHED_RR timeslice Add a /proc/sys/kernel scheduler knob named sched_rr_timeslice_ms that allows global changing of the SCHED_RR timeslice value. User visable value is in milliseconds but is stored as jiffies. Setting to 0 (zero) resets to the default (currently 100ms). Signed-off-by: Clark Williams Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20130207094704.13751796@riff.lan Signed-off-by: Ingo Molnar --- include/linux/sched/sysctl.h | 15 ++++++++++++++- kernel/sched/core.c | 19 +++++++++++++++++++ kernel/sched/rt.c | 6 ++++-- kernel/sysctl.c | 7 +++++++ 4 files changed, 44 insertions(+), 3 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index bac914e458ca..d2bb0ae979d0 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -73,6 +73,13 @@ static inline unsigned int get_sysctl_timer_migration(void) return 1; } #endif + +/* + * control realtime throttling: + * + * /proc/sys/kernel/sched_rt_period_us + * /proc/sys/kernel/sched_rt_runtime_us + */ extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; @@ -90,7 +97,13 @@ extern unsigned int sysctl_sched_autogroup_enabled; */ #define RR_TIMESLICE (100 * HZ / 1000) -int sched_rt_handler(struct ctl_table *table, int write, +extern int sched_rr_timeslice; + +extern int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + +extern int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1dff78a9e2ab..4a88f1d51563 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7509,6 +7509,25 @@ static int sched_rt_global_constraints(void) } #endif /* CONFIG_RT_GROUP_SCHED */ +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* make sure that internally we keep jiffies */ + /* also, writing zero resets timeslice to default */ + if (!ret && write) { + sched_rr_timeslice = sched_rr_timeslice <= 0 ? + RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c25de141c576..fb0f77e402b6 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -7,6 +7,8 @@ #include +int sched_rr_timeslice = RR_TIMESLICE; + static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; @@ -2016,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) if (--p->rt.time_slice) return; - p->rt.time_slice = RR_TIMESLICE; + p->rt.time_slice = sched_rr_timeslice; /* * Requeue to the end of queue if we (and all of our ancestors) are the @@ -2047,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) * Time slice is 0 for SCHED_FIFO tasks */ if (task->policy == SCHED_RR) - return RR_TIMESLICE; + return sched_rr_timeslice; else return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7357e23aaf68..4fc9be955c71 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -404,6 +404,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", -- cgit v1.2.3-59-g8ed1b