aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c119
1 files changed, 113 insertions, 6 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e44d83f3e0e6..12e1f3a2cabc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -889,6 +889,23 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
/* Max allowed maximum utilization */
unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+/*
+ * By default RT tasks run at the maximum performance point/capacity of the
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
+ * SCHED_CAPACITY_SCALE.
+ *
+ * This knob allows admins to change the default behavior when uclamp is being
+ * used. In battery powered devices, particularly, running at the maximum
+ * capacity and frequency will increase energy consumption and shorten the
+ * battery life.
+ *
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
+ *
+ * This knob will not override the system default sched_util_clamp_min defined
+ * above.
+ */
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
@@ -991,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+ unsigned int default_util_min;
+ struct uclamp_se *uc_se;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ uc_se = &p->uclamp_req[UCLAMP_MIN];
+
+ /* Only sync if user didn't override the default */
+ if (uc_se->user_defined)
+ return;
+
+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
+ uclamp_se_set(uc_se, default_util_min, false);
+}
+
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (!rt_task(p))
+ return;
+
+ /* Protect updates to p->uclamp_* */
+ rq = task_rq_lock(p, &rf);
+ __uclamp_update_util_min_rt_default(p);
+ task_rq_unlock(rq, p, &rf);
+}
+
+static void uclamp_sync_util_min_rt_default(void)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * copy_process() sysctl_uclamp
+ * uclamp_min_rt = X;
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
+ * // link thread smp_mb__after_spinlock()
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
+ * sched_post_fork() for_each_process_thread()
+ * __uclamp_sync_rt() __uclamp_sync_rt()
+ *
+ * Ensures that either sched_post_fork() will observe the new
+ * uclamp_min_rt or for_each_process_thread() will observe the new
+ * task.
+ */
+ read_lock(&tasklist_lock);
+ smp_mb__after_spinlock();
+ read_unlock(&tasklist_lock);
+
+ rcu_read_lock();
+ for_each_process_thread(g, p)
+ uclamp_update_util_min_rt_default(p);
+ rcu_read_unlock();
+}
+
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
@@ -1278,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
- int old_min, old_max;
+ int old_min, old_max, old_min_rt;
int result;
mutex_lock(&uclamp_mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
result = proc_dointvec(table, write, buffer, lenp, ppos);
if (result)
@@ -1292,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
goto done;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
+
result = -EINVAL;
goto undo;
}
@@ -1313,6 +1391,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
uclamp_update_root_tg();
}
+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
+ static_branch_enable(&sched_uclamp_used);
+ uclamp_sync_util_min_rt_default();
+ }
+
/*
* We update all RUNNABLE tasks only when task groups are in use.
* Otherwise, keep it simple and do just a lazy update at each next
@@ -1324,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
done:
mutex_unlock(&uclamp_mutex);
@@ -1369,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p,
*/
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
- unsigned int clamp_value = uclamp_none(clamp_id);
/* Keep using defined clamps across class changes */
if (uc_se->user_defined)
continue;
- /* By default, RT tasks always get 100% boost */
+ /*
+ * RT by default have a 100% boost value that could be modified
+ * at runtime.
+ */
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- clamp_value = uclamp_none(UCLAMP_MAX);
+ __uclamp_update_util_min_rt_default(p);
+ else
+ uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
- uclamp_se_set(uc_se, clamp_value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
@@ -1400,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p)
{
enum uclamp_id clamp_id;
+ /*
+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
+ * as the task is still at its early fork stages.
+ */
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
@@ -1412,6 +1503,11 @@ static void uclamp_fork(struct task_struct *p)
}
}
+static void uclamp_post_fork(struct task_struct *p)
+{
+ uclamp_update_util_min_rt_default(p);
+}
+
static void __init init_uclamp_rq(struct rq *rq)
{
enum uclamp_id clamp_id;
@@ -1462,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p,
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
+static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
@@ -3205,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
+void sched_post_fork(struct task_struct *p)
+{
+ uclamp_post_fork(p);
+}
+
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
@@ -5724,6 +5826,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif