aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/kernel
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2025-02-20 10:36:34 -0800
committerJakub Kicinski <kuba@kernel.org>2025-02-20 10:37:30 -0800
commit5d6ba5ab8582aa35c1ee98e47af28e6f6772596c (patch)
treea4a3a2d311773c19909c9928273e0bcae01a39ce /kernel
parentMerge tag 'linux-can-next-for-6.15-20250219' of git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next (diff)
parentMerge tag 'net-6.14-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net (diff)
downloadwireguard-linux-5d6ba5ab8582aa35c1ee98e47af28e6f6772596c.tar.xz
wireguard-linux-5d6ba5ab8582aa35c1ee98e47af28e6f6772596c.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.14-rc4). No conflicts or adjacent changes. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c134
-rw-r--r--kernel/cgroup/cgroup.c20
-rw-r--r--kernel/cgroup/rstat.c1
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/sched/autogroup.c4
-rw-r--r--kernel/sched/core.c12
-rw-r--r--kernel/sched/ext.c113
-rw-r--r--kernel/sched/ext.h4
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/trace/ring_buffer.c28
-rw-r--r--kernel/trace/trace.c12
-rw-r--r--kernel/workqueue.c12
12 files changed, 221 insertions, 125 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 31222e8cd534..6520baa13669 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -103,48 +103,50 @@ struct bsd_acct_struct {
atomic_long_t count;
struct rcu_head rcu;
struct mutex lock;
- int active;
+ bool active;
+ bool check_space;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
struct work_struct work;
struct completion done;
+ acct_t ac;
};
-static void do_acct_process(struct bsd_acct_struct *acct);
+static void fill_ac(struct bsd_acct_struct *acct);
+static void acct_write_process(struct bsd_acct_struct *acct);
/*
* Check the amount of free space and suspend/resume accordingly.
*/
-static int check_free_space(struct bsd_acct_struct *acct)
+static bool check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- if (time_is_after_jiffies(acct->needcheck))
- goto out;
+ if (!acct->check_space)
+ return acct->active;
/* May block */
if (vfs_statfs(&acct->file->f_path, &sbuf))
- goto out;
+ return acct->active;
if (acct->active) {
u64 suspend = sbuf.f_blocks * SUSPEND;
do_div(suspend, 100);
if (sbuf.f_bavail <= suspend) {
- acct->active = 0;
+ acct->active = false;
pr_info("Process accounting paused\n");
}
} else {
u64 resume = sbuf.f_blocks * RESUME;
do_div(resume, 100);
if (sbuf.f_bavail >= resume) {
- acct->active = 1;
+ acct->active = true;
pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-out:
return acct->active;
}
@@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin)
{
struct bsd_acct_struct *acct = to_acct(pin);
mutex_lock(&acct->lock);
- do_acct_process(acct);
+ /*
+ * Fill the accounting struct with the exiting task's info
+ * before punting to the workqueue.
+ */
+ fill_ac(acct);
schedule_work(&acct->work);
wait_for_completion(&acct->done);
cmpxchg(&acct->ns->bacct, pin, NULL);
@@ -202,6 +208,9 @@ static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
struct file *file = acct->file;
+
+ /* We were fired by acct_pin_kill() which holds acct->lock. */
+ acct_write_process(acct);
if (file->f_op->flush)
file->f_op->flush(file, NULL);
__fput_sync(file);
@@ -234,6 +243,20 @@ static int acct_on(struct filename *pathname)
return -EACCES;
}
+ /* Exclude kernel kernel internal filesystems. */
+ if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return -EINVAL;
+ }
+
+ /* Exclude procfs and sysfs. */
+ if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return -EINVAL;
+ }
+
if (!(file->f_mode & FMODE_CAN_WRITE)) {
kfree(acct);
filp_close(file, NULL);
@@ -430,13 +453,27 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
-static void fill_ac(acct_t *ac)
+static void fill_ac(struct bsd_acct_struct *acct)
{
struct pacct_struct *pacct = &current->signal->pacct;
+ struct file *file = acct->file;
+ acct_t *ac = &acct->ac;
u64 elapsed, run_time;
time64_t btime;
struct tty_struct *tty;
+ lockdep_assert_held(&acct->lock);
+
+ if (time_is_after_jiffies(acct->needcheck)) {
+ acct->check_space = false;
+
+ /* Don't fill in @ac if nothing will be written. */
+ if (!acct->active)
+ return;
+ } else {
+ acct->check_space = true;
+ }
+
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
@@ -484,64 +521,61 @@ static void fill_ac(acct_t *ac)
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
ac->ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
-}
-/*
- * do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct)
-{
- acct_t ac;
- unsigned long flim;
- const struct cred *orig_cred;
- struct file *file = acct->file;
- /*
- * Accounting records are not subject to resource limits.
- */
- flim = rlimit(RLIMIT_FSIZE);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- /* Perform file operations on behalf of whoever enabled accounting */
- orig_cred = override_creds(file->f_cred);
-
- /*
- * First check to see if there is enough free_space to continue
- * the process accounting system.
- */
- if (!check_free_space(acct))
- goto out;
-
- fill_ac(&ac);
/* we really need to bite the bullet and change layout */
- ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
- ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
+ ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid());
+ ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid());
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
- ac.ac_uid16 = ac.ac_uid;
- ac.ac_gid16 = ac.ac_gid;
+ ac->ac_uid16 = ac->ac_uid;
+ ac->ac_gid16 = ac->ac_gid;
#elif ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
- ac.ac_pid = task_tgid_nr_ns(current, ns);
+ ac->ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
- ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
- ns);
+ ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
}
#endif
+}
+
+static void acct_write_process(struct bsd_acct_struct *acct)
+{
+ struct file *file = acct->file;
+ const struct cred *cred;
+ acct_t *ac = &acct->ac;
+
+ /* Perform file operations on behalf of whoever enabled accounting */
+ cred = override_creds(file->f_cred);
+
/*
- * Get freeze protection. If the fs is frozen, just skip the write
- * as we could deadlock the system otherwise.
+ * First check to see if there is enough free_space to continue
+ * the process accounting system. Then get freeze protection. If
+ * the fs is frozen, just skip the write as we could deadlock
+ * the system otherwise.
*/
- if (file_start_write_trylock(file)) {
+ if (check_free_space(acct) && file_start_write_trylock(file)) {
/* it's been opened O_APPEND, so position is irrelevant */
loff_t pos = 0;
- __kernel_write(file, &ac, sizeof(acct_t), &pos);
+ __kernel_write(file, ac, sizeof(acct_t), &pos);
file_end_write(file);
}
-out:
+
+ revert_creds(cred);
+}
+
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+ unsigned long flim;
+
+ /* Accounting records are not subject to resource limits. */
+ flim = rlimit(RLIMIT_FSIZE);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ fill_ac(acct);
+ acct_write_process(acct);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
- revert_creds(orig_cred);
}
/**
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d9061bd55436..afc665b7b1fe 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4013,7 +4013,7 @@ static void __cgroup_kill(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- set_bit(CGRP_KILL, &cgrp->flags);
+ cgrp->kill_seq++;
spin_unlock_irq(&css_set_lock);
css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -4029,10 +4029,6 @@ static void __cgroup_kill(struct cgroup *cgrp)
send_sig(SIGKILL, task, 0);
}
css_task_iter_end(&it);
-
- spin_lock_irq(&css_set_lock);
- clear_bit(CGRP_KILL, &cgrp->flags);
- spin_unlock_irq(&css_set_lock);
}
static void cgroup_kill(struct cgroup *cgrp)
@@ -6488,6 +6484,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
+ if (kargs->cgrp)
+ kargs->kill_seq = kargs->cgrp->kill_seq;
+ else
+ kargs->kill_seq = cset->dfl_cgrp->kill_seq;
spin_unlock_irq(&css_set_lock);
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
@@ -6668,6 +6668,7 @@ void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ unsigned int cgrp_kill_seq = 0;
unsigned long cgrp_flags = 0;
bool kill = false;
struct cgroup_subsys *ss;
@@ -6681,10 +6682,13 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
- if (kargs->cgrp)
+ if (kargs->cgrp) {
cgrp_flags = kargs->cgrp->flags;
- else
+ cgrp_kill_seq = kargs->cgrp->kill_seq;
+ } else {
cgrp_flags = cset->dfl_cgrp->flags;
+ cgrp_kill_seq = cset->dfl_cgrp->kill_seq;
+ }
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
@@ -6719,7 +6723,7 @@ void cgroup_post_fork(struct task_struct *child,
* child down right after we finished preparing it for
* userspace.
*/
- kill = test_bit(CGRP_KILL, &cgrp_flags);
+ kill = kargs->kill_seq != cgrp_kill_seq;
}
spin_unlock_irq(&css_set_lock);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 5877974ece92..aac91466279f 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -590,7 +590,6 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
- cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
#ifdef CONFIG_SCHED_CORE
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5432418c0fea..875f25ed6f71 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -31,10 +31,6 @@ config GENERIC_IRQ_EFFECTIVE_AFF_MASK
config GENERIC_PENDING_IRQ
bool
-# Deduce delayed migration from top-level interrupt chip flags
-config GENERIC_PENDING_IRQ_CHIPFLAGS
- bool
-
# Support for generic irq migrating off cpu before the cpu is offline.
config GENERIC_IRQ_MIGRATION
bool
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 83d46b9b8ec8..2b331822c7e7 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p)
* see this thread after that: we can no longer use signal->autogroup.
* See the PF_EXITING check in task_wants_autogroup().
*/
- sched_move_task(p);
+ sched_move_task(p, true);
}
static void
@@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
* sched_autogroup_exit_task().
*/
for_each_thread(p, t)
- sched_move_task(t);
+ sched_move_task(t, true);
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 165c90ba64ea..9aecd914ac69 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1063,9 +1063,10 @@ void wake_up_q(struct wake_q_head *head)
struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
- /* Task can safely be re-inserted now: */
node = node->next;
- task->wake_q.next = NULL;
+ /* pairs with cmpxchg_relaxed() in __wake_q_add() */
+ WRITE_ONCE(task->wake_q.next, NULL);
+ /* Task can safely be re-inserted now. */
/*
* wake_up_process() executes a full barrier, which pairs with
@@ -9050,7 +9051,7 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group
* now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
* its new group.
*/
-void sched_move_task(struct task_struct *tsk)
+void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -9079,7 +9080,8 @@ void sched_move_task(struct task_struct *tsk)
put_prev_task(rq, tsk);
sched_change_group(tsk, group);
- scx_move_task(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -9180,7 +9182,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
cgroup_taskset_for_each(task, css, tset)
- sched_move_task(task);
+ sched_move_task(task, false);
scx_cgroup_finish_attach();
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8857c0709bdd..5a81d9a1e31f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -123,6 +123,19 @@ enum scx_ops_flags {
SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
/*
+ * A migration disabled task can only execute on its current CPU. By
+ * default, such tasks are automatically put on the CPU's local DSQ with
+ * the default slice on enqueue. If this ops flag is set, they also go
+ * through ops.enqueue().
+ *
+ * A migration disabled task never invokes ops.select_cpu() as it can
+ * only select the current CPU. Also, p->cpus_ptr will only contain its
+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
+ */
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
* CPU cgroup support flags
*/
SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */
@@ -130,6 +143,7 @@ enum scx_ops_flags {
SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
SCX_OPS_ENQ_LAST |
SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
SCX_OPS_SWITCH_PARTIAL |
SCX_OPS_HAS_CGROUP_WEIGHT,
};
@@ -416,7 +430,7 @@ struct sched_ext_ops {
/**
* @update_idle: Update the idle state of a CPU
- * @cpu: CPU to udpate the idle state for
+ * @cpu: CPU to update the idle state for
* @idle: whether entering or exiting the idle state
*
* This operation is called when @rq's CPU goes or leaves the idle
@@ -882,6 +896,7 @@ static bool scx_warned_zero_slice;
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
@@ -1214,7 +1229,7 @@ static bool scx_kf_allowed_if_unlocked(void)
/**
* nldsq_next_task - Iterate to the next task in a non-local DSQ
- * @dsq: user dsq being interated
+ * @dsq: user dsq being iterated
* @cur: current position, %NULL to start iteration
* @rev: walk backwards
*
@@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
unlikely(p->flags & PF_EXITING))
goto local;
+ /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
+ if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
+ is_migration_disabled(p))
+ goto local;
+
if (!SCX_HAS_OP(enqueue))
goto global;
@@ -2078,7 +2098,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
/*
* list_add_tail() must be used. scx_ops_bypass() depends on tasks being
- * appened to the runnable_list.
+ * appended to the runnable_list.
*/
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
}
@@ -2313,12 +2333,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
*
* - The BPF scheduler is bypassed while the rq is offline and we can always say
* no to the BPF scheduler initiated migrations while offline.
+ *
+ * The caller must ensure that @p and @rq are on different CPUs.
*/
static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
bool trigger_error)
{
int cpu = cpu_of(rq);
+ SCHED_WARN_ON(task_cpu(p) == cpu);
+
+ /*
+ * If @p has migration disabled, @p->cpus_ptr is updated to contain only
+ * the pinned CPU in migrate_disable_switch() while @p is being switched
+ * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
+ * updated and thus another CPU may see @p on a DSQ inbetween leading to
+ * @p passing the below task_allowed_on_cpu() check while migration is
+ * disabled.
+ *
+ * Test the migration disabled state first as the race window is narrow
+ * and the BPF scheduler failing to check migration disabled state can
+ * easily be masked if task_allowed_on_cpu() is done first.
+ */
+ if (unlikely(is_migration_disabled(p))) {
+ if (trigger_error)
+ scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
+ p->comm, p->pid, task_cpu(p), cpu);
+ return false;
+ }
+
/*
* We don't require the BPF scheduler to avoid dispatching to offline
* CPUs mostly for convenience but also because CPUs can go offline
@@ -2327,14 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (!task_allowed_on_cpu(p, cpu)) {
if (trigger_error)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
- cpu_of(rq), p->comm, p->pid);
+ scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
+ cpu, p->comm, p->pid);
return false;
}
- if (unlikely(is_migration_disabled(p)))
- return false;
-
if (!scx_rq_online(rq))
return false;
@@ -2437,7 +2477,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
if (dst_dsq->id == SCX_DSQ_LOCAL) {
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
- if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
dst_dsq = find_global_dsq(p);
dst_rq = src_rq;
}
@@ -2480,7 +2521,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
/*
* A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
* banging on the same DSQ on a large NUMA system to the point where switching
- * to the bypass mode can take a long time. Inject artifical delays while the
+ * to the bypass mode can take a long time. Inject artificial delays while the
* bypass mode is switching to guarantee timely completion.
*/
static void scx_ops_breather(struct rq *rq)
@@ -2575,6 +2616,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
{
struct rq *src_rq = task_rq(p);
struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+#ifdef CONFIG_SMP
+ struct rq *locked_rq = rq;
+#endif
/*
* We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2588,7 +2632,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
}
#ifdef CONFIG_SMP
- if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
dispatch_enqueue(find_global_dsq(p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
@@ -2611,8 +2656,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
/* switch to @src_rq lock */
- if (rq != src_rq) {
- raw_spin_rq_unlock(rq);
+ if (locked_rq != src_rq) {
+ raw_spin_rq_unlock(locked_rq);
+ locked_rq = src_rq;
raw_spin_rq_lock(src_rq);
}
@@ -2630,6 +2676,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
} else {
move_remote_task_to_local_dsq(p, enq_flags,
src_rq, dst_rq);
+ /* task has been moved to dst_rq, which is now locked */
+ locked_rq = dst_rq;
}
/* if the destination CPU is idle, wake it up */
@@ -2638,8 +2686,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
}
/* switch back to @rq lock */
- if (rq != dst_rq) {
- raw_spin_rq_unlock(dst_rq);
+ if (locked_rq != rq) {
+ raw_spin_rq_unlock(locked_rq);
raw_spin_rq_lock(rq);
}
#else /* CONFIG_SMP */
@@ -3144,7 +3192,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*
* Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
* to implement the default task ordering. The older the timestamp, the higher
- * prority the task - the global FIFO ordering matching the default scheduling
+ * priority the task - the global FIFO ordering matching the default scheduling
* behavior.
*
* When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
@@ -3851,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
curr->scx.slice = 0;
touch_core_sched(rq, curr);
} else if (SCX_HAS_OP(tick)) {
- SCX_CALL_OP(SCX_KF_REST, tick, curr);
+ SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
}
if (!curr->scx.slice)
@@ -3998,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p)
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
if (SCX_HAS_OP(disable))
- SCX_CALL_OP(SCX_KF_REST, disable, p);
+ SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
scx_set_task_state(p, SCX_TASK_READY);
}
@@ -4027,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p)
}
if (SCX_HAS_OP(exit_task))
- SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
+ SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -4323,25 +4371,12 @@ err:
return ops_sanitize_err("cgroup_prep_move", ret);
}
-void scx_move_task(struct task_struct *p)
+void scx_cgroup_move_task(struct task_struct *p)
{
if (!scx_cgroup_enabled)
return;
/*
- * We're called from sched_move_task() which handles both cgroup and
- * autogroup moves. Ignore the latter.
- *
- * Also ignore exiting tasks, because in the exit path tasks transition
- * from the autogroup to the root group, so task_group_is_autogroup()
- * alone isn't able to catch exiting autogroup tasks. This is safe for
- * cgroup_move(), because cgroup migrations never happen for PF_EXITING
- * tasks.
- */
- if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))
- return;
-
- /*
* @p must have ops.cgroup_prep_move() called on it and thus
* cgrp_moving_from set.
*/
@@ -4590,7 +4625,7 @@ static int scx_cgroup_init(void)
cgroup_warned_missing_idle = false;
/*
- * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
* cgroups and init, all online cgroups are initialized.
*/
rcu_read_lock();
@@ -5059,6 +5094,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
static_branch_disable(&scx_has_op[i]);
static_branch_disable(&scx_ops_enq_last);
static_branch_disable(&scx_ops_enq_exiting);
+ static_branch_disable(&scx_ops_enq_migration_disabled);
static_branch_disable(&scx_ops_cpu_preempt);
static_branch_disable(&scx_builtin_idle_enabled);
synchronize_rcu();
@@ -5277,9 +5313,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
ops_state >> SCX_OPSS_QSEQ_SHIFT);
- dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu",
- p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
- p->scx.dsq_vtime, p->scx.slice);
+ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s",
+ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
+ dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
+ p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
if (SCX_HAS_OP(dump_task)) {
@@ -5667,6 +5704,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (ops->flags & SCX_OPS_ENQ_EXITING)
static_branch_enable(&scx_ops_enq_exiting);
+ if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
+ static_branch_enable(&scx_ops_enq_migration_disabled);
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
static_branch_enable(&scx_ops_cpu_preempt);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 4d022d17ac7d..1079b56b0f7a 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -73,7 +73,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
-void scx_move_task(struct task_struct *p);
+void scx_cgroup_move_task(struct task_struct *p);
void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
@@ -82,7 +82,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle);
static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
-static inline void scx_move_task(struct task_struct *p) {}
+static inline void scx_cgroup_move_task(struct task_struct *p) {}
static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 38e0e323dda2..b93c8c3dc05a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -572,7 +572,7 @@ extern void sched_online_group(struct task_group *tg,
extern void sched_destroy_group(struct task_group *tg);
extern void sched_release_group(struct task_group *tg);
-extern void sched_move_task(struct task_struct *tsk);
+extern void sched_move_task(struct task_struct *tsk, bool for_autogroup);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b8e0ae15ca5b..bb6089c2951e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1672,7 +1672,8 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
* must be the same.
*/
static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
- struct trace_buffer *buffer, int nr_pages)
+ struct trace_buffer *buffer, int nr_pages,
+ unsigned long *subbuf_mask)
{
int subbuf_size = PAGE_SIZE;
struct buffer_data_page *subbuf;
@@ -1680,6 +1681,9 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
unsigned long buffers_end;
int i;
+ if (!subbuf_mask)
+ return false;
+
/* Check the meta magic and meta struct size */
if (meta->magic != RING_BUFFER_META_MAGIC ||
meta->struct_size != sizeof(*meta)) {
@@ -1712,6 +1716,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
subbuf = rb_subbufs_from_meta(meta);
+ bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
+
/* Is the meta buffers and the subbufs themselves have correct data? */
for (i = 0; i < meta->nr_subbufs; i++) {
if (meta->buffers[i] < 0 ||
@@ -1725,6 +1731,12 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
return false;
}
+ if (test_bit(meta->buffers[i], subbuf_mask)) {
+ pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
+ return false;
+ }
+
+ set_bit(meta->buffers[i], subbuf_mask);
subbuf = (void *)subbuf + subbuf_size;
}
@@ -1838,6 +1850,11 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->cpu);
goto invalid;
}
+
+ /* If the buffer has content, update pages_touched */
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+
entries += ret;
entry_bytes += local_read(&head_page->page->commit);
local_set(&cpu_buffer->head_page->entries, ret);
@@ -1889,17 +1906,22 @@ static void rb_meta_init_text_addr(struct ring_buffer_meta *meta)
static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
{
struct ring_buffer_meta *meta;
+ unsigned long *subbuf_mask;
unsigned long delta;
void *subbuf;
int cpu;
int i;
+ /* Create a mask to test the subbuf array */
+ subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL);
+ /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */
+
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
void *next_meta;
meta = rb_range_meta(buffer, nr_pages, cpu);
- if (rb_meta_valid(meta, cpu, buffer, nr_pages)) {
+ if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) {
/* Make the mappings match the current address */
subbuf = rb_subbufs_from_meta(meta);
delta = (unsigned long)subbuf - meta->first_buffer;
@@ -1943,6 +1965,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
subbuf += meta->subbuf_size;
}
}
+ bitmap_free(subbuf_mask);
}
static void *rbm_start(struct seq_file *m, loff_t *pos)
@@ -7126,6 +7149,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
kfree(cpu_buffer->subbuf_ids);
cpu_buffer->subbuf_ids = NULL;
rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
}
unlock:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1496a5ac33ae..0e6d517e74e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5977,8 +5977,6 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu_id)
{
- int ret;
-
guard(mutex)(&trace_types_lock);
if (cpu_id != RING_BUFFER_ALL_CPUS) {
@@ -5987,11 +5985,7 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
return -EINVAL;
}
- ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
- if (ret < 0)
- ret = -ENOMEM;
-
- return ret;
+ return __tracing_resize_ring_buffer(tr, size, cpu_id);
}
static void update_last_data(struct trace_array *tr)
@@ -8285,6 +8279,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
struct trace_iterator *iter = &info->iter;
int ret = 0;
+ /* Currently the boot mapped buffer is not supported for mmap */
+ if (iter->tr->flags & TRACE_ARRAY_FL_BOOT)
+ return -ENODEV;
+
ret = get_snapshot_map(iter->tr);
if (ret)
return ret;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3c2c45313c88..97152f2250fe 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3517,12 +3517,6 @@ repeat:
}
/*
- * Put the reference grabbed by send_mayday(). @pool won't
- * go away while we're still attached to it.
- */
- put_pwq(pwq);
-
- /*
* Leave this pool. Notify regular workers; otherwise, we end up
* with 0 concurrency and stalling the execution.
*/
@@ -3532,6 +3526,12 @@ repeat:
worker_detach_from_pool(rescuer);
+ /*
+ * Put the reference grabbed by send_mayday(). @pool might
+ * go away any time after it.
+ */
+ put_pwq_unlocked(pwq);
+
raw_spin_lock_irq(&wq_mayday_lock);
}