aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/include/linux/sched
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/sched')
-rw-r--r--include/linux/sched/affinity.h1
-rw-r--r--include/linux/sched/clock.h25
-rw-r--r--include/linux/sched/cond_resched.h1
-rw-r--r--include/linux/sched/coredump.h53
-rw-r--r--include/linux/sched/cpufreq.h7
-rw-r--r--include/linux/sched/cputime.h14
-rw-r--r--include/linux/sched/deadline.h22
-rw-r--r--include/linux/sched/debug.h6
-rw-r--r--include/linux/sched/ext.h217
-rw-r--r--include/linux/sched/hotplug.h6
-rw-r--r--include/linux/sched/idle.h69
-rw-r--r--include/linux/sched/isolation.h62
-rw-r--r--include/linux/sched/jobctl.h12
-rw-r--r--include/linux/sched/mm.h298
-rw-r--r--include/linux/sched/numa_balancing.h16
-rw-r--r--include/linux/sched/posix-timers.h1
-rw-r--r--include/linux/sched/prio.h19
-rw-r--r--include/linux/sched/rseq_api.h1
-rw-r--r--include/linux/sched/rt.h45
-rw-r--r--include/linux/sched/sd_flags.h26
-rw-r--r--include/linux/sched/signal.h217
-rw-r--r--include/linux/sched/smt.h4
-rw-r--r--include/linux/sched/stat.h16
-rw-r--r--include/linux/sched/sysctl.h85
-rw-r--r--include/linux/sched/task.h84
-rw-r--r--include/linux/sched/task_flags.h1
-rw-r--r--include/linux/sched/task_stack.h35
-rw-r--r--include/linux/sched/thread_info_api.h1
-rw-r--r--include/linux/sched/topology.h90
-rw-r--r--include/linux/sched/types.h2
-rw-r--r--include/linux/sched/user.h16
-rw-r--r--include/linux/sched/vhost_task.h14
-rw-r--r--include/linux/sched/wake_q.h41
33 files changed, 1048 insertions, 459 deletions
diff --git a/include/linux/sched/affinity.h b/include/linux/sched/affinity.h
new file mode 100644
index 000000000000..227f5be81bcd
--- /dev/null
+++ b/include/linux/sched/affinity.h
@@ -0,0 +1 @@
+#include <linux/sched.h>
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 867d588314e0..196f0ca351a2 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -12,7 +12,16 @@
*
* Please use one of the three interfaces below.
*/
-extern unsigned long long notrace sched_clock(void);
+extern u64 sched_clock(void);
+
+#if defined(CONFIG_ARCH_WANTS_NO_INSTR) || defined(CONFIG_GENERIC_SCHED_CLOCK)
+extern u64 sched_clock_noinstr(void);
+#else
+static __always_inline u64 sched_clock_noinstr(void)
+{
+ return sched_clock();
+}
+#endif
/*
* See the comment in kernel/sched/clock.c
@@ -45,7 +54,12 @@ static inline u64 cpu_clock(int cpu)
return sched_clock();
}
-static inline u64 local_clock(void)
+static __always_inline u64 local_clock_noinstr(void)
+{
+ return sched_clock_noinstr();
+}
+
+static __always_inline u64 local_clock(void)
{
return sched_clock();
}
@@ -79,10 +93,9 @@ static inline u64 cpu_clock(int cpu)
return sched_clock_cpu(cpu);
}
-static inline u64 local_clock(void)
-{
- return sched_clock_cpu(raw_smp_processor_id());
-}
+extern u64 local_clock_noinstr(void);
+extern u64 local_clock(void);
+
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/include/linux/sched/cond_resched.h b/include/linux/sched/cond_resched.h
new file mode 100644
index 000000000000..227f5be81bcd
--- /dev/null
+++ b/include/linux/sched/cond_resched.h
@@ -0,0 +1 @@
+#include <linux/sched.h>
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index dfd82eab2902..6eb65ceed213 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -8,12 +8,6 @@
#define SUID_DUMP_USER 1 /* Dump as user of process */
#define SUID_DUMP_ROOT 2 /* Dump as root */
-/* mm flags */
-
-/* for SUID_DUMP_* above */
-#define MMF_DUMPABLE_BITS 2
-#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
-
extern void set_dumpable(struct mm_struct *mm, int value);
/*
* This returns the actual value of the suid_dumpable flag. For things
@@ -31,51 +25,4 @@ static inline int get_dumpable(struct mm_struct *mm)
return __get_dumpable(mm->flags);
}
-/* coredump filter bits */
-#define MMF_DUMP_ANON_PRIVATE 2
-#define MMF_DUMP_ANON_SHARED 3
-#define MMF_DUMP_MAPPED_PRIVATE 4
-#define MMF_DUMP_MAPPED_SHARED 5
-#define MMF_DUMP_ELF_HEADERS 6
-#define MMF_DUMP_HUGETLB_PRIVATE 7
-#define MMF_DUMP_HUGETLB_SHARED 8
-#define MMF_DUMP_DAX_PRIVATE 9
-#define MMF_DUMP_DAX_SHARED 10
-
-#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
-#define MMF_DUMP_FILTER_BITS 9
-#define MMF_DUMP_FILTER_MASK \
- (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
-#define MMF_DUMP_FILTER_DEFAULT \
- ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
- (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
-
-#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
-# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
-#else
-# define MMF_DUMP_MASK_DEFAULT_ELF 0
-#endif
- /* leave room for more dump flags */
-#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
-#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */
-/*
- * This one-shot flag is dropped due to necessity of changing exe once again
- * on NFS restore
- */
-//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
-
-#define MMF_HAS_UPROBES 19 /* has uprobes */
-#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
-#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
-#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
-#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
-#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
-#define MMF_OOM_VICTIM 25 /* mm is the oom victim */
-#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */
-#define MMF_MULTIPROCESS 27 /* mm is shared between processes */
-#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
-
-#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
- MMF_DISABLE_THP_MASK)
-
#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 3ed5aa18593f..bdd31ab93bc5 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -26,7 +26,12 @@ bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy);
static inline unsigned long map_util_freq(unsigned long util,
unsigned long freq, unsigned long cap)
{
- return (freq + (freq >> 2)) * util / cap;
+ return freq * util / cap;
+}
+
+static inline unsigned long map_util_perf(unsigned long util)
+{
+ return util + (util >> 2);
}
#endif /* CONFIG_CPU_FREQ */
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 6c9f19a33865..5f8fd5b24a2e 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -8,25 +8,17 @@
* cputime accounting APIs:
*/
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-#include <asm/cputime.h>
-
-#ifndef cputime_to_nsecs
-# define cputime_to_nsecs(__ct) \
- (cputime_to_usecs(__ct) * NSEC_PER_USEC)
-#endif
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern void task_cputime(struct task_struct *t,
+extern bool task_cputime(struct task_struct *t,
u64 *utime, u64 *stime);
extern u64 task_gtime(struct task_struct *t);
#else
-static inline void task_cputime(struct task_struct *t,
+static inline bool task_cputime(struct task_struct *t,
u64 *utime, u64 *stime)
{
*utime = t->utime;
*stime = t->stime;
+ return false;
}
static inline u64 task_gtime(struct task_struct *t)
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 1aff00b65f3c..f9aabbc9d22e 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,4 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_DEADLINE_H
+#define _LINUX_SCHED_DEADLINE_H
/*
* SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -6,16 +8,18 @@
* NORMAL/BATCH tasks.
*/
-#define MAX_DL_PRIO 0
+#include <linux/sched.h>
-static inline int dl_prio(int prio)
+static inline bool dl_prio(int prio)
{
- if (unlikely(prio < MAX_DL_PRIO))
- return 1;
- return 0;
+ return unlikely(prio < MAX_DL_PRIO);
}
-static inline int dl_task(struct task_struct *p)
+/*
+ * Returns true if a task has a priority that belongs to DL class. PI-boosted
+ * tasks will return true. Use dl_policy() to ignore PI-boosted tasks.
+ */
+static inline bool dl_task(struct task_struct *p)
{
return dl_prio(p->prio);
}
@@ -30,5 +34,11 @@ static inline bool dl_time_before(u64 a, u64 b)
struct root_domain;
extern void dl_add_task_root_domain(struct task_struct *p);
extern void dl_clear_root_domain(struct root_domain *rd);
+extern void dl_clear_root_domain_cpu(int cpu);
#endif /* CONFIG_SMP */
+
+extern u64 dl_cookie;
+extern bool dl_bw_visited(int cpu, u64 cookie);
+
+#endif /* _LINUX_SCHED_DEADLINE_H */
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
index 00c45a0e6abe..35ed4577a6cc 100644
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -14,7 +14,7 @@ extern void dump_cpu_task(int cpu);
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
-extern void show_state_filter(unsigned long state_filter);
+extern void show_state_filter(unsigned int state_filter);
static inline void show_state(void)
{
@@ -35,15 +35,13 @@ extern void show_stack(struct task_struct *task, unsigned long *sp,
extern void sched_show_task(struct task_struct *p);
-#ifdef CONFIG_SCHED_DEBUG
struct seq_file;
extern void proc_sched_show_task(struct task_struct *p,
struct pid_namespace *ns, struct seq_file *m);
extern void proc_sched_set_task(struct task_struct *p);
-#endif
/* Attach to any functions which should be ignored in wchan output. */
-#define __sched __attribute__((__section__(".sched.text")))
+#define __sched __section(".sched.text")
/* Linker adds these: start and end of __sched functions */
extern char __sched_text_start[], __sched_text_end[];
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
new file mode 100644
index 000000000000..f7545430a548
--- /dev/null
+++ b/include/linux/sched/ext.h
@@ -0,0 +1,217 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef _LINUX_SCHED_EXT_H
+#define _LINUX_SCHED_EXT_H
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+
+#include <linux/llist.h>
+#include <linux/rhashtable-types.h>
+
+enum scx_public_consts {
+ SCX_OPS_NAME_LEN = 128,
+
+ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
+ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
+};
+
+/*
+ * DSQ (dispatch queue) IDs are 64bit of the format:
+ *
+ * Bits: [63] [62 .. 0]
+ * [ B] [ ID ]
+ *
+ * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
+ * ID: 63 bit ID
+ *
+ * Built-in IDs:
+ *
+ * Bits: [63] [62] [61..32] [31 .. 0]
+ * [ 1] [ L] [ R ] [ V ]
+ *
+ * 1: 1 for built-in DSQs.
+ * L: 1 for LOCAL_ON DSQ IDs, 0 for others
+ * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
+ */
+enum scx_dsq_id_flags {
+ SCX_DSQ_FLAG_BUILTIN = 1LLU << 63,
+ SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62,
+
+ SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
+ SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
+ SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
+ SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
+ SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
+};
+
+/*
+ * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
+ * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
+ * buffer between the scheduler core and the BPF scheduler. See the
+ * documentation for more details.
+ */
+struct scx_dispatch_q {
+ raw_spinlock_t lock;
+ struct list_head list; /* tasks in dispatch order */
+ struct rb_root priq; /* used to order by p->scx.dsq_vtime */
+ u32 nr;
+ u32 seq; /* used by BPF iter */
+ u64 id;
+ struct rhash_head hash_node;
+ struct llist_node free_node;
+ struct rcu_head rcu;
+};
+
+/* scx_entity.flags */
+enum scx_ent_flags {
+ SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
+ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
+ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
+
+ SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */
+ SCX_TASK_STATE_BITS = 2,
+ SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
+
+ SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */
+};
+
+/* scx_entity.flags & SCX_TASK_STATE_MASK */
+enum scx_task_state {
+ SCX_TASK_NONE, /* ops.init_task() not called yet */
+ SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */
+ SCX_TASK_READY, /* fully initialized, but not in sched_ext */
+ SCX_TASK_ENABLED, /* fully initialized and in sched_ext */
+
+ SCX_TASK_NR_STATES,
+};
+
+/* scx_entity.dsq_flags */
+enum scx_ent_dsq_flags {
+ SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */
+};
+
+/*
+ * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
+ * everywhere and the following bits track which kfunc sets are currently
+ * allowed for %current. This simple per-task tracking works because SCX ops
+ * nest in a limited way. BPF will likely implement a way to allow and disallow
+ * kfuncs depending on the calling context which will replace this manual
+ * mechanism. See scx_kf_allow().
+ */
+enum scx_kf_mask {
+ SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */
+ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
+ SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */
+ /* ops.dequeue (in REST) may be nested inside DISPATCH */
+ SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */
+ SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */
+ SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */
+ SCX_KF_REST = 1 << 4, /* other rq-locked operations */
+
+ __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
+ SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
+ __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
+};
+
+enum scx_dsq_lnode_flags {
+ SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
+
+ /* high 16 bits can be for iter cursor flags */
+ __SCX_DSQ_LNODE_PRIV_SHIFT = 16,
+};
+
+struct scx_dsq_list_node {
+ struct list_head node;
+ u32 flags;
+ u32 priv; /* can be used by iter cursor */
+};
+
+/*
+ * The following is embedded in task_struct and contains all fields necessary
+ * for a task to be scheduled by SCX.
+ */
+struct sched_ext_entity {
+ struct scx_dispatch_q *dsq;
+ struct scx_dsq_list_node dsq_list; /* dispatch order */
+ struct rb_node dsq_priq; /* p->scx.dsq_vtime order */
+ u32 dsq_seq;
+ u32 dsq_flags; /* protected by DSQ lock */
+ u32 flags; /* protected by rq lock */
+ u32 weight;
+ s32 sticky_cpu;
+ s32 holding_cpu;
+ s32 selected_cpu;
+ u32 kf_mask; /* see scx_kf_mask above */
+ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */
+ atomic_long_t ops_state;
+
+ struct list_head runnable_node; /* rq->scx.runnable_list */
+ unsigned long runnable_at;
+
+#ifdef CONFIG_SCHED_CORE
+ u64 core_sched_at; /* see scx_prio_less() */
+#endif
+ u64 ddsp_dsq_id;
+ u64 ddsp_enq_flags;
+
+ /* BPF scheduler modifiable fields */
+
+ /*
+ * Runtime budget in nsecs. This is usually set through
+ * scx_bpf_dispatch() but can also be modified directly by the BPF
+ * scheduler. Automatically decreased by SCX as the task executes. On
+ * depletion, a scheduling event is triggered.
+ *
+ * This value is cleared to zero if the task is preempted by
+ * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
+ * task ran. Use p->se.sum_exec_runtime instead.
+ */
+ u64 slice;
+
+ /*
+ * Used to order tasks when dispatching to the vtime-ordered priority
+ * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
+ * but can also be modified directly by the BPF scheduler. Modifying it
+ * while a task is queued on a dsq may mangle the ordering and is not
+ * recommended.
+ */
+ u64 dsq_vtime;
+
+ /*
+ * If set, reject future sched_setscheduler(2) calls updating the policy
+ * to %SCHED_EXT with -%EACCES.
+ *
+ * Can be set from ops.init_task() while the BPF scheduler is being
+ * loaded (!scx_init_task_args->fork). If set and the task's policy is
+ * already %SCHED_EXT, the task's policy is rejected and forcefully
+ * reverted to %SCHED_NORMAL. The number of such events are reported
+ * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag
+ * during fork is not allowed.
+ */
+ bool disallow; /* reject switching into SCX */
+
+ /* cold fields */
+#ifdef CONFIG_EXT_GROUP_SCHED
+ struct cgroup *cgrp_moving_from;
+#endif
+ struct list_head tasks_node;
+};
+
+void sched_ext_free(struct task_struct *p);
+void print_scx_info(const char *log_lvl, struct task_struct *p);
+void scx_softlockup(u32 dur_s);
+
+#else /* !CONFIG_SCHED_CLASS_EXT */
+
+static inline void sched_ext_free(struct task_struct *p) {}
+static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
+static inline void scx_softlockup(u32 dur_s) {}
+
+#endif /* CONFIG_SCHED_CLASS_EXT */
+#endif /* _LINUX_SCHED_EXT_H */
diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h
index 9a62ffdd296f..17e04859b9a4 100644
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -11,15 +11,13 @@ extern int sched_cpu_activate(unsigned int cpu);
extern int sched_cpu_deactivate(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_wait_empty(unsigned int cpu);
extern int sched_cpu_dying(unsigned int cpu);
#else
+# define sched_cpu_wait_empty NULL
# define sched_cpu_dying NULL
#endif
-#ifdef CONFIG_HOTPLUG_CPU
-extern void idle_task_exit(void);
-#else
static inline void idle_task_exit(void) {}
-#endif
#endif /* _LINUX_SCHED_HOTPLUG_H */
diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index 22873d276be6..439f6029d3b9 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -5,13 +5,17 @@
#include <linux/sched.h>
enum cpu_idle_type {
+ __CPU_NOT_IDLE = 0,
CPU_IDLE,
- CPU_NOT_IDLE,
CPU_NEWLY_IDLE,
CPU_MAX_IDLE_TYPES
};
+#ifdef CONFIG_SMP
extern void wake_up_if_idle(int cpu);
+#else
+static inline void wake_up_if_idle(int cpu) { }
+#endif
/*
* Idle thread specific functions to determine the need_resched
@@ -19,12 +23,37 @@ extern void wake_up_if_idle(int cpu);
*/
#ifdef TIF_POLLING_NRFLAG
-static inline void __current_set_polling(void)
+#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H
+
+static __always_inline void __current_set_polling(void)
{
- set_thread_flag(TIF_POLLING_NRFLAG);
+ arch_set_bit(TIF_POLLING_NRFLAG,
+ (unsigned long *)(&current_thread_info()->flags));
}
-static inline bool __must_check current_set_polling_and_test(void)
+static __always_inline void __current_clr_polling(void)
+{
+ arch_clear_bit(TIF_POLLING_NRFLAG,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+#else
+
+static __always_inline void __current_set_polling(void)
+{
+ set_bit(TIF_POLLING_NRFLAG,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+static __always_inline void __current_clr_polling(void)
+{
+ clear_bit(TIF_POLLING_NRFLAG,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H */
+
+static __always_inline bool __must_check current_set_polling_and_test(void)
{
__current_set_polling();
@@ -37,12 +66,7 @@ static inline bool __must_check current_set_polling_and_test(void)
return unlikely(tif_need_resched());
}
-static inline void __current_clr_polling(void)
-{
- clear_thread_flag(TIF_POLLING_NRFLAG);
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
+static __always_inline bool __must_check current_clr_polling_and_test(void)
{
__current_clr_polling();
@@ -55,6 +79,21 @@ static inline bool __must_check current_clr_polling_and_test(void)
return unlikely(tif_need_resched());
}
+static __always_inline void current_clr_polling(void)
+{
+ __current_clr_polling();
+
+ /*
+ * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
+ * Once the bit is cleared, we'll get IPIs with every new
+ * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
+ * fold.
+ */
+ smp_mb__after_atomic(); /* paired with resched_curr() */
+
+ preempt_fold_need_resched();
+}
+
#else
static inline void __current_set_polling(void) { }
static inline void __current_clr_polling(void) { }
@@ -67,21 +106,15 @@ static inline bool __must_check current_clr_polling_and_test(void)
{
return unlikely(tif_need_resched());
}
-#endif
-static inline void current_clr_polling(void)
+static __always_inline void current_clr_polling(void)
{
__current_clr_polling();
- /*
- * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
- * Once the bit is cleared, we'll get IPIs with every new
- * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
- * fold.
- */
smp_mb(); /* paired with resched_curr() */
preempt_fold_need_resched();
}
+#endif
#endif /* _LINUX_SCHED_IDLE_H */
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index cc9f393e2a70..d8501f4709b5 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -2,59 +2,79 @@
#define _LINUX_SCHED_ISOLATION_H
#include <linux/cpumask.h>
+#include <linux/cpuset.h>
#include <linux/init.h>
#include <linux/tick.h>
-enum hk_flags {
- HK_FLAG_TIMER = 1,
- HK_FLAG_RCU = (1 << 1),
- HK_FLAG_MISC = (1 << 2),
- HK_FLAG_SCHED = (1 << 3),
- HK_FLAG_TICK = (1 << 4),
- HK_FLAG_DOMAIN = (1 << 5),
- HK_FLAG_WQ = (1 << 6),
- HK_FLAG_MANAGED_IRQ = (1 << 7),
- HK_FLAG_KTHREAD = (1 << 8),
+enum hk_type {
+ HK_TYPE_DOMAIN,
+ HK_TYPE_MANAGED_IRQ,
+ HK_TYPE_KERNEL_NOISE,
+ HK_TYPE_MAX,
+
+ /*
+ * The following housekeeping types are only set by the nohz_full
+ * boot commandline option. So they can share the same value.
+ */
+ HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE,
+ HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE,
+ HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE,
+ HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE,
+ HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE,
+ HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE
};
#ifdef CONFIG_CPU_ISOLATION
DECLARE_STATIC_KEY_FALSE(housekeeping_overridden);
-extern int housekeeping_any_cpu(enum hk_flags flags);
-extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
-extern bool housekeeping_enabled(enum hk_flags flags);
-extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
-extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
+extern int housekeeping_any_cpu(enum hk_type type);
+extern const struct cpumask *housekeeping_cpumask(enum hk_type type);
+extern bool housekeeping_enabled(enum hk_type type);
+extern void housekeeping_affine(struct task_struct *t, enum hk_type type);
+extern bool housekeeping_test_cpu(int cpu, enum hk_type type);
extern void __init housekeeping_init(void);
#else
-static inline int housekeeping_any_cpu(enum hk_flags flags)
+static inline int housekeeping_any_cpu(enum hk_type type)
{
return smp_processor_id();
}
-static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+static inline const struct cpumask *housekeeping_cpumask(enum hk_type type)
{
return cpu_possible_mask;
}
-static inline bool housekeeping_enabled(enum hk_flags flags)
+static inline bool housekeeping_enabled(enum hk_type type)
{
return false;
}
static inline void housekeeping_affine(struct task_struct *t,
- enum hk_flags flags) { }
+ enum hk_type type) { }
+
+static inline bool housekeeping_test_cpu(int cpu, enum hk_type type)
+{
+ return true;
+}
+
static inline void housekeeping_init(void) { }
#endif /* CONFIG_CPU_ISOLATION */
-static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
+static inline bool housekeeping_cpu(int cpu, enum hk_type type)
{
#ifdef CONFIG_CPU_ISOLATION
if (static_branch_unlikely(&housekeeping_overridden))
- return housekeeping_test_cpu(cpu, flags);
+ return housekeeping_test_cpu(cpu, type);
#endif
return true;
}
+static inline bool cpu_is_isolated(int cpu)
+{
+ return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
+ !housekeeping_test_cpu(cpu, HK_TYPE_TICK) ||
+ cpuset_cpu_is_isolated(cpu);
+}
+
#endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
index d2b4204ba4d3..68876d0a7ef9 100644
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -19,7 +19,10 @@ struct task_struct;
#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */
-#define JOBCTL_TASK_WORK_BIT 24 /* set by TWA_SIGNAL */
+#define JOBCTL_PTRACE_FROZEN_BIT 24 /* frozen for ptrace */
+
+#define JOBCTL_STOPPED_BIT 26 /* do_signal_stop() */
+#define JOBCTL_TRACED_BIT 27 /* ptrace_stop() */
#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
@@ -29,10 +32,13 @@ struct task_struct;
#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT)
#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT)
#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT)
-#define JOBCTL_TASK_WORK (1UL << JOBCTL_TASK_WORK_BIT)
+#define JOBCTL_PTRACE_FROZEN (1UL << JOBCTL_PTRACE_FROZEN_BIT)
+
+#define JOBCTL_STOPPED (1UL << JOBCTL_STOPPED_BIT)
+#define JOBCTL_TRACED (1UL << JOBCTL_TRACED_BIT)
#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
-#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK | JOBCTL_TASK_WORK)
+#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
extern bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask);
extern void task_clear_jobctl_trapping(struct task_struct *task);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d5ece7a9a403..b13474825130 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -8,6 +8,7 @@
#include <linux/mm_types.h>
#include <linux/gfp.h>
#include <linux/sync_core.h>
+#include <linux/sched/coredump.h>
/*
* Routines for handling mm_structs
@@ -28,7 +29,7 @@ extern struct mm_struct *mm_alloc(void);
*
* Use mmdrop() to release the reference acquired by mmgrab().
*
- * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
+ * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
* of &mm_struct.mm_count vs &mm_struct.mm_users.
*/
static inline void mmgrab(struct mm_struct *mm)
@@ -36,6 +37,11 @@ static inline void mmgrab(struct mm_struct *mm)
atomic_inc(&mm->mm_count);
}
+static inline void smp_mb__after_mmgrab(void)
+{
+ smp_mb__after_atomic();
+}
+
extern void __mmdrop(struct mm_struct *mm);
static inline void mmdrop(struct mm_struct *mm)
@@ -49,6 +55,63 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is
+ * by far the least expensive way to do that.
+ */
+static inline void __mmdrop_delayed(struct rcu_head *rhp)
+{
+ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
+
+ __mmdrop(mm);
+}
+
+/*
+ * Invoked from finish_task_switch(). Delegates the heavy lifting on RT
+ * kernels via RCU.
+ */
+static inline void mmdrop_sched(struct mm_struct *mm)
+{
+ /* Provides a full memory barrier. See mmdrop() */
+ if (atomic_dec_and_test(&mm->mm_count))
+ call_rcu(&mm->delayed_drop, __mmdrop_delayed);
+}
+#else
+static inline void mmdrop_sched(struct mm_struct *mm)
+{
+ mmdrop(mm);
+}
+#endif
+
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+ mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+ mmdrop(mm);
+ } else {
+ /*
+ * mmdrop_lazy_tlb must provide a full memory barrier, see the
+ * membarrier comment finish_task_switch which relies on this.
+ */
+ smp_mb();
+ }
+}
+
+static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+ mmdrop_sched(mm);
+ else
+ smp_mb(); /* see mmdrop_lazy_tlb() above */
+}
+
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
@@ -62,7 +125,7 @@ static inline void mmdrop(struct mm_struct *mm)
*
* Use mmput() to release the reference acquired by mmget().
*
- * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
+ * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
* of &mm_struct.mm_count vs &mm_struct.mm_users.
*/
static inline void mmget(struct mm_struct *mm)
@@ -106,15 +169,46 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
#endif /* CONFIG_MEMCG */
#ifdef CONFIG_MMU
+#ifndef arch_get_mmap_end
+#define arch_get_mmap_end(addr, len, flags) (TASK_SIZE)
+#endif
+
+#ifndef arch_get_mmap_base
+#define arch_get_mmap_base(addr, base) (base)
+#endif
+
extern void arch_pick_mmap_layout(struct mm_struct *mm,
struct rlimit *rlim_stack);
-extern unsigned long
-arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
- unsigned long, unsigned long);
-extern unsigned long
+
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags);
+unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t);
+
+unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags);
+
+unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
+ struct file *filp,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags,
+ vm_flags_t vm_flags);
+
+unsigned long
+generic_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
- unsigned long flags);
+ unsigned long flags, vm_flags_t vm_flags);
+unsigned long
+generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
struct rlimit *rlim_stack) {}
@@ -140,7 +234,8 @@ static inline bool in_vfork(struct task_struct *tsk)
* another oom-unkillable task does this it should blame itself.
*/
rcu_read_lock();
- ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm;
+ ret = tsk->vfork_done &&
+ rcu_dereference(tsk->real_parent)->mm == tsk->mm;
rcu_read_unlock();
return ret;
@@ -150,12 +245,13 @@ static inline bool in_vfork(struct task_struct *tsk)
* Applies per-task gfp context to the given allocation flags.
* PF_MEMALLOC_NOIO implies GFP_NOIO
* PF_MEMALLOC_NOFS implies GFP_NOFS
+ * PF_MEMALLOC_PIN implies !GFP_MOVABLE
*/
static inline gfp_t current_gfp_context(gfp_t flags)
{
unsigned int pflags = READ_ONCE(current->flags);
- if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) {
+ if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
/*
* NOIO implies both NOIO and NOFS and it is a weaker context
* so always make sure it makes precedence
@@ -164,22 +260,85 @@ static inline gfp_t current_gfp_context(gfp_t flags)
flags &= ~(__GFP_IO | __GFP_FS);
else if (pflags & PF_MEMALLOC_NOFS)
flags &= ~__GFP_FS;
+
+ if (pflags & PF_MEMALLOC_PIN)
+ flags &= ~__GFP_MOVABLE;
}
return flags;
}
#ifdef CONFIG_LOCKDEP
-extern void __fs_reclaim_acquire(void);
-extern void __fs_reclaim_release(void);
+extern void __fs_reclaim_acquire(unsigned long ip);
+extern void __fs_reclaim_release(unsigned long ip);
extern void fs_reclaim_acquire(gfp_t gfp_mask);
extern void fs_reclaim_release(gfp_t gfp_mask);
#else
-static inline void __fs_reclaim_acquire(void) { }
-static inline void __fs_reclaim_release(void) { }
+static inline void __fs_reclaim_acquire(unsigned long ip) { }
+static inline void __fs_reclaim_release(unsigned long ip) { }
static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
#endif
+/* Any memory-allocation retry loop should use
+ * memalloc_retry_wait(), and pass the flags for the most
+ * constrained allocation attempt that might have failed.
+ * This provides useful documentation of where loops are,
+ * and a central place to fine tune the waiting as the MM
+ * implementation changes.
+ */
+static inline void memalloc_retry_wait(gfp_t gfp_flags)
+{
+ /* We use io_schedule_timeout because waiting for memory
+ * typically included waiting for dirty pages to be
+ * written out, which requires IO.
+ */
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ gfp_flags = current_gfp_context(gfp_flags);
+ if (gfpflags_allow_blocking(gfp_flags) &&
+ !(gfp_flags & __GFP_NORETRY))
+ /* Probably waited already, no need for much more */
+ io_schedule_timeout(1);
+ else
+ /* Probably didn't wait, and has now released a lock,
+ * so now is a good time to wait
+ */
+ io_schedule_timeout(HZ/50);
+}
+
+/**
+ * might_alloc - Mark possible allocation sites
+ * @gfp_mask: gfp_t flags that would be used to allocate
+ *
+ * Similar to might_sleep() and other annotations, this can be used in functions
+ * that might allocate, but often don't. Compiles to nothing without
+ * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking.
+ */
+static inline void might_alloc(gfp_t gfp_mask)
+{
+ fs_reclaim_acquire(gfp_mask);
+ fs_reclaim_release(gfp_mask);
+
+ might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+}
+
+/**
+ * memalloc_flags_save - Add a PF_* flag to current->flags, save old value
+ *
+ * This allows PF_* flags to be conveniently added, irrespective of current
+ * value, and then the old version restored with memalloc_flags_restore().
+ */
+static inline unsigned memalloc_flags_save(unsigned flags)
+{
+ unsigned oldflags = ~current->flags & flags;
+ current->flags |= flags;
+ return oldflags;
+}
+
+static inline void memalloc_flags_restore(unsigned flags)
+{
+ current->flags &= ~flags;
+}
+
/**
* memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
*
@@ -189,13 +348,12 @@ static inline void fs_reclaim_release(gfp_t gfp_mask) { }
* point of view. Use memalloc_noio_restore to end the scope with flags
* returned by this function.
*
- * This function is safe to be used from any context.
+ * Context: This function is safe to be used from any context.
+ * Return: The saved flags to be passed to memalloc_noio_restore.
*/
static inline unsigned int memalloc_noio_save(void)
{
- unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
- current->flags |= PF_MEMALLOC_NOIO;
- return flags;
+ return memalloc_flags_save(PF_MEMALLOC_NOIO);
}
/**
@@ -208,7 +366,7 @@ static inline unsigned int memalloc_noio_save(void)
*/
static inline void memalloc_noio_restore(unsigned int flags)
{
- current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
+ memalloc_flags_restore(flags);
}
/**
@@ -220,13 +378,12 @@ static inline void memalloc_noio_restore(unsigned int flags)
* point of view. Use memalloc_nofs_restore to end the scope with flags
* returned by this function.
*
- * This function is safe to be used from any context.
+ * Context: This function is safe to be used from any context.
+ * Return: The saved flags to be passed to memalloc_nofs_restore.
*/
static inline unsigned int memalloc_nofs_save(void)
{
- unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
- current->flags |= PF_MEMALLOC_NOFS;
- return flags;
+ return memalloc_flags_save(PF_MEMALLOC_NOFS);
}
/**
@@ -239,44 +396,77 @@ static inline unsigned int memalloc_nofs_save(void)
*/
static inline void memalloc_nofs_restore(unsigned int flags)
{
- current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
+ memalloc_flags_restore(flags);
}
+/**
+ * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope.
+ *
+ * This function marks the beginning of the __GFP_MEMALLOC allocation scope.
+ * All further allocations will implicitly add the __GFP_MEMALLOC flag, which
+ * prevents entering reclaim and allows access to all memory reserves. This
+ * should only be used when the caller guarantees the allocation will allow more
+ * memory to be freed very shortly, i.e. it needs to allocate some memory in
+ * the process of freeing memory, and cannot reclaim due to potential recursion.
+ *
+ * Users of this scope have to be extremely careful to not deplete the reserves
+ * completely and implement a throttling mechanism which controls the
+ * consumption of the reserve based on the amount of freed memory. Usage of a
+ * pre-allocated pool (e.g. mempool) should be always considered before using
+ * this scope.
+ *
+ * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC
+ *
+ * Context: This function should not be used in an interrupt context as that one
+ * does not give PF_MEMALLOC access to reserves.
+ * See __gfp_pfmemalloc_flags().
+ * Return: The saved flags to be passed to memalloc_noreclaim_restore.
+ */
static inline unsigned int memalloc_noreclaim_save(void)
{
- unsigned int flags = current->flags & PF_MEMALLOC;
- current->flags |= PF_MEMALLOC;
- return flags;
+ return memalloc_flags_save(PF_MEMALLOC);
}
+/**
+ * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save
+ * function. Always make sure that the given flags is the return value from the
+ * pairing memalloc_noreclaim_save call.
+ */
static inline void memalloc_noreclaim_restore(unsigned int flags)
{
- current->flags = (current->flags & ~PF_MEMALLOC) | flags;
+ memalloc_flags_restore(flags);
}
-#ifdef CONFIG_CMA
-static inline unsigned int memalloc_nocma_save(void)
-{
- unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;
-
- current->flags |= PF_MEMALLOC_NOCMA;
- return flags;
-}
-
-static inline void memalloc_nocma_restore(unsigned int flags)
-{
- current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
-}
-#else
-static inline unsigned int memalloc_nocma_save(void)
+/**
+ * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope.
+ *
+ * This function marks the beginning of the ~__GFP_MOVABLE allocation scope.
+ * All further allocations will implicitly remove the __GFP_MOVABLE flag, which
+ * will constraint the allocations to zones that allow long term pinning, i.e.
+ * not ZONE_MOVABLE zones.
+ *
+ * Return: The saved flags to be passed to memalloc_pin_restore.
+ */
+static inline unsigned int memalloc_pin_save(void)
{
- return 0;
+ return memalloc_flags_save(PF_MEMALLOC_PIN);
}
-static inline void memalloc_nocma_restore(unsigned int flags)
+/**
+ * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function.
+ * Always make sure that the given flags is the return value from the pairing
+ * memalloc_pin_save call.
+ */
+static inline void memalloc_pin_restore(unsigned int flags)
{
+ memalloc_flags_restore(flags);
}
-#endif
#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
@@ -288,6 +478,10 @@ DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
* __GFP_ACCOUNT allocations till the end of the scope will be charged to the
* given memcg.
*
+ * Please, make sure that caller has a reference to the passed memcg structure,
+ * so its lifetime is guaranteed to exceed the scope between two
+ * set_active_memcg() calls.
+ *
* NOTE: This function can nest. Users must save the return value and
* reset the previous value after their own charging scope is over.
*/
@@ -296,7 +490,7 @@ set_active_memcg(struct mem_cgroup *memcg)
{
struct mem_cgroup *old;
- if (in_interrupt()) {
+ if (!in_task()) {
old = this_cpu_read(int_active_memcg);
this_cpu_write(int_active_memcg, memcg);
} else {
@@ -337,6 +531,13 @@ enum {
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
+ /*
+ * The atomic_read() below prevents CSE. The following should
+ * help the compiler generate more efficient code on architectures
+ * where sync_core_before_usermode() is a no-op.
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE))
+ return;
if (current->mm != mm)
return;
if (likely(!(atomic_read(&mm->membarrier_state) &
@@ -347,6 +548,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
extern void membarrier_exec_mmap(struct mm_struct *mm);
+extern void membarrier_update_current_mm(struct mm_struct *next_mm);
+
#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -361,6 +564,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm)
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
+static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
+{
+}
#endif
#endif /* _LINUX_SCHED_MM_H */
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..52b22c5c396d 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -15,13 +15,23 @@
#define TNF_FAULT_LOCAL 0x08
#define TNF_MIGRATE_FAIL 0x10
+enum numa_vmaskip_reason {
+ NUMAB_SKIP_UNSUITABLE,
+ NUMAB_SKIP_SHARED_RO,
+ NUMAB_SKIP_INACCESSIBLE,
+ NUMAB_SKIP_SCAN_DELAY,
+ NUMAB_SKIP_PID_INACTIVE,
+ NUMAB_SKIP_IGNORE_PID,
+ NUMAB_SKIP_SEQ_COMPLETED,
+};
+
#ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int last_node, int node, int pages, int flags);
extern pid_t task_numa_group_id(struct task_struct *p);
extern void set_numabalancing_state(bool enabled);
extern void task_numa_free(struct task_struct *p, bool final);
-extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
- int src_nid, int dst_cpu);
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
+ int src_nid, int dst_cpu);
#else
static inline void task_numa_fault(int last_node, int node, int pages,
int flags)
@@ -38,7 +48,7 @@ static inline void task_numa_free(struct task_struct *p, bool final)
{
}
static inline bool should_numa_migrate_memory(struct task_struct *p,
- struct page *page, int src_nid, int dst_cpu)
+ struct folio *folio, int src_nid, int dst_cpu)
{
return true;
}
diff --git a/include/linux/sched/posix-timers.h b/include/linux/sched/posix-timers.h
new file mode 100644
index 000000000000..523a381d6c88
--- /dev/null
+++ b/include/linux/sched/posix-timers.h
@@ -0,0 +1 @@
+#include <linux/posix-timers.h>
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index 7d64feafc408..6ab43b4f72f9 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -11,16 +11,10 @@
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
* values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space. This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
*/
-#define MAX_USER_RT_PRIO 100
-#define MAX_RT_PRIO MAX_USER_RT_PRIO
+#define MAX_RT_PRIO 100
+#define MAX_DL_PRIO 0
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
@@ -34,15 +28,6 @@
#define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO)
/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
-
-/*
* Convert nice value [19,-20] to rlimit style value [1,40].
*/
static inline long nice_to_rlimit(long nice)
diff --git a/include/linux/sched/rseq_api.h b/include/linux/sched/rseq_api.h
new file mode 100644
index 000000000000..cf2af72693e1
--- /dev/null
+++ b/include/linux/sched/rseq_api.h
@@ -0,0 +1 @@
+#include <linux/rseq.h>
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index e5af028c08b4..4e3338103654 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -6,19 +6,40 @@
struct task_struct;
-static inline int rt_prio(int prio)
+static inline bool rt_prio(int prio)
{
- if (unlikely(prio < MAX_RT_PRIO))
- return 1;
- return 0;
+ return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO);
}
-static inline int rt_task(struct task_struct *p)
+static inline bool rt_or_dl_prio(int prio)
+{
+ return unlikely(prio < MAX_RT_PRIO);
+}
+
+/*
+ * Returns true if a task has a priority that belongs to RT class. PI-boosted
+ * tasks will return true. Use rt_policy() to ignore PI-boosted tasks.
+ */
+static inline bool rt_task(struct task_struct *p)
{
return rt_prio(p->prio);
}
-static inline bool task_is_realtime(struct task_struct *tsk)
+/*
+ * Returns true if a task has a priority that belongs to RT or DL classes.
+ * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore
+ * PI-boosted tasks.
+ */
+static inline bool rt_or_dl_task(struct task_struct *p)
+{
+ return rt_or_dl_prio(p->prio);
+}
+
+/*
+ * Returns true if a task has a policy that belongs to RT or DL classes.
+ * PI-boosted tasks will return false.
+ */
+static inline bool rt_or_dl_task_policy(struct task_struct *tsk)
{
int policy = tsk->policy;
@@ -30,6 +51,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)
}
#ifdef CONFIG_RT_MUTEXES
+extern void rt_mutex_pre_schedule(void);
+extern void rt_mutex_schedule(void);
+extern void rt_mutex_post_schedule(void);
+
/*
* Must hold either p->pi_lock or task_rq(p)->lock.
*/
@@ -39,20 +64,12 @@ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
}
extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
- return tsk->pi_blocked_on != NULL;
-}
#else
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
return NULL;
}
# define rt_mutex_adjust_pi(p) do { } while (0)
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
- return false;
-}
#endif
extern void normalize_rt_tasks(void);
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index 34b21e971d77..b04a5d04dee9 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -91,6 +91,16 @@ SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD)
SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
/*
+ * Domain members have different CPU capacities spanning all unique CPU
+ * capacity values.
+ *
+ * SHARED_PARENT: Set from the topmost domain down to the first domain where
+ * all available CPU capacities are visible
+ * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups.
+ */
+SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+
+/*
* Domain members share CPU capacity (i.e. SMT)
*
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
@@ -100,13 +110,20 @@ SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
- * Domain members share CPU package resources (i.e. caches)
+ * Domain members share CPU cluster (LLC tags or L2 cache)
+ *
+ * NEEDS_GROUPS: Clusters are shared between groups.
+ */
+SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
+
+/*
+ * Domain members share CPU Last Level Caches
*
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
* the same cache(s).
* NEEDS_GROUPS: Caches are shared between groups.
*/
-SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+SD_FLAG(SD_SHARE_LLC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Only a single load balancing instance
@@ -122,12 +139,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
/*
* Place busy tasks earlier in the domain
*
- * SHARED_CHILD: Usually set on the SMT level. Technically could be set further
- * up, but currently assumed to be set from the base domain
- * upwards (see update_top_cache_domain()).
* NEEDS_GROUPS: Load balancing flag.
*/
-SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS)
/*
* Prefer to place tasks in a sibling domain
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 1bad18a1d8ba..1ef1edbaaf79 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -9,6 +9,7 @@
#include <linux/sched/task.h>
#include <linux/cred.h>
#include <linux/refcount.h>
+#include <linux/pid.h>
#include <linux/posix-timers.h>
#include <linux/mm_types.h>
#include <asm/ptrace.h>
@@ -72,6 +73,17 @@ struct multiprocess_signals {
struct hlist_node node;
};
+struct core_thread {
+ struct task_struct *task;
+ struct core_thread *next;
+};
+
+struct core_state {
+ atomic_t nr_threads;
+ struct core_thread dumper;
+ struct completion startup;
+};
+
/*
* NOTE! "signal_struct" does not have its own
* locking, because a shared signal_struct always
@@ -83,6 +95,7 @@ struct signal_struct {
refcount_t sigcnt;
atomic_t live;
int nr_threads;
+ int quick_threads;
struct list_head thread_head;
wait_queue_head_t wait_chldexit; /* for wait4() */
@@ -98,18 +111,16 @@ struct signal_struct {
/* thread group exit support */
int group_exit_code;
- /* overloaded:
- * - notify group_exit_task when ->count is equal to notify_count
- * - everyone except group_exit_task is stopped during signal delivery
- * of fatal signals, group_exit_task processes the signal.
- */
+ /* notify group_exec_task when notify_count is less or equal to 0 */
int notify_count;
- struct task_struct *group_exit_task;
+ struct task_struct *group_exec_task;
/* thread group stop support, overloads group_exit_code too */
int group_stop_count;
unsigned int flags; /* see SIGNAL_* flags below */
+ struct core_state *core_state; /* coredumping support */
+
/*
* PR_SET_CHILD_SUBREAPER marks a process, like a service
* manager, to re-parent orphan (double-forking) child processes
@@ -125,8 +136,10 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
- int posix_timer_id;
- struct list_head posix_timers;
+ unsigned int timer_create_restore_ids:1;
+ atomic_t next_posix_timer_id;
+ struct hlist_head posix_timers;
+ struct hlist_head ignored_posix_timers;
/* ITIMER_REAL timer for the process */
struct hrtimer real_timer;
@@ -228,12 +241,13 @@ struct signal_struct {
* credential calculations
* (notably. ptrace)
* Deprecated do not use in new code.
- * Use exec_update_mutex instead.
- */
- struct mutex exec_update_mutex; /* Held while task_struct is being
- * updated during exec, and may have
- * inconsistent permissions.
+ * Use exec_update_lock instead.
*/
+ struct rw_semaphore exec_update_lock; /* Held while task_struct is
+ * being updated during exec,
+ * and may have inconsistent
+ * permissions.
+ */
} __randomize_layout;
/*
@@ -242,7 +256,6 @@ struct signal_struct {
#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */
#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */
-#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */
/*
* Pending notifications to parent.
*/
@@ -258,31 +271,24 @@ struct signal_struct {
static inline void signal_set_stop_flags(struct signal_struct *sig,
unsigned int flags)
{
- WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP));
+ WARN_ON(sig->flags & SIGNAL_GROUP_EXIT);
sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
}
-/* If true, all threads except ->group_exit_task have pending SIGKILL */
-static inline int signal_group_exit(const struct signal_struct *sig)
-{
- return (sig->flags & SIGNAL_GROUP_EXIT) ||
- (sig->group_exit_task != NULL);
-}
-
extern void flush_signals(struct task_struct *);
extern void ignore_signals(struct task_struct *);
extern void flush_signal_handlers(struct task_struct *, int force_default);
-extern int dequeue_signal(struct task_struct *task,
- sigset_t *mask, kernel_siginfo_t *info);
+extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type);
static inline int kernel_dequeue_signal(void)
{
struct task_struct *task = current;
kernel_siginfo_t __info;
+ enum pid_type __type;
int ret;
spin_lock_irq(&task->sighand->siglock);
- ret = dequeue_signal(task, &task->blocked, &__info);
+ ret = dequeue_signal(&task->blocked, &__info, &__type);
spin_unlock_irq(&task->sighand->siglock);
return ret;
@@ -291,42 +297,32 @@ static inline int kernel_dequeue_signal(void)
static inline void kernel_signal_stop(void)
{
spin_lock_irq(&current->sighand->siglock);
- if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+ if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
+ current->jobctl |= JOBCTL_STOPPED;
set_special_state(TASK_STOPPED);
+ }
spin_unlock_irq(&current->sighand->siglock);
schedule();
}
-#ifdef __ARCH_SI_TRAPNO
-# define ___ARCH_SI_TRAPNO(_a1) , _a1
-#else
-# define ___ARCH_SI_TRAPNO(_a1)
-#endif
-#ifdef __ia64__
-# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3
-#else
-# define ___ARCH_SI_IA64(_a1, _a2, _a3)
-#endif
-int force_sig_fault_to_task(int sig, int code, void __user *addr
- ___ARCH_SI_TRAPNO(int trapno)
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- , struct task_struct *t);
-int force_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_TRAPNO(int trapno)
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr));
-int send_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_TRAPNO(int trapno)
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- , struct task_struct *t);
+int force_sig_fault_to_task(int sig, int code, void __user *addr,
+ struct task_struct *t);
+int force_sig_fault(int sig, int code, void __user *addr);
+int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);
int force_sig_mceerr(int code, void __user *, short);
int send_sig_mceerr(int code, void __user *, short, struct task_struct *);
int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper);
int force_sig_pkuerr(void __user *addr, u32 pkey);
+int send_sig_perf(void __user *addr, u32 type, u64 sig_data);
int force_sig_ptrace_errno_trap(int errno, void __user *addr);
+int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno);
+int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
+ struct task_struct *t);
+int force_sig_seccomp(int syscall, int reason, bool force_coredump);
extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern void force_sigsegv(int sig);
@@ -340,24 +336,61 @@ extern int kill_pid(struct pid *pid, int sig, int priv);
extern __must_check bool do_notify_parent(struct task_struct *, int);
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
extern void force_sig(int);
+extern void force_fatal_sig(int);
+extern void force_exit_sig(int);
extern int send_sig(int, struct task_struct *, int);
extern int zap_other_threads(struct task_struct *p);
-extern struct sigqueue *sigqueue_alloc(void);
-extern void sigqueue_free(struct sigqueue *);
-extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
+static inline void clear_notify_signal(void)
+{
+ clear_thread_flag(TIF_NOTIFY_SIGNAL);
+ smp_mb__after_atomic();
+}
+
+/*
+ * Returns 'true' if kick_process() is needed to force a transition from
+ * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work.
+ */
+static inline bool __set_notify_signal(struct task_struct *task)
+{
+ return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
+ !wake_up_state(task, TASK_INTERRUPTIBLE);
+}
+
+/*
+ * Called to break out of interruptible wait loops, and enter the
+ * exit_to_user_mode_loop().
+ */
+static inline void set_notify_signal(struct task_struct *task)
+{
+ if (__set_notify_signal(task))
+ kick_process(task);
+}
+
static inline int restart_syscall(void)
{
set_tsk_thread_flag(current, TIF_SIGPENDING);
return -ERESTARTNOINTR;
}
-static inline int signal_pending(struct task_struct *p)
+static inline int task_sigpending(struct task_struct *p)
{
return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}
+static inline int signal_pending(struct task_struct *p)
+{
+ /*
+ * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
+ * behavior in terms of ensuring that we break out of wait loops
+ * so that notify signal callbacks can be processed.
+ */
+ if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
+ return 1;
+ return task_sigpending(p);
+}
+
static inline int __fatal_signal_pending(struct task_struct *p)
{
return unlikely(sigismember(&p->pending.signal, SIGKILL));
@@ -365,10 +398,10 @@ static inline int __fatal_signal_pending(struct task_struct *p)
static inline int fatal_signal_pending(struct task_struct *p)
{
- return signal_pending(p) && __fatal_signal_pending(p);
+ return task_sigpending(p) && __fatal_signal_pending(p);
}
-static inline int signal_pending_state(long state, struct task_struct *p)
+static inline int signal_pending_state(unsigned int state, struct task_struct *p)
{
if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
return 0;
@@ -398,19 +431,28 @@ static inline bool fault_signal_pending(vm_fault_t fault_flags,
* This is required every time the blocked sigset_t changes.
* callers must hold sighand->siglock.
*/
-extern void recalc_sigpending_and_wake(struct task_struct *t);
extern void recalc_sigpending(void);
extern void calculate_sigpending(void);
extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
-static inline void signal_wake_up(struct task_struct *t, bool resume)
+static inline void signal_wake_up(struct task_struct *t, bool fatal)
{
- signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
+ unsigned int state = 0;
+ if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) {
+ t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
+ state = TASK_WAKEKILL | __TASK_TRACED;
+ }
+ signal_wake_up_state(t, state);
}
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
- signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
+ unsigned int state = 0;
+ if (resume) {
+ t->jobctl &= ~JOBCTL_TRACED;
+ state = __TASK_TRACED;
+ }
+ signal_wake_up_state(t, state);
}
void task_join_group_stop(struct task_struct *task);
@@ -502,7 +544,7 @@ extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
static inline void restore_saved_sigmask_unless(bool interrupted)
{
if (interrupted)
- WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+ WARN_ON(!signal_pending(current));
else
restore_saved_sigmask();
}
@@ -524,6 +566,17 @@ static inline int kill_cad_pid(int sig, int priv)
#define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
#define SEND_SIG_PRIV ((struct kernel_siginfo *) 1)
+static inline int __on_sig_stack(unsigned long sp)
+{
+#ifdef CONFIG_STACK_GROWSUP
+ return sp >= current->sas_ss_sp &&
+ sp - current->sas_ss_sp < current->sas_ss_size;
+#else
+ return sp > current->sas_ss_sp &&
+ sp - current->sas_ss_sp <= current->sas_ss_size;
+#endif
+}
+
/*
* True if we are on the alternate signal stack.
*/
@@ -541,13 +594,7 @@ static inline int on_sig_stack(unsigned long sp)
if (current->sas_ss_flags & SS_AUTODISARM)
return 0;
-#ifdef CONFIG_STACK_GROWSUP
- return sp >= current->sas_ss_sp &&
- sp - current->sas_ss_sp < current->sas_ss_size;
-#else
- return sp > current->sas_ss_sp &&
- sp - current->sas_ss_sp <= current->sas_ss_size;
-#endif
+ return __on_sig_stack(sp);
}
static inline int sas_ss_flags(unsigned long sp)
@@ -591,17 +638,18 @@ extern void flush_itimer_signals(void);
extern bool current_is_single_threaded(void);
/*
- * Careful: do_each_thread/while_each_thread is a double loop so
- * 'break' will not work as expected - use goto instead.
+ * Without tasklist/siglock it is only rcu-safe if g can't exit/exec,
+ * otherwise next_thread(t) will never reach g after list_del_rcu(g).
*/
-#define do_each_thread(g, t) \
- for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
-
#define while_each_thread(g, t) \
while ((t = next_thread(t)) != g)
+#define for_other_threads(p, t) \
+ for (t = p; (t = next_thread(t)) != p; )
+
#define __for_each_thread(signal, t) \
- list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
+ lockdep_is_held(&tasklist_lock))
#define for_each_thread(p, t) \
__for_each_thread((p)->signal, t)
@@ -660,22 +708,31 @@ bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
return p1->signal == p2->signal;
}
-static inline struct task_struct *next_thread(const struct task_struct *p)
+/*
+ * returns NULL if p is the last thread in the thread group
+ */
+static inline struct task_struct *__next_thread(struct task_struct *p)
+{
+ return list_next_or_null_rcu(&p->signal->thread_head,
+ &p->thread_node,
+ struct task_struct,
+ thread_node);
+}
+
+static inline struct task_struct *next_thread(struct task_struct *p)
{
- return list_entry_rcu(p->thread_group.next,
- struct task_struct, thread_group);
+ return __next_thread(p) ?: p->group_leader;
}
static inline int thread_group_empty(struct task_struct *p)
{
- return list_empty(&p->thread_group);
+ return thread_group_leader(p) &&
+ list_is_last(&p->thread_node, &p->signal->thread_head);
}
#define delay_group_leader(p) \
(thread_group_leader(p) && !thread_group_empty(p))
-extern bool thread_group_exited(struct pid *pid);
-
extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
unsigned long *flags);
@@ -695,6 +752,12 @@ static inline void unlock_task_sighand(struct task_struct *task,
spin_unlock_irqrestore(&task->sighand->siglock, *flags);
}
+#ifdef CONFIG_LOCKDEP
+extern void lockdep_assert_task_sighand_held(struct task_struct *task);
+#else
+static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { }
+#endif
+
static inline unsigned long task_rlimit(const struct task_struct *task,
unsigned int limit)
{
diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h
index 59d3736c454c..166b19af956f 100644
--- a/include/linux/sched/smt.h
+++ b/include/linux/sched/smt.h
@@ -12,9 +12,9 @@ static __always_inline bool sched_smt_active(void)
return static_branch_likely(&sched_smt_present);
}
#else
-static inline bool sched_smt_active(void) { return false; }
+static __always_inline bool sched_smt_active(void) { return false; }
#endif
void arch_smt_update(void);
-#endif
+#endif /* _LINUX_SCHED_SMT_H */
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 568286411b43..0108a38bb64d 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -3,6 +3,7 @@
#define _LINUX_SCHED_STAT_H
#include <linux/percpu.h>
+#include <linux/kconfig.h>
/*
* Various counters maintained by the scheduler and fork(),
@@ -16,21 +17,14 @@ extern unsigned long total_forks;
extern int nr_threads;
DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
-extern unsigned long nr_running(void);
+extern unsigned int nr_running(void);
extern bool single_task_running(void);
-extern unsigned long nr_iowait(void);
-extern unsigned long nr_iowait_cpu(int cpu);
+extern unsigned int nr_iowait(void);
+extern unsigned int nr_iowait_cpu(int cpu);
static inline int sched_info_on(void)
{
-#ifdef CONFIG_SCHEDSTATS
- return 1;
-#elif defined(CONFIG_TASK_DELAY_ACCT)
- extern int delayacct_on;
- return delayacct_on;
-#else
- return 0;
-#endif
+ return IS_ENABLED(CONFIG_SCHED_INFO);
}
#ifdef CONFIG_SCHEDSTATS
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 3c31ba88aca5..5a64582b086b 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -4,98 +4,29 @@
#include <linux/types.h>
-struct ctl_table;
-
#ifdef CONFIG_DETECT_HUNG_TASK
-
-#ifdef CONFIG_SMP
-extern unsigned int sysctl_hung_task_all_cpu_backtrace;
-#else
-#define sysctl_hung_task_all_cpu_backtrace 0
-#endif /* CONFIG_SMP */
-
-extern int sysctl_hung_task_check_count;
-extern unsigned int sysctl_hung_task_panic;
+/* used for hung_task and block/ */
extern unsigned long sysctl_hung_task_timeout_secs;
-extern unsigned long sysctl_hung_task_check_interval_secs;
-extern int sysctl_hung_task_warnings;
-int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos);
#else
/* Avoid need for ifdefs elsewhere in the code */
enum { sysctl_hung_task_timeout_secs = 0 };
#endif
-extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
-extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_child_runs_first;
-
enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
SCHED_TUNABLESCALING_LOG,
SCHED_TUNABLESCALING_LINEAR,
SCHED_TUNABLESCALING_END,
};
-extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
-
-extern unsigned int sysctl_numa_balancing_scan_delay;
-extern unsigned int sysctl_numa_balancing_scan_period_min;
-extern unsigned int sysctl_numa_balancing_scan_period_max;
-extern unsigned int sysctl_numa_balancing_scan_size;
-
-#ifdef CONFIG_SCHED_DEBUG
-extern __read_mostly unsigned int sysctl_sched_migration_cost;
-extern __read_mostly unsigned int sysctl_sched_nr_migrate;
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos);
-#endif
-
-/*
- * control realtime throttling:
- *
- * /proc/sys/kernel/sched_rt_period_us
- * /proc/sys/kernel/sched_rt_runtime_us
- */
-extern unsigned int sysctl_sched_rt_period;
-extern int sysctl_sched_rt_runtime;
-extern unsigned int sysctl_sched_dl_period_max;
-extern unsigned int sysctl_sched_dl_period_min;
+#define NUMA_BALANCING_DISABLED 0x0
+#define NUMA_BALANCING_NORMAL 0x1
+#define NUMA_BALANCING_MEMORY_TIERING 0x2
-#ifdef CONFIG_UCLAMP_TASK
-extern unsigned int sysctl_sched_uclamp_util_min;
-extern unsigned int sysctl_sched_uclamp_util_max;
-extern unsigned int sysctl_sched_uclamp_util_min_rt_default;
-#endif
-
-#ifdef CONFIG_CFS_BANDWIDTH
-extern unsigned int sysctl_sched_cfs_bandwidth_slice;
-#endif
-
-#ifdef CONFIG_SCHED_AUTOGROUP
-extern unsigned int sysctl_sched_autogroup_enabled;
-#endif
-
-extern int sysctl_sched_rr_timeslice;
-extern int sched_rr_timeslice;
-
-int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos);
-int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos);
-int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos);
-int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos);
-int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos);
-
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
-extern unsigned int sysctl_sched_energy_aware;
-int sched_energy_aware_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_NUMA_BALANCING
+extern int sysctl_numa_balancing_mode;
+#else
+#define sysctl_numa_balancing_mode 0
#endif
#endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 85fb2f34c59b..ca1db4b92c32 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -7,6 +7,8 @@
* functionality:
*/
+#include <linux/rcupdate.h>
+#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
@@ -23,7 +25,12 @@ struct kernel_clone_args {
int __user *pidfd;
int __user *child_tid;
int __user *parent_tid;
+ const char *name;
int exit_signal;
+ u32 kthread:1;
+ u32 io_thread:1;
+ u32 user_worker:1;
+ u32 no_files:1;
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
@@ -31,8 +38,12 @@ struct kernel_clone_args {
/* Number of elements in *set_tid */
size_t set_tid_size;
int cgroup;
+ int idle;
+ int (*fn)(void *);
+ void *fn_arg;
struct cgroup *cgrp;
struct css_set *cset;
+ unsigned int kill_seq;
};
/*
@@ -47,27 +58,28 @@ extern spinlock_t mmlist_lock;
extern union thread_union init_thread_union;
extern struct task_struct init_task;
-#ifdef CONFIG_PROVE_RCU
extern int lockdep_tasklist_lock_is_held(void);
-#endif /* #ifdef CONFIG_PROVE_RCU */
extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
+extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern void sched_cancel_fork(struct task_struct *p);
extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);
void __noreturn do_task_dead(void);
+void __noreturn make_task_dead(int signr);
+extern void mm_cache_init(void);
extern void proc_caches_init(void);
extern void fork_init(void);
extern void release_task(struct task_struct * p);
-extern int copy_thread(unsigned long, unsigned long, unsigned long,
- struct task_struct *, unsigned long);
+extern int copy_thread(struct task_struct *, const struct kernel_clone_args *);
extern void flush_thread(void);
@@ -78,15 +90,19 @@ static inline void exit_thread(struct task_struct *tsk)
{
}
#endif
-extern void do_group_exit(int);
+extern __noreturn void do_group_exit(int);
extern void exit_files(struct task_struct *);
-extern void exit_itimers(struct signal_struct *);
+extern void exit_itimers(struct task_struct *);
extern pid_t kernel_clone(struct kernel_clone_args *kargs);
+struct task_struct *copy_process(struct pid *pid, int trace, int node,
+ struct kernel_clone_args *args);
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
-struct mm_struct *copy_init_mm(void);
-extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags);
+extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);
@@ -105,14 +121,57 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
return t;
}
+static inline struct task_struct *tryget_task_struct(struct task_struct *t)
+{
+ return refcount_inc_not_zero(&t->usage) ? t : NULL;
+}
+
extern void __put_task_struct(struct task_struct *t);
+extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
static inline void put_task_struct(struct task_struct *t)
{
- if (refcount_dec_and_test(&t->usage))
+ if (!refcount_dec_and_test(&t->usage))
+ return;
+
+ /*
+ * In !RT, it is always safe to call __put_task_struct().
+ * Under RT, we can only call it in preemptible context.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
+ static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP);
+
+ lock_map_acquire_try(&put_task_map);
__put_task_struct(t);
+ lock_map_release(&put_task_map);
+ return;
+ }
+
+ /*
+ * under PREEMPT_RT, we can't call put_task_struct
+ * in atomic context because it will indirectly
+ * acquire sleeping locks.
+ *
+ * call_rcu() will schedule delayed_put_task_struct_rcu()
+ * to be called in process context.
+ *
+ * __put_task_struct() is called when
+ * refcount_dec_and_test(&t->usage) succeeds.
+ *
+ * This means that it can't "conflict" with
+ * put_task_struct_rcu_user() which abuses ->rcu the same
+ * way; rcu_users has a reference so task->usage can't be
+ * zero after rcu_users 1 -> 0 transition.
+ *
+ * delayed_free_task() also uses ->rcu, but it is only called
+ * when it fails to fork a process. Therefore, there is no
+ * way it can conflict with put_task_struct().
+ */
+ call_rcu(&t->rcu, __put_task_struct_rcu_cb);
}
+DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))
+
static inline void put_task_struct_many(struct task_struct *t, int nr)
{
if (refcount_sub_and_test(nr, &t->usage))
@@ -121,6 +180,9 @@ static inline void put_task_struct_many(struct task_struct *t, int nr)
void put_task_struct_rcu_user(struct task_struct *task);
+/* Free all architecture-specific resources held by a thread. */
+void release_thread(struct task_struct *dead_task);
+
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
@@ -157,7 +219,7 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs. Also
* pins the final release of task.io_context. Also protects ->cpuset and
- * ->cgroup.subsys[]. And ->vfork_done.
+ * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
*
* Nests both inside and outside of read_lock(&tasklist_lock).
* It must not be nested with write_lock_irq(&tasklist_lock),
@@ -173,4 +235,6 @@ static inline void task_unlock(struct task_struct *p)
spin_unlock(&p->alloc_lock);
}
+DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))
+
#endif /* _LINUX_SCHED_TASK_H */
diff --git a/include/linux/sched/task_flags.h b/include/linux/sched/task_flags.h
new file mode 100644
index 000000000000..227f5be81bcd
--- /dev/null
+++ b/include/linux/sched/task_flags.h
@@ -0,0 +1 @@
+#include <linux/sched.h>
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index 2413427e439c..85c5a6392e02 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -8,6 +8,8 @@
#include <linux/sched.h>
#include <linux/magic.h>
+#include <linux/refcount.h>
+#include <linux/kasan.h>
#ifdef CONFIG_THREAD_INFO_IN_TASK
@@ -16,19 +18,23 @@
* try_get_task_stack() instead. task_stack_page will return a pointer
* that could get freed out from under you.
*/
-static inline void *task_stack_page(const struct task_struct *task)
+static __always_inline void *task_stack_page(const struct task_struct *task)
{
return task->stack;
}
#define setup_thread_stack(new,old) do { } while(0)
-static inline unsigned long *end_of_stack(const struct task_struct *task)
+static __always_inline unsigned long *end_of_stack(const struct task_struct *task)
{
+#ifdef CONFIG_STACK_GROWSUP
+ return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1;
+#else
return task->stack;
+#endif
}
-#elif !defined(__HAVE_THREAD_FUNCTIONS)
+#else
#define task_stack_page(task) ((void *)(task)->stack)
@@ -75,6 +81,8 @@ static inline void *try_get_task_stack(struct task_struct *tsk)
static inline void put_task_stack(struct task_struct *tsk) {}
#endif
+void exit_task_stack_account(struct task_struct *tsk);
+
#define task_stack_end_corrupted(task) \
(*(end_of_stack(task)) != STACK_END_MAGIC)
@@ -82,34 +90,22 @@ static inline int object_is_on_stack(const void *obj)
{
void *stack = task_stack_page(current);
+ obj = kasan_reset_tag(obj);
return (obj >= stack) && (obj < (stack + THREAD_SIZE));
}
extern void thread_stack_cache_init(void);
#ifdef CONFIG_DEBUG_STACK_USAGE
+unsigned long stack_not_used(struct task_struct *p);
+#else
static inline unsigned long stack_not_used(struct task_struct *p)
{
- unsigned long *n = end_of_stack(p);
-
- do { /* Skip over canary */
-# ifdef CONFIG_STACK_GROWSUP
- n--;
-# else
- n++;
-# endif
- } while (!*n);
-
-# ifdef CONFIG_STACK_GROWSUP
- return (unsigned long)end_of_stack(p) - (unsigned long)n;
-# else
- return (unsigned long)n - (unsigned long)end_of_stack(p);
-# endif
+ return 0;
}
#endif
extern void set_task_stack_end_magic(struct task_struct *tsk);
-#ifndef __HAVE_ARCH_KSTACK_END
static inline int kstack_end(void *addr)
{
/* Reliable end of stack detection:
@@ -117,6 +113,5 @@ static inline int kstack_end(void *addr)
*/
return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
}
-#endif
#endif /* _LINUX_SCHED_TASK_STACK_H */
diff --git a/include/linux/sched/thread_info_api.h b/include/linux/sched/thread_info_api.h
new file mode 100644
index 000000000000..2c60fbc16c08
--- /dev/null
+++ b/include/linux/sched/thread_info_api.h
@@ -0,0 +1 @@
+#include <linux/thread_info.h>
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 9ef7bf686a9f..198bb5cc1774 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -25,27 +25,30 @@ enum {
};
#undef SD_FLAG
-#ifdef CONFIG_SCHED_DEBUG
-
struct sd_flag_debug {
unsigned int meta_flags;
char *name;
};
extern const struct sd_flag_debug sd_flag_debug[];
-#endif
-
#ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void)
{
- return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+ return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
+}
+#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+static inline int cpu_cluster_flags(void)
+{
+ return SD_CLUSTER | SD_SHARE_LLC;
}
#endif
#ifdef CONFIG_SCHED_MC
static inline int cpu_core_flags(void)
{
- return SD_SHARE_PKG_RESOURCES;
+ return SD_SHARE_LLC;
}
#endif
@@ -74,6 +77,7 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
+ int nr_idle_scan;
};
struct sched_domain {
@@ -86,6 +90,7 @@ struct sched_domain {
unsigned int busy_factor; /* less balancing by factor if busy */
unsigned int imbalance_pct; /* No balance until over watermark */
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
+ unsigned int imb_numa_nr; /* Nr running tasks that allows a NUMA imbalance */
int nohz_idle; /* NOHZ IDLE status */
int flags; /* See SD_* */
@@ -98,16 +103,17 @@ struct sched_domain {
/* idle_balance() stats */
u64 max_newidle_lb_cost;
- unsigned long next_decay_max_lb_cost;
-
- u64 avg_scan_cost; /* select_idle_sibling */
+ unsigned long last_decay_max_lb_cost;
#ifdef CONFIG_SCHEDSTATS
- /* load_balance() stats */
+ /* sched_balance_rq() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
- unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
+ unsigned int lb_imbalance_load[CPU_MAX_IDLE_TYPES];
+ unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES];
+ unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES];
+ unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES];
unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
@@ -133,9 +139,7 @@ struct sched_domain {
unsigned int ttwu_move_affine;
unsigned int ttwu_move_balance;
#endif
-#ifdef CONFIG_SCHED_DEBUG
char *name;
-#endif
union {
void *private; /* used during construction */
struct rcu_head rcu; /* used during destruction */
@@ -158,10 +162,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
return to_cpumask(sd->span);
}
-extern void partition_sched_domains_locked(int ndoms_new,
- cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new);
-
extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new);
@@ -169,7 +169,9 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
+bool cpus_equal_capacity(int this_cpu, int that_cpu);
bool cpus_share_cache(int this_cpu, int that_cpu);
+bool cpus_share_resources(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);
@@ -189,33 +191,28 @@ struct sched_domain_topology_level {
int flags;
int numa_level;
struct sd_data data;
-#ifdef CONFIG_SCHED_DEBUG
char *name;
-#endif
};
-extern void set_sched_topology(struct sched_domain_topology_level *tl);
+extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
+extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio);
+
-#ifdef CONFIG_SCHED_DEBUG
# define SD_INIT_NAME(type) .name = #type
-#else
-# define SD_INIT_NAME(type)
-#endif
#else /* CONFIG_SMP */
struct sched_domain_attr;
static inline void
-partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
+partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
{
}
-static inline void
-partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
+static inline bool cpus_equal_capacity(int this_cpu, int that_cpu)
{
+ return true;
}
static inline bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -223,8 +220,25 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
return true;
}
+static inline bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+ return true;
+}
+
+static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
+{
+}
+
#endif /* !CONFIG_SMP */
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+extern void rebuild_sched_domains_energy(void);
+#else
+static inline void rebuild_sched_domains_energy(void)
+{
+}
+#endif
+
#ifndef arch_scale_cpu_capacity
/**
* arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.
@@ -243,21 +257,29 @@ unsigned long arch_scale_cpu_capacity(int cpu)
}
#endif
-#ifndef arch_scale_thermal_pressure
+#ifndef arch_scale_hw_pressure
static __always_inline
-unsigned long arch_scale_thermal_pressure(int cpu)
+unsigned long arch_scale_hw_pressure(int cpu)
{
return 0;
}
#endif
-#ifndef arch_set_thermal_pressure
+#ifndef arch_update_hw_pressure
static __always_inline
-void arch_set_thermal_pressure(const struct cpumask *cpus,
- unsigned long th_pressure)
+void arch_update_hw_pressure(const struct cpumask *cpus,
+ unsigned long capped_frequency)
{ }
#endif
+#ifndef arch_scale_freq_ref
+static __always_inline
+unsigned int arch_scale_freq_ref(int cpu)
+{
+ return 0;
+}
+#endif
+
static inline int task_node(const struct task_struct *p)
{
return cpu_to_node(task_cpu(p));
diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h
index 3c3e049224ae..969aaf5ef9d6 100644
--- a/include/linux/sched/types.h
+++ b/include/linux/sched/types.h
@@ -20,4 +20,4 @@ struct task_cputime {
unsigned long long sum_exec_runtime;
};
-#endif
+#endif /* _LINUX_SCHED_TYPES_H */
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index a8ec3b6093fc..4cc52698e214 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -4,6 +4,7 @@
#include <linux/uidgid.h>
#include <linux/atomic.h>
+#include <linux/percpu_counter.h>
#include <linux/refcount.h>
#include <linux/ratelimit.h>
@@ -12,19 +13,9 @@
*/
struct user_struct {
refcount_t __count; /* reference count */
- atomic_t processes; /* How many processes does this user have? */
- atomic_t sigpending; /* How many pending signals does this user have? */
-#ifdef CONFIG_FANOTIFY
- atomic_t fanotify_listeners;
-#endif
#ifdef CONFIG_EPOLL
- atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
-#endif
-#ifdef CONFIG_POSIX_MQUEUE
- /* protected by mq_lock */
- unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
+ struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */
#endif
- unsigned long locked_shm; /* How many pages of mlocked shm ? */
unsigned long unix_inflight; /* How many files in flight in unix sockets */
atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */
@@ -33,7 +24,8 @@ struct user_struct {
kuid_t uid;
#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
- defined(CONFIG_NET) || defined(CONFIG_IO_URING)
+ defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \
+ defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD)
atomic_long_t locked_vm;
#endif
#ifdef CONFIG_WATCH_QUEUE
diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h
new file mode 100644
index 000000000000..25446c5d3508
--- /dev/null
+++ b/include/linux/sched/vhost_task.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_VHOST_TASK_H
+#define _LINUX_SCHED_VHOST_TASK_H
+
+struct vhost_task;
+
+struct vhost_task *vhost_task_create(bool (*fn)(void *),
+ void (*handle_kill)(void *), void *arg,
+ const char *name);
+void vhost_task_start(struct vhost_task *vtsk);
+void vhost_task_stop(struct vhost_task *vtsk);
+void vhost_task_wake(struct vhost_task *vtsk);
+
+#endif /* _LINUX_SCHED_VHOST_TASK_H */
diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index 26a2013ac39c..0f28b4623ad4 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -42,8 +42,11 @@ struct wake_q_head {
#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
-#define DEFINE_WAKE_Q(name) \
- struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+#define WAKE_Q_HEAD_INITIALIZER(name) \
+ { WAKE_Q_TAIL, &name.first }
+
+#define DEFINE_WAKE_Q(name) \
+ struct wake_q_head name = WAKE_Q_HEAD_INITIALIZER(name)
static inline void wake_q_init(struct wake_q_head *head)
{
@@ -60,4 +63,38 @@ extern void wake_q_add(struct wake_q_head *head, struct task_struct *task);
extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task);
extern void wake_up_q(struct wake_q_head *head);
+/* Spin unlock helpers to unlock and call wake_up_q with preempt disabled */
+static inline
+void raw_spin_unlock_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q)
+{
+ guard(preempt)();
+ raw_spin_unlock(lock);
+ if (wake_q) {
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ }
+}
+
+static inline
+void raw_spin_unlock_irq_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q)
+{
+ guard(preempt)();
+ raw_spin_unlock_irq(lock);
+ if (wake_q) {
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ }
+}
+
+static inline
+void raw_spin_unlock_irqrestore_wake(raw_spinlock_t *lock, unsigned long flags,
+ struct wake_q_head *wake_q)
+{
+ guard(preempt)();
+ raw_spin_unlock_irqrestore(lock, flags);
+ if (wake_q) {
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ }
+}
#endif /* _LINUX_SCHED_WAKE_Q_H */