diff options
Diffstat (limited to 'include/linux/sched')
33 files changed, 1048 insertions, 459 deletions
diff --git a/include/linux/sched/affinity.h b/include/linux/sched/affinity.h new file mode 100644 index 000000000000..227f5be81bcd --- /dev/null +++ b/include/linux/sched/affinity.h @@ -0,0 +1 @@ +#include <linux/sched.h> diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 867d588314e0..196f0ca351a2 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -12,7 +12,16 @@ * * Please use one of the three interfaces below. */ -extern unsigned long long notrace sched_clock(void); +extern u64 sched_clock(void); + +#if defined(CONFIG_ARCH_WANTS_NO_INSTR) || defined(CONFIG_GENERIC_SCHED_CLOCK) +extern u64 sched_clock_noinstr(void); +#else +static __always_inline u64 sched_clock_noinstr(void) +{ + return sched_clock(); +} +#endif /* * See the comment in kernel/sched/clock.c @@ -45,7 +54,12 @@ static inline u64 cpu_clock(int cpu) return sched_clock(); } -static inline u64 local_clock(void) +static __always_inline u64 local_clock_noinstr(void) +{ + return sched_clock_noinstr(); +} + +static __always_inline u64 local_clock(void) { return sched_clock(); } @@ -79,10 +93,9 @@ static inline u64 cpu_clock(int cpu) return sched_clock_cpu(cpu); } -static inline u64 local_clock(void) -{ - return sched_clock_cpu(raw_smp_processor_id()); -} +extern u64 local_clock_noinstr(void); +extern u64 local_clock(void); + #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/include/linux/sched/cond_resched.h b/include/linux/sched/cond_resched.h new file mode 100644 index 000000000000..227f5be81bcd --- /dev/null +++ b/include/linux/sched/cond_resched.h @@ -0,0 +1 @@ +#include <linux/sched.h> diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index dfd82eab2902..6eb65ceed213 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,12 +8,6 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ -/* mm flags */ - -/* for SUID_DUMP_* above */ -#define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) - extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things @@ -31,51 +25,4 @@ static inline int get_dumpable(struct mm_struct *mm) return __get_dumpable(mm->flags); } -/* coredump filter bits */ -#define MMF_DUMP_ANON_PRIVATE 2 -#define MMF_DUMP_ANON_SHARED 3 -#define MMF_DUMP_MAPPED_PRIVATE 4 -#define MMF_DUMP_MAPPED_SHARED 5 -#define MMF_DUMP_ELF_HEADERS 6 -#define MMF_DUMP_HUGETLB_PRIVATE 7 -#define MMF_DUMP_HUGETLB_SHARED 8 -#define MMF_DUMP_DAX_PRIVATE 9 -#define MMF_DUMP_DAX_SHARED 10 - -#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS -#define MMF_DUMP_FILTER_BITS 9 -#define MMF_DUMP_FILTER_MASK \ - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) -#define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) - -#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) -#else -# define MMF_DUMP_MASK_DEFAULT_ELF 0 -#endif - /* leave room for more dump flags */ -#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ -#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ -/* - * This one-shot flag is dropped due to necessity of changing exe once again - * on NFS restore - */ -//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ - -#define MMF_HAS_UPROBES 19 /* has uprobes */ -#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ -#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ -#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ -#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ -#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ -#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) - -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK) - #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 3ed5aa18593f..bdd31ab93bc5 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -26,7 +26,12 @@ bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy); static inline unsigned long map_util_freq(unsigned long util, unsigned long freq, unsigned long cap) { - return (freq + (freq >> 2)) * util / cap; + return freq * util / cap; +} + +static inline unsigned long map_util_perf(unsigned long util) +{ + return util + (util >> 2); } #endif /* CONFIG_CPU_FREQ */ diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h index 6c9f19a33865..5f8fd5b24a2e 100644 --- a/include/linux/sched/cputime.h +++ b/include/linux/sched/cputime.h @@ -8,25 +8,17 @@ * cputime accounting APIs: */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -#include <asm/cputime.h> - -#ifndef cputime_to_nsecs -# define cputime_to_nsecs(__ct) \ - (cputime_to_usecs(__ct) * NSEC_PER_USEC) -#endif -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ - #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern void task_cputime(struct task_struct *t, +extern bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime); extern u64 task_gtime(struct task_struct *t); #else -static inline void task_cputime(struct task_struct *t, +static inline bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { *utime = t->utime; *stime = t->stime; + return false; } static inline u64 task_gtime(struct task_struct *t) diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 1aff00b65f3c..f9aabbc9d22e 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_DEADLINE_H +#define _LINUX_SCHED_DEADLINE_H /* * SCHED_DEADLINE tasks has negative priorities, reflecting @@ -6,16 +8,18 @@ * NORMAL/BATCH tasks. */ -#define MAX_DL_PRIO 0 +#include <linux/sched.h> -static inline int dl_prio(int prio) +static inline bool dl_prio(int prio) { - if (unlikely(prio < MAX_DL_PRIO)) - return 1; - return 0; + return unlikely(prio < MAX_DL_PRIO); } -static inline int dl_task(struct task_struct *p) +/* + * Returns true if a task has a priority that belongs to DL class. PI-boosted + * tasks will return true. Use dl_policy() to ignore PI-boosted tasks. + */ +static inline bool dl_task(struct task_struct *p) { return dl_prio(p->prio); } @@ -30,5 +34,11 @@ static inline bool dl_time_before(u64 a, u64 b) struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); +extern void dl_clear_root_domain_cpu(int cpu); #endif /* CONFIG_SMP */ + +extern u64 dl_cookie; +extern bool dl_bw_visited(int cpu, u64 cookie); + +#endif /* _LINUX_SCHED_DEADLINE_H */ diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index 00c45a0e6abe..35ed4577a6cc 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -14,7 +14,7 @@ extern void dump_cpu_task(int cpu); /* * Only dump TASK_* tasks. (0 for all tasks) */ -extern void show_state_filter(unsigned long state_filter); +extern void show_state_filter(unsigned int state_filter); static inline void show_state(void) { @@ -35,15 +35,13 @@ extern void show_stack(struct task_struct *task, unsigned long *sp, extern void sched_show_task(struct task_struct *p); -#ifdef CONFIG_SCHED_DEBUG struct seq_file; extern void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); -#endif /* Attach to any functions which should be ignored in wchan output. */ -#define __sched __attribute__((__section__(".sched.text"))) +#define __sched __section(".sched.text") /* Linker adds these: start and end of __sched functions */ extern char __sched_text_start[], __sched_text_end[]; diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h new file mode 100644 index 000000000000..f7545430a548 --- /dev/null +++ b/include/linux/sched/ext.h @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifndef _LINUX_SCHED_EXT_H +#define _LINUX_SCHED_EXT_H + +#ifdef CONFIG_SCHED_CLASS_EXT + +#include <linux/llist.h> +#include <linux/rhashtable-types.h> + +enum scx_public_consts { + SCX_OPS_NAME_LEN = 128, + + SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ + SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ +}; + +/* + * DSQ (dispatch queue) IDs are 64bit of the format: + * + * Bits: [63] [62 .. 0] + * [ B] [ ID ] + * + * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs + * ID: 63 bit ID + * + * Built-in IDs: + * + * Bits: [63] [62] [61..32] [31 .. 0] + * [ 1] [ L] [ R ] [ V ] + * + * 1: 1 for built-in DSQs. + * L: 1 for LOCAL_ON DSQ IDs, 0 for others + * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. + */ +enum scx_dsq_id_flags { + SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, + SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, + + SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, + SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, + SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, + SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, +}; + +/* + * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered + * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to + * buffer between the scheduler core and the BPF scheduler. See the + * documentation for more details. + */ +struct scx_dispatch_q { + raw_spinlock_t lock; + struct list_head list; /* tasks in dispatch order */ + struct rb_root priq; /* used to order by p->scx.dsq_vtime */ + u32 nr; + u32 seq; /* used by BPF iter */ + u64 id; + struct rhash_head hash_node; + struct llist_node free_node; + struct rcu_head rcu; +}; + +/* scx_entity.flags */ +enum scx_ent_flags { + SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ + SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ + SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ + + SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ + SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, + + SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ +}; + +/* scx_entity.flags & SCX_TASK_STATE_MASK */ +enum scx_task_state { + SCX_TASK_NONE, /* ops.init_task() not called yet */ + SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ + SCX_TASK_READY, /* fully initialized, but not in sched_ext */ + SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ + + SCX_TASK_NR_STATES, +}; + +/* scx_entity.dsq_flags */ +enum scx_ent_dsq_flags { + SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ +}; + +/* + * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from + * everywhere and the following bits track which kfunc sets are currently + * allowed for %current. This simple per-task tracking works because SCX ops + * nest in a limited way. BPF will likely implement a way to allow and disallow + * kfuncs depending on the calling context which will replace this manual + * mechanism. See scx_kf_allow(). + */ +enum scx_kf_mask { + SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ + /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ + SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ + /* ops.dequeue (in REST) may be nested inside DISPATCH */ + SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ + SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ + SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ + SCX_KF_REST = 1 << 4, /* other rq-locked operations */ + + __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, + __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, +}; + +enum scx_dsq_lnode_flags { + SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, + + /* high 16 bits can be for iter cursor flags */ + __SCX_DSQ_LNODE_PRIV_SHIFT = 16, +}; + +struct scx_dsq_list_node { + struct list_head node; + u32 flags; + u32 priv; /* can be used by iter cursor */ +}; + +/* + * The following is embedded in task_struct and contains all fields necessary + * for a task to be scheduled by SCX. + */ +struct sched_ext_entity { + struct scx_dispatch_q *dsq; + struct scx_dsq_list_node dsq_list; /* dispatch order */ + struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ + u32 dsq_seq; + u32 dsq_flags; /* protected by DSQ lock */ + u32 flags; /* protected by rq lock */ + u32 weight; + s32 sticky_cpu; + s32 holding_cpu; + s32 selected_cpu; + u32 kf_mask; /* see scx_kf_mask above */ + struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ + atomic_long_t ops_state; + + struct list_head runnable_node; /* rq->scx.runnable_list */ + unsigned long runnable_at; + +#ifdef CONFIG_SCHED_CORE + u64 core_sched_at; /* see scx_prio_less() */ +#endif + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; + + /* BPF scheduler modifiable fields */ + + /* + * Runtime budget in nsecs. This is usually set through + * scx_bpf_dispatch() but can also be modified directly by the BPF + * scheduler. Automatically decreased by SCX as the task executes. On + * depletion, a scheduling event is triggered. + * + * This value is cleared to zero if the task is preempted by + * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the + * task ran. Use p->se.sum_exec_runtime instead. + */ + u64 slice; + + /* + * Used to order tasks when dispatching to the vtime-ordered priority + * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() + * but can also be modified directly by the BPF scheduler. Modifying it + * while a task is queued on a dsq may mangle the ordering and is not + * recommended. + */ + u64 dsq_vtime; + + /* + * If set, reject future sched_setscheduler(2) calls updating the policy + * to %SCHED_EXT with -%EACCES. + * + * Can be set from ops.init_task() while the BPF scheduler is being + * loaded (!scx_init_task_args->fork). If set and the task's policy is + * already %SCHED_EXT, the task's policy is rejected and forcefully + * reverted to %SCHED_NORMAL. The number of such events are reported + * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag + * during fork is not allowed. + */ + bool disallow; /* reject switching into SCX */ + + /* cold fields */ +#ifdef CONFIG_EXT_GROUP_SCHED + struct cgroup *cgrp_moving_from; +#endif + struct list_head tasks_node; +}; + +void sched_ext_free(struct task_struct *p); +void print_scx_info(const char *log_lvl, struct task_struct *p); +void scx_softlockup(u32 dur_s); + +#else /* !CONFIG_SCHED_CLASS_EXT */ + +static inline void sched_ext_free(struct task_struct *p) {} +static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} +static inline void scx_softlockup(u32 dur_s) {} + +#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* _LINUX_SCHED_EXT_H */ diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h index 9a62ffdd296f..17e04859b9a4 100644 --- a/include/linux/sched/hotplug.h +++ b/include/linux/sched/hotplug.h @@ -11,15 +11,13 @@ extern int sched_cpu_activate(unsigned int cpu); extern int sched_cpu_deactivate(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU +extern int sched_cpu_wait_empty(unsigned int cpu); extern int sched_cpu_dying(unsigned int cpu); #else +# define sched_cpu_wait_empty NULL # define sched_cpu_dying NULL #endif -#ifdef CONFIG_HOTPLUG_CPU -extern void idle_task_exit(void); -#else static inline void idle_task_exit(void) {} -#endif #endif /* _LINUX_SCHED_HOTPLUG_H */ diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index 22873d276be6..439f6029d3b9 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -5,13 +5,17 @@ #include <linux/sched.h> enum cpu_idle_type { + __CPU_NOT_IDLE = 0, CPU_IDLE, - CPU_NOT_IDLE, CPU_NEWLY_IDLE, CPU_MAX_IDLE_TYPES }; +#ifdef CONFIG_SMP extern void wake_up_if_idle(int cpu); +#else +static inline void wake_up_if_idle(int cpu) { } +#endif /* * Idle thread specific functions to determine the need_resched @@ -19,12 +23,37 @@ extern void wake_up_if_idle(int cpu); */ #ifdef TIF_POLLING_NRFLAG -static inline void __current_set_polling(void) +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H + +static __always_inline void __current_set_polling(void) { - set_thread_flag(TIF_POLLING_NRFLAG); + arch_set_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); } -static inline bool __must_check current_set_polling_and_test(void) +static __always_inline void __current_clr_polling(void) +{ + arch_clear_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#else + +static __always_inline void __current_set_polling(void) +{ + set_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +static __always_inline void __current_clr_polling(void) +{ + clear_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H */ + +static __always_inline bool __must_check current_set_polling_and_test(void) { __current_set_polling(); @@ -37,12 +66,7 @@ static inline bool __must_check current_set_polling_and_test(void) return unlikely(tif_need_resched()); } -static inline void __current_clr_polling(void) -{ - clear_thread_flag(TIF_POLLING_NRFLAG); -} - -static inline bool __must_check current_clr_polling_and_test(void) +static __always_inline bool __must_check current_clr_polling_and_test(void) { __current_clr_polling(); @@ -55,6 +79,21 @@ static inline bool __must_check current_clr_polling_and_test(void) return unlikely(tif_need_resched()); } +static __always_inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb__after_atomic(); /* paired with resched_curr() */ + + preempt_fold_need_resched(); +} + #else static inline void __current_set_polling(void) { } static inline void __current_clr_polling(void) { } @@ -67,21 +106,15 @@ static inline bool __must_check current_clr_polling_and_test(void) { return unlikely(tif_need_resched()); } -#endif -static inline void current_clr_polling(void) +static __always_inline void current_clr_polling(void) { __current_clr_polling(); - /* - * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. - * Once the bit is cleared, we'll get IPIs with every new - * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also - * fold. - */ smp_mb(); /* paired with resched_curr() */ preempt_fold_need_resched(); } +#endif #endif /* _LINUX_SCHED_IDLE_H */ diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index cc9f393e2a70..d8501f4709b5 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -2,59 +2,79 @@ #define _LINUX_SCHED_ISOLATION_H #include <linux/cpumask.h> +#include <linux/cpuset.h> #include <linux/init.h> #include <linux/tick.h> -enum hk_flags { - HK_FLAG_TIMER = 1, - HK_FLAG_RCU = (1 << 1), - HK_FLAG_MISC = (1 << 2), - HK_FLAG_SCHED = (1 << 3), - HK_FLAG_TICK = (1 << 4), - HK_FLAG_DOMAIN = (1 << 5), - HK_FLAG_WQ = (1 << 6), - HK_FLAG_MANAGED_IRQ = (1 << 7), - HK_FLAG_KTHREAD = (1 << 8), +enum hk_type { + HK_TYPE_DOMAIN, + HK_TYPE_MANAGED_IRQ, + HK_TYPE_KERNEL_NOISE, + HK_TYPE_MAX, + + /* + * The following housekeeping types are only set by the nohz_full + * boot commandline option. So they can share the same value. + */ + HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE, + HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE, + HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, + HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, + HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, + HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION DECLARE_STATIC_KEY_FALSE(housekeeping_overridden); -extern int housekeeping_any_cpu(enum hk_flags flags); -extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); -extern bool housekeeping_enabled(enum hk_flags flags); -extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); -extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags); +extern int housekeeping_any_cpu(enum hk_type type); +extern const struct cpumask *housekeeping_cpumask(enum hk_type type); +extern bool housekeeping_enabled(enum hk_type type); +extern void housekeeping_affine(struct task_struct *t, enum hk_type type); +extern bool housekeeping_test_cpu(int cpu, enum hk_type type); extern void __init housekeeping_init(void); #else -static inline int housekeeping_any_cpu(enum hk_flags flags) +static inline int housekeeping_any_cpu(enum hk_type type) { return smp_processor_id(); } -static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) +static inline const struct cpumask *housekeeping_cpumask(enum hk_type type) { return cpu_possible_mask; } -static inline bool housekeeping_enabled(enum hk_flags flags) +static inline bool housekeeping_enabled(enum hk_type type) { return false; } static inline void housekeeping_affine(struct task_struct *t, - enum hk_flags flags) { } + enum hk_type type) { } + +static inline bool housekeeping_test_cpu(int cpu, enum hk_type type) +{ + return true; +} + static inline void housekeeping_init(void) { } #endif /* CONFIG_CPU_ISOLATION */ -static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) +static inline bool housekeeping_cpu(int cpu, enum hk_type type) { #ifdef CONFIG_CPU_ISOLATION if (static_branch_unlikely(&housekeeping_overridden)) - return housekeeping_test_cpu(cpu, flags); + return housekeeping_test_cpu(cpu, type); #endif return true; } +static inline bool cpu_is_isolated(int cpu) +{ + return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) || + !housekeeping_test_cpu(cpu, HK_TYPE_TICK) || + cpuset_cpu_is_isolated(cpu); +} + #endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index d2b4204ba4d3..68876d0a7ef9 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -19,7 +19,10 @@ struct task_struct; #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ #define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ -#define JOBCTL_TASK_WORK_BIT 24 /* set by TWA_SIGNAL */ +#define JOBCTL_PTRACE_FROZEN_BIT 24 /* frozen for ptrace */ + +#define JOBCTL_STOPPED_BIT 26 /* do_signal_stop() */ +#define JOBCTL_TRACED_BIT 27 /* ptrace_stop() */ #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) @@ -29,10 +32,13 @@ struct task_struct; #define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) #define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) -#define JOBCTL_TASK_WORK (1UL << JOBCTL_TASK_WORK_BIT) +#define JOBCTL_PTRACE_FROZEN (1UL << JOBCTL_PTRACE_FROZEN_BIT) + +#define JOBCTL_STOPPED (1UL << JOBCTL_STOPPED_BIT) +#define JOBCTL_TRACED (1UL << JOBCTL_TRACED_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) -#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK | JOBCTL_TASK_WORK) +#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) extern bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask); extern void task_clear_jobctl_trapping(struct task_struct *task); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index d5ece7a9a403..b13474825130 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -8,6 +8,7 @@ #include <linux/mm_types.h> #include <linux/gfp.h> #include <linux/sync_core.h> +#include <linux/sched/coredump.h> /* * Routines for handling mm_structs @@ -28,7 +29,7 @@ extern struct mm_struct *mm_alloc(void); * * Use mmdrop() to release the reference acquired by mmgrab(). * - * See also <Documentation/vm/active_mm.rst> for an in-depth explanation + * See also <Documentation/mm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) @@ -36,6 +37,11 @@ static inline void mmgrab(struct mm_struct *mm) atomic_inc(&mm->mm_count); } +static inline void smp_mb__after_mmgrab(void) +{ + smp_mb__after_atomic(); +} + extern void __mmdrop(struct mm_struct *mm); static inline void mmdrop(struct mm_struct *mm) @@ -49,6 +55,63 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +#ifdef CONFIG_PREEMPT_RT +/* + * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is + * by far the least expensive way to do that. + */ +static inline void __mmdrop_delayed(struct rcu_head *rhp) +{ + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); + + __mmdrop(mm); +} + +/* + * Invoked from finish_task_switch(). Delegates the heavy lifting on RT + * kernels via RCU. + */ +static inline void mmdrop_sched(struct mm_struct *mm) +{ + /* Provides a full memory barrier. See mmdrop() */ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +static inline void mmdrop_sched(struct mm_struct *mm) +{ + mmdrop(mm); +} +#endif + +/* Helpers for lazy TLB mm refcounting */ +static inline void mmgrab_lazy_tlb(struct mm_struct *mm) +{ + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) + mmgrab(mm); +} + +static inline void mmdrop_lazy_tlb(struct mm_struct *mm) +{ + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) { + mmdrop(mm); + } else { + /* + * mmdrop_lazy_tlb must provide a full memory barrier, see the + * membarrier comment finish_task_switch which relies on this. + */ + smp_mb(); + } +} + +static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm) +{ + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) + mmdrop_sched(mm); + else + smp_mb(); /* see mmdrop_lazy_tlb() above */ +} + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. @@ -62,7 +125,7 @@ static inline void mmdrop(struct mm_struct *mm) * * Use mmput() to release the reference acquired by mmget(). * - * See also <Documentation/vm/active_mm.rst> for an in-depth explanation + * See also <Documentation/mm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) @@ -106,15 +169,46 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr, len, flags) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); -extern unsigned long -arch_get_unmapped_area(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); -extern unsigned long + +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags); +unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t); + +unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); + +unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, + struct file *filp, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags, + vm_flags_t vm_flags); + +unsigned long +generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long flags, vm_flags_t vm_flags); +unsigned long +generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) {} @@ -140,7 +234,8 @@ static inline bool in_vfork(struct task_struct *tsk) * another oom-unkillable task does this it should blame itself. */ rcu_read_lock(); - ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm; + ret = tsk->vfork_done && + rcu_dereference(tsk->real_parent)->mm == tsk->mm; rcu_read_unlock(); return ret; @@ -150,12 +245,13 @@ static inline bool in_vfork(struct task_struct *tsk) * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS + * PF_MEMALLOC_PIN implies !GFP_MOVABLE */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence @@ -164,22 +260,85 @@ static inline gfp_t current_gfp_context(gfp_t flags) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; + + if (pflags & PF_MEMALLOC_PIN) + flags &= ~__GFP_MOVABLE; } return flags; } #ifdef CONFIG_LOCKDEP -extern void __fs_reclaim_acquire(void); -extern void __fs_reclaim_release(void); +extern void __fs_reclaim_acquire(unsigned long ip); +extern void __fs_reclaim_release(unsigned long ip); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else -static inline void __fs_reclaim_acquire(void) { } -static inline void __fs_reclaim_release(void) { } +static inline void __fs_reclaim_acquire(unsigned long ip) { } +static inline void __fs_reclaim_release(unsigned long ip) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif +/* Any memory-allocation retry loop should use + * memalloc_retry_wait(), and pass the flags for the most + * constrained allocation attempt that might have failed. + * This provides useful documentation of where loops are, + * and a central place to fine tune the waiting as the MM + * implementation changes. + */ +static inline void memalloc_retry_wait(gfp_t gfp_flags) +{ + /* We use io_schedule_timeout because waiting for memory + * typically included waiting for dirty pages to be + * written out, which requires IO. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + gfp_flags = current_gfp_context(gfp_flags); + if (gfpflags_allow_blocking(gfp_flags) && + !(gfp_flags & __GFP_NORETRY)) + /* Probably waited already, no need for much more */ + io_schedule_timeout(1); + else + /* Probably didn't wait, and has now released a lock, + * so now is a good time to wait + */ + io_schedule_timeout(HZ/50); +} + +/** + * might_alloc - Mark possible allocation sites + * @gfp_mask: gfp_t flags that would be used to allocate + * + * Similar to might_sleep() and other annotations, this can be used in functions + * that might allocate, but often don't. Compiles to nothing without + * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking. + */ +static inline void might_alloc(gfp_t gfp_mask) +{ + fs_reclaim_acquire(gfp_mask); + fs_reclaim_release(gfp_mask); + + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); +} + +/** + * memalloc_flags_save - Add a PF_* flag to current->flags, save old value + * + * This allows PF_* flags to be conveniently added, irrespective of current + * value, and then the old version restored with memalloc_flags_restore(). + */ +static inline unsigned memalloc_flags_save(unsigned flags) +{ + unsigned oldflags = ~current->flags & flags; + current->flags |= flags; + return oldflags; +} + +static inline void memalloc_flags_restore(unsigned flags) +{ + current->flags &= ~flags; +} + /** * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. * @@ -189,13 +348,12 @@ static inline void fs_reclaim_release(gfp_t gfp_mask) { } * point of view. Use memalloc_noio_restore to end the scope with flags * returned by this function. * - * This function is safe to be used from any context. + * Context: This function is safe to be used from any context. + * Return: The saved flags to be passed to memalloc_noio_restore. */ static inline unsigned int memalloc_noio_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOIO; - current->flags |= PF_MEMALLOC_NOIO; - return flags; + return memalloc_flags_save(PF_MEMALLOC_NOIO); } /** @@ -208,7 +366,7 @@ static inline unsigned int memalloc_noio_save(void) */ static inline void memalloc_noio_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; + memalloc_flags_restore(flags); } /** @@ -220,13 +378,12 @@ static inline void memalloc_noio_restore(unsigned int flags) * point of view. Use memalloc_nofs_restore to end the scope with flags * returned by this function. * - * This function is safe to be used from any context. + * Context: This function is safe to be used from any context. + * Return: The saved flags to be passed to memalloc_nofs_restore. */ static inline unsigned int memalloc_nofs_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOFS; - current->flags |= PF_MEMALLOC_NOFS; - return flags; + return memalloc_flags_save(PF_MEMALLOC_NOFS); } /** @@ -239,44 +396,77 @@ static inline unsigned int memalloc_nofs_save(void) */ static inline void memalloc_nofs_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; + memalloc_flags_restore(flags); } +/** + * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope. + * + * This function marks the beginning of the __GFP_MEMALLOC allocation scope. + * All further allocations will implicitly add the __GFP_MEMALLOC flag, which + * prevents entering reclaim and allows access to all memory reserves. This + * should only be used when the caller guarantees the allocation will allow more + * memory to be freed very shortly, i.e. it needs to allocate some memory in + * the process of freeing memory, and cannot reclaim due to potential recursion. + * + * Users of this scope have to be extremely careful to not deplete the reserves + * completely and implement a throttling mechanism which controls the + * consumption of the reserve based on the amount of freed memory. Usage of a + * pre-allocated pool (e.g. mempool) should be always considered before using + * this scope. + * + * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC + * + * Context: This function should not be used in an interrupt context as that one + * does not give PF_MEMALLOC access to reserves. + * See __gfp_pfmemalloc_flags(). + * Return: The saved flags to be passed to memalloc_noreclaim_restore. + */ static inline unsigned int memalloc_noreclaim_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC; - current->flags |= PF_MEMALLOC; - return flags; + return memalloc_flags_save(PF_MEMALLOC); } +/** + * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope. + * @flags: Flags to restore. + * + * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save + * function. Always make sure that the given flags is the return value from the + * pairing memalloc_noreclaim_save call. + */ static inline void memalloc_noreclaim_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC) | flags; + memalloc_flags_restore(flags); } -#ifdef CONFIG_CMA -static inline unsigned int memalloc_nocma_save(void) -{ - unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; - - current->flags |= PF_MEMALLOC_NOCMA; - return flags; -} - -static inline void memalloc_nocma_restore(unsigned int flags) -{ - current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; -} -#else -static inline unsigned int memalloc_nocma_save(void) +/** + * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope. + * + * This function marks the beginning of the ~__GFP_MOVABLE allocation scope. + * All further allocations will implicitly remove the __GFP_MOVABLE flag, which + * will constraint the allocations to zones that allow long term pinning, i.e. + * not ZONE_MOVABLE zones. + * + * Return: The saved flags to be passed to memalloc_pin_restore. + */ +static inline unsigned int memalloc_pin_save(void) { - return 0; + return memalloc_flags_save(PF_MEMALLOC_PIN); } -static inline void memalloc_nocma_restore(unsigned int flags) +/** + * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope. + * @flags: Flags to restore. + * + * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function. + * Always make sure that the given flags is the return value from the pairing + * memalloc_pin_save call. + */ +static inline void memalloc_pin_restore(unsigned int flags) { + memalloc_flags_restore(flags); } -#endif #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); @@ -288,6 +478,10 @@ DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * + * Please, make sure that caller has a reference to the passed memcg structure, + * so its lifetime is guaranteed to exceed the scope between two + * set_active_memcg() calls. + * * NOTE: This function can nest. Users must save the return value and * reset the previous value after their own charging scope is over. */ @@ -296,7 +490,7 @@ set_active_memcg(struct mem_cgroup *memcg) { struct mem_cgroup *old; - if (in_interrupt()) { + if (!in_task()) { old = this_cpu_read(int_active_memcg); this_cpu_write(int_active_memcg, memcg); } else { @@ -337,6 +531,13 @@ enum { static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { + /* + * The atomic_read() below prevents CSE. The following should + * help the compiler generate more efficient code on architectures + * where sync_core_before_usermode() is a no-op. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE)) + return; if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & @@ -347,6 +548,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) extern void membarrier_exec_mmap(struct mm_struct *mm); +extern void membarrier_update_current_mm(struct mm_struct *next_mm); + #else #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS static inline void membarrier_arch_switch_mm(struct mm_struct *prev, @@ -361,6 +564,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm) static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { } +static inline void membarrier_update_current_mm(struct mm_struct *next_mm) +{ +} #endif #endif /* _LINUX_SCHED_MM_H */ diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index 3988762efe15..52b22c5c396d 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -15,13 +15,23 @@ #define TNF_FAULT_LOCAL 0x08 #define TNF_MIGRATE_FAIL 0x10 +enum numa_vmaskip_reason { + NUMAB_SKIP_UNSUITABLE, + NUMAB_SKIP_SHARED_RO, + NUMAB_SKIP_INACCESSIBLE, + NUMAB_SKIP_SCAN_DELAY, + NUMAB_SKIP_PID_INACTIVE, + NUMAB_SKIP_IGNORE_PID, + NUMAB_SKIP_SEQ_COMPLETED, +}; + #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); extern void task_numa_free(struct task_struct *p, bool final); -extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, - int src_nid, int dst_cpu); +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, + int src_nid, int dst_cpu); #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) @@ -38,7 +48,7 @@ static inline void task_numa_free(struct task_struct *p, bool final) { } static inline bool should_numa_migrate_memory(struct task_struct *p, - struct page *page, int src_nid, int dst_cpu) + struct folio *folio, int src_nid, int dst_cpu) { return true; } diff --git a/include/linux/sched/posix-timers.h b/include/linux/sched/posix-timers.h new file mode 100644 index 000000000000..523a381d6c88 --- /dev/null +++ b/include/linux/sched/posix-timers.h @@ -0,0 +1 @@ +#include <linux/posix-timers.h> diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index 7d64feafc408..6ab43b4f72f9 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -11,16 +11,10 @@ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. */ -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO +#define MAX_RT_PRIO 100 +#define MAX_DL_PRIO 0 #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) @@ -34,15 +28,6 @@ #define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO) /* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* * Convert nice value [19,-20] to rlimit style value [1,40]. */ static inline long nice_to_rlimit(long nice) diff --git a/include/linux/sched/rseq_api.h b/include/linux/sched/rseq_api.h new file mode 100644 index 000000000000..cf2af72693e1 --- /dev/null +++ b/include/linux/sched/rseq_api.h @@ -0,0 +1 @@ +#include <linux/rseq.h> diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index e5af028c08b4..4e3338103654 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -6,19 +6,40 @@ struct task_struct; -static inline int rt_prio(int prio) +static inline bool rt_prio(int prio) { - if (unlikely(prio < MAX_RT_PRIO)) - return 1; - return 0; + return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO); } -static inline int rt_task(struct task_struct *p) +static inline bool rt_or_dl_prio(int prio) +{ + return unlikely(prio < MAX_RT_PRIO); +} + +/* + * Returns true if a task has a priority that belongs to RT class. PI-boosted + * tasks will return true. Use rt_policy() to ignore PI-boosted tasks. + */ +static inline bool rt_task(struct task_struct *p) { return rt_prio(p->prio); } -static inline bool task_is_realtime(struct task_struct *tsk) +/* + * Returns true if a task has a priority that belongs to RT or DL classes. + * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore + * PI-boosted tasks. + */ +static inline bool rt_or_dl_task(struct task_struct *p) +{ + return rt_or_dl_prio(p->prio); +} + +/* + * Returns true if a task has a policy that belongs to RT or DL classes. + * PI-boosted tasks will return false. + */ +static inline bool rt_or_dl_task_policy(struct task_struct *tsk) { int policy = tsk->policy; @@ -30,6 +51,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) } #ifdef CONFIG_RT_MUTEXES +extern void rt_mutex_pre_schedule(void); +extern void rt_mutex_schedule(void); +extern void rt_mutex_post_schedule(void); + /* * Must hold either p->pi_lock or task_rq(p)->lock. */ @@ -39,20 +64,12 @@ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) } extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); extern void rt_mutex_adjust_pi(struct task_struct *p); -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) -{ - return tsk->pi_blocked_on != NULL; -} #else static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; } # define rt_mutex_adjust_pi(p) do { } while (0) -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) -{ - return false; -} #endif extern void normalize_rt_tasks(void); diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 34b21e971d77..b04a5d04dee9 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -91,6 +91,16 @@ SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD) SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) /* + * Domain members have different CPU capacities spanning all unique CPU + * capacity values. + * + * SHARED_PARENT: Set from the topmost domain down to the first domain where + * all available CPU capacities are visible + * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups. + */ +SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) + +/* * Domain members share CPU capacity (i.e. SMT) * * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share @@ -100,13 +110,20 @@ SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* - * Domain members share CPU package resources (i.e. caches) + * Domain members share CPU cluster (LLC tags or L2 cache) + * + * NEEDS_GROUPS: Clusters are shared between groups. + */ +SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) + +/* + * Domain members share CPU Last Level Caches * * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share * the same cache(s). * NEEDS_GROUPS: Caches are shared between groups. */ -SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +SD_FLAG(SD_SHARE_LLC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* * Only a single load balancing instance @@ -122,12 +139,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) /* * Place busy tasks earlier in the domain * - * SHARED_CHILD: Usually set on the SMT level. Technically could be set further - * up, but currently assumed to be set from the base domain - * upwards (see update_top_cache_domain()). * NEEDS_GROUPS: Load balancing flag. */ -SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) /* * Prefer to place tasks in a sibling domain diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 1bad18a1d8ba..1ef1edbaaf79 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -9,6 +9,7 @@ #include <linux/sched/task.h> #include <linux/cred.h> #include <linux/refcount.h> +#include <linux/pid.h> #include <linux/posix-timers.h> #include <linux/mm_types.h> #include <asm/ptrace.h> @@ -72,6 +73,17 @@ struct multiprocess_signals { struct hlist_node node; }; +struct core_thread { + struct task_struct *task; + struct core_thread *next; +}; + +struct core_state { + atomic_t nr_threads; + struct core_thread dumper; + struct completion startup; +}; + /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always @@ -83,6 +95,7 @@ struct signal_struct { refcount_t sigcnt; atomic_t live; int nr_threads; + int quick_threads; struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ @@ -98,18 +111,16 @@ struct signal_struct { /* thread group exit support */ int group_exit_code; - /* overloaded: - * - notify group_exit_task when ->count is equal to notify_count - * - everyone except group_exit_task is stopped during signal delivery - * of fatal signals, group_exit_task processes the signal. - */ + /* notify group_exec_task when notify_count is less or equal to 0 */ int notify_count; - struct task_struct *group_exit_task; + struct task_struct *group_exec_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ + struct core_state *core_state; /* coredumping support */ + /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes @@ -125,8 +136,10 @@ struct signal_struct { #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ - int posix_timer_id; - struct list_head posix_timers; + unsigned int timer_create_restore_ids:1; + atomic_t next_posix_timer_id; + struct hlist_head posix_timers; + struct hlist_head ignored_posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; @@ -228,12 +241,13 @@ struct signal_struct { * credential calculations * (notably. ptrace) * Deprecated do not use in new code. - * Use exec_update_mutex instead. - */ - struct mutex exec_update_mutex; /* Held while task_struct is being - * updated during exec, and may have - * inconsistent permissions. + * Use exec_update_lock instead. */ + struct rw_semaphore exec_update_lock; /* Held while task_struct is + * being updated during exec, + * and may have inconsistent + * permissions. + */ } __randomize_layout; /* @@ -242,7 +256,6 @@ struct signal_struct { #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ -#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */ /* * Pending notifications to parent. */ @@ -258,31 +271,24 @@ struct signal_struct { static inline void signal_set_stop_flags(struct signal_struct *sig, unsigned int flags) { - WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); + WARN_ON(sig->flags & SIGNAL_GROUP_EXIT); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } -/* If true, all threads except ->group_exit_task have pending SIGKILL */ -static inline int signal_group_exit(const struct signal_struct *sig) -{ - return (sig->flags & SIGNAL_GROUP_EXIT) || - (sig->group_exit_task != NULL); -} - extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); -extern int dequeue_signal(struct task_struct *task, - sigset_t *mask, kernel_siginfo_t *info); +extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); static inline int kernel_dequeue_signal(void) { struct task_struct *task = current; kernel_siginfo_t __info; + enum pid_type __type; int ret; spin_lock_irq(&task->sighand->siglock); - ret = dequeue_signal(task, &task->blocked, &__info); + ret = dequeue_signal(&task->blocked, &__info, &__type); spin_unlock_irq(&task->sighand->siglock); return ret; @@ -291,42 +297,32 @@ static inline int kernel_dequeue_signal(void) static inline void kernel_signal_stop(void) { spin_lock_irq(¤t->sighand->siglock); - if (current->jobctl & JOBCTL_STOP_DEQUEUED) + if (current->jobctl & JOBCTL_STOP_DEQUEUED) { + current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); + } spin_unlock_irq(¤t->sighand->siglock); schedule(); } -#ifdef __ARCH_SI_TRAPNO -# define ___ARCH_SI_TRAPNO(_a1) , _a1 -#else -# define ___ARCH_SI_TRAPNO(_a1) -#endif -#ifdef __ia64__ -# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3 -#else -# define ___ARCH_SI_IA64(_a1, _a2, _a3) -#endif -int force_sig_fault_to_task(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) - ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) - , struct task_struct *t); -int force_sig_fault(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) - ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)); -int send_sig_fault(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) - ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) - , struct task_struct *t); +int force_sig_fault_to_task(int sig, int code, void __user *addr, + struct task_struct *t); +int force_sig_fault(int sig, int code, void __user *addr); +int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t); int force_sig_mceerr(int code, void __user *, short); int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); +int send_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); +int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); +int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, + struct task_struct *t); +int force_sig_seccomp(int syscall, int reason, bool force_coredump); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); @@ -340,24 +336,61 @@ extern int kill_pid(struct pid *pid, int sig, int priv); extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); +extern void force_fatal_sig(int); +extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); -extern struct sigqueue *sigqueue_alloc(void); -extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); +static inline void clear_notify_signal(void) +{ + clear_thread_flag(TIF_NOTIFY_SIGNAL); + smp_mb__after_atomic(); +} + +/* + * Returns 'true' if kick_process() is needed to force a transition from + * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. + */ +static inline bool __set_notify_signal(struct task_struct *task) +{ + return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && + !wake_up_state(task, TASK_INTERRUPTIBLE); +} + +/* + * Called to break out of interruptible wait loops, and enter the + * exit_to_user_mode_loop(). + */ +static inline void set_notify_signal(struct task_struct *task) +{ + if (__set_notify_signal(task)) + kick_process(task); +} + static inline int restart_syscall(void) { set_tsk_thread_flag(current, TIF_SIGPENDING); return -ERESTARTNOINTR; } -static inline int signal_pending(struct task_struct *p) +static inline int task_sigpending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } +static inline int signal_pending(struct task_struct *p) +{ + /* + * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same + * behavior in terms of ensuring that we break out of wait loops + * so that notify signal callbacks can be processed. + */ + if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) + return 1; + return task_sigpending(p); +} + static inline int __fatal_signal_pending(struct task_struct *p) { return unlikely(sigismember(&p->pending.signal, SIGKILL)); @@ -365,10 +398,10 @@ static inline int __fatal_signal_pending(struct task_struct *p) static inline int fatal_signal_pending(struct task_struct *p) { - return signal_pending(p) && __fatal_signal_pending(p); + return task_sigpending(p) && __fatal_signal_pending(p); } -static inline int signal_pending_state(long state, struct task_struct *p) +static inline int signal_pending_state(unsigned int state, struct task_struct *p) { if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) return 0; @@ -398,19 +431,28 @@ static inline bool fault_signal_pending(vm_fault_t fault_flags, * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ -extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); -static inline void signal_wake_up(struct task_struct *t, bool resume) +static inline void signal_wake_up(struct task_struct *t, bool fatal) { - signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); + unsigned int state = 0; + if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) { + t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED); + state = TASK_WAKEKILL | __TASK_TRACED; + } + signal_wake_up_state(t, state); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { - signal_wake_up_state(t, resume ? __TASK_TRACED : 0); + unsigned int state = 0; + if (resume) { + t->jobctl &= ~JOBCTL_TRACED; + state = __TASK_TRACED; + } + signal_wake_up_state(t, state); } void task_join_group_stop(struct task_struct *task); @@ -502,7 +544,7 @@ extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); static inline void restore_saved_sigmask_unless(bool interrupted) { if (interrupted) - WARN_ON(!test_thread_flag(TIF_SIGPENDING)); + WARN_ON(!signal_pending(current)); else restore_saved_sigmask(); } @@ -524,6 +566,17 @@ static inline int kill_cad_pid(int sig, int priv) #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0) #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1) +static inline int __on_sig_stack(unsigned long sp) +{ +#ifdef CONFIG_STACK_GROWSUP + return sp >= current->sas_ss_sp && + sp - current->sas_ss_sp < current->sas_ss_size; +#else + return sp > current->sas_ss_sp && + sp - current->sas_ss_sp <= current->sas_ss_size; +#endif +} + /* * True if we are on the alternate signal stack. */ @@ -541,13 +594,7 @@ static inline int on_sig_stack(unsigned long sp) if (current->sas_ss_flags & SS_AUTODISARM) return 0; -#ifdef CONFIG_STACK_GROWSUP - return sp >= current->sas_ss_sp && - sp - current->sas_ss_sp < current->sas_ss_size; -#else - return sp > current->sas_ss_sp && - sp - current->sas_ss_sp <= current->sas_ss_size; -#endif + return __on_sig_stack(sp); } static inline int sas_ss_flags(unsigned long sp) @@ -591,17 +638,18 @@ extern void flush_itimer_signals(void); extern bool current_is_single_threaded(void); /* - * Careful: do_each_thread/while_each_thread is a double loop so - * 'break' will not work as expected - use goto instead. + * Without tasklist/siglock it is only rcu-safe if g can't exit/exec, + * otherwise next_thread(t) will never reach g after list_del_rcu(g). */ -#define do_each_thread(g, t) \ - for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do - #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define for_other_threads(p, t) \ + for (t = p; (t = next_thread(t)) != p; ) + #define __for_each_thread(signal, t) \ - list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ + lockdep_is_held(&tasklist_lock)) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) @@ -660,22 +708,31 @@ bool same_thread_group(struct task_struct *p1, struct task_struct *p2) return p1->signal == p2->signal; } -static inline struct task_struct *next_thread(const struct task_struct *p) +/* + * returns NULL if p is the last thread in the thread group + */ +static inline struct task_struct *__next_thread(struct task_struct *p) +{ + return list_next_or_null_rcu(&p->signal->thread_head, + &p->thread_node, + struct task_struct, + thread_node); +} + +static inline struct task_struct *next_thread(struct task_struct *p) { - return list_entry_rcu(p->thread_group.next, - struct task_struct, thread_group); + return __next_thread(p) ?: p->group_leader; } static inline int thread_group_empty(struct task_struct *p) { - return list_empty(&p->thread_group); + return thread_group_leader(p) && + list_is_last(&p->thread_node, &p->signal->thread_head); } #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) -extern bool thread_group_exited(struct pid *pid); - extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); @@ -695,6 +752,12 @@ static inline void unlock_task_sighand(struct task_struct *task, spin_unlock_irqrestore(&task->sighand->siglock, *flags); } +#ifdef CONFIG_LOCKDEP +extern void lockdep_assert_task_sighand_held(struct task_struct *task); +#else +static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } +#endif + static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index 59d3736c454c..166b19af956f 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -12,9 +12,9 @@ static __always_inline bool sched_smt_active(void) return static_branch_likely(&sched_smt_present); } #else -static inline bool sched_smt_active(void) { return false; } +static __always_inline bool sched_smt_active(void) { return false; } #endif void arch_smt_update(void); -#endif +#endif /* _LINUX_SCHED_SMT_H */ diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 568286411b43..0108a38bb64d 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -3,6 +3,7 @@ #define _LINUX_SCHED_STAT_H #include <linux/percpu.h> +#include <linux/kconfig.h> /* * Various counters maintained by the scheduler and fork(), @@ -16,21 +17,14 @@ extern unsigned long total_forks; extern int nr_threads; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); -extern unsigned long nr_running(void); +extern unsigned int nr_running(void); extern bool single_task_running(void); -extern unsigned long nr_iowait(void); -extern unsigned long nr_iowait_cpu(int cpu); +extern unsigned int nr_iowait(void); +extern unsigned int nr_iowait_cpu(int cpu); static inline int sched_info_on(void) { -#ifdef CONFIG_SCHEDSTATS - return 1; -#elif defined(CONFIG_TASK_DELAY_ACCT) - extern int delayacct_on; - return delayacct_on; -#else - return 0; -#endif + return IS_ENABLED(CONFIG_SCHED_INFO); } #ifdef CONFIG_SCHEDSTATS diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3c31ba88aca5..5a64582b086b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -4,98 +4,29 @@ #include <linux/types.h> -struct ctl_table; - #ifdef CONFIG_DETECT_HUNG_TASK - -#ifdef CONFIG_SMP -extern unsigned int sysctl_hung_task_all_cpu_backtrace; -#else -#define sysctl_hung_task_all_cpu_backtrace 0 -#endif /* CONFIG_SMP */ - -extern int sysctl_hung_task_check_count; -extern unsigned int sysctl_hung_task_panic; +/* used for hung_task and block/ */ extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_check_interval_secs; -extern int sysctl_hung_task_warnings; -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; #endif -extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; -extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_child_runs_first; - enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, SCHED_TUNABLESCALING_LINEAR, SCHED_TUNABLESCALING_END, }; -extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; - -extern unsigned int sysctl_numa_balancing_scan_delay; -extern unsigned int sysctl_numa_balancing_scan_period_min; -extern unsigned int sysctl_numa_balancing_scan_period_max; -extern unsigned int sysctl_numa_balancing_scan_size; - -#ifdef CONFIG_SCHED_DEBUG -extern __read_mostly unsigned int sysctl_sched_migration_cost; -extern __read_mostly unsigned int sysctl_sched_nr_migrate; - -int sched_proc_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos); -#endif - -/* - * control realtime throttling: - * - * /proc/sys/kernel/sched_rt_period_us - * /proc/sys/kernel/sched_rt_runtime_us - */ -extern unsigned int sysctl_sched_rt_period; -extern int sysctl_sched_rt_runtime; -extern unsigned int sysctl_sched_dl_period_max; -extern unsigned int sysctl_sched_dl_period_min; +#define NUMA_BALANCING_DISABLED 0x0 +#define NUMA_BALANCING_NORMAL 0x1 +#define NUMA_BALANCING_MEMORY_TIERING 0x2 -#ifdef CONFIG_UCLAMP_TASK -extern unsigned int sysctl_sched_uclamp_util_min; -extern unsigned int sysctl_sched_uclamp_util_max; -extern unsigned int sysctl_sched_uclamp_util_min_rt_default; -#endif - -#ifdef CONFIG_CFS_BANDWIDTH -extern unsigned int sysctl_sched_cfs_bandwidth_slice; -#endif - -#ifdef CONFIG_SCHED_AUTOGROUP -extern unsigned int sysctl_sched_autogroup_enabled; -#endif - -extern int sysctl_sched_rr_timeslice; -extern int sched_rr_timeslice; - -int sched_rr_handler(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); -int sched_rt_handler(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); -int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); -int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); - -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -extern unsigned int sysctl_sched_energy_aware; -int sched_energy_aware_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_NUMA_BALANCING +extern int sysctl_numa_balancing_mode; +#else +#define sysctl_numa_balancing_mode 0 #endif #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 85fb2f34c59b..ca1db4b92c32 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -7,6 +7,8 @@ * functionality: */ +#include <linux/rcupdate.h> +#include <linux/refcount.h> #include <linux/sched.h> #include <linux/uaccess.h> @@ -23,7 +25,12 @@ struct kernel_clone_args { int __user *pidfd; int __user *child_tid; int __user *parent_tid; + const char *name; int exit_signal; + u32 kthread:1; + u32 io_thread:1; + u32 user_worker:1; + u32 no_files:1; unsigned long stack; unsigned long stack_size; unsigned long tls; @@ -31,8 +38,12 @@ struct kernel_clone_args { /* Number of elements in *set_tid */ size_t set_tid_size; int cgroup; + int idle; + int (*fn)(void *); + void *fn_arg; struct cgroup *cgrp; struct css_set *cset; + unsigned int kill_seq; }; /* @@ -47,27 +58,28 @@ extern spinlock_t mmlist_lock; extern union thread_union init_thread_union; extern struct task_struct init_task; -#ifdef CONFIG_PROVE_RCU extern int lockdep_tasklist_lock_is_held(void); -#endif /* #ifdef CONFIG_PROVE_RCU */ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); +void __noreturn make_task_dead(int signr); +extern void mm_cache_init(void); extern void proc_caches_init(void); extern void fork_init(void); extern void release_task(struct task_struct * p); -extern int copy_thread(unsigned long, unsigned long, unsigned long, - struct task_struct *, unsigned long); +extern int copy_thread(struct task_struct *, const struct kernel_clone_args *); extern void flush_thread(void); @@ -78,15 +90,19 @@ static inline void exit_thread(struct task_struct *tsk) { } #endif -extern void do_group_exit(int); +extern __noreturn void do_group_exit(int); extern void exit_files(struct task_struct *); -extern void exit_itimers(struct signal_struct *); +extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); +struct task_struct *copy_process(struct pid *pid, int trace, int node, + struct kernel_clone_args *args); +struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); -struct mm_struct *copy_init_mm(void); -extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); +extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, + unsigned long flags); +extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); @@ -105,14 +121,57 @@ static inline struct task_struct *get_task_struct(struct task_struct *t) return t; } +static inline struct task_struct *tryget_task_struct(struct task_struct *t) +{ + return refcount_inc_not_zero(&t->usage) ? t : NULL; +} + extern void __put_task_struct(struct task_struct *t); +extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); static inline void put_task_struct(struct task_struct *t) { - if (refcount_dec_and_test(&t->usage)) + if (!refcount_dec_and_test(&t->usage)) + return; + + /* + * In !RT, it is always safe to call __put_task_struct(). + * Under RT, we can only call it in preemptible context. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { + static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); + + lock_map_acquire_try(&put_task_map); __put_task_struct(t); + lock_map_release(&put_task_map); + return; + } + + /* + * under PREEMPT_RT, we can't call put_task_struct + * in atomic context because it will indirectly + * acquire sleeping locks. + * + * call_rcu() will schedule delayed_put_task_struct_rcu() + * to be called in process context. + * + * __put_task_struct() is called when + * refcount_dec_and_test(&t->usage) succeeds. + * + * This means that it can't "conflict" with + * put_task_struct_rcu_user() which abuses ->rcu the same + * way; rcu_users has a reference so task->usage can't be + * zero after rcu_users 1 -> 0 transition. + * + * delayed_free_task() also uses ->rcu, but it is only called + * when it fails to fork a process. Therefore, there is no + * way it can conflict with put_task_struct(). + */ + call_rcu(&t->rcu, __put_task_struct_rcu_cb); } +DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T)) + static inline void put_task_struct_many(struct task_struct *t, int nr) { if (refcount_sub_and_test(nr, &t->usage)) @@ -121,6 +180,9 @@ static inline void put_task_struct_many(struct task_struct *t, int nr) void put_task_struct_rcu_user(struct task_struct *task); +/* Free all architecture-specific resources held by a thread. */ +void release_thread(struct task_struct *dead_task); + #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else @@ -157,7 +219,7 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. And ->vfork_done. + * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), @@ -173,4 +235,6 @@ static inline void task_unlock(struct task_struct *p) spin_unlock(&p->alloc_lock); } +DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T)) + #endif /* _LINUX_SCHED_TASK_H */ diff --git a/include/linux/sched/task_flags.h b/include/linux/sched/task_flags.h new file mode 100644 index 000000000000..227f5be81bcd --- /dev/null +++ b/include/linux/sched/task_flags.h @@ -0,0 +1 @@ +#include <linux/sched.h> diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index 2413427e439c..85c5a6392e02 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -8,6 +8,8 @@ #include <linux/sched.h> #include <linux/magic.h> +#include <linux/refcount.h> +#include <linux/kasan.h> #ifdef CONFIG_THREAD_INFO_IN_TASK @@ -16,19 +18,23 @@ * try_get_task_stack() instead. task_stack_page will return a pointer * that could get freed out from under you. */ -static inline void *task_stack_page(const struct task_struct *task) +static __always_inline void *task_stack_page(const struct task_struct *task) { return task->stack; } #define setup_thread_stack(new,old) do { } while(0) -static inline unsigned long *end_of_stack(const struct task_struct *task) +static __always_inline unsigned long *end_of_stack(const struct task_struct *task) { +#ifdef CONFIG_STACK_GROWSUP + return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1; +#else return task->stack; +#endif } -#elif !defined(__HAVE_THREAD_FUNCTIONS) +#else #define task_stack_page(task) ((void *)(task)->stack) @@ -75,6 +81,8 @@ static inline void *try_get_task_stack(struct task_struct *tsk) static inline void put_task_stack(struct task_struct *tsk) {} #endif +void exit_task_stack_account(struct task_struct *tsk); + #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) @@ -82,34 +90,22 @@ static inline int object_is_on_stack(const void *obj) { void *stack = task_stack_page(current); + obj = kasan_reset_tag(obj); return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE +unsigned long stack_not_used(struct task_struct *p); +#else static inline unsigned long stack_not_used(struct task_struct *p) { - unsigned long *n = end_of_stack(p); - - do { /* Skip over canary */ -# ifdef CONFIG_STACK_GROWSUP - n--; -# else - n++; -# endif - } while (!*n); - -# ifdef CONFIG_STACK_GROWSUP - return (unsigned long)end_of_stack(p) - (unsigned long)n; -# else - return (unsigned long)n - (unsigned long)end_of_stack(p); -# endif + return 0; } #endif extern void set_task_stack_end_magic(struct task_struct *tsk); -#ifndef __HAVE_ARCH_KSTACK_END static inline int kstack_end(void *addr) { /* Reliable end of stack detection: @@ -117,6 +113,5 @@ static inline int kstack_end(void *addr) */ return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); } -#endif #endif /* _LINUX_SCHED_TASK_STACK_H */ diff --git a/include/linux/sched/thread_info_api.h b/include/linux/sched/thread_info_api.h new file mode 100644 index 000000000000..2c60fbc16c08 --- /dev/null +++ b/include/linux/sched/thread_info_api.h @@ -0,0 +1 @@ +#include <linux/thread_info.h> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 9ef7bf686a9f..198bb5cc1774 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -25,27 +25,30 @@ enum { }; #undef SD_FLAG -#ifdef CONFIG_SCHED_DEBUG - struct sd_flag_debug { unsigned int meta_flags; char *name; }; extern const struct sd_flag_debug sd_flag_debug[]; -#endif - #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { - return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; +} +#endif + +#ifdef CONFIG_SCHED_CLUSTER +static inline int cpu_cluster_flags(void) +{ + return SD_CLUSTER | SD_SHARE_LLC; } #endif #ifdef CONFIG_SCHED_MC static inline int cpu_core_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_SHARE_LLC; } #endif @@ -74,6 +77,7 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; + int nr_idle_scan; }; struct sched_domain { @@ -86,6 +90,7 @@ struct sched_domain { unsigned int busy_factor; /* less balancing by factor if busy */ unsigned int imbalance_pct; /* No balance until over watermark */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int imb_numa_nr; /* Nr running tasks that allows a NUMA imbalance */ int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ @@ -98,16 +103,17 @@ struct sched_domain { /* idle_balance() stats */ u64 max_newidle_lb_cost; - unsigned long next_decay_max_lb_cost; - - u64 avg_scan_cost; /* select_idle_sibling */ + unsigned long last_decay_max_lb_cost; #ifdef CONFIG_SCHEDSTATS - /* load_balance() stats */ + /* sched_balance_rq() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; - unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_load[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES]; unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; @@ -133,9 +139,7 @@ struct sched_domain { unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; #endif -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif union { void *private; /* used during construction */ struct rcu_head rcu; /* used during destruction */ @@ -158,10 +162,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } -extern void partition_sched_domains_locked(int ndoms_new, - cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new); - extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); @@ -169,7 +169,9 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); +bool cpus_equal_capacity(int this_cpu, int that_cpu); bool cpus_share_cache(int this_cpu, int that_cpu); +bool cpus_share_resources(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); @@ -189,33 +191,28 @@ struct sched_domain_topology_level { int flags; int numa_level; struct sd_data data; -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif }; -extern void set_sched_topology(struct sched_domain_topology_level *tl); +extern void __init set_sched_topology(struct sched_domain_topology_level *tl); +extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); + -#ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type -#else -# define SD_INIT_NAME(type) -#endif #else /* CONFIG_SMP */ struct sched_domain_attr; static inline void -partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) +partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) { } -static inline void -partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) +static inline bool cpus_equal_capacity(int this_cpu, int that_cpu) { + return true; } static inline bool cpus_share_cache(int this_cpu, int that_cpu) @@ -223,8 +220,25 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +static inline bool cpus_share_resources(int this_cpu, int that_cpu) +{ + return true; +} + +static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ +} + #endif /* !CONFIG_SMP */ +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +extern void rebuild_sched_domains_energy(void); +#else +static inline void rebuild_sched_domains_energy(void) +{ +} +#endif + #ifndef arch_scale_cpu_capacity /** * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. @@ -243,21 +257,29 @@ unsigned long arch_scale_cpu_capacity(int cpu) } #endif -#ifndef arch_scale_thermal_pressure +#ifndef arch_scale_hw_pressure static __always_inline -unsigned long arch_scale_thermal_pressure(int cpu) +unsigned long arch_scale_hw_pressure(int cpu) { return 0; } #endif -#ifndef arch_set_thermal_pressure +#ifndef arch_update_hw_pressure static __always_inline -void arch_set_thermal_pressure(const struct cpumask *cpus, - unsigned long th_pressure) +void arch_update_hw_pressure(const struct cpumask *cpus, + unsigned long capped_frequency) { } #endif +#ifndef arch_scale_freq_ref +static __always_inline +unsigned int arch_scale_freq_ref(int cpu) +{ + return 0; +} +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h index 3c3e049224ae..969aaf5ef9d6 100644 --- a/include/linux/sched/types.h +++ b/include/linux/sched/types.h @@ -20,4 +20,4 @@ struct task_cputime { unsigned long long sum_exec_runtime; }; -#endif +#endif /* _LINUX_SCHED_TYPES_H */ diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index a8ec3b6093fc..4cc52698e214 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -4,6 +4,7 @@ #include <linux/uidgid.h> #include <linux/atomic.h> +#include <linux/percpu_counter.h> #include <linux/refcount.h> #include <linux/ratelimit.h> @@ -12,19 +13,9 @@ */ struct user_struct { refcount_t __count; /* reference count */ - atomic_t processes; /* How many processes does this user have? */ - atomic_t sigpending; /* How many pending signals does this user have? */ -#ifdef CONFIG_FANOTIFY - atomic_t fanotify_listeners; -#endif #ifdef CONFIG_EPOLL - atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ -#endif -#ifdef CONFIG_POSIX_MQUEUE - /* protected by mq_lock */ - unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ + struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */ #endif - unsigned long locked_shm; /* How many pages of mlocked shm ? */ unsigned long unix_inflight; /* How many files in flight in unix sockets */ atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ @@ -33,7 +24,8 @@ struct user_struct { kuid_t uid; #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ - defined(CONFIG_NET) || defined(CONFIG_IO_URING) + defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \ + defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD) atomic_long_t locked_vm; #endif #ifdef CONFIG_WATCH_QUEUE diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h new file mode 100644 index 000000000000..25446c5d3508 --- /dev/null +++ b/include/linux/sched/vhost_task.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_VHOST_TASK_H +#define _LINUX_SCHED_VHOST_TASK_H + +struct vhost_task; + +struct vhost_task *vhost_task_create(bool (*fn)(void *), + void (*handle_kill)(void *), void *arg, + const char *name); +void vhost_task_start(struct vhost_task *vtsk); +void vhost_task_stop(struct vhost_task *vtsk); +void vhost_task_wake(struct vhost_task *vtsk); + +#endif /* _LINUX_SCHED_VHOST_TASK_H */ diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 26a2013ac39c..0f28b4623ad4 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -42,8 +42,11 @@ struct wake_q_head { #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) -#define DEFINE_WAKE_Q(name) \ - struct wake_q_head name = { WAKE_Q_TAIL, &name.first } +#define WAKE_Q_HEAD_INITIALIZER(name) \ + { WAKE_Q_TAIL, &name.first } + +#define DEFINE_WAKE_Q(name) \ + struct wake_q_head name = WAKE_Q_HEAD_INITIALIZER(name) static inline void wake_q_init(struct wake_q_head *head) { @@ -60,4 +63,38 @@ extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); extern void wake_up_q(struct wake_q_head *head); +/* Spin unlock helpers to unlock and call wake_up_q with preempt disabled */ +static inline +void raw_spin_unlock_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irq_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irq(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irqrestore_wake(raw_spinlock_t *lock, unsigned long flags, + struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irqrestore(lock, flags); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} #endif /* _LINUX_SCHED_WAKE_Q_H */ |