aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorCatalin Marinas <catalin.marinas@arm.com>2021-08-31 09:10:00 +0100
committerCatalin Marinas <catalin.marinas@arm.com>2021-08-31 09:10:00 +0100
commit65266a7c6abfa1ad915a362c41bf38576607f1f9 (patch)
tree046d86fc88c50fcdeaddb9ab4c709ae0878b1e3f /kernel
parentMerge branch 'for-next/entry' into for-next/core (diff)
parentDocumentation: arm64: describe asymmetric 32-bit support (diff)
downloadlinux-dev-65266a7c6abfa1ad915a362c41bf38576607f1f9.tar.xz
linux-dev-65266a7c6abfa1ad915a362c41bf38576607f1f9.zip
Merge remote-tracking branch 'tip/sched/arm64' into for-next/core
* tip/sched/arm64: (785 commits) Documentation: arm64: describe asymmetric 32-bit support arm64: Remove logic to kill 32-bit tasks on 64-bit-only cores arm64: Hook up cmdline parameter to allow mismatched 32-bit EL0 arm64: Advertise CPUs capable of running 32-bit applications in sysfs arm64: Prevent offlining first CPU with 32-bit EL0 on mismatched system arm64: exec: Adjust affinity for compat tasks with mismatched 32-bit EL0 arm64: Implement task_cpu_possible_mask() sched: Introduce dl_task_check_affinity() to check proposed affinity sched: Allow task CPU affinity to be restricted on asymmetric systems sched: Split the guts of sched_setaffinity() into a helper function sched: Introduce task_struct::user_cpus_ptr to track requested affinity sched: Reject CPU affinity changes based on task_cpu_possible_mask() cpuset: Cleanup cpuset_cpus_allowed_fallback() use in select_fallback_rq() cpuset: Honour task_cpu_possible_mask() in guarantee_online_cpus() cpuset: Don't use the cpu_possible_mask as a last resort for cgroup v1 sched: Introduce task_cpu_possible_mask() to limit fallback rq selection sched: Cgroup SCHED_IDLE support sched/topology: Skip updating masks for non-online nodes Linux 5.14-rc6 lib: use PFN_PHYS() in devmem_is_allowed() ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c26
-rw-r--r--kernel/bpf/disasm.c16
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/helpers.c8
-rw-r--r--kernel/bpf/verifier.c148
-rw-r--r--kernel/cgroup/cgroup-v1.c4
-rw-r--r--kernel/cgroup/cpuset.c59
-rw-r--r--kernel/cgroup/rstat.c19
-rw-r--r--kernel/events/core.c35
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/msi.c13
-rw-r--r--kernel/irq/timings.c5
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/sched/core.c530
-rw-r--r--kernel/sched/deadline.c8
-rw-r--r--kernel/sched/debug.c10
-rw-r--r--kernel/sched/fair.c211
-rw-r--r--kernel/sched/sched.h26
-rw-r--r--kernel/sched/topology.c65
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/time/timer.c6
-rw-r--r--kernel/trace/bpf_trace.c13
-rw-r--r--kernel/trace/trace.c4
-rw-r--r--kernel/trace/trace_events_hist.c24
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--kernel/tracepoint.c155
-rw-r--r--kernel/ucount.c29
-rw-r--r--kernel/workqueue.c20
29 files changed, 1062 insertions, 389 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9b1577498373..0a28a8095d3e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -32,6 +32,8 @@
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
+
+#include <asm/barrier.h>
#include <asm/unaligned.h>
/* Registers */
@@ -1360,11 +1362,13 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
}
/**
- * __bpf_prog_run - run eBPF program on a given context
+ * ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
*
* Decode and execute eBPF instructions.
+ *
+ * Return: whatever value is in %BPF_R0 at program exit
*/
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
@@ -1377,6 +1381,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
/* Non-UAPI available opcodes. */
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
+ [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC,
[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
@@ -1621,7 +1626,21 @@ out:
COND_JMP(s, JSGE, >=)
COND_JMP(s, JSLE, <=)
#undef COND_JMP
- /* STX and ST and LDX*/
+ /* ST, STX and LDX*/
+ ST_NOSPEC:
+ /* Speculation barrier for mitigating Speculative Store Bypass.
+ * In case of arm64, we rely on the firmware mitigation as
+ * controlled via the ssbd kernel parameter. Whenever the
+ * mitigation is enabled, it works for all of the kernel code
+ * with no need to provide any additional instructions here.
+ * In case of x86, we use 'lfence' insn for mitigation. We
+ * reuse preexisting logic from Spectre v1 mitigation that
+ * happens to produce the required code on x86 for v4 as well.
+ */
+#ifdef CONFIG_X86
+ barrier_nospec();
+#endif
+ CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
@@ -1861,6 +1880,9 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
* The BPF program will be executed via BPF_PROG_RUN() macro.
+ *
+ * Return: the &fp argument along with &err set to 0 for success or
+ * a negative errno code on failure
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index bbfc6bb79240..ca3cd9aaa6ce 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -206,15 +206,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_%02x\n", insn->code);
}
} else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
+ if (BPF_MODE(insn->code) == BPF_MEM) {
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
+ verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
+ } else {
verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
- return;
}
- verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
} else if (class == BPF_LDX) {
if (BPF_MODE(insn->code) != BPF_MEM) {
verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 72c58cc516a3..9c011f3a2687 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1565,8 +1565,8 @@ alloc:
/* We cannot do copy_from_user or copy_to_user inside
* the rcu_read_lock. Allocate enough space here.
*/
- keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
- values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
if (!keys || !values) {
ret = -ENOMEM;
goto after_loop;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 62cf00383910..7a97b2f4747d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -397,8 +397,8 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
void *ptr;
int i;
- for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
- if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+ for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
+ if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
continue;
storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
@@ -1070,12 +1070,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9de3c9c3267c..f9bda5476ea5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2610,6 +2610,19 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
cur = env->cur_state->frame[env->cur_state->curframe];
if (value_regno >= 0)
reg = &cur->regs[value_regno];
+ if (!env->bypass_spec_v4) {
+ bool sanitize = reg && is_spillable_regtype(reg->type);
+
+ for (i = 0; i < size; i++) {
+ if (state->stack[spi].slot_type[i] == STACK_INVALID) {
+ sanitize = true;
+ break;
+ }
+ }
+
+ if (sanitize)
+ env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
+ }
if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
@@ -2632,47 +2645,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
verbose(env, "invalid size of register spill\n");
return -EACCES;
}
-
if (state != cur && reg->type == PTR_TO_STACK) {
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
-
- if (!env->bypass_spec_v4) {
- bool sanitize = false;
-
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
- register_is_const(&state->stack[spi].spilled_ptr))
- sanitize = true;
- for (i = 0; i < BPF_REG_SIZE; i++)
- if (state->stack[spi].slot_type[i] == STACK_MISC) {
- sanitize = true;
- break;
- }
- if (sanitize) {
- int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
- int soff = (-spi - 1) * BPF_REG_SIZE;
-
- /* detected reuse of integer stack slot with a pointer
- * which means either llvm is reusing stack slot or
- * an attacker is trying to exploit CVE-2018-3639
- * (speculative store bypass)
- * Have to sanitize that slot with preemptive
- * store of zero.
- */
- if (*poff && *poff != soff) {
- /* disallow programs where single insn stores
- * into two different stack slots, since verifier
- * cannot sanitize them
- */
- verbose(env,
- "insn %d cannot access two stack slots fp%d and fp%d",
- insn_idx, *poff, soff);
- return -EINVAL;
- }
- *poff = soff;
- }
- }
save_register_state(state, spi, reg);
} else {
u8 type = STACK_MISC;
@@ -6561,6 +6537,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
alu_state |= ptr_is_dst_reg ?
BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+ /* Limit pruning on unknown scalars to enable deep search for
+ * potential masking differences from other program paths.
+ */
+ if (!off_is_imm)
+ env->explore_alu_limits = true;
}
err = update_alu_sanitation_state(aux, alu_state, alu_limit);
@@ -9936,8 +9918,8 @@ next:
}
/* Returns true if (rold safe implies rcur safe) */
-static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
- struct bpf_id_pair *idmap)
+static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+ struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
{
bool equal;
@@ -9963,6 +9945,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
switch (rold->type) {
case SCALAR_VALUE:
+ if (env->explore_alu_limits)
+ return false;
if (rcur->type == SCALAR_VALUE) {
if (!rold->precise && !rcur->precise)
return true;
@@ -10053,9 +10037,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
}
-static bool stacksafe(struct bpf_func_state *old,
- struct bpf_func_state *cur,
- struct bpf_id_pair *idmap)
+static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, struct bpf_id_pair *idmap)
{
int i, spi;
@@ -10100,9 +10083,8 @@ static bool stacksafe(struct bpf_func_state *old,
continue;
if (old->stack[spi].slot_type[0] != STACK_SPILL)
continue;
- if (!regsafe(&old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr,
- idmap))
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap))
/* when explored and current stack slot are both storing
* spilled registers, check that stored pointers types
* are the same as well.
@@ -10159,10 +10141,11 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
for (i = 0; i < MAX_BPF_REG; i++)
- if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch))
+ if (!regsafe(env, &old->regs[i], &cur->regs[i],
+ env->idmap_scratch))
return false;
- if (!stacksafe(old, cur, env->idmap_scratch))
+ if (!stacksafe(env, old, cur, env->idmap_scratch))
return false;
if (!refsafe(old, cur))
@@ -11906,35 +11889,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
bpf_convert_ctx_access_t convert_ctx_access;
+ bool ctx_access;
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
type = BPF_READ;
- else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_DW))
+ ctx_access = true;
+ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
- else
+ ctx_access = BPF_CLASS(insn->code) == BPF_STX;
+ } else {
continue;
+ }
if (type == BPF_WRITE &&
- env->insn_aux_data[i + delta].sanitize_stack_off) {
+ env->insn_aux_data[i + delta].sanitize_stack_spill) {
struct bpf_insn patch[] = {
- /* Sanitize suspicious stack slot with zero.
- * There are no memory dependencies for this store,
- * since it's only using frame pointer and immediate
- * constant of zero
- */
- BPF_ST_MEM(BPF_DW, BPF_REG_FP,
- env->insn_aux_data[i + delta].sanitize_stack_off,
- 0),
- /* the original STX instruction will immediately
- * overwrite the same stack slot with appropriate value
- */
*insn,
+ BPF_ST_NOSPEC(),
};
cnt = ARRAY_SIZE(patch);
@@ -11948,6 +11929,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
continue;
}
+ if (!ctx_access)
+ continue;
+
switch (env->insn_aux_data[i + delta].ptr_type) {
case PTR_TO_CTX:
if (!ops->convert_ctx_access)
@@ -12752,37 +12736,6 @@ static void free_states(struct bpf_verifier_env *env)
}
}
-/* The verifier is using insn_aux_data[] to store temporary data during
- * verification and to store information for passes that run after the
- * verification like dead code sanitization. do_check_common() for subprogram N
- * may analyze many other subprograms. sanitize_insn_aux_data() clears all
- * temporary data after do_check_common() finds that subprogram N cannot be
- * verified independently. pass_cnt counts the number of times
- * do_check_common() was run and insn->aux->seen tells the pass number
- * insn_aux_data was touched. These variables are compared to clear temporary
- * data from failed pass. For testing and experiments do_check_common() can be
- * run multiple times even when prior attempt to verify is unsuccessful.
- *
- * Note that special handling is needed on !env->bypass_spec_v1 if this is
- * ever called outside of error path with subsequent program rejection.
- */
-static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
-{
- struct bpf_insn *insn = env->prog->insnsi;
- struct bpf_insn_aux_data *aux;
- int i, class;
-
- for (i = 0; i < env->prog->len; i++) {
- class = BPF_CLASS(insn[i].code);
- if (class != BPF_LDX && class != BPF_STX)
- continue;
- aux = &env->insn_aux_data[i];
- if (aux->seen != env->pass_cnt)
- continue;
- memset(aux, 0, offsetof(typeof(*aux), orig_idx));
- }
-}
-
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
@@ -12859,9 +12812,6 @@ out:
if (!ret && pop_log)
bpf_vlog_reset(&env->log, 0);
free_states(env);
- if (ret)
- /* clean aux data in case subprog was rejected */
- sanitize_insn_aux_data(env);
return ret;
}
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 8d6bf56ed77a..de2c432dee20 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1221,9 +1221,7 @@ int cgroup1_get_tree(struct fs_context *fc)
ret = cgroup_do_get_tree(fc);
if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
- struct super_block *sb = fc->root->d_sb;
- dput(fc->root);
- deactivate_locked_super(sb);
+ fc_drop_locked(fc);
ret = 1;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index adb5190c4429..6500cbe0ce16 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -372,18 +372,29 @@ static inline bool is_in_v2_mode(void)
}
/*
- * Return in pmask the portion of a cpusets's cpus_allowed that
- * are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.
+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
+ * are online and are capable of running the task. If none are found,
+ * walk up the cpuset hierarchy until we find one that does have some
+ * appropriate cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
* Call with callback_lock or cpuset_mutex held.
*/
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
+static void guarantee_online_cpus(struct task_struct *tsk,
+ struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ struct cpuset *cs;
+
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
+ cpumask_copy(pmask, cpu_online_mask);
+
+ rcu_read_lock();
+ cs = task_cs(tsk);
+
+ while (!cpumask_intersects(cs->effective_cpus, pmask)) {
cs = parent_cs(cs);
if (unlikely(!cs)) {
/*
@@ -393,11 +404,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* cpuset's effective_cpus is on its way to be
* identical to cpu_online_mask.
*/
- cpumask_copy(pmask, cpu_online_mask);
- return;
+ goto out_unlock;
}
}
- cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
+ cpumask_and(pmask, pmask, cs->effective_cpus);
+
+out_unlock:
+ rcu_read_unlock();
}
/*
@@ -2199,15 +2212,13 @@ static void cpuset_attach(struct cgroup_taskset *tset)
percpu_down_write(&cpuset_rwsem);
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
-
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, css, tset) {
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(task, cpus_attach);
+ else
+ cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
@@ -3302,9 +3313,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
- guarantee_online_cpus(task_cs(tsk), pmask);
- rcu_read_unlock();
+ guarantee_online_cpus(tsk, pmask);
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -3318,13 +3327,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
* which will not contain a sane cpumask during cases such as cpu hotplugging.
* This is the absolute last resort for the scheduler and it is only used if
* _every_ other avenue has been traveled.
+ *
+ * Returns true if the affinity of @tsk was changed, false otherwise.
**/
-void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ const struct cpumask *cs_mask;
+ bool changed = false;
+
rcu_read_lock();
- do_set_cpus_allowed(tsk, is_in_v2_mode() ?
- task_cs(tsk)->cpus_allowed : cpu_possible_mask);
+ cs_mask = task_cs(tsk)->cpus_allowed;
+ if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
+ do_set_cpus_allowed(tsk, cs_mask);
+ changed = true;
+ }
rcu_read_unlock();
/*
@@ -3344,6 +3362,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* select_fallback_rq() will fix things ups and set cpu_possible_mask
* if required.
*/
+ return changed;
}
void __init cpuset_init_current_mems_allowed(void)
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 7f0e58917432..b264ab5652ba 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -347,19 +347,20 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
}
static struct cgroup_rstat_cpu *
-cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
struct cgroup_rstat_cpu *rstatc;
rstatc = get_cpu_ptr(cgrp->rstat_cpu);
- u64_stats_update_begin(&rstatc->bsync);
+ *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
return rstatc;
}
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
- struct cgroup_rstat_cpu *rstatc)
+ struct cgroup_rstat_cpu *rstatc,
+ unsigned long flags)
{
- u64_stats_update_end(&rstatc->bsync);
+ u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
cgroup_rstat_updated(cgrp, smp_processor_id());
put_cpu_ptr(rstatc);
}
@@ -367,18 +368,20 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
case CPUTIME_USER:
@@ -394,7 +397,7 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
break;
}
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 464917096e73..1cb1f9b8392e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11917,6 +11917,37 @@ again:
return gctx;
}
+static bool
+perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
+{
+ unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
+ bool is_capable = perfmon_capable();
+
+ if (attr->sigtrap) {
+ /*
+ * perf_event_attr::sigtrap sends signals to the other task.
+ * Require the current task to also have CAP_KILL.
+ */
+ rcu_read_lock();
+ is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
+ rcu_read_unlock();
+
+ /*
+ * If the required capabilities aren't available, checks for
+ * ptrace permissions: upgrade to ATTACH, since sending signals
+ * can effectively change the target task.
+ */
+ ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
+ }
+
+ /*
+ * Preserve ptrace permission check for backwards compatibility. The
+ * ptrace check also includes checks that the current task and other
+ * task have matching uids, and is therefore not done here explicitly.
+ */
+ return is_capable || ptrace_may_access(task, ptrace_mode);
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -12163,15 +12194,13 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_file;
/*
- * Preserve ptrace permission check for backwards compatibility.
- *
* We must hold exec_update_lock across this and any potential
* perf_install_in_context() call for this new event to
* serialize against exec() altering our credentials (and the
* perf_event_exit_task() that could imply).
*/
err = -EACCES;
- if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ if (!perf_check_permission(&attr, task))
goto err_cred;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index bc94b2cc5995..bd0e165b8397 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -446,6 +446,7 @@ void put_task_stack(struct task_struct *tsk)
void free_task(struct task_struct *tsk)
{
+ release_user_cpus_ptr(tsk);
scs_release(tsk);
#ifndef CONFIG_THREAD_INFO_IN_TASK
@@ -924,6 +925,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask;
+ dup_user_cpus_ptr(tsk, orig, node);
/*
* One for the user space visible state that goes away when reaped.
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 7f04c7d8296e..a98bcfc4be7b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
} else {
switch (__irq_startup_managed(desc, aff, force)) {
case IRQ_STARTUP_NORMAL:
+ if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
+ irq_setup_affinity(desc);
ret = __irq_startup(desc);
- irq_setup_affinity(desc);
+ if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
+ irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
irq_do_set_affinity(d, aff, false);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index c41965e348b5..85df3ca03efe 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -476,11 +476,6 @@ skip_activate:
return 0;
cleanup:
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- if (irqd_is_activated(irq_data))
- irq_domain_deactivate_irq(irq_data);
- }
msi_domain_free_irqs(domain, dev);
return ret;
}
@@ -505,7 +500,15 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
+ struct irq_data *irq_data;
struct msi_desc *desc;
+ int i;
+
+ for_each_msi_vector(desc, i, dev) {
+ irq_data = irq_domain_get_irq_data(domain, i);
+ if (irqd_is_activated(irq_data))
+ irq_domain_deactivate_irq(irq_data);
+ }
for_each_msi_entry(desc, dev) {
/*
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index d309d6fbf5bd..4d2a702d7aa9 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -453,6 +453,11 @@ static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
*/
index = irq_timings_interval_index(interval);
+ if (index > PREDICTION_BUFFER_SIZE - 1) {
+ irqs->count = 0;
+ return;
+ }
+
/*
* Store the index as an element of the pattern in another
* circular array.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b5d9bb5202c6..ad0db322ed3b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -343,7 +343,7 @@ static __always_inline bool
rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
enum rtmutex_chainwalk chwalk)
{
- if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX))
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
return waiter != NULL;
return chwalk == RT_MUTEX_FULL_CHAINWALK;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d9ff40f4661..8dc67166aa6c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -993,6 +993,7 @@ int get_nohz_timer_target(void)
{
int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
+ const struct cpumask *hk_mask;
if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
if (!idle_cpu(cpu))
@@ -1000,10 +1001,11 @@ int get_nohz_timer_target(void)
default_cpu = cpu;
}
+ hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
+
rcu_read_lock();
for_each_domain(cpu, sd) {
- for_each_cpu_and(i, sched_domain_span(sd),
- housekeeping_cpumask(HK_FLAG_TIMER)) {
+ for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
@@ -1619,6 +1621,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
uclamp_rq_dec_id(rq, p, clamp_id);
}
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (!p->uclamp[clamp_id].active)
+ return;
+
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /*
+ * Make sure to clear the idle flag if we've transiently reached 0
+ * active tasks on rq.
+ */
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
static inline void
uclamp_update_active(struct task_struct *p)
{
@@ -1642,12 +1661,8 @@ uclamp_update_active(struct task_struct *p)
* affecting a valid clamp bucket, the next time it's enqueued,
* it will already see the updated clamp bucket value.
*/
- for_each_clamp_id(clamp_id) {
- if (p->uclamp[clamp_id].active) {
- uclamp_rq_dec_id(rq, p, clamp_id);
- uclamp_rq_inc_id(rq, p, clamp_id);
- }
- }
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_reinc_id(rq, p, clamp_id);
task_rq_unlock(rq, p, &rf);
}
@@ -1981,12 +1996,18 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
+static inline int __normal_prio(int policy, int rt_prio, int nice)
{
- return p->static_prio;
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
}
/*
@@ -1998,15 +2019,7 @@ static inline int __normal_prio(struct task_struct *p)
*/
static inline int normal_prio(struct task_struct *p)
{
- int prio;
-
- if (task_has_dl_policy(p))
- prio = MAX_DL_PRIO-1;
- else if (task_has_rt_policy(p))
- prio = MAX_RT_PRIO-1 - p->rt_priority;
- else
- prio = __normal_prio(p);
- return prio;
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}
/*
@@ -2163,7 +2176,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
/* Non kernel threads are not allowed during either online or offline. */
if (!(p->flags & PF_KTHREAD))
- return cpu_active(cpu);
+ return cpu_active(cpu) && task_cpu_possible(cpu, p);
/* KTHREAD_IS_PER_CPU is always allowed. */
if (kthread_is_per_cpu(p))
@@ -2470,6 +2483,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
__do_set_cpus_allowed(p, new_mask, 0);
}
+int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
+ int node)
+{
+ if (!src->user_cpus_ptr)
+ return 0;
+
+ dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
+ if (!dst->user_cpus_ptr)
+ return -ENOMEM;
+
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ return 0;
+}
+
+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+{
+ struct cpumask *user_mask = NULL;
+
+ swap(p->user_cpus_ptr, user_mask);
+
+ return user_mask;
+}
+
+void release_user_cpus_ptr(struct task_struct *p)
+{
+ kfree(clear_user_cpus_ptr(p));
+}
+
/*
* This function is wildly self concurrent; here be dragons.
*
@@ -2687,28 +2728,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}
/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
*/
-static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags)
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+ const struct cpumask *new_mask,
+ u32 flags,
+ struct rq *rq,
+ struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
{
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ bool kthread = p->flags & PF_KTHREAD;
+ struct cpumask *user_mask = NULL;
unsigned int dest_cpu;
- struct rq_flags rf;
- struct rq *rq;
int ret = 0;
- rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
- if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
+ if (kthread || is_migration_disabled(p)) {
/*
* Kernel threads are allowed on online && !active CPUs,
* however, during cpu-hot-unplug, even these might get pushed
@@ -2722,6 +2761,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
cpu_valid_mask = cpu_online_mask;
}
+ if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
/*
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
@@ -2756,20 +2800,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
__do_set_cpus_allowed(p, new_mask, flags);
- return affine_move_task(rq, p, &rf, dest_cpu, flags);
+ if (flags & SCA_USER)
+ user_mask = clear_user_cpus_ptr(p);
+
+ ret = affine_move_task(rq, p, rf, dest_cpu, flags);
+
+ kfree(user_mask);
+
+ return ret;
out:
- task_rq_unlock(rq, p, &rf);
+ task_rq_unlock(rq, p, rf);
return ret;
}
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, u32 flags)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+}
+
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
return __set_cpus_allowed_ptr(p, new_mask, 0);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+/*
+ * Change a given task's CPU affinity to the intersection of its current
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask
+ * and pointing @p->user_cpus_ptr to a copy of the old mask.
+ * If the resulting mask is empty, leave the affinity unchanged and return
+ * -EINVAL.
+ */
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
+ struct cpumask *new_mask,
+ const struct cpumask *subset_mask)
+{
+ struct cpumask *user_mask = NULL;
+ struct rq_flags rf;
+ struct rq *rq;
+ int err;
+
+ if (!p->user_cpus_ptr) {
+ user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
+ if (!user_mask)
+ return -ENOMEM;
+ }
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Forcefully restricting the affinity of a deadline task is
+ * likely to cause problems, so fail and noisily override the
+ * mask entirely.
+ */
+ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ err = -EPERM;
+ goto err_unlock;
+ }
+
+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
+ /*
+ * We're about to butcher the task affinity, so keep track of what
+ * the user asked for in case we're able to restore it later on.
+ */
+ if (user_mask) {
+ cpumask_copy(user_mask, p->cpus_ptr);
+ p->user_cpus_ptr = user_mask;
+ }
+
+ return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+
+err_unlock:
+ task_rq_unlock(rq, p, &rf);
+ kfree(user_mask);
+ return err;
+}
+
+/*
+ * Restrict the CPU affinity of task @p so that it is a subset of
+ * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+ * old affinity mask. If the resulting mask is empty, we warn and walk
+ * up the cpuset hierarchy until we find a suitable mask.
+ */
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ cpumask_var_t new_mask;
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
+
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
+
+ /*
+ * __migrate_task() can fail silently in the face of concurrent
+ * offlining of the chosen destination CPU, so take the hotplug
+ * lock to ensure that the migration succeeds.
+ */
+ cpus_read_lock();
+ if (!cpumask_available(new_mask))
+ goto out_set_mask;
+
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+ goto out_free_mask;
+
+ /*
+ * We failed to find a valid subset of the affinity mask for the
+ * task, so override it based on its cpuset hierarchy.
+ */
+ cpuset_cpus_allowed(p, new_mask);
+ override_mask = new_mask;
+
+out_set_mask:
+ if (printk_ratelimit()) {
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+ task_pid_nr(p), p->comm,
+ cpumask_pr_args(override_mask));
+ }
+
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+out_free_mask:
+ cpus_read_unlock();
+ free_cpumask_var(new_mask);
+}
+
+static int
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+
+/*
+ * Restore the affinity of a task @p which was previously restricted by a
+ * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
+ * @p->user_cpus_ptr.
+ *
+ * It is the caller's responsibility to serialise this with any calls to
+ * force_compatible_cpus_allowed_ptr(@p).
+ */
+void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ struct cpumask *user_mask = p->user_cpus_ptr;
+ unsigned long flags;
+
+ /*
+ * Try to restore the old affinity mask. If this fails, then
+ * we free the mask explicitly to avoid it being inherited across
+ * a subsequent fork().
+ */
+ if (!user_mask || !__sched_setaffinity(p, user_mask))
+ return;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ user_mask = clear_user_cpus_ptr(p);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ kfree(user_mask);
+}
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -3114,9 +3316,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
+ if (is_cpu_allowed(p, dest_cpu))
return dest_cpu;
}
}
@@ -3133,8 +3333,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* No more Mr. Nice Guy. */
switch (state) {
case cpuset:
- if (IS_ENABLED(CONFIG_CPUSETS)) {
- cpuset_cpus_allowed_fallback(p);
+ if (cpuset_cpus_allowed_fallback(p)) {
state = possible;
break;
}
@@ -3146,10 +3345,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
*
* More yuck to audit.
*/
- do_set_cpus_allowed(p, cpu_possible_mask);
+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
state = fail;
break;
-
case fail:
BUG();
break;
@@ -4099,7 +4297,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
- p->prio = p->normal_prio = __normal_prio(p);
+ p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
/*
@@ -5662,11 +5860,9 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
- p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src, p, 0);
set_task_cpu(p, this);
activate_task(dst, p, 0);
- p->on_rq = TASK_ON_RQ_QUEUED;
resched_curr(dst);
@@ -6341,6 +6537,18 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);
+static void __setscheduler_prio(struct task_struct *p, int prio)
+{
+ if (dl_prio(prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+
+ p->prio = prio;
+}
+
#ifdef CONFIG_RT_MUTEXES
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
@@ -6456,22 +6664,19 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
} else {
p->dl.pi_se = &p->dl;
}
- p->sched_class = &dl_sched_class;
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
- p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
- p->sched_class = &fair_sched_class;
}
- p->prio = prio;
+ __setscheduler_prio(p, prio);
if (queued)
enqueue_task(rq, p, queue_flag);
@@ -6824,35 +7029,6 @@ static void __setscheduler_params(struct task_struct *p,
set_load_weight(p, true);
}
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr, bool keep_boost)
-{
- /*
- * If params can't change scheduling class changes aren't allowed
- * either.
- */
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
- return;
-
- __setscheduler_params(p, attr);
-
- /*
- * Keep a potential priority boosting if called from
- * sched_setscheduler().
- */
- p->prio = normal_prio(p);
- if (keep_boost)
- p->prio = rt_effective_prio(p, p->prio);
-
- if (dl_prio(p->prio))
- p->sched_class = &dl_sched_class;
- else if (rt_prio(p->prio))
- p->sched_class = &rt_sched_class;
- else
- p->sched_class = &fair_sched_class;
-}
-
/*
* Check the target process has a UID that matches the current process's:
*/
@@ -6873,10 +7049,8 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
struct callback_head *head;
struct rq_flags rf;
@@ -7074,6 +7248,7 @@ change:
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
/*
* Take priority boosted tasks into account. If the new
@@ -7082,8 +7257,8 @@ change:
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_effective_prio(p, newprio);
- if (new_effective_prio == oldprio)
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
@@ -7096,7 +7271,10 @@ change:
prev_class = p->sched_class;
- __setscheduler(rq, p, attr, pi);
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ __setscheduler_prio(p, newprio);
+ }
__setscheduler_uclamp(p, attr);
if (queued) {
@@ -7320,6 +7498,16 @@ err_size:
return -E2BIG;
}
+static void get_params(struct task_struct *p, struct sched_attr *attr)
+{
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, attr);
+ else if (task_has_rt_policy(p))
+ attr->sched_priority = p->rt_priority;
+ else
+ attr->sched_nice = task_nice(p);
+}
+
/**
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
* @pid: the pid in question.
@@ -7381,6 +7569,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
rcu_read_unlock();
if (likely(p)) {
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
retval = sched_setattr(p, &attr);
put_task_struct(p);
}
@@ -7529,12 +7719,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- if (task_has_dl_policy(p))
- __getparam_dl(p, &kattr);
- else if (task_has_rt_policy(p))
- kattr.sched_priority = p->rt_priority;
- else
- kattr.sched_nice = task_nice(p);
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
/*
@@ -7555,9 +7741,76 @@ out_unlock:
return retval;
}
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+#ifdef CONFIG_SMP
+int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
+ int ret = 0;
+
+ /*
+ * If the task isn't a deadline task or admission control is
+ * disabled then we don't care about affinity changes.
+ */
+ if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+ return 0;
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+ rcu_read_lock();
+ if (!cpumask_subset(task_rq(p)->rd->span, mask))
+ ret = -EBUSY;
+ rcu_read_unlock();
+ return ret;
+}
+#endif
+
+static int
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+{
+ int retval;
cpumask_var_t cpus_allowed, new_mask;
+
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, mask, cpus_allowed);
+
+ retval = dl_task_check_affinity(p, new_mask);
+ if (retval)
+ goto out_free_new_mask;
+again:
+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
+ if (retval)
+ goto out_free_new_mask;
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset update.
+ * Just reset the cpumask to the cpuset's cpus_allowed.
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+ goto again;
+ }
+
+out_free_new_mask:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
struct task_struct *p;
int retval;
@@ -7577,68 +7830,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
retval = -EINVAL;
goto out_put_task;
}
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_put_task;
- }
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_free_cpus_allowed;
- }
- retval = -EPERM;
+
if (!check_same_owner(p)) {
rcu_read_lock();
if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
rcu_read_unlock();
- goto out_free_new_mask;
+ retval = -EPERM;
+ goto out_put_task;
}
rcu_read_unlock();
}
retval = security_task_setscheduler(p);
if (retval)
- goto out_free_new_mask;
-
-
- cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, in_mask, cpus_allowed);
-
- /*
- * Since bandwidth control happens on root_domain basis,
- * if admission test is enabled, we only admit -deadline
- * tasks allowed to run on all the CPUs in the task's
- * root_domain.
- */
-#ifdef CONFIG_SMP
- if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
- rcu_read_lock();
- if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
- retval = -EBUSY;
- rcu_read_unlock();
- goto out_free_new_mask;
- }
- rcu_read_unlock();
- }
-#endif
-again:
- retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+ goto out_put_task;
- if (!retval) {
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset
- * update. Just reset the cpus_allowed to the
- * cpuset's cpus_allowed
- */
- cpumask_copy(new_mask, cpus_allowed);
- goto again;
- }
- }
-out_free_new_mask:
- free_cpumask_var(new_mask);
-out_free_cpus_allowed:
- free_cpumask_var(cpus_allowed);
+ retval = __sched_setaffinity(p, in_mask);
out_put_task:
put_task_struct(p);
return retval;
@@ -9824,7 +10031,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&cfs_constraints_mutex);
ret = __cfs_schedulable(tg, period, quota);
if (ret)
@@ -9868,7 +10075,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return ret;
}
@@ -10119,6 +10326,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return css_tg(css)->idle;
+}
+
+static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 idle)
+{
+ return sched_group_set_idle(css_tg(css), idle);
+}
+#endif
+
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
@@ -10126,6 +10347,11 @@ static struct cftype cpu_legacy_files[] = {
.read_u64 = cpu_shares_read_u64,
.write_u64 = cpu_shares_write_u64,
},
+ {
+ .name = "idle",
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -10333,6 +10559,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_weight_nice_read_s64,
.write_s64 = cpu_weight_nice_write_s64,
},
+ {
+ .name = "idle",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index aaacd6cfd42f..e94314633b39 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1733,6 +1733,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
*/
raw_spin_rq_lock(rq);
if (p->dl.dl_non_contending) {
+ update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
@@ -2741,7 +2742,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
+ dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
@@ -2754,7 +2755,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
attr->sched_runtime = dl_se->dl_runtime;
attr->sched_deadline = dl_se->dl_deadline;
attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
+ attr->sched_flags &= ~SCHED_DL_FLAGS;
+ attr->sched_flags |= dl_se->flags;
}
/*
@@ -2851,7 +2853,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
if (dl_se->dl_runtime != attr->sched_runtime ||
dl_se->dl_deadline != attr->sched_deadline ||
dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
+ dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
return true;
return false;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0c5ec2776ddf..49716228efb4 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -388,6 +388,13 @@ void update_sched_domain_debugfs(void)
{
int cpu, i;
+ /*
+ * This can unfortunately be invoked before sched_debug_init() creates
+ * the debug directory. Don't touch sd_sysctl_cpus until then.
+ */
+ if (!debugfs_sched)
+ return;
+
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
@@ -600,6 +607,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
+ cfs_rq->idle_h_nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 44c452072a1b..5aa3cfd15a2e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,6 +431,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
}
}
+static int tg_is_idle(struct task_group *tg)
+{
+ return tg->idle > 0;
+}
+
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->idle > 0;
+}
+
+static int se_is_idle(struct sched_entity *se)
+{
+ if (entity_is_task(se))
+ return task_has_idle_policy(task_of(se));
+ return cfs_rq_is_idle(group_cfs_rq(se));
+}
+
#else /* !CONFIG_FAIR_GROUP_SCHED */
#define for_each_sched_entity(se) \
@@ -468,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}
+static int tg_is_idle(struct task_group *tg)
+{
+ return 0;
+}
+
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static int se_is_idle(struct sched_entity *se)
+{
+ return 0;
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
@@ -1486,7 +1518,7 @@ static inline bool is_core_idle(int cpu)
if (cpu == sibling)
continue;
- if (!idle_cpu(cpu))
+ if (!idle_cpu(sibling))
return false;
}
#endif
@@ -4841,6 +4873,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
+
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
@@ -4860,6 +4895,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
update_load_avg(qcfs_rq, se, 0);
se_update_runnable(se);
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
+
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
}
@@ -4904,39 +4942,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+
if (se->on_rq)
break;
- cfs_rq = cfs_rq_of(se);
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+ enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
+
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
- cfs_rq->h_nr_running += task_delta;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ qcfs_rq->h_nr_running += task_delta;
+ qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
}
for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_load_avg(qcfs_rq, se, UPDATE_TG);
se_update_runnable(se);
- cfs_rq->h_nr_running += task_delta;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
+ qcfs_rq->h_nr_running += task_delta;
+ qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
/*
* One parent has been throttled and cfs_rq removed from the
* list. Add it back to not break the leaf list.
*/
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ if (throttled_hierarchy(qcfs_rq))
+ list_add_leaf_cfs_rq(qcfs_rq);
}
/* At this point se is NULL and we are at root level*/
@@ -4949,9 +4993,9 @@ unthrottle_throttle:
* assertion below.
*/
for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- if (list_add_leaf_cfs_rq(cfs_rq))
+ if (list_add_leaf_cfs_rq(qcfs_rq))
break;
}
@@ -5574,6 +5618,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
@@ -5591,6 +5638,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
@@ -5668,6 +5718,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
@@ -5697,6 +5750,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
@@ -6249,7 +6305,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
time = cpu_clock(this);
}
- for_each_cpu_wrap(cpu, cpus, target) {
+ for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
if ((unsigned int)i < nr_cpumask_bits)
@@ -6376,6 +6432,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
+ p->recent_used_cpu = prev;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
@@ -6902,9 +6959,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
-
- if (want_affine)
- current->recent_used_cpu = cpu;
}
rcu_read_unlock();
@@ -7041,24 +7095,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
static void set_last_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
-
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
+ if (se_is_idle(se))
+ return;
cfs_rq_of(se)->last = se;
}
}
static void set_next_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
-
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
+ if (se_is_idle(se))
+ return;
cfs_rq_of(se)->next = se;
}
}
@@ -7079,6 +7131,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0;
+ int cse_is_idle, pse_is_idle;
if (unlikely(se == pse))
return;
@@ -7123,8 +7176,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
find_matching_se(&se, &pse);
- update_curr(cfs_rq_of(se));
BUG_ON(!pse);
+
+ cse_is_idle = se_is_idle(se);
+ pse_is_idle = se_is_idle(pse);
+
+ /*
+ * Preempt an idle group in favor of a non-idle group (and don't preempt
+ * in the inverse case).
+ */
+ if (cse_is_idle && !pse_is_idle)
+ goto preempt;
+ if (cse_is_idle != pse_is_idle)
+ return;
+
+ update_curr(cfs_rq_of(se));
if (wakeup_preempt_entity(se, pse) == 1) {
/*
* Bias pick_next to pick the sched entity that is
@@ -10217,9 +10283,11 @@ static inline int on_null_domain(struct rq *rq)
static inline int find_new_ilb(void)
{
int ilb;
+ const struct cpumask *hk_mask;
+
+ hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
- for_each_cpu_and(ilb, nohz.idle_cpus_mask,
- housekeeping_cpumask(HK_FLAG_MISC)) {
+ for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
if (ilb == smp_processor_id())
continue;
@@ -11416,10 +11484,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
static DEFINE_MUTEX(shares_mutex);
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
+ lockdep_assert_held(&shares_mutex);
+
/*
* We can't change the weight of the root cgroup.
*/
@@ -11428,9 +11498,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
- mutex_lock(&shares_mutex);
if (tg->shares == shares)
- goto done;
+ return 0;
tg->shares = shares;
for_each_possible_cpu(i) {
@@ -11448,10 +11517,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
rq_unlock_irqrestore(rq, &rf);
}
-done:
+ return 0;
+}
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+ int ret;
+
+ mutex_lock(&shares_mutex);
+ if (tg_is_idle(tg))
+ ret = -EINVAL;
+ else
+ ret = __sched_group_set_shares(tg, shares);
+ mutex_unlock(&shares_mutex);
+
+ return ret;
+}
+
+int sched_group_set_idle(struct task_group *tg, long idle)
+{
+ int i;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ if (idle < 0 || idle > 1)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->idle == idle) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->idle = idle;
+
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se = tg->se[i];
+ struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+ bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
+ long idle_task_delta;
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+
+ grp_cfs_rq->idle = idle;
+ if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
+ goto next_cpu;
+
+ idle_task_delta = grp_cfs_rq->h_nr_running -
+ grp_cfs_rq->idle_h_nr_running;
+ if (!cfs_rq_is_idle(grp_cfs_rq))
+ idle_task_delta *= -1;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!se->on_rq)
+ break;
+
+ cfs_rq->idle_h_nr_running += idle_task_delta;
+
+ /* Already accounted at parent level and above. */
+ if (cfs_rq_is_idle(cfs_rq))
+ break;
+ }
+
+next_cpu:
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ /* Idle groups have minimum weight. */
+ if (tg_is_idle(tg))
+ __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
+ else
+ __sched_group_set_shares(tg, NICE_0_LOAD);
+
mutex_unlock(&shares_mutex);
return 0;
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14a41a243f7b..a9a660c6e08a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,6 +227,8 @@ static inline void update_avg(u64 *avg, u64 sample)
*/
#define SCHED_FLAG_SUGOV 0x10000000
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+
static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
@@ -394,6 +396,9 @@ struct task_group {
struct cfs_rq **cfs_rq;
unsigned long shares;
+ /* A positive value indicates that this is a SCHED_IDLE group. */
+ int idle;
+
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
@@ -503,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern int sched_group_set_idle(struct task_group *tg, long idle);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
@@ -599,6 +606,9 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
+ /* Locally cached copy of our task_group's idle value */
+ int idle;
+
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;
@@ -2234,6 +2244,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq);
#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
#define SCA_MIGRATE_ENABLE 0x04
+#define SCA_USER 0x08
#ifdef CONFIG_SMP
@@ -2385,6 +2396,21 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_DEBUG
+extern unsigned int sysctl_sched_latency;
+extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_wakeup_granularity;
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
+
+extern unsigned int sysctl_sched_tunable_scaling;
+
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_size;
+#endif
+
#ifdef CONFIG_SCHED_HRTICK
/*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b77ad49dc14f..4e8698e62f07 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1482,6 +1482,8 @@ int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
+
+static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif
/*
@@ -1833,6 +1835,16 @@ void sched_init_numa(void)
sched_domains_numa_masks[i][j] = mask;
for_each_node(k) {
+ /*
+ * Distance information can be unreliable for
+ * offline nodes, defer building the node
+ * masks to its bringup.
+ * This relies on all unique distance values
+ * still being visible at init time.
+ */
+ if (!node_online(j))
+ continue;
+
if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");
@@ -1886,6 +1898,53 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
init_numa_topology_type();
+
+ sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
+ if (!sched_numa_onlined_nodes)
+ return;
+
+ bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
+ for_each_online_node(i)
+ bitmap_set(sched_numa_onlined_nodes, i, 1);
+}
+
+static void __sched_domains_numa_masks_set(unsigned int node)
+{
+ int i, j;
+
+ /*
+ * NUMA masks are not built for offline nodes in sched_init_numa().
+ * Thus, when a CPU of a never-onlined-before node gets plugged in,
+ * adding that new CPU to the right NUMA masks is not sufficient: the
+ * masks of that CPU's node must also be updated.
+ */
+ if (test_bit(node, sched_numa_onlined_nodes))
+ return;
+
+ bitmap_set(sched_numa_onlined_nodes, node, 1);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (!node_online(j) || node == j)
+ continue;
+
+ if (node_distance(j, node) > sched_domains_numa_distance[i])
+ continue;
+
+ /* Add remote nodes in our masks */
+ cpumask_or(sched_domains_numa_masks[i][node],
+ sched_domains_numa_masks[i][node],
+ sched_domains_numa_masks[0][j]);
+ }
+ }
+
+ /*
+ * A new node has been brought up, potentially changing the topology
+ * classification.
+ *
+ * Note that this is racy vs any use of sched_numa_topology_type :/
+ */
+ init_numa_topology_type();
}
void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu)
int node = cpu_to_node(cpu);
int i, j;
+ __sched_domains_numa_masks_set(node);
+
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
+ if (!node_online(j))
+ continue;
+
+ /* Set ourselves in the remote node's masks */
if (node_distance(j, node) <= sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 057e17f3215d..6469eca8078c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -602,7 +602,7 @@ static inline void seccomp_sync_threads(unsigned long flags)
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
atomic_set(&thread->seccomp.filter_count,
- atomic_read(&thread->seccomp.filter_count));
+ atomic_read(&caller->seccomp.filter_count));
/*
* Don't let an unprivileged task work around
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 9eb11c2209e5..e3d2c23c413d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1265,8 +1265,10 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
static void timer_sync_wait_running(struct timer_base *base)
{
if (atomic_read(&base->timer_waiters)) {
+ raw_spin_unlock_irq(&base->lock);
spin_unlock(&base->expiry_lock);
spin_lock(&base->expiry_lock);
+ raw_spin_lock_irq(&base->lock);
}
}
@@ -1457,14 +1459,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
- base->running_timer = NULL;
raw_spin_lock(&base->lock);
+ base->running_timer = NULL;
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, baseclk);
+ raw_spin_lock_irq(&base->lock);
base->running_timer = NULL;
timer_sync_wait_running(base);
- raw_spin_lock_irq(&base->lock);
}
}
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b4916ef388ad..fdd14072fc3b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -990,28 +990,29 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
- case BPF_FUNC_probe_write_user:
- return bpf_get_probe_write_proto();
case BPF_FUNC_current_task_under_cgroup:
return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_probe_write_user:
+ return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+ NULL : bpf_get_probe_write_proto();
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_proto;
case BPF_FUNC_probe_read_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_str_proto;
#endif
#ifdef CONFIG_CGROUPS
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c59dd35a6da5..33899a71fdc1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9135,8 +9135,10 @@ static int trace_array_create_dir(struct trace_array *tr)
return -EINVAL;
ret = event_trace_add_tracer(tr->dir, tr);
- if (ret)
+ if (ret) {
tracefs_remove(tr->dir);
+ return ret;
+ }
init_tracer_tracefs(tr, tr->dir);
__update_tracer_options(tr);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 34325f41ebc0..949ef09dc537 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -65,7 +65,8 @@
C(INVALID_SORT_MODIFIER,"Invalid sort modifier"), \
C(EMPTY_SORT_FIELD, "Empty sort field"), \
C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \
- C(INVALID_SORT_FIELD, "Sort field must be a key or a val"),
+ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \
+ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"),
#undef C
#define C(a, b) HIST_ERR_##a
@@ -2156,6 +2157,13 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
ret = PTR_ERR(operand1);
goto free;
}
+ if (operand1->flags & HIST_FIELD_FL_STRING) {
+ /* String type can not be the operand of unary operator. */
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str));
+ destroy_hist_field(operand1, 0);
+ ret = -EINVAL;
+ goto free;
+ }
expr->flags |= operand1->flags &
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
@@ -2257,6 +2265,11 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
operand1 = NULL;
goto free;
}
+ if (operand1->flags & HIST_FIELD_FL_STRING) {
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str));
+ ret = -EINVAL;
+ goto free;
+ }
/* rest of string could be another expression e.g. b+c in a+b+c */
operand_flags = 0;
@@ -2266,6 +2279,11 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
operand2 = NULL;
goto free;
}
+ if (operand2->flags & HIST_FIELD_FL_STRING) {
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str));
+ ret = -EINVAL;
+ goto free;
+ }
ret = check_expr_operands(file->tr, operand1, operand2);
if (ret)
@@ -2287,6 +2305,10 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
expr->operands[0] = operand1;
expr->operands[1] = operand2;
+
+ /* The operand sizes should be the same, so just pick one */
+ expr->size = operand1->size;
+
expr->operator = field_op;
expr->name = expr_str(expr, 0);
expr->type = kstrdup(operand1->type, GFP_KERNEL);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index a6c0cdaf4b87..14f46aae1981 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -327,7 +327,7 @@ static void move_to_next_cpu(void)
get_online_cpus();
cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
- next_cpu = cpumask_next(smp_processor_id(), current_mask);
+ next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
put_online_cpus();
if (next_cpu >= nr_cpu_ids)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index fc32821f8240..efd14c79fab4 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -15,12 +15,57 @@
#include <linux/sched/task.h>
#include <linux/static_key.h>
+enum tp_func_state {
+ TP_FUNC_0,
+ TP_FUNC_1,
+ TP_FUNC_2,
+ TP_FUNC_N,
+};
+
extern tracepoint_ptr_t __start___tracepoints_ptrs[];
extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
DEFINE_SRCU(tracepoint_srcu);
EXPORT_SYMBOL_GPL(tracepoint_srcu);
+enum tp_transition_sync {
+ TP_TRANSITION_SYNC_1_0_1,
+ TP_TRANSITION_SYNC_N_2_1,
+
+ _NR_TP_TRANSITION_SYNC,
+};
+
+struct tp_transition_snapshot {
+ unsigned long rcu;
+ unsigned long srcu;
+ bool ongoing;
+};
+
+/* Protected by tracepoints_mutex */
+static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC];
+
+static void tp_rcu_get_state(enum tp_transition_sync sync)
+{
+ struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
+
+ /* Keep the latest get_state snapshot. */
+ snapshot->rcu = get_state_synchronize_rcu();
+ snapshot->srcu = start_poll_synchronize_srcu(&tracepoint_srcu);
+ snapshot->ongoing = true;
+}
+
+static void tp_rcu_cond_sync(enum tp_transition_sync sync)
+{
+ struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
+
+ if (!snapshot->ongoing)
+ return;
+ cond_synchronize_rcu(snapshot->rcu);
+ if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu))
+ synchronize_srcu(&tracepoint_srcu);
+ snapshot->ongoing = false;
+}
+
/* Set to 1 to enable tracepoint debug output */
static const int tracepoint_debug;
@@ -246,26 +291,29 @@ static void *func_remove(struct tracepoint_func **funcs,
return old;
}
-static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs, bool sync)
+/*
+ * Count the number of functions (enum tp_func_state) in a tp_funcs array.
+ */
+static enum tp_func_state nr_func_state(const struct tracepoint_func *tp_funcs)
+{
+ if (!tp_funcs)
+ return TP_FUNC_0;
+ if (!tp_funcs[1].func)
+ return TP_FUNC_1;
+ if (!tp_funcs[2].func)
+ return TP_FUNC_2;
+ return TP_FUNC_N; /* 3 or more */
+}
+
+static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs)
{
void *func = tp->iterator;
/* Synthetic events do not have static call sites */
if (!tp->static_call_key)
return;
-
- if (!tp_funcs[1].func) {
+ if (nr_func_state(tp_funcs) == TP_FUNC_1)
func = tp_funcs[0].func;
- /*
- * If going from the iterator back to a single caller,
- * we need to synchronize with __DO_TRACE to make sure
- * that the data passed to the callback is the one that
- * belongs to that callback.
- */
- if (sync)
- tracepoint_synchronize_unregister();
- }
-
__static_call_update(tp->static_call_key, tp->static_call_tramp, func);
}
@@ -299,9 +347,41 @@ static int tracepoint_add_func(struct tracepoint *tp,
* a pointer to it. This array is referenced by __DO_TRACE from
* include/linux/tracepoint.h using rcu_dereference_sched().
*/
- tracepoint_update_call(tp, tp_funcs, false);
- rcu_assign_pointer(tp->funcs, tp_funcs);
- static_key_enable(&tp->key);
+ switch (nr_func_state(tp_funcs)) {
+ case TP_FUNC_1: /* 0->1 */
+ /*
+ * Make sure new static func never uses old data after a
+ * 1->0->1 transition sequence.
+ */
+ tp_rcu_cond_sync(TP_TRANSITION_SYNC_1_0_1);
+ /* Set static call to first function */
+ tracepoint_update_call(tp, tp_funcs);
+ /* Both iterator and static call handle NULL tp->funcs */
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ static_key_enable(&tp->key);
+ break;
+ case TP_FUNC_2: /* 1->2 */
+ /* Set iterator static call */
+ tracepoint_update_call(tp, tp_funcs);
+ /*
+ * Iterator callback installed before updating tp->funcs.
+ * Requires ordering between RCU assign/dereference and
+ * static call update/call.
+ */
+ fallthrough;
+ case TP_FUNC_N: /* N->N+1 (N>1) */
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>1) transition sequence.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
release_probes(old);
return 0;
@@ -328,17 +408,52 @@ static int tracepoint_remove_func(struct tracepoint *tp,
/* Failed allocating new tp_funcs, replaced func with stub */
return 0;
- if (!tp_funcs) {
+ switch (nr_func_state(tp_funcs)) {
+ case TP_FUNC_0: /* 1->0 */
/* Removed last function */
if (tp->unregfunc && static_key_enabled(&tp->key))
tp->unregfunc();
static_key_disable(&tp->key);
+ /* Set iterator static call */
+ tracepoint_update_call(tp, tp_funcs);
+ /* Both iterator and static call handle NULL tp->funcs */
+ rcu_assign_pointer(tp->funcs, NULL);
+ /*
+ * Make sure new static func never uses old data after a
+ * 1->0->1 transition sequence.
+ */
+ tp_rcu_get_state(TP_TRANSITION_SYNC_1_0_1);
+ break;
+ case TP_FUNC_1: /* 2->1 */
rcu_assign_pointer(tp->funcs, tp_funcs);
- } else {
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>2) transition sequence. If the first
+ * element's data has changed, then force the synchronization
+ * to prevent current readers that have loaded the old data
+ * from calling the new function.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ tp_rcu_cond_sync(TP_TRANSITION_SYNC_N_2_1);
+ /* Set static call to first function */
+ tracepoint_update_call(tp, tp_funcs);
+ break;
+ case TP_FUNC_2: /* N->N-1 (N>2) */
+ fallthrough;
+ case TP_FUNC_N:
rcu_assign_pointer(tp->funcs, tp_funcs);
- tracepoint_update_call(tp, tp_funcs,
- tp_funcs[0].func != old[0].func);
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>2) transition sequence.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
release_probes(old);
return 0;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 87799e2379bd..bb51849e6375 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -58,14 +58,17 @@ static struct ctl_table_root set_root = {
.permissions = set_permissions,
};
-#define UCOUNT_ENTRY(name) \
- { \
- .procname = name, \
- .maxlen = sizeof(int), \
- .mode = 0644, \
- .proc_handler = proc_dointvec_minmax, \
- .extra1 = SYSCTL_ZERO, \
- .extra2 = SYSCTL_INT_MAX, \
+static long ue_zero = 0;
+static long ue_int_max = INT_MAX;
+
+#define UCOUNT_ENTRY(name) \
+ { \
+ .procname = name, \
+ .maxlen = sizeof(long), \
+ .mode = 0644, \
+ .proc_handler = proc_doulongvec_minmax, \
+ .extra1 = &ue_zero, \
+ .extra2 = &ue_int_max, \
}
static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_user_namespaces"),
@@ -160,6 +163,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new;
+ long overflow;
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -184,8 +188,12 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
return new;
}
}
+ overflow = atomic_add_negative(1, &ucounts->count);
spin_unlock_irq(&ucounts_lock);
- ucounts = get_ucounts(ucounts);
+ if (overflow) {
+ put_ucounts(ucounts);
+ return NULL;
+ }
return ucounts;
}
@@ -193,8 +201,7 @@ void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;
- if (atomic_dec_and_test(&ucounts->count)) {
- spin_lock_irqsave(&ucounts_lock, flags);
+ if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) {
hlist_del_init(&ucounts->node);
spin_unlock_irqrestore(&ucounts_lock, flags);
kfree(ucounts);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 50142fc08902..f148eacda55a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3676,15 +3676,21 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
unbound_release_work);
struct workqueue_struct *wq = pwq->wq;
struct worker_pool *pool = pwq->pool;
- bool is_last;
+ bool is_last = false;
- if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
- return;
+ /*
+ * when @pwq is not linked, it doesn't hold any reference to the
+ * @wq, and @wq is invalid to access.
+ */
+ if (!list_empty(&pwq->pwqs_node)) {
+ if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
+ return;
- mutex_lock(&wq->mutex);
- list_del_rcu(&pwq->pwqs_node);
- is_last = list_empty(&wq->pwqs);
- mutex_unlock(&wq->mutex);
+ mutex_lock(&wq->mutex);
+ list_del_rcu(&pwq->pwqs_node);
+ is_last = list_empty(&wq->pwqs);
+ mutex_unlock(&wq->mutex);
+ }
mutex_lock(&wq_pool_mutex);
put_unbound_pool(pool);