aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c100
-rw-r--r--kernel/bpf/sockmap.c18
-rw-r--r--kernel/bpf/verifier.c145
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/locking/rwsem-xadd.c19
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/locking/rwsem.h30
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/sched/core.c45
-rw-r--r--kernel/sched/deadline.c10
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h7
-rw-r--r--kernel/sched/topology.c2
-rw-r--r--kernel/seccomp.c21
-rw-r--r--kernel/sys.c28
-rw-r--r--kernel/time/tick-broadcast.c8
-rw-r--r--kernel/trace/trace.c12
-rw-r--r--kernel/trace/trace.h11
-rw-r--r--kernel/trace/trace_events_trigger.c15
19 files changed, 362 insertions, 133 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba03ec39efb3..6ef6746a7871 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -218,47 +218,84 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
return 0;
}
-static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
+ u32 curr, const bool probe_pass)
{
+ const s64 imm_min = S32_MIN, imm_max = S32_MAX;
+ s64 imm = insn->imm;
+
+ if (curr < pos && curr + imm + 1 > pos)
+ imm += delta;
+ else if (curr > pos + delta && curr + imm + 1 <= pos + delta)
+ imm -= delta;
+ if (imm < imm_min || imm > imm_max)
+ return -ERANGE;
+ if (!probe_pass)
+ insn->imm = imm;
+ return 0;
+}
+
+static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
+ u32 curr, const bool probe_pass)
+{
+ const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s32 off = insn->off;
+
+ if (curr < pos && curr + off + 1 > pos)
+ off += delta;
+ else if (curr > pos + delta && curr + off + 1 <= pos + delta)
+ off -= delta;
+ if (off < off_min || off > off_max)
+ return -ERANGE;
+ if (!probe_pass)
+ insn->off = off;
+ return 0;
+}
+
+static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
+ const bool probe_pass)
+{
+ u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0);
struct bpf_insn *insn = prog->insnsi;
- u32 i, insn_cnt = prog->len;
- bool pseudo_call;
- u8 code;
- int off;
+ int ret = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ u8 code;
+
+ /* In the probing pass we still operate on the original,
+ * unpatched image in order to check overflows before we
+ * do any other adjustments. Therefore skip the patchlet.
+ */
+ if (probe_pass && i == pos) {
+ i += delta + 1;
+ insn++;
+ }
code = insn->code;
- if (BPF_CLASS(code) != BPF_JMP)
- continue;
- if (BPF_OP(code) == BPF_EXIT)
+ if (BPF_CLASS(code) != BPF_JMP ||
+ BPF_OP(code) == BPF_EXIT)
continue;
+ /* Adjust offset of jmps if we cross patch boundaries. */
if (BPF_OP(code) == BPF_CALL) {
- if (insn->src_reg == BPF_PSEUDO_CALL)
- pseudo_call = true;
- else
+ if (insn->src_reg != BPF_PSEUDO_CALL)
continue;
+ ret = bpf_adj_delta_to_imm(insn, pos, delta, i,
+ probe_pass);
} else {
- pseudo_call = false;
+ ret = bpf_adj_delta_to_off(insn, pos, delta, i,
+ probe_pass);
}
- off = pseudo_call ? insn->imm : insn->off;
-
- /* Adjust offset of jmps if we cross boundaries. */
- if (i < pos && i + off + 1 > pos)
- off += delta;
- else if (i > pos + delta && i + off + 1 <= pos + delta)
- off -= delta;
-
- if (pseudo_call)
- insn->imm = off;
- else
- insn->off = off;
+ if (ret)
+ break;
}
+
+ return ret;
}
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len)
{
u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+ const u32 cnt_max = S16_MAX;
struct bpf_prog *prog_adj;
/* Since our patchlet doesn't expand the image, we're done. */
@@ -269,6 +306,15 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
insn_adj_cnt = prog->len + insn_delta;
+ /* Reject anything that would potentially let the insn->off
+ * target overflow when we have excessive program expansions.
+ * We need to probe here before we do any reallocation where
+ * we afterwards may not fail anymore.
+ */
+ if (insn_adj_cnt > cnt_max &&
+ bpf_adj_branches(prog, off, insn_delta, true))
+ return NULL;
+
/* Several new instructions need to be inserted. Make room
* for them. Likely, there's no need for a new allocation as
* last page could have large enough tailroom.
@@ -294,7 +340,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
sizeof(*patch) * insn_rest);
memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
- bpf_adj_branches(prog_adj, off, insn_delta);
+ /* We are guaranteed to not fail at this point, otherwise
+ * the ship has sailed to reverse to the original state. An
+ * overflow cannot happen at this point.
+ */
+ BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
return prog_adj;
}
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 098eca568c2b..95a84b2f10ce 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1703,11 +1703,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
* we increment the refcnt. If this is the case abort with an
* error.
*/
- verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+ verdict = bpf_prog_inc_not_zero(verdict);
if (IS_ERR(verdict))
return PTR_ERR(verdict);
- parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+ parse = bpf_prog_inc_not_zero(parse);
if (IS_ERR(parse)) {
bpf_prog_put(verdict);
return PTR_ERR(parse);
@@ -1715,12 +1715,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
}
if (tx_msg) {
- tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
+ tx_msg = bpf_prog_inc_not_zero(tx_msg);
if (IS_ERR(tx_msg)) {
- if (verdict)
- bpf_prog_put(verdict);
- if (parse)
+ if (parse && verdict) {
bpf_prog_put(parse);
+ bpf_prog_put(verdict);
+ }
return PTR_ERR(tx_msg);
}
}
@@ -1805,10 +1805,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
out_free:
smap_release_sock(psock, sock);
out_progs:
- if (verdict)
- bpf_prog_put(verdict);
- if (parse)
+ if (parse && verdict) {
bpf_prog_put(parse);
+ bpf_prog_put(verdict);
+ }
if (tx_msg)
bpf_prog_put(tx_msg);
write_unlock_bh(&sock->sk_callback_lock);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5dd1dcb902bf..1904e814f282 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -156,7 +156,29 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_INSNS 131072
#define BPF_COMPLEXITY_LIMIT_STACK 1024
-#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
+#define BPF_MAP_PTR_UNPRIV 1UL
+#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
+ POISON_POINTER_DELTA))
+#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
+
+static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
+{
+ return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON;
+}
+
+static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
+{
+ return aux->map_state & BPF_MAP_PTR_UNPRIV;
+}
+
+static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
+ const struct bpf_map *map, bool unpriv)
+{
+ BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
+ unpriv |= bpf_map_ptr_unpriv(aux);
+ aux->map_state = (unsigned long)map |
+ (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+}
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
@@ -978,7 +1000,7 @@ static bool register_is_null(struct bpf_reg_state *reg)
*/
static int check_stack_write(struct bpf_verifier_env *env,
struct bpf_func_state *state, /* func where register points to */
- int off, int size, int value_regno)
+ int off, int size, int value_regno, int insn_idx)
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
@@ -1017,8 +1039,33 @@ static int check_stack_write(struct bpf_verifier_env *env,
state->stack[spi].spilled_ptr = cur->regs[value_regno];
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
- for (i = 0; i < BPF_REG_SIZE; i++)
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ if (state->stack[spi].slot_type[i] == STACK_MISC &&
+ !env->allow_ptr_leaks) {
+ int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
+ int soff = (-spi - 1) * BPF_REG_SIZE;
+
+ /* detected reuse of integer stack slot with a pointer
+ * which means either llvm is reusing stack slot or
+ * an attacker is trying to exploit CVE-2018-3639
+ * (speculative store bypass)
+ * Have to sanitize that slot with preemptive
+ * store of zero.
+ */
+ if (*poff && *poff != soff) {
+ /* disallow programs where single insn stores
+ * into two different stack slots, since verifier
+ * cannot sanitize them
+ */
+ verbose(env,
+ "insn %d cannot access two stack slots fp%d and fp%d",
+ insn_idx, *poff, soff);
+ return -EINVAL;
+ }
+ *poff = soff;
+ }
state->stack[spi].slot_type[i] = STACK_SPILL;
+ }
} else {
u8 type = STACK_MISC;
@@ -1694,7 +1741,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (t == BPF_WRITE)
err = check_stack_write(env, state, off, size,
- value_regno);
+ value_regno, insn_idx);
else
err = check_stack_read(env, state, off, size,
value_regno);
@@ -2333,6 +2380,29 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return 0;
}
+static int
+record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
+ int func_id, int insn_idx)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+
+ if (func_id != BPF_FUNC_tail_call &&
+ func_id != BPF_FUNC_map_lookup_elem)
+ return 0;
+ if (meta->map_ptr == NULL) {
+ verbose(env, "kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+
+ if (!BPF_MAP_PTR(aux->map_state))
+ bpf_map_ptr_store(aux, meta->map_ptr,
+ meta->map_ptr->unpriv_array);
+ else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr)
+ bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
+ meta->map_ptr->unpriv_array);
+ return 0;
+}
+
static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
const struct bpf_func_proto *fn = NULL;
@@ -2387,13 +2457,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
- if (func_id == BPF_FUNC_tail_call) {
- if (meta.map_ptr == NULL) {
- verbose(env, "verifier bug\n");
- return -EINVAL;
- }
- env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
- }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
@@ -2404,6 +2467,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
if (err)
return err;
+ err = record_func_map(env, &meta, func_id, insn_idx);
+ if (err)
+ return err;
+
/* Mark slots with STACK_MISC in case of raw mode, stack offset
* is inferred from register state.
*/
@@ -2428,8 +2495,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
- struct bpf_insn_aux_data *insn_aux;
-
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -2445,11 +2510,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].id = ++env->id_gen;
- insn_aux = &env->insn_aux_data[insn_idx];
- if (!insn_aux->map_ptr)
- insn_aux->map_ptr = meta.map_ptr;
- else if (insn_aux->map_ptr != meta.map_ptr)
- insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
@@ -5169,6 +5229,34 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
else
continue;
+ if (type == BPF_WRITE &&
+ env->insn_aux_data[i + delta].sanitize_stack_off) {
+ struct bpf_insn patch[] = {
+ /* Sanitize suspicious stack slot with zero.
+ * There are no memory dependencies for this store,
+ * since it's only using frame pointer and immediate
+ * constant of zero
+ */
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+ env->insn_aux_data[i + delta].sanitize_stack_off,
+ 0),
+ /* the original STX instruction will immediately
+ * overwrite the same stack slot with appropriate value
+ */
+ *insn,
+ };
+
+ cnt = ARRAY_SIZE(patch);
+ new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
@@ -5417,6 +5505,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
+ struct bpf_insn_aux_data *aux;
struct bpf_insn insn_buf[16];
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
@@ -5491,19 +5580,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->imm = 0;
insn->code = BPF_JMP | BPF_TAIL_CALL;
+ aux = &env->insn_aux_data[i + delta];
+ if (!bpf_map_ptr_unpriv(aux))
+ continue;
+
/* instead of changing every JIT dealing with tail_call
* emit two extra insns:
* if (index >= max_entries) goto out;
* index &= array->index_mask;
* to avoid out-of-bounds cpu speculation
*/
- map_ptr = env->insn_aux_data[i + delta].map_ptr;
- if (map_ptr == BPF_MAP_PTR_POISON) {
+ if (bpf_map_ptr_poisoned(aux)) {
verbose(env, "tail_call abusing map_ptr\n");
return -EINVAL;
}
- if (!map_ptr->unpriv_array)
- continue;
+
+ map_ptr = BPF_MAP_PTR(aux->map_state);
insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
map_ptr->max_entries, 2);
insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -5527,9 +5619,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
if (prog->jit_requested && BITS_PER_LONG == 64 &&
insn->imm == BPF_FUNC_map_lookup_elem) {
- map_ptr = env->insn_aux_data[i + delta].map_ptr;
- if (map_ptr == BPF_MAP_PTR_POISON ||
- !map_ptr->ops->map_gen_lookup)
+ aux = &env->insn_aux_data[i + delta];
+ if (bpf_map_ptr_poisoned(aux))
+ goto patch_call_imm;
+
+ map_ptr = BPF_MAP_PTR(aux->map_state);
+ if (!map_ptr->ops->map_gen_lookup)
goto patch_call_imm;
cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2017a39ab490..481951bf091d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -193,7 +193,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme);
void kthread_park_complete(struct task_struct *k)
{
- complete(&to_kthread(k)->parked);
+ complete_all(&to_kthread(k)->parked);
}
static int kthread(void *_create)
@@ -459,6 +459,7 @@ void kthread_unpark(struct task_struct *k)
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
__kthread_bind(k, kthread->cpu, TASK_PARKED);
+ reinit_completion(&kthread->parked);
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
wake_up_state(k, TASK_PARKED);
}
@@ -483,9 +484,6 @@ int kthread_park(struct task_struct *k)
if (WARN_ON(k->flags & PF_EXITING))
return -ENOSYS;
- if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
- return -EBUSY;
-
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
if (k != current) {
wake_up_process(k);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index e795908f3607..a90336779375 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -352,16 +352,15 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
struct task_struct *owner;
bool ret = true;
+ BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
+
if (need_resched())
return false;
rcu_read_lock();
owner = READ_ONCE(sem->owner);
- if (!rwsem_owner_is_writer(owner)) {
- /*
- * Don't spin if the rwsem is readers owned.
- */
- ret = !rwsem_owner_is_reader(owner);
+ if (!owner || !is_rwsem_owner_spinnable(owner)) {
+ ret = !owner; /* !owner is spinnable */
goto done;
}
@@ -382,11 +381,11 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner = READ_ONCE(sem->owner);
- if (!rwsem_owner_is_writer(owner))
- goto out;
+ if (!is_rwsem_owner_spinnable(owner))
+ return false;
rcu_read_lock();
- while (sem->owner == owner) {
+ while (owner && (READ_ONCE(sem->owner) == owner)) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
* checking sem->owner still matches owner, if that fails,
@@ -408,12 +407,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
cpu_relax();
}
rcu_read_unlock();
-out:
+
/*
* If there is a new owner or the owner is not set, we continue
* spinning.
*/
- return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
+ return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 30465a2f2b6c..bc1e507be9ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -221,5 +221,3 @@ void up_read_non_owner(struct rw_semaphore *sem)
EXPORT_SYMBOL(up_read_non_owner);
#endif
-
-
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index a17cba8d94bb..b9d0e72aa80f 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,20 +1,24 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* The owner field of the rw_semaphore structure will be set to
- * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear
* the owner field when it unlocks. A reader, on the other hand, will
* not touch the owner field when it unlocks.
*
- * In essence, the owner field now has the following 3 states:
+ * In essence, the owner field now has the following 4 states:
* 1) 0
* - lock is free or the owner hasn't set the field yet
* 2) RWSEM_READER_OWNED
* - lock is currently or previously owned by readers (lock is free
* or not set by owner yet)
- * 3) Other non-zero value
- * - a writer owns the lock
+ * 3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well
+ * - lock is owned by an anonymous writer, so spinning on the lock
+ * owner should be disabled.
+ * 4) Other non-zero value
+ * - a writer owns the lock and other writers can spin on the lock owner.
*/
-#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
+#define RWSEM_ANONYMOUSLY_OWNED (1UL << 0)
+#define RWSEM_READER_OWNED ((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED)
#ifdef CONFIG_DEBUG_RWSEMS
# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c)
@@ -51,14 +55,22 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
}
-static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+/*
+ * Return true if the a rwsem waiter can spin on the rwsem's owner
+ * and steal the lock, i.e. the lock is not anonymously owned.
+ * N.B. !owner is considered spinnable.
+ */
+static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
{
- return owner && owner != RWSEM_READER_OWNED;
+ return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
}
-static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+/*
+ * Return true if rwsem is owned by an anonymous writer or readers.
+ */
+static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
{
- return owner == RWSEM_READER_OWNED;
+ return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
}
#else
static inline void rwsem_set_owner(struct rw_semaphore *sem)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11b4282c2d20..1efcb5b0c3ed 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -269,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
struct bio *bio;
int error = 0;
- bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
+ bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
bio_set_dev(bio, hib_resume_bdev);
bio_set_op_attrs(bio, op, op_flags);
@@ -376,7 +376,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
return -ENOSPC;
if (hb) {
- src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN |
+ src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY);
if (src) {
copy_page(src, buf);
@@ -384,7 +384,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
ret = hib_wait_io(hb); /* Free pages */
if (ret)
return ret;
- src = (void *)__get_free_page(__GFP_RECLAIM |
+ src = (void *)__get_free_page(GFP_NOIO |
__GFP_NOWARN |
__GFP_NORETRY);
if (src) {
@@ -691,7 +691,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
nr_threads = num_online_cpus() - 1;
nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
- page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
+ page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH);
if (!page) {
pr_err("Failed to allocate LZO page\n");
ret = -ENOMEM;
@@ -989,7 +989,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
last = tmp;
tmp->map = (struct swap_map_page *)
- __get_free_page(__GFP_RECLAIM | __GFP_HIGH);
+ __get_free_page(GFP_NOIO | __GFP_HIGH);
if (!tmp->map) {
release_swap_reader(handle);
return -ENOMEM;
@@ -1261,8 +1261,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
for (i = 0; i < read_pages; i++) {
page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
- __GFP_RECLAIM | __GFP_HIGH :
- __GFP_RECLAIM | __GFP_NOWARN |
+ GFP_NOIO | __GFP_HIGH :
+ GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY);
if (!page[i]) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 092f7c4de903..211890edf37e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -881,6 +881,33 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+ if (!(p->flags & PF_KTHREAD))
+ return false;
+
+ if (p->nr_cpus_allowed != 1)
+ return false;
+
+ return true;
+}
+
+/*
+ * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * __set_cpus_allowed_ptr() and select_fallback_rq().
+ */
+static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+{
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ return false;
+
+ if (is_per_cpu_kthread(p))
+ return cpu_online(cpu);
+
+ return cpu_active(cpu);
+}
+
/*
* This is how migration works:
*
@@ -938,16 +965,8 @@ struct migration_arg {
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{
- if (p->flags & PF_KTHREAD) {
- if (unlikely(!cpu_online(dest_cpu)))
- return rq;
- } else {
- if (unlikely(!cpu_active(dest_cpu)))
- return rq;
- }
-
/* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ if (!is_cpu_allowed(p, dest_cpu))
return rq;
update_rq_clock(rq);
@@ -1476,10 +1495,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for (;;) {
/* Any allowed, online CPU? */
for_each_cpu(dest_cpu, &p->cpus_allowed) {
- if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
- continue;
- if (!cpu_online(dest_cpu))
+ if (!is_cpu_allowed(p, dest_cpu))
continue;
+
goto out;
}
@@ -1542,8 +1560,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
* [ this allows ->select_task() to simply return task_cpu(p) and
* not worry about this generic constraint ]
*/
- if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
- !cpu_online(cpu)))
+ if (unlikely(!is_cpu_allowed(p, cpu)))
cpu = select_fallback_rq(task_cpu(p), p);
return cpu;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e7b3008b85bb..fbfc3f1d368a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1117,7 +1117,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
* should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
* So, overflow is not an issue here.
*/
-u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
{
u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
u64 u_act;
@@ -1259,6 +1259,9 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
rq = task_rq_lock(p, &rf);
+ sched_clock_tick();
+ update_rq_clock(rq);
+
if (!dl_task(p) || p->state == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
@@ -1278,9 +1281,6 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
if (dl_se->dl_non_contending == 0)
goto unlock;
- sched_clock_tick();
- update_rq_clock(rq);
-
sub_running_bw(dl_se, &rq->dl);
dl_se->dl_non_contending = 0;
unlock:
@@ -2731,8 +2731,6 @@ bool dl_cpu_busy(unsigned int cpu)
#endif
#ifdef CONFIG_SCHED_DEBUG
-extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
-
void print_dl_stats(struct seq_file *m, int cpu)
{
print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7aef6b4e885a..ef3c4e6f5345 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2701,8 +2701,6 @@ int sched_rr_handler(struct ctl_table *table, int write,
}
#ifdef CONFIG_SCHED_DEBUG
-extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
-
void print_rt_stats(struct seq_file *m, int cpu)
{
rt_rq_iter_t iter;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 15750c222ca2..cb467c221b15 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -983,7 +983,7 @@ static inline void rq_clock_skip_update(struct rq *rq)
}
/*
- * See rt task throttoling, which is the only time a skip
+ * See rt task throttling, which is the only time a skip
* request is cancelled.
*/
static inline void rq_clock_cancel_skipupdate(struct rq *rq)
@@ -2025,8 +2025,9 @@ extern bool sched_debug_enabled;
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
-extern void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
#ifdef CONFIG_NUMA_BALANCING
extern void
show_numa_stats(struct task_struct *p, struct seq_file *m);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 64cc564f5255..61a1125c1ae4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1708,7 +1708,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
rcu_read_unlock();
if (rq && sched_debug_enabled) {
- pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+ pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dc77548167ef..e691d9a6c58d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -19,6 +19,8 @@
#include <linux/compat.h>
#include <linux/coredump.h>
#include <linux/kmemleak.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/seccomp.h>
@@ -227,8 +229,11 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
return true;
}
+void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
+
static inline void seccomp_assign_mode(struct task_struct *task,
- unsigned long seccomp_mode)
+ unsigned long seccomp_mode,
+ unsigned long flags)
{
assert_spin_locked(&task->sighand->siglock);
@@ -238,6 +243,9 @@ static inline void seccomp_assign_mode(struct task_struct *task,
* filter) is set.
*/
smp_mb__before_atomic();
+ /* Assume default seccomp processes want spec flaw mitigation. */
+ if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
+ arch_seccomp_spec_mitigate(task);
set_tsk_thread_flag(task, TIF_SECCOMP);
}
@@ -305,7 +313,7 @@ static inline pid_t seccomp_can_sync_threads(void)
* without dropping the locks.
*
*/
-static inline void seccomp_sync_threads(void)
+static inline void seccomp_sync_threads(unsigned long flags)
{
struct task_struct *thread, *caller;
@@ -346,7 +354,8 @@ static inline void seccomp_sync_threads(void)
* allow one thread to transition the other.
*/
if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
- seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+ seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
+ flags);
}
}
@@ -469,7 +478,7 @@ static long seccomp_attach_filter(unsigned int flags,
/* Now that the new filter is in place, synchronize to all threads. */
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
- seccomp_sync_threads();
+ seccomp_sync_threads(flags);
return 0;
}
@@ -818,7 +827,7 @@ static long seccomp_set_mode_strict(void)
#ifdef TIF_NOTSC
disable_TSC();
#endif
- seccomp_assign_mode(current, seccomp_mode);
+ seccomp_assign_mode(current, seccomp_mode, 0);
ret = 0;
out:
@@ -876,7 +885,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
/* Do not free the successfully attached filter. */
prepared = NULL;
- seccomp_assign_mode(current, seccomp_mode);
+ seccomp_assign_mode(current, seccomp_mode, flags);
out:
spin_unlock_irq(&current->sighand->siglock);
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
diff --git a/kernel/sys.c b/kernel/sys.c
index ad692183dfe9..d1b2b8d934bb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -61,6 +61,8 @@
#include <linux/uidgid.h>
#include <linux/cred.h>
+#include <linux/nospec.h>
+
#include <linux/kmsg_dump.h>
/* Move somewhere else to avoid recompiling? */
#include <generated/utsrelease.h>
@@ -69,6 +71,9 @@
#include <asm/io.h>
#include <asm/unistd.h>
+/* Hardening for Spectre-v1 */
+#include <linux/nospec.h>
+
#include "uid16.h"
#ifndef SET_UNALIGN_CTL
@@ -1451,6 +1456,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
if (resource >= RLIM_NLIMITS)
return -EINVAL;
+ resource = array_index_nospec(resource, RLIM_NLIMITS);
task_lock(current->group_leader);
x = current->signal->rlim[resource];
task_unlock(current->group_leader);
@@ -1470,6 +1476,7 @@ COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
if (resource >= RLIM_NLIMITS)
return -EINVAL;
+ resource = array_index_nospec(resource, RLIM_NLIMITS);
task_lock(current->group_leader);
r = current->signal->rlim[resource];
task_unlock(current->group_leader);
@@ -2242,6 +2249,17 @@ static int propagate_has_child_subreaper(struct task_struct *p, void *data)
return 1;
}
+int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
+{
+ return -EINVAL;
+}
+
+int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
+ unsigned long ctrl)
+{
+ return -EINVAL;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2450,6 +2468,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SVE_GET_VL:
error = SVE_GET_VL();
break;
+ case PR_GET_SPECULATION_CTRL:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_prctl_spec_ctrl_get(me, arg2);
+ break;
+ case PR_SET_SPECULATION_CTRL:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b398c2ea69b2..aa2094d5dd27 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -612,6 +612,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
now = ktime_get();
/* Find all expired events */
for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
+ /*
+ * Required for !SMP because for_each_cpu() reports
+ * unconditionally CPU0 as set on UP kernels.
+ */
+ if (!IS_ENABLED(CONFIG_SMP) &&
+ cpumask_empty(tick_broadcast_oneshot_mask))
+ break;
+
td = &per_cpu(tick_cpu_device, cpu);
if (td->evtdev->next_event <= now) {
cpumask_set_cpu(cpu, tmpmask);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 414d7210b2ec..bcd93031d042 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -893,7 +893,7 @@ int __trace_bputs(unsigned long ip, const char *str)
EXPORT_SYMBOL_GPL(__trace_bputs);
#ifdef CONFIG_TRACER_SNAPSHOT
-static void tracing_snapshot_instance(struct trace_array *tr)
+void tracing_snapshot_instance(struct trace_array *tr)
{
struct tracer *tracer = tr->current_trace;
unsigned long flags;
@@ -949,7 +949,7 @@ static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
struct trace_buffer *size_buf, int cpu_id);
static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
-static int alloc_snapshot(struct trace_array *tr)
+int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
int ret;
@@ -995,7 +995,7 @@ int tracing_alloc_snapshot(void)
struct trace_array *tr = &global_trace;
int ret;
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
WARN_ON(ret < 0);
return ret;
@@ -5408,7 +5408,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
#ifdef CONFIG_TRACER_MAX_TRACE
if (t->use_max_tr && !had_max_tr) {
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
goto out;
}
@@ -6451,7 +6451,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
#endif
if (!tr->allocated_snapshot) {
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
break;
}
@@ -7179,7 +7179,7 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
return ret;
out_reg:
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
goto out;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6fb46a06c9dc..507954b4e058 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1817,6 +1817,17 @@ static inline void __init trace_event_init(void) { }
static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+void tracing_snapshot_instance(struct trace_array *tr);
+int tracing_alloc_snapshot_instance(struct trace_array *tr);
+#else
+static inline void tracing_snapshot_instance(struct trace_array *tr) { }
+static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
+{
+ return 0;
+}
+#endif
+
extern struct trace_iterator *tracepoint_print_iter;
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d251cabcf69a..8b5bdcf64871 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -483,9 +483,10 @@ clear_event_triggers(struct trace_array *tr)
struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list) {
- struct event_trigger_data *data;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ struct event_trigger_data *data, *n;
+ list_for_each_entry_safe(data, n, &file->triggers, list) {
trace_event_trigger_enable_disable(file, 0);
+ list_del_rcu(&data->list);
if (data->ops->free)
data->ops->free(data->ops, data);
}
@@ -642,6 +643,7 @@ event_trigger_callback(struct event_command *cmd_ops,
trigger_data->count = -1;
trigger_data->ops = trigger_ops;
trigger_data->cmd_ops = cmd_ops;
+ trigger_data->private_data = file;
INIT_LIST_HEAD(&trigger_data->list);
INIT_LIST_HEAD(&trigger_data->named_list);
@@ -1053,7 +1055,12 @@ static void
snapshot_trigger(struct event_trigger_data *data, void *rec,
struct ring_buffer_event *event)
{
- tracing_snapshot();
+ struct trace_event_file *file = data->private_data;
+
+ if (file)
+ tracing_snapshot_instance(file->tr);
+ else
+ tracing_snapshot();
}
static void
@@ -1076,7 +1083,7 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
{
int ret = register_trigger(glob, ops, data, file);
- if (ret > 0 && tracing_alloc_snapshot() != 0) {
+ if (ret > 0 && tracing_alloc_snapshot_instance(file->tr) != 0) {
unregister_trigger(glob, ops, data, file);
ret = 0;
}