aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c100
-rw-r--r--kernel/bpf/sockmap.c18
-rw-r--r--kernel/bpf/verifier.c145
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/locking/rwsem-xadd.c19
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/locking/rwsem.h30
-rw-r--r--kernel/power/hibernate.c7
-rw-r--r--kernel/power/qos.c1
-rw-r--r--kernel/power/suspend.c27
-rw-r--r--kernel/power/user.c5
-rw-r--r--kernel/power/wakelock.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c262
-rw-r--r--kernel/sched/deadline.c4
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h5
-rw-r--r--kernel/sched/topology.c2
-rw-r--r--kernel/seccomp.c21
-rw-r--r--kernel/sys.c28
-rw-r--r--kernel/time/tick-broadcast.c8
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/trace/trace.c12
-rw-r--r--kernel/trace/trace.h11
-rw-r--r--kernel/trace/trace_events_trigger.c15
24 files changed, 530 insertions, 203 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba03ec39efb3..6ef6746a7871 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -218,47 +218,84 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
return 0;
}
-static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
+ u32 curr, const bool probe_pass)
{
+ const s64 imm_min = S32_MIN, imm_max = S32_MAX;
+ s64 imm = insn->imm;
+
+ if (curr < pos && curr + imm + 1 > pos)
+ imm += delta;
+ else if (curr > pos + delta && curr + imm + 1 <= pos + delta)
+ imm -= delta;
+ if (imm < imm_min || imm > imm_max)
+ return -ERANGE;
+ if (!probe_pass)
+ insn->imm = imm;
+ return 0;
+}
+
+static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
+ u32 curr, const bool probe_pass)
+{
+ const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s32 off = insn->off;
+
+ if (curr < pos && curr + off + 1 > pos)
+ off += delta;
+ else if (curr > pos + delta && curr + off + 1 <= pos + delta)
+ off -= delta;
+ if (off < off_min || off > off_max)
+ return -ERANGE;
+ if (!probe_pass)
+ insn->off = off;
+ return 0;
+}
+
+static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
+ const bool probe_pass)
+{
+ u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0);
struct bpf_insn *insn = prog->insnsi;
- u32 i, insn_cnt = prog->len;
- bool pseudo_call;
- u8 code;
- int off;
+ int ret = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ u8 code;
+
+ /* In the probing pass we still operate on the original,
+ * unpatched image in order to check overflows before we
+ * do any other adjustments. Therefore skip the patchlet.
+ */
+ if (probe_pass && i == pos) {
+ i += delta + 1;
+ insn++;
+ }
code = insn->code;
- if (BPF_CLASS(code) != BPF_JMP)
- continue;
- if (BPF_OP(code) == BPF_EXIT)
+ if (BPF_CLASS(code) != BPF_JMP ||
+ BPF_OP(code) == BPF_EXIT)
continue;
+ /* Adjust offset of jmps if we cross patch boundaries. */
if (BPF_OP(code) == BPF_CALL) {
- if (insn->src_reg == BPF_PSEUDO_CALL)
- pseudo_call = true;
- else
+ if (insn->src_reg != BPF_PSEUDO_CALL)
continue;
+ ret = bpf_adj_delta_to_imm(insn, pos, delta, i,
+ probe_pass);
} else {
- pseudo_call = false;
+ ret = bpf_adj_delta_to_off(insn, pos, delta, i,
+ probe_pass);
}
- off = pseudo_call ? insn->imm : insn->off;
-
- /* Adjust offset of jmps if we cross boundaries. */
- if (i < pos && i + off + 1 > pos)
- off += delta;
- else if (i > pos + delta && i + off + 1 <= pos + delta)
- off -= delta;
-
- if (pseudo_call)
- insn->imm = off;
- else
- insn->off = off;
+ if (ret)
+ break;
}
+
+ return ret;
}
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len)
{
u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+ const u32 cnt_max = S16_MAX;
struct bpf_prog *prog_adj;
/* Since our patchlet doesn't expand the image, we're done. */
@@ -269,6 +306,15 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
insn_adj_cnt = prog->len + insn_delta;
+ /* Reject anything that would potentially let the insn->off
+ * target overflow when we have excessive program expansions.
+ * We need to probe here before we do any reallocation where
+ * we afterwards may not fail anymore.
+ */
+ if (insn_adj_cnt > cnt_max &&
+ bpf_adj_branches(prog, off, insn_delta, true))
+ return NULL;
+
/* Several new instructions need to be inserted. Make room
* for them. Likely, there's no need for a new allocation as
* last page could have large enough tailroom.
@@ -294,7 +340,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
sizeof(*patch) * insn_rest);
memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
- bpf_adj_branches(prog_adj, off, insn_delta);
+ /* We are guaranteed to not fail at this point, otherwise
+ * the ship has sailed to reverse to the original state. An
+ * overflow cannot happen at this point.
+ */
+ BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
return prog_adj;
}
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 098eca568c2b..95a84b2f10ce 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1703,11 +1703,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
* we increment the refcnt. If this is the case abort with an
* error.
*/
- verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+ verdict = bpf_prog_inc_not_zero(verdict);
if (IS_ERR(verdict))
return PTR_ERR(verdict);
- parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+ parse = bpf_prog_inc_not_zero(parse);
if (IS_ERR(parse)) {
bpf_prog_put(verdict);
return PTR_ERR(parse);
@@ -1715,12 +1715,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
}
if (tx_msg) {
- tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
+ tx_msg = bpf_prog_inc_not_zero(tx_msg);
if (IS_ERR(tx_msg)) {
- if (verdict)
- bpf_prog_put(verdict);
- if (parse)
+ if (parse && verdict) {
bpf_prog_put(parse);
+ bpf_prog_put(verdict);
+ }
return PTR_ERR(tx_msg);
}
}
@@ -1805,10 +1805,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
out_free:
smap_release_sock(psock, sock);
out_progs:
- if (verdict)
- bpf_prog_put(verdict);
- if (parse)
+ if (parse && verdict) {
bpf_prog_put(parse);
+ bpf_prog_put(verdict);
+ }
if (tx_msg)
bpf_prog_put(tx_msg);
write_unlock_bh(&sock->sk_callback_lock);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5dd1dcb902bf..1904e814f282 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -156,7 +156,29 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_INSNS 131072
#define BPF_COMPLEXITY_LIMIT_STACK 1024
-#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
+#define BPF_MAP_PTR_UNPRIV 1UL
+#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
+ POISON_POINTER_DELTA))
+#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
+
+static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
+{
+ return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON;
+}
+
+static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
+{
+ return aux->map_state & BPF_MAP_PTR_UNPRIV;
+}
+
+static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
+ const struct bpf_map *map, bool unpriv)
+{
+ BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
+ unpriv |= bpf_map_ptr_unpriv(aux);
+ aux->map_state = (unsigned long)map |
+ (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+}
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
@@ -978,7 +1000,7 @@ static bool register_is_null(struct bpf_reg_state *reg)
*/
static int check_stack_write(struct bpf_verifier_env *env,
struct bpf_func_state *state, /* func where register points to */
- int off, int size, int value_regno)
+ int off, int size, int value_regno, int insn_idx)
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
@@ -1017,8 +1039,33 @@ static int check_stack_write(struct bpf_verifier_env *env,
state->stack[spi].spilled_ptr = cur->regs[value_regno];
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
- for (i = 0; i < BPF_REG_SIZE; i++)
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ if (state->stack[spi].slot_type[i] == STACK_MISC &&
+ !env->allow_ptr_leaks) {
+ int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
+ int soff = (-spi - 1) * BPF_REG_SIZE;
+
+ /* detected reuse of integer stack slot with a pointer
+ * which means either llvm is reusing stack slot or
+ * an attacker is trying to exploit CVE-2018-3639
+ * (speculative store bypass)
+ * Have to sanitize that slot with preemptive
+ * store of zero.
+ */
+ if (*poff && *poff != soff) {
+ /* disallow programs where single insn stores
+ * into two different stack slots, since verifier
+ * cannot sanitize them
+ */
+ verbose(env,
+ "insn %d cannot access two stack slots fp%d and fp%d",
+ insn_idx, *poff, soff);
+ return -EINVAL;
+ }
+ *poff = soff;
+ }
state->stack[spi].slot_type[i] = STACK_SPILL;
+ }
} else {
u8 type = STACK_MISC;
@@ -1694,7 +1741,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (t == BPF_WRITE)
err = check_stack_write(env, state, off, size,
- value_regno);
+ value_regno, insn_idx);
else
err = check_stack_read(env, state, off, size,
value_regno);
@@ -2333,6 +2380,29 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return 0;
}
+static int
+record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
+ int func_id, int insn_idx)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+
+ if (func_id != BPF_FUNC_tail_call &&
+ func_id != BPF_FUNC_map_lookup_elem)
+ return 0;
+ if (meta->map_ptr == NULL) {
+ verbose(env, "kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+
+ if (!BPF_MAP_PTR(aux->map_state))
+ bpf_map_ptr_store(aux, meta->map_ptr,
+ meta->map_ptr->unpriv_array);
+ else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr)
+ bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
+ meta->map_ptr->unpriv_array);
+ return 0;
+}
+
static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
const struct bpf_func_proto *fn = NULL;
@@ -2387,13 +2457,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
- if (func_id == BPF_FUNC_tail_call) {
- if (meta.map_ptr == NULL) {
- verbose(env, "verifier bug\n");
- return -EINVAL;
- }
- env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
- }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
@@ -2404,6 +2467,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
if (err)
return err;
+ err = record_func_map(env, &meta, func_id, insn_idx);
+ if (err)
+ return err;
+
/* Mark slots with STACK_MISC in case of raw mode, stack offset
* is inferred from register state.
*/
@@ -2428,8 +2495,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
- struct bpf_insn_aux_data *insn_aux;
-
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -2445,11 +2510,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].id = ++env->id_gen;
- insn_aux = &env->insn_aux_data[insn_idx];
- if (!insn_aux->map_ptr)
- insn_aux->map_ptr = meta.map_ptr;
- else if (insn_aux->map_ptr != meta.map_ptr)
- insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
@@ -5169,6 +5229,34 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
else
continue;
+ if (type == BPF_WRITE &&
+ env->insn_aux_data[i + delta].sanitize_stack_off) {
+ struct bpf_insn patch[] = {
+ /* Sanitize suspicious stack slot with zero.
+ * There are no memory dependencies for this store,
+ * since it's only using frame pointer and immediate
+ * constant of zero
+ */
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+ env->insn_aux_data[i + delta].sanitize_stack_off,
+ 0),
+ /* the original STX instruction will immediately
+ * overwrite the same stack slot with appropriate value
+ */
+ *insn,
+ };
+
+ cnt = ARRAY_SIZE(patch);
+ new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
@@ -5417,6 +5505,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
+ struct bpf_insn_aux_data *aux;
struct bpf_insn insn_buf[16];
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
@@ -5491,19 +5580,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->imm = 0;
insn->code = BPF_JMP | BPF_TAIL_CALL;
+ aux = &env->insn_aux_data[i + delta];
+ if (!bpf_map_ptr_unpriv(aux))
+ continue;
+
/* instead of changing every JIT dealing with tail_call
* emit two extra insns:
* if (index >= max_entries) goto out;
* index &= array->index_mask;
* to avoid out-of-bounds cpu speculation
*/
- map_ptr = env->insn_aux_data[i + delta].map_ptr;
- if (map_ptr == BPF_MAP_PTR_POISON) {
+ if (bpf_map_ptr_poisoned(aux)) {
verbose(env, "tail_call abusing map_ptr\n");
return -EINVAL;
}
- if (!map_ptr->unpriv_array)
- continue;
+
+ map_ptr = BPF_MAP_PTR(aux->map_state);
insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
map_ptr->max_entries, 2);
insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -5527,9 +5619,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
if (prog->jit_requested && BITS_PER_LONG == 64 &&
insn->imm == BPF_FUNC_map_lookup_elem) {
- map_ptr = env->insn_aux_data[i + delta].map_ptr;
- if (map_ptr == BPF_MAP_PTR_POISON ||
- !map_ptr->ops->map_gen_lookup)
+ aux = &env->insn_aux_data[i + delta];
+ if (bpf_map_ptr_poisoned(aux))
+ goto patch_call_imm;
+
+ map_ptr = BPF_MAP_PTR(aux->map_state);
+ if (!map_ptr->ops->map_gen_lookup)
goto patch_call_imm;
cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2017a39ab490..481951bf091d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -193,7 +193,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme);
void kthread_park_complete(struct task_struct *k)
{
- complete(&to_kthread(k)->parked);
+ complete_all(&to_kthread(k)->parked);
}
static int kthread(void *_create)
@@ -459,6 +459,7 @@ void kthread_unpark(struct task_struct *k)
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
__kthread_bind(k, kthread->cpu, TASK_PARKED);
+ reinit_completion(&kthread->parked);
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
wake_up_state(k, TASK_PARKED);
}
@@ -483,9 +484,6 @@ int kthread_park(struct task_struct *k)
if (WARN_ON(k->flags & PF_EXITING))
return -ENOSYS;
- if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
- return -EBUSY;
-
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
if (k != current) {
wake_up_process(k);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index e795908f3607..a90336779375 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -352,16 +352,15 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
struct task_struct *owner;
bool ret = true;
+ BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
+
if (need_resched())
return false;
rcu_read_lock();
owner = READ_ONCE(sem->owner);
- if (!rwsem_owner_is_writer(owner)) {
- /*
- * Don't spin if the rwsem is readers owned.
- */
- ret = !rwsem_owner_is_reader(owner);
+ if (!owner || !is_rwsem_owner_spinnable(owner)) {
+ ret = !owner; /* !owner is spinnable */
goto done;
}
@@ -382,11 +381,11 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner = READ_ONCE(sem->owner);
- if (!rwsem_owner_is_writer(owner))
- goto out;
+ if (!is_rwsem_owner_spinnable(owner))
+ return false;
rcu_read_lock();
- while (sem->owner == owner) {
+ while (owner && (READ_ONCE(sem->owner) == owner)) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
* checking sem->owner still matches owner, if that fails,
@@ -408,12 +407,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
cpu_relax();
}
rcu_read_unlock();
-out:
+
/*
* If there is a new owner or the owner is not set, we continue
* spinning.
*/
- return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
+ return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 30465a2f2b6c..bc1e507be9ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -221,5 +221,3 @@ void up_read_non_owner(struct rw_semaphore *sem)
EXPORT_SYMBOL(up_read_non_owner);
#endif
-
-
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index a17cba8d94bb..b9d0e72aa80f 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,20 +1,24 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* The owner field of the rw_semaphore structure will be set to
- * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear
* the owner field when it unlocks. A reader, on the other hand, will
* not touch the owner field when it unlocks.
*
- * In essence, the owner field now has the following 3 states:
+ * In essence, the owner field now has the following 4 states:
* 1) 0
* - lock is free or the owner hasn't set the field yet
* 2) RWSEM_READER_OWNED
* - lock is currently or previously owned by readers (lock is free
* or not set by owner yet)
- * 3) Other non-zero value
- * - a writer owns the lock
+ * 3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well
+ * - lock is owned by an anonymous writer, so spinning on the lock
+ * owner should be disabled.
+ * 4) Other non-zero value
+ * - a writer owns the lock and other writers can spin on the lock owner.
*/
-#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
+#define RWSEM_ANONYMOUSLY_OWNED (1UL << 0)
+#define RWSEM_READER_OWNED ((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED)
#ifdef CONFIG_DEBUG_RWSEMS
# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c)
@@ -51,14 +55,22 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
}
-static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+/*
+ * Return true if the a rwsem waiter can spin on the rwsem's owner
+ * and steal the lock, i.e. the lock is not anonymously owned.
+ * N.B. !owner is considered spinnable.
+ */
+static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
{
- return owner && owner != RWSEM_READER_OWNED;
+ return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
}
-static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+/*
+ * Return true if rwsem is owned by an anonymous writer or readers.
+ */
+static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
{
- return owner == RWSEM_READER_OWNED;
+ return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
}
#else
static inline void rwsem_set_owner(struct rw_semaphore *sem)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 5454cc639a8d..9c85c7822383 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -287,6 +287,8 @@ static int create_image(int platform_mode)
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
+
error = syscore_suspend();
if (error) {
pr_err("Some system devices failed to power down, aborting hibernation\n");
@@ -317,6 +319,7 @@ static int create_image(int platform_mode)
syscore_resume();
Enable_irqs:
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
@@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode)
goto Enable_cpus;
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
error = syscore_suspend();
if (error)
@@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode)
syscore_resume();
Enable_irqs:
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
@@ -563,6 +568,7 @@ int hibernation_platform_enter(void)
goto Enable_cpus;
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
syscore_suspend();
if (pm_wakeup_pending()) {
error = -EAGAIN;
@@ -575,6 +581,7 @@ int hibernation_platform_enter(void)
Power_up:
syscore_resume();
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index fa39092b7aea..86d72ffb811b 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -184,7 +184,6 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
c->target_value = value;
}
-static inline int pm_qos_get_value(struct pm_qos_constraints *c);
static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
{
struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4c10be0f4843..87331565e505 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
+#include <linux/swait.h>
#include <linux/ftrace.h>
#include <trace/events/power.h>
#include <linux/compiler.h>
@@ -57,10 +58,10 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_s2idle_ops *s2idle_ops;
-static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
+static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
enum s2idle_states __read_mostly s2idle_state;
-static DEFINE_SPINLOCK(s2idle_lock);
+static DEFINE_RAW_SPINLOCK(s2idle_lock);
void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
@@ -78,12 +79,12 @@ static void s2idle_enter(void)
{
trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
- spin_lock_irq(&s2idle_lock);
+ raw_spin_lock_irq(&s2idle_lock);
if (pm_wakeup_pending())
goto out;
s2idle_state = S2IDLE_STATE_ENTER;
- spin_unlock_irq(&s2idle_lock);
+ raw_spin_unlock_irq(&s2idle_lock);
get_online_cpus();
cpuidle_resume();
@@ -91,17 +92,17 @@ static void s2idle_enter(void)
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
- wait_event(s2idle_wait_head,
- s2idle_state == S2IDLE_STATE_WAKE);
+ swait_event(s2idle_wait_head,
+ s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
put_online_cpus();
- spin_lock_irq(&s2idle_lock);
+ raw_spin_lock_irq(&s2idle_lock);
out:
s2idle_state = S2IDLE_STATE_NONE;
- spin_unlock_irq(&s2idle_lock);
+ raw_spin_unlock_irq(&s2idle_lock);
trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false);
}
@@ -156,12 +157,12 @@ void s2idle_wake(void)
{
unsigned long flags;
- spin_lock_irqsave(&s2idle_lock, flags);
+ raw_spin_lock_irqsave(&s2idle_lock, flags);
if (s2idle_state > S2IDLE_STATE_NONE) {
s2idle_state = S2IDLE_STATE_WAKE;
- wake_up(&s2idle_wait_head);
+ swake_up(&s2idle_wait_head);
}
- spin_unlock_irqrestore(&s2idle_lock, flags);
+ raw_spin_unlock_irqrestore(&s2idle_lock, flags);
}
EXPORT_SYMBOL_GPL(s2idle_wake);
@@ -428,6 +429,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
+ system_state = SYSTEM_SUSPEND;
+
error = syscore_suspend();
if (!error) {
*wakeup = pm_wakeup_pending();
@@ -443,6 +446,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
syscore_resume();
}
+ system_state = SYSTEM_RUNNING;
+
arch_suspend_enable_irqs();
BUG_ON(irqs_disabled());
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 75c959de4b29..abd225550271 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -186,6 +186,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
res = PAGE_SIZE - pg_offp;
}
+ if (!data_of(data->handle)) {
+ res = -EINVAL;
+ goto unlock;
+ }
+
res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
buf, count);
if (res > 0)
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index dfba59be190b..4210152e56f0 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -188,6 +188,7 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
return ERR_PTR(-ENOMEM);
}
wl->ws.name = wl->name;
+ wl->ws.last_time = ktime_get();
wakeup_source_add(&wl->ws);
rb_link_node(&wl->node, parent, node);
rb_insert_color(&wl->node, &wakelocks_tree);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e13df951aca7..fd76497efeb1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -51,7 +51,7 @@ struct sugov_cpu {
bool iowait_boost_pending;
unsigned int iowait_boost;
unsigned int iowait_boost_max;
- u64 last_update;
+ u64 last_update;
/* The fields below are only needed when sharing a policy: */
unsigned long util_cfs;
@@ -89,46 +89,52 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
* schedule the kthread.
*/
if (sg_policy->policy->fast_switch_enabled &&
- !cpufreq_can_do_remote_dvfs(sg_policy->policy))
+ !cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
- if (sg_policy->work_in_progress)
- return false;
-
- if (unlikely(sg_policy->need_freq_update)) {
- sg_policy->need_freq_update = false;
- /*
- * This happens when limits change, so forget the previous
- * next_freq value and force an update.
- */
- sg_policy->next_freq = UINT_MAX;
+ if (unlikely(sg_policy->need_freq_update))
return true;
- }
delta_ns = time - sg_policy->last_freq_update_time;
return delta_ns >= sg_policy->freq_update_delay_ns;
}
-static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
- unsigned int next_freq)
+static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
{
- struct cpufreq_policy *policy = sg_policy->policy;
-
if (sg_policy->next_freq == next_freq)
- return;
+ return false;
sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
- if (policy->fast_switch_enabled) {
- next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- if (!next_freq)
- return;
+ return true;
+}
- policy->cur = next_freq;
- trace_cpu_frequency(next_freq, smp_processor_id());
- } else {
+static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+
+ if (!sugov_update_next_freq(sg_policy, time, next_freq))
+ return;
+
+ next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+ if (!next_freq)
+ return;
+
+ policy->cur = next_freq;
+ trace_cpu_frequency(next_freq, smp_processor_id());
+}
+
+static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ if (!sugov_update_next_freq(sg_policy, time, next_freq))
+ return;
+
+ if (!sg_policy->work_in_progress) {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
@@ -165,8 +171,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
freq = (freq + (freq >> 2)) * util / max;
- if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
+
+ sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
@@ -201,43 +209,120 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
return min(util, sg_cpu->max);
}
-static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
+/**
+ * sugov_iowait_reset() - Reset the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @set_iowait_boost: true if an IO boost has been requested
+ *
+ * The IO wait boost of a task is disabled after a tick since the last update
+ * of a CPU. If a new IO wait boost is requested after more then a tick, then
+ * we enable the boost starting from the minimum frequency, which improves
+ * energy efficiency by ignoring sporadic wakeups from IO.
+ */
+static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
+ bool set_iowait_boost)
{
- if (flags & SCHED_CPUFREQ_IOWAIT) {
- if (sg_cpu->iowait_boost_pending)
- return;
+ s64 delta_ns = time - sg_cpu->last_update;
- sg_cpu->iowait_boost_pending = true;
+ /* Reset boost only if a tick has elapsed since last request */
+ if (delta_ns <= TICK_NSEC)
+ return false;
- if (sg_cpu->iowait_boost) {
- sg_cpu->iowait_boost <<= 1;
- if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
- } else {
- sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
- }
- } else if (sg_cpu->iowait_boost) {
- s64 delta_ns = time - sg_cpu->last_update;
+ sg_cpu->iowait_boost = set_iowait_boost
+ ? sg_cpu->sg_policy->policy->min : 0;
+ sg_cpu->iowait_boost_pending = set_iowait_boost;
- /* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC) {
- sg_cpu->iowait_boost = 0;
- sg_cpu->iowait_boost_pending = false;
- }
+ return true;
+}
+
+/**
+ * sugov_iowait_boost() - Updates the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
+ *
+ * Each time a task wakes up after an IO operation, the CPU utilization can be
+ * boosted to a certain utilization which doubles at each "frequent and
+ * successive" wakeup from IO, ranging from the utilization of the minimum
+ * OPP to the utilization of the maximum OPP.
+ * To keep doubling, an IO boost has to be requested at least once per tick,
+ * otherwise we restart from the utilization of the minimum OPP.
+ */
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
+{
+ bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
+
+ /* Reset boost if the CPU appears to have been idle enough */
+ if (sg_cpu->iowait_boost &&
+ sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
+ return;
+
+ /* Boost only tasks waking up after IO */
+ if (!set_iowait_boost)
+ return;
+
+ /* Ensure boost doubles only one time at each request */
+ if (sg_cpu->iowait_boost_pending)
+ return;
+ sg_cpu->iowait_boost_pending = true;
+
+ /* Double the boost at each request */
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost <<= 1;
+ if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ return;
}
+
+ /* First wakeup after IO: start with minimum boost */
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
}
-static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
- unsigned long *max)
+/**
+ * sugov_iowait_apply() - Apply the IO boost to a CPU.
+ * @sg_cpu: the sugov data for the cpu to boost
+ * @time: the update time from the caller
+ * @util: the utilization to (eventually) boost
+ * @max: the maximum value the utilization can be boosted to
+ *
+ * A CPU running a task which woken up after an IO operation can have its
+ * utilization boosted to speed up the completion of those IO operations.
+ * The IO boost value is increased each time a task wakes up from IO, in
+ * sugov_iowait_apply(), and it's instead decreased by this function,
+ * each time an increase has not been requested (!iowait_boost_pending).
+ *
+ * A CPU which also appears to have been idle for at least one tick has also
+ * its IO boost utilization reset.
+ *
+ * This mechanism is designed to boost high frequently IO waiting tasks, while
+ * being more conservative on tasks which does sporadic IO operations.
+ */
+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long *util, unsigned long *max)
{
unsigned int boost_util, boost_max;
+ /* No boost currently required */
if (!sg_cpu->iowait_boost)
return;
+ /* Reset boost if the CPU appears to have been idle enough */
+ if (sugov_iowait_reset(sg_cpu, time, false))
+ return;
+
+ /*
+ * An IO waiting task has just woken up:
+ * allow to further double the boost value
+ */
if (sg_cpu->iowait_boost_pending) {
sg_cpu->iowait_boost_pending = false;
} else {
+ /*
+ * Otherwise: reduce the boost value and disable it when we
+ * reach the minimum.
+ */
sg_cpu->iowait_boost >>= 1;
if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
sg_cpu->iowait_boost = 0;
@@ -245,9 +330,12 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
}
}
+ /*
+ * Apply the current boost value: a CPU is boosted only if its current
+ * utilization is smaller then the current IO boost level.
+ */
boost_util = sg_cpu->iowait_boost;
boost_max = sg_cpu->iowait_boost_max;
-
if (*util * boost_max < *max * boost_util) {
*util = boost_util;
*max = boost_max;
@@ -286,7 +374,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int next_f;
bool busy;
- sugov_set_iowait_boost(sg_cpu, time, flags);
+ sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
ignore_dl_rate_limit(sg_cpu, sg_policy);
@@ -299,21 +387,31 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
sugov_get_util(sg_cpu);
max = sg_cpu->max;
util = sugov_aggregate_util(sg_cpu);
- sugov_iowait_boost(sg_cpu, &util, &max);
+ sugov_iowait_apply(sg_cpu, time, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
- if (busy && next_f < sg_policy->next_freq &&
- sg_policy->next_freq != UINT_MAX) {
+ if (busy && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
/* Reset cached freq as next_freq has changed */
sg_policy->cached_raw_freq = 0;
}
- sugov_update_commit(sg_policy, time, next_f);
+ /*
+ * This code runs under rq->lock for the target CPU, so it won't run
+ * concurrently on two different CPUs for the same target and it is not
+ * necessary to acquire the lock in the fast switch case.
+ */
+ if (sg_policy->policy->fast_switch_enabled) {
+ sugov_fast_switch(sg_policy, time, next_f);
+ } else {
+ raw_spin_lock(&sg_policy->update_lock);
+ sugov_deferred_update(sg_policy, time, next_f);
+ raw_spin_unlock(&sg_policy->update_lock);
+ }
}
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
@@ -326,28 +424,12 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
- s64 delta_ns;
sugov_get_util(j_sg_cpu);
-
- /*
- * If the CFS CPU utilization was last updated before the
- * previous frequency update and the time elapsed between the
- * last update of the CPU utilization and the last frequency
- * update is long enough, reset iowait_boost and util_cfs, as
- * they are now probably stale. However, still consider the
- * CPU contribution if it has some DEADLINE utilization
- * (util_dl).
- */
- delta_ns = time - j_sg_cpu->last_update;
- if (delta_ns > TICK_NSEC) {
- j_sg_cpu->iowait_boost = 0;
- j_sg_cpu->iowait_boost_pending = false;
- }
-
j_max = j_sg_cpu->max;
j_util = sugov_aggregate_util(j_sg_cpu);
- sugov_iowait_boost(j_sg_cpu, &j_util, &j_max);
+ sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
+
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
@@ -366,14 +448,18 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
raw_spin_lock(&sg_policy->update_lock);
- sugov_set_iowait_boost(sg_cpu, time, flags);
+ sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
ignore_dl_rate_limit(sg_cpu, sg_policy);
if (sugov_should_update_freq(sg_policy, time)) {
next_f = sugov_next_freq_shared(sg_cpu, time);
- sugov_update_commit(sg_policy, time, next_f);
+
+ if (sg_policy->policy->fast_switch_enabled)
+ sugov_fast_switch(sg_policy, time, next_f);
+ else
+ sugov_deferred_update(sg_policy, time, next_f);
}
raw_spin_unlock(&sg_policy->update_lock);
@@ -382,13 +468,27 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
static void sugov_work(struct kthread_work *work)
{
struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+ unsigned int freq;
+ unsigned long flags;
+
+ /*
+ * Hold sg_policy->update_lock shortly to handle the case where:
+ * incase sg_policy->next_freq is read here, and then updated by
+ * sugov_deferred_update() just before work_in_progress is set to false
+ * here, we may miss queueing the new update.
+ *
+ * Note: If a work was queued after the update_lock is released,
+ * sugov_work() will just be called again by kthread_work code; and the
+ * request will be proceed before the sugov thread sleeps.
+ */
+ raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
+ freq = sg_policy->next_freq;
+ sg_policy->work_in_progress = false;
+ raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
mutex_lock(&sg_policy->work_lock);
- __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
- CPUFREQ_RELATION_L);
+ __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
mutex_unlock(&sg_policy->work_lock);
-
- sg_policy->work_in_progress = false;
}
static void sugov_irq_work(struct irq_work *irq_work)
@@ -511,11 +611,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
}
sg_policy->thread = thread;
-
- /* Kthread is bound to all CPUs by default */
- if (!policy->dvfs_possible_from_any_cpu)
- kthread_bind_mask(thread, policy->related_cpus);
-
+ kthread_bind_mask(thread, policy->related_cpus);
init_irq_work(&sg_policy->irq_work, sugov_irq_work);
mutex_init(&sg_policy->work_lock);
@@ -658,7 +754,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
sg_policy->last_freq_update_time = 0;
- sg_policy->next_freq = UINT_MAX;
+ sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e7b3008b85bb..1356afd1eeb6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1117,7 +1117,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
* should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
* So, overflow is not an issue here.
*/
-u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
{
u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
u64 u_act;
@@ -2731,8 +2731,6 @@ bool dl_cpu_busy(unsigned int cpu)
#endif
#ifdef CONFIG_SCHED_DEBUG
-extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
-
void print_dl_stats(struct seq_file *m, int cpu)
{
print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7aef6b4e885a..ef3c4e6f5345 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2701,8 +2701,6 @@ int sched_rr_handler(struct ctl_table *table, int write,
}
#ifdef CONFIG_SCHED_DEBUG
-extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
-
void print_rt_stats(struct seq_file *m, int cpu)
{
rt_rq_iter_t iter;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 15750c222ca2..1f0a4bc6a39d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2025,8 +2025,9 @@ extern bool sched_debug_enabled;
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
-extern void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
#ifdef CONFIG_NUMA_BALANCING
extern void
show_numa_stats(struct task_struct *p, struct seq_file *m);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 64cc564f5255..61a1125c1ae4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1708,7 +1708,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
rcu_read_unlock();
if (rq && sched_debug_enabled) {
- pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+ pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dc77548167ef..e691d9a6c58d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -19,6 +19,8 @@
#include <linux/compat.h>
#include <linux/coredump.h>
#include <linux/kmemleak.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/seccomp.h>
@@ -227,8 +229,11 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
return true;
}
+void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
+
static inline void seccomp_assign_mode(struct task_struct *task,
- unsigned long seccomp_mode)
+ unsigned long seccomp_mode,
+ unsigned long flags)
{
assert_spin_locked(&task->sighand->siglock);
@@ -238,6 +243,9 @@ static inline void seccomp_assign_mode(struct task_struct *task,
* filter) is set.
*/
smp_mb__before_atomic();
+ /* Assume default seccomp processes want spec flaw mitigation. */
+ if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
+ arch_seccomp_spec_mitigate(task);
set_tsk_thread_flag(task, TIF_SECCOMP);
}
@@ -305,7 +313,7 @@ static inline pid_t seccomp_can_sync_threads(void)
* without dropping the locks.
*
*/
-static inline void seccomp_sync_threads(void)
+static inline void seccomp_sync_threads(unsigned long flags)
{
struct task_struct *thread, *caller;
@@ -346,7 +354,8 @@ static inline void seccomp_sync_threads(void)
* allow one thread to transition the other.
*/
if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
- seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+ seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
+ flags);
}
}
@@ -469,7 +478,7 @@ static long seccomp_attach_filter(unsigned int flags,
/* Now that the new filter is in place, synchronize to all threads. */
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
- seccomp_sync_threads();
+ seccomp_sync_threads(flags);
return 0;
}
@@ -818,7 +827,7 @@ static long seccomp_set_mode_strict(void)
#ifdef TIF_NOTSC
disable_TSC();
#endif
- seccomp_assign_mode(current, seccomp_mode);
+ seccomp_assign_mode(current, seccomp_mode, 0);
ret = 0;
out:
@@ -876,7 +885,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
/* Do not free the successfully attached filter. */
prepared = NULL;
- seccomp_assign_mode(current, seccomp_mode);
+ seccomp_assign_mode(current, seccomp_mode, flags);
out:
spin_unlock_irq(&current->sighand->siglock);
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
diff --git a/kernel/sys.c b/kernel/sys.c
index ad692183dfe9..d1b2b8d934bb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -61,6 +61,8 @@
#include <linux/uidgid.h>
#include <linux/cred.h>
+#include <linux/nospec.h>
+
#include <linux/kmsg_dump.h>
/* Move somewhere else to avoid recompiling? */
#include <generated/utsrelease.h>
@@ -69,6 +71,9 @@
#include <asm/io.h>
#include <asm/unistd.h>
+/* Hardening for Spectre-v1 */
+#include <linux/nospec.h>
+
#include "uid16.h"
#ifndef SET_UNALIGN_CTL
@@ -1451,6 +1456,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
if (resource >= RLIM_NLIMITS)
return -EINVAL;
+ resource = array_index_nospec(resource, RLIM_NLIMITS);
task_lock(current->group_leader);
x = current->signal->rlim[resource];
task_unlock(current->group_leader);
@@ -1470,6 +1476,7 @@ COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
if (resource >= RLIM_NLIMITS)
return -EINVAL;
+ resource = array_index_nospec(resource, RLIM_NLIMITS);
task_lock(current->group_leader);
r = current->signal->rlim[resource];
task_unlock(current->group_leader);
@@ -2242,6 +2249,17 @@ static int propagate_has_child_subreaper(struct task_struct *p, void *data)
return 1;
}
+int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
+{
+ return -EINVAL;
+}
+
+int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
+ unsigned long ctrl)
+{
+ return -EINVAL;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2450,6 +2468,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SVE_GET_VL:
error = SVE_GET_VL();
break;
+ case PR_GET_SPECULATION_CTRL:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_prctl_spec_ctrl_get(me, arg2);
+ break;
+ case PR_SET_SPECULATION_CTRL:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b398c2ea69b2..aa2094d5dd27 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -612,6 +612,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
now = ktime_get();
/* Find all expired events */
for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
+ /*
+ * Required for !SMP because for_each_cpu() reports
+ * unconditionally CPU0 as set on UP kernels.
+ */
+ if (!IS_ENABLED(CONFIG_SMP) &&
+ cpumask_empty(tick_broadcast_oneshot_mask))
+ break;
+
td = &per_cpu(tick_cpu_device, cpu);
if (td->evtdev->next_event <= now) {
cpumask_set_cpu(cpu, tmpmask);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 49edc1c4f3e6..14de3727b18e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -490,6 +490,7 @@ void tick_freeze(void)
if (tick_freeze_depth == num_online_cpus()) {
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), true);
+ system_state = SYSTEM_SUSPEND;
timekeeping_suspend();
} else {
tick_suspend_local();
@@ -513,6 +514,7 @@ void tick_unfreeze(void)
if (tick_freeze_depth == num_online_cpus()) {
timekeeping_resume();
+ system_state = SYSTEM_RUNNING;
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
} else {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 414d7210b2ec..bcd93031d042 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -893,7 +893,7 @@ int __trace_bputs(unsigned long ip, const char *str)
EXPORT_SYMBOL_GPL(__trace_bputs);
#ifdef CONFIG_TRACER_SNAPSHOT
-static void tracing_snapshot_instance(struct trace_array *tr)
+void tracing_snapshot_instance(struct trace_array *tr)
{
struct tracer *tracer = tr->current_trace;
unsigned long flags;
@@ -949,7 +949,7 @@ static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
struct trace_buffer *size_buf, int cpu_id);
static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
-static int alloc_snapshot(struct trace_array *tr)
+int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
int ret;
@@ -995,7 +995,7 @@ int tracing_alloc_snapshot(void)
struct trace_array *tr = &global_trace;
int ret;
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
WARN_ON(ret < 0);
return ret;
@@ -5408,7 +5408,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
#ifdef CONFIG_TRACER_MAX_TRACE
if (t->use_max_tr && !had_max_tr) {
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
goto out;
}
@@ -6451,7 +6451,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
#endif
if (!tr->allocated_snapshot) {
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
break;
}
@@ -7179,7 +7179,7 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
return ret;
out_reg:
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
goto out;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6fb46a06c9dc..507954b4e058 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1817,6 +1817,17 @@ static inline void __init trace_event_init(void) { }
static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+void tracing_snapshot_instance(struct trace_array *tr);
+int tracing_alloc_snapshot_instance(struct trace_array *tr);
+#else
+static inline void tracing_snapshot_instance(struct trace_array *tr) { }
+static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
+{
+ return 0;
+}
+#endif
+
extern struct trace_iterator *tracepoint_print_iter;
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d251cabcf69a..8b5bdcf64871 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -483,9 +483,10 @@ clear_event_triggers(struct trace_array *tr)
struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list) {
- struct event_trigger_data *data;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ struct event_trigger_data *data, *n;
+ list_for_each_entry_safe(data, n, &file->triggers, list) {
trace_event_trigger_enable_disable(file, 0);
+ list_del_rcu(&data->list);
if (data->ops->free)
data->ops->free(data->ops, data);
}
@@ -642,6 +643,7 @@ event_trigger_callback(struct event_command *cmd_ops,
trigger_data->count = -1;
trigger_data->ops = trigger_ops;
trigger_data->cmd_ops = cmd_ops;
+ trigger_data->private_data = file;
INIT_LIST_HEAD(&trigger_data->list);
INIT_LIST_HEAD(&trigger_data->named_list);
@@ -1053,7 +1055,12 @@ static void
snapshot_trigger(struct event_trigger_data *data, void *rec,
struct ring_buffer_event *event)
{
- tracing_snapshot();
+ struct trace_event_file *file = data->private_data;
+
+ if (file)
+ tracing_snapshot_instance(file->tr);
+ else
+ tracing_snapshot();
}
static void
@@ -1076,7 +1083,7 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
{
int ret = register_trigger(glob, ops, data, file);
- if (ret > 0 && tracing_alloc_snapshot() != 0) {
+ if (ret > 0 && tracing_alloc_snapshot_instance(file->tr) != 0) {
unregister_trigger(glob, ops, data, file);
ret = 0;
}