aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c24
-rw-r--r--kernel/bpf/btf.c2
-rw-r--r--kernel/bpf/hashtab.c54
-rw-r--r--kernel/bpf/sockmap.c75
-rw-r--r--kernel/bpf/syscall.c1
-rw-r--r--kernel/bpf/verifier.c218
-rw-r--r--kernel/cpu.c37
-rw-r--r--kernel/dma/direct.c4
-rw-r--r--kernel/events/core.c15
-rw-r--r--kernel/events/hw_breakpoint.c13
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/jump_label.c2
-rw-r--r--kernel/locking/lockdep.c1
-rw-r--r--kernel/locking/mutex.c3
-rw-r--r--kernel/locking/test-ww_mutex.c2
-rw-r--r--kernel/printk/printk.c13
-rw-r--r--kernel/printk/printk_safe.c4
-rw-r--r--kernel/sched/debug.c6
-rw-r--r--kernel/sched/fair.c26
-rw-r--r--kernel/sched/topology.c5
-rw-r--r--kernel/time/clocksource.c40
-rw-r--r--kernel/watchdog.c4
-rw-r--r--kernel/watchdog_hld.c2
-rw-r--r--kernel/workqueue.c2
24 files changed, 276 insertions, 280 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 0c17aab3ce5f..f9d24121be99 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -358,6 +358,29 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ void __percpu *pptr;
+ int cpu;
+
+ rcu_read_lock();
+
+ seq_printf(m, "%u: {\n", *(u32 *)key);
+ pptr = array->pptrs[index & array->index_mask];
+ for_each_possible_cpu(cpu) {
+ seq_printf(m, "\tcpu%d: ", cpu);
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ per_cpu_ptr(pptr, cpu), m);
+ seq_puts(m, "\n");
+ }
+ seq_puts(m, "}\n");
+
+ rcu_read_unlock();
+}
+
static int array_map_check_btf(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type)
@@ -398,6 +421,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_lookup_elem = percpu_array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_seq_show_elem = percpu_array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2590700237c1..138f0302692e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1844,7 +1844,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
hdr = &btf->hdr;
cur = btf->nohdr_data + hdr->type_off;
- end = btf->nohdr_data + hdr->type_len;
+ end = cur + hdr->type_len;
env->log_type_id = 1;
while (cur < end) {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 04b8eda94e7d..2c1790288138 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -15,6 +15,7 @@
#include <linux/jhash.h>
#include <linux/filter.h>
#include <linux/rculist_nulls.h>
+#include <linux/random.h>
#include <uapi/linux/btf.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
@@ -41,6 +42,7 @@ struct bpf_htab {
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
+ u32 hashrnd;
};
/* each htab element is struct htab_elem + key + value */
@@ -371,6 +373,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (!htab->buckets)
goto free_htab;
+ htab->hashrnd = get_random_int();
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
raw_spin_lock_init(&htab->buckets[i].lock);
@@ -402,9 +405,9 @@ free_htab:
return ERR_PTR(err);
}
-static inline u32 htab_map_hash(const void *key, u32 key_len)
+static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
{
- return jhash(key, key_len, 0);
+ return jhash(key, key_len, hashrnd);
}
static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
@@ -470,7 +473,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
head = select_bucket(htab, hash);
@@ -597,7 +600,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
if (!key)
goto find_first_elem;
- hash = htab_map_hash(key, key_size);
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
head = select_bucket(htab, hash);
@@ -824,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
b = __select_bucket(htab, hash);
head = &b->head;
@@ -880,7 +883,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
b = __select_bucket(htab, hash);
head = &b->head;
@@ -945,7 +948,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
b = __select_bucket(htab, hash);
head = &b->head;
@@ -998,7 +1001,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
b = __select_bucket(htab, hash);
head = &b->head;
@@ -1071,7 +1074,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
b = __select_bucket(htab, hash);
head = &b->head;
@@ -1103,7 +1106,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
b = __select_bucket(htab, hash);
head = &b->head;
@@ -1282,6 +1285,35 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
return ret;
}
+static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ struct htab_elem *l;
+ void __percpu *pptr;
+ int cpu;
+
+ rcu_read_lock();
+
+ l = __htab_map_lookup_elem(map, key);
+ if (!l) {
+ rcu_read_unlock();
+ return;
+ }
+
+ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+ seq_puts(m, ": {\n");
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ seq_printf(m, "\tcpu%d: ", cpu);
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ per_cpu_ptr(pptr, cpu), m);
+ seq_puts(m, "\n");
+ }
+ seq_puts(m, "}\n");
+
+ rcu_read_unlock();
+}
+
const struct bpf_map_ops htab_percpu_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
@@ -1290,6 +1322,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_lookup_elem = htab_percpu_map_lookup_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
+ .map_seq_show_elem = htab_percpu_map_seq_show_elem,
};
const struct bpf_map_ops htab_lru_percpu_map_ops = {
@@ -1300,6 +1333,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
+ .map_seq_show_elem = htab_percpu_map_seq_show_elem,
};
static int fd_htab_map_alloc_check(union bpf_attr *attr)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 98e621a29e8e..488ef9663c01 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -236,7 +236,7 @@ static int bpf_tcp_init(struct sock *sk)
}
static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md);
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge);
static void bpf_tcp_release(struct sock *sk)
{
@@ -248,7 +248,7 @@ static void bpf_tcp_release(struct sock *sk)
goto out;
if (psock->cork) {
- free_start_sg(psock->sock, psock->cork);
+ free_start_sg(psock->sock, psock->cork, true);
kfree(psock->cork);
psock->cork = NULL;
}
@@ -330,14 +330,14 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
close_fun = psock->save_close;
if (psock->cork) {
- free_start_sg(psock->sock, psock->cork);
+ free_start_sg(psock->sock, psock->cork, true);
kfree(psock->cork);
psock->cork = NULL;
}
list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
list_del(&md->list);
- free_start_sg(psock->sock, md);
+ free_start_sg(psock->sock, md, true);
kfree(md);
}
@@ -369,7 +369,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
/* If another thread deleted this object skip deletion.
* The refcnt on psock may or may not be zero.
*/
- if (l) {
+ if (l && l == link) {
hlist_del_rcu(&link->hash_node);
smap_release_sock(psock, link->sk);
free_htab_elem(htab, link);
@@ -570,14 +570,16 @@ static void free_bytes_sg(struct sock *sk, int bytes,
md->sg_start = i;
}
-static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
+static int free_sg(struct sock *sk, int start,
+ struct sk_msg_buff *md, bool charge)
{
struct scatterlist *sg = md->sg_data;
int i = start, free = 0;
while (sg[i].length) {
free += sg[i].length;
- sk_mem_uncharge(sk, sg[i].length);
+ if (charge)
+ sk_mem_uncharge(sk, sg[i].length);
if (!md->skb)
put_page(sg_page(&sg[i]));
sg[i].length = 0;
@@ -594,9 +596,9 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
return free;
}
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md)
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge)
{
- int free = free_sg(sk, md->sg_start, md);
+ int free = free_sg(sk, md->sg_start, md, charge);
md->sg_start = md->sg_end;
return free;
@@ -604,7 +606,7 @@ static int free_start_sg(struct sock *sk, struct sk_msg_buff *md)
static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
{
- return free_sg(sk, md->sg_curr, md);
+ return free_sg(sk, md->sg_curr, md, true);
}
static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
@@ -718,7 +720,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
list_add_tail(&r->list, &psock->ingress);
sk->sk_data_ready(sk);
} else {
- free_start_sg(sk, r);
+ free_start_sg(sk, r, true);
kfree(r);
}
@@ -752,14 +754,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
release_sock(sk);
}
smap_release_sock(psock, sk);
- if (unlikely(err))
- goto out;
- return 0;
+ return err;
out_rcu:
rcu_read_unlock();
-out:
- free_bytes_sg(NULL, send, md, false);
- return err;
+ return 0;
}
static inline void bpf_md_init(struct smap_psock *psock)
@@ -822,7 +820,7 @@ more_data:
case __SK_PASS:
err = bpf_tcp_push(sk, send, m, flags, true);
if (unlikely(err)) {
- *copied -= free_start_sg(sk, m);
+ *copied -= free_start_sg(sk, m, true);
break;
}
@@ -845,16 +843,17 @@ more_data:
lock_sock(sk);
if (unlikely(err < 0)) {
- free_start_sg(sk, m);
+ int free = free_start_sg(sk, m, false);
+
psock->sg_size = 0;
if (!cork)
- *copied -= send;
+ *copied -= free;
} else {
psock->sg_size -= send;
}
if (cork) {
- free_start_sg(sk, m);
+ free_start_sg(sk, m, true);
psock->sg_size = 0;
kfree(m);
m = NULL;
@@ -912,6 +911,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
rcu_read_lock();
psock = smap_psock_sk(sk);
@@ -922,9 +923,6 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out;
rcu_read_unlock();
- if (!skb_queue_empty(&sk->sk_receive_queue))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-
lock_sock(sk);
bytes_ready:
while (copied != len) {
@@ -1122,7 +1120,7 @@ wait_for_memory:
err = sk_stream_wait_memory(sk, &timeo);
if (err) {
if (m && m != psock->cork)
- free_start_sg(sk, m);
+ free_start_sg(sk, m, true);
goto out_err;
}
}
@@ -1427,12 +1425,15 @@ out:
static void smap_write_space(struct sock *sk)
{
struct smap_psock *psock;
+ void (*write_space)(struct sock *sk);
rcu_read_lock();
psock = smap_psock_sk(sk);
if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
schedule_work(&psock->tx_work);
+ write_space = psock->save_write_space;
rcu_read_unlock();
+ write_space(sk);
}
static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
@@ -1461,10 +1462,16 @@ static void smap_destroy_psock(struct rcu_head *rcu)
schedule_work(&psock->gc_work);
}
+static bool psock_is_smap_sk(struct sock *sk)
+{
+ return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops;
+}
+
static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
{
if (refcount_dec_and_test(&psock->refcnt)) {
- tcp_cleanup_ulp(sock);
+ if (psock_is_smap_sk(sock))
+ tcp_cleanup_ulp(sock);
write_lock_bh(&sock->sk_callback_lock);
smap_stop_sock(psock, sock);
write_unlock_bh(&sock->sk_callback_lock);
@@ -1578,13 +1585,13 @@ static void smap_gc_work(struct work_struct *w)
bpf_prog_put(psock->bpf_tx_msg);
if (psock->cork) {
- free_start_sg(psock->sock, psock->cork);
+ free_start_sg(psock->sock, psock->cork, true);
kfree(psock->cork);
}
list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
list_del(&md->list);
- free_start_sg(psock->sock, md);
+ free_start_sg(psock->sock, md, true);
kfree(md);
}
@@ -1891,6 +1898,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
* doesn't update user data.
*/
if (psock) {
+ if (!psock_is_smap_sk(sock)) {
+ err = -EBUSY;
+ goto out_progs;
+ }
if (READ_ONCE(psock->bpf_parse) && parse) {
err = -EBUSY;
goto out_progs;
@@ -2140,7 +2151,9 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
return ERR_PTR(-EPERM);
/* check sanity of attributes */
- if (attr->max_entries == 0 || attr->value_size != 4 ||
+ if (attr->max_entries == 0 ||
+ attr->key_size == 0 ||
+ attr->value_size != 4 ||
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
@@ -2267,8 +2280,10 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
}
l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
htab->map.numa_node);
- if (!l_new)
+ if (!l_new) {
+ atomic_dec(&htab->count);
return ERR_PTR(-ENOMEM);
+ }
memcpy(l_new->key, key, key_size);
l_new->sk = sk;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8339d81cba1d..3c9636f03bb2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -30,7 +30,6 @@
#include <linux/cred.h>
#include <linux/timekeeping.h>
#include <linux/ctype.h>
-#include <linux/btf.h>
#include <linux/nospec.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 92246117d2b0..8cd4f5306c3c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -263,6 +263,13 @@ static const char * const reg_type_str[] = {
[PTR_TO_PACKET_END] = "pkt_end",
};
+static char slot_type_char[] = {
+ [STACK_INVALID] = '?',
+ [STACK_SPILL] = 'r',
+ [STACK_MISC] = 'm',
+ [STACK_ZERO] = '0',
+};
+
static void print_liveness(struct bpf_verifier_env *env,
enum bpf_reg_liveness live)
{
@@ -349,15 +356,26 @@ static void print_verifier_state(struct bpf_verifier_env *env,
}
}
for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (state->stack[i].slot_type[0] == STACK_SPILL) {
- verbose(env, " fp%d",
- (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, state->stack[i].spilled_ptr.live);
+ char types_buf[BPF_REG_SIZE + 1];
+ bool valid = false;
+ int j;
+
+ for (j = 0; j < BPF_REG_SIZE; j++) {
+ if (state->stack[i].slot_type[j] != STACK_INVALID)
+ valid = true;
+ types_buf[j] = slot_type_char[
+ state->stack[i].slot_type[j]];
+ }
+ types_buf[BPF_REG_SIZE] = 0;
+ if (!valid)
+ continue;
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, state->stack[i].spilled_ptr.live);
+ if (state->stack[i].slot_type[0] == STACK_SPILL)
verbose(env, "=%s",
reg_type_str[state->stack[i].spilled_ptr.type]);
- }
- if (state->stack[i].slot_type[0] == STACK_ZERO)
- verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE);
+ else
+ verbose(env, "=%s", types_buf);
}
verbose(env, "\n");
}
@@ -380,9 +398,9 @@ static int copy_stack_state(struct bpf_func_state *dst,
/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
* make it consume minimal amount of memory. check_stack_write() access from
* the program calls into realloc_func_state() to grow the stack size.
- * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
- * which this function copies over. It points to previous bpf_verifier_state
- * which is never reallocated
+ * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state
+ * which this function copies over. It points to corresponding reg in previous
+ * bpf_verifier_state which is never reallocated
*/
static int realloc_func_state(struct bpf_func_state *state, int size,
bool copy_old)
@@ -466,7 +484,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->frame[i] = NULL;
}
dst_state->curframe = src->curframe;
- dst_state->parent = src->parent;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -732,6 +749,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
for (i = 0; i < MAX_BPF_REG; i++) {
mark_reg_not_init(env, regs, i);
regs[i].live = REG_LIVE_NONE;
+ regs[i].parent = NULL;
}
/* frame pointer */
@@ -876,74 +894,21 @@ next:
return 0;
}
-static
-struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *state,
- struct bpf_verifier_state *parent,
- u32 regno)
-{
- struct bpf_verifier_state *tmp = NULL;
-
- /* 'parent' could be a state of caller and
- * 'state' could be a state of callee. In such case
- * parent->curframe < state->curframe
- * and it's ok for r1 - r5 registers
- *
- * 'parent' could be a callee's state after it bpf_exit-ed.
- * In such case parent->curframe > state->curframe
- * and it's ok for r0 only
- */
- if (parent->curframe == state->curframe ||
- (parent->curframe < state->curframe &&
- regno >= BPF_REG_1 && regno <= BPF_REG_5) ||
- (parent->curframe > state->curframe &&
- regno == BPF_REG_0))
- return parent;
-
- if (parent->curframe > state->curframe &&
- regno >= BPF_REG_6) {
- /* for callee saved regs we have to skip the whole chain
- * of states that belong to callee and mark as LIVE_READ
- * the registers before the call
- */
- tmp = parent;
- while (tmp && tmp->curframe != state->curframe) {
- tmp = tmp->parent;
- }
- if (!tmp)
- goto bug;
- parent = tmp;
- } else {
- goto bug;
- }
- return parent;
-bug:
- verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp);
- verbose(env, "regno %d parent frame %d current frame %d\n",
- regno, parent->curframe, state->curframe);
- return NULL;
-}
-
+/* Parentage chain of this register (or stack slot) should take care of all
+ * issues like callee-saved registers, stack slot allocation time, etc.
+ */
static int mark_reg_read(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *state,
- struct bpf_verifier_state *parent,
- u32 regno)
+ const struct bpf_reg_state *state,
+ struct bpf_reg_state *parent)
{
bool writes = parent == state->parent; /* Observe write marks */
- if (regno == BPF_REG_FP)
- /* We don't need to worry about FP liveness because it's read-only */
- return 0;
-
while (parent) {
/* if read wasn't screened by an earlier write ... */
- if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN)
+ if (writes && state->live & REG_LIVE_WRITTEN)
break;
- parent = skip_callee(env, state, parent, regno);
- if (!parent)
- return -EFAULT;
/* ... then we depend on parent's value */
- parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ;
+ parent->live |= REG_LIVE_READ;
state = parent;
parent = state->parent;
writes = true;
@@ -969,7 +934,10 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
- return mark_reg_read(env, vstate, vstate->parent, regno);
+ /* We don't need to worry about FP liveness because it's read-only */
+ if (regno != BPF_REG_FP)
+ return mark_reg_read(env, &regs[regno],
+ regs[regno].parent);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
@@ -1080,8 +1048,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
} else {
u8 type = STACK_MISC;
- /* regular write of data into stack */
- state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
+ /* regular write of data into stack destroys any spilled ptr */
+ state->stack[spi].spilled_ptr.type = NOT_INIT;
/* only mark the slot as written if all 8 bytes were written
* otherwise read propagation may incorrectly stop too soon
@@ -1106,61 +1074,6 @@ static int check_stack_write(struct bpf_verifier_env *env,
return 0;
}
-/* registers of every function are unique and mark_reg_read() propagates
- * the liveness in the following cases:
- * - from callee into caller for R1 - R5 that were used as arguments
- * - from caller into callee for R0 that used as result of the call
- * - from caller to the same caller skipping states of the callee for R6 - R9,
- * since R6 - R9 are callee saved by implicit function prologue and
- * caller's R6 != callee's R6, so when we propagate liveness up to
- * parent states we need to skip callee states for R6 - R9.
- *
- * stack slot marking is different, since stacks of caller and callee are
- * accessible in both (since caller can pass a pointer to caller's stack to
- * callee which can pass it to another function), hence mark_stack_slot_read()
- * has to propagate the stack liveness to all parent states at given frame number.
- * Consider code:
- * f1() {
- * ptr = fp - 8;
- * *ptr = ctx;
- * call f2 {
- * .. = *ptr;
- * }
- * .. = *ptr;
- * }
- * First *ptr is reading from f1's stack and mark_stack_slot_read() has
- * to mark liveness at the f1's frame and not f2's frame.
- * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has
- * to propagate liveness to f2 states at f1's frame level and further into
- * f1 states at f1's frame level until write into that stack slot
- */
-static void mark_stack_slot_read(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *state,
- struct bpf_verifier_state *parent,
- int slot, int frameno)
-{
- bool writes = parent == state->parent; /* Observe write marks */
-
- while (parent) {
- if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE)
- /* since LIVE_WRITTEN mark is only done for full 8-byte
- * write the read marks are conservative and parent
- * state may not even have the stack allocated. In such case
- * end the propagation, since the loop reached beginning
- * of the function
- */
- break;
- /* if read wasn't screened by an earlier write ... */
- if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
- break;
- /* ... then we depend on parent's value */
- parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
- state = parent;
- parent = state->parent;
- writes = true;
- }
-}
-
static int check_stack_read(struct bpf_verifier_env *env,
struct bpf_func_state *reg_state /* func where register points to */,
int off, int size, int value_regno)
@@ -1198,8 +1111,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
*/
state->regs[value_regno].live |= REG_LIVE_WRITTEN;
}
- mark_stack_slot_read(env, vstate, vstate->parent, spi,
- reg_state->frameno);
+ mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
+ reg_state->stack[spi].spilled_ptr.parent);
return 0;
} else {
int zeros = 0;
@@ -1215,8 +1128,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
off, i, size);
return -EACCES;
}
- mark_stack_slot_read(env, vstate, vstate->parent, spi,
- reg_state->frameno);
+ mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
+ reg_state->stack[spi].spilled_ptr.parent);
if (value_regno >= 0) {
if (zeros == size) {
/* any size read into register is zero extended,
@@ -1908,8 +1821,8 @@ mark:
/* reading any byte out of 8-byte 'spill_slot' will cause
* the whole slot to be marked as 'read'
*/
- mark_stack_slot_read(env, env->cur_state, env->cur_state->parent,
- spi, state->frameno);
+ mark_reg_read(env, &state->stack[spi].spilled_ptr,
+ state->stack[spi].spilled_ptr.parent);
}
return update_stack_depth(env, state, off);
}
@@ -2366,11 +2279,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
state->curframe + 1 /* frameno within this callchain */,
subprog /* subprog number within this prog */);
- /* copy r1 - r5 args that callee can access */
+ /* copy r1 - r5 args that callee can access. The copy includes parent
+ * pointers, which connects us up to the liveness chain
+ */
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
callee->regs[i] = caller->regs[i];
- /* after the call regsiters r0 - r5 were scratched */
+ /* after the call registers r0 - r5 were scratched */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, caller->regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
@@ -3163,7 +3078,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* an arbitrary scalar. Disallow all math except
* pointer subtraction
*/
- if (opcode == BPF_SUB){
+ if (opcode == BPF_SUB && env->allow_ptr_leaks) {
mark_reg_unknown(env, regs, insn->dst_reg);
return 0;
}
@@ -4370,7 +4285,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
/* explored state didn't use this */
return true;
- equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0;
+ equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
if (rold->type == PTR_TO_STACK)
/* two stack pointers are equal only if they're pointing to
@@ -4603,7 +4518,7 @@ static bool states_equal(struct bpf_verifier_env *env,
* equivalent state (jump target or such) we didn't arrive by the straight-line
* code, so read marks in the state must propagate to the parent regardless
* of the state's write marks. That's what 'parent == state->parent' comparison
- * in mark_reg_read() and mark_stack_slot_read() is for.
+ * in mark_reg_read() is for.
*/
static int propagate_liveness(struct bpf_verifier_env *env,
const struct bpf_verifier_state *vstate,
@@ -4624,7 +4539,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,
if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
continue;
if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
- err = mark_reg_read(env, vstate, vparent, i);
+ err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i],
+ &vparent->frame[vstate->curframe]->regs[i]);
if (err)
return err;
}
@@ -4639,7 +4555,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,
if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
continue;
if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
- mark_stack_slot_read(env, vstate, vparent, i, frame);
+ mark_reg_read(env, &state->stack[i].spilled_ptr,
+ &parent->stack[i].spilled_ptr);
}
}
return err;
@@ -4649,7 +4566,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
- struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err;
sl = env->explored_states[insn_idx];
@@ -4691,16 +4608,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return -ENOMEM;
/* add new state to the head of linked list */
- err = copy_verifier_state(&new_sl->state, cur);
+ new = &new_sl->state;
+ err = copy_verifier_state(new, cur);
if (err) {
- free_verifier_state(&new_sl->state, false);
+ free_verifier_state(new, false);
kfree(new_sl);
return err;
}
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
/* connect new state to parentage chain */
- cur->parent = &new_sl->state;
+ for (i = 0; i < BPF_REG_FP; i++)
+ cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i];
/* clear write marks in current state: the writes we did are not writes
* our child did, so they don't screen off its reads from us.
* (There are no read marks in current state, because reads always mark
@@ -4713,9 +4632,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
/* all stack frames are accessible from callee, clear them all */
for (j = 0; j <= cur->curframe; j++) {
struct bpf_func_state *frame = cur->frame[j];
+ struct bpf_func_state *newframe = new->frame[j];
- for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++)
+ for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {
frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
+ frame->stack[i].spilled_ptr.parent =
+ &newframe->stack[i].spilled_ptr;
+ }
}
return 0;
}
@@ -4734,7 +4657,6 @@ static int do_check(struct bpf_verifier_env *env)
if (!state)
return -ENOMEM;
state->curframe = 0;
- state->parent = NULL;
state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
if (!state->frame[0]) {
kfree(state);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index ed44d7d34c2d..0097acec1c71 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -102,8 +102,6 @@ static inline void cpuhp_lock_release(bool bringup) { }
* @name: Name of the step
* @startup: Startup function of the step
* @teardown: Teardown function of the step
- * @skip_onerr: Do not invoke the functions on error rollback
- * Will go away once the notifiers are gone
* @cant_stop: Bringup/teardown can't be stopped at this step
*/
struct cpuhp_step {
@@ -119,7 +117,6 @@ struct cpuhp_step {
struct hlist_node *node);
} teardown;
struct hlist_head list;
- bool skip_onerr;
bool cant_stop;
bool multi_instance;
};
@@ -550,12 +547,8 @@ static int bringup_cpu(unsigned int cpu)
static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
{
- for (st->state--; st->state > st->target; st->state--) {
- struct cpuhp_step *step = cpuhp_get_step(st->state);
-
- if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
- }
+ for (st->state--; st->state > st->target; st->state--)
+ cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
}
static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
@@ -614,15 +607,15 @@ static void cpuhp_thread_fun(unsigned int cpu)
bool bringup = st->bringup;
enum cpuhp_state state;
+ if (WARN_ON_ONCE(!st->should_run))
+ return;
+
/*
* ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
* that if we see ->should_run we also see the rest of the state.
*/
smp_mb();
- if (WARN_ON_ONCE(!st->should_run))
- return;
-
cpuhp_lock_acquire(bringup);
if (st->single) {
@@ -644,12 +637,6 @@ static void cpuhp_thread_fun(unsigned int cpu)
WARN_ON_ONCE(!cpuhp_is_ap_state(state));
- if (st->rollback) {
- struct cpuhp_step *step = cpuhp_get_step(state);
- if (step->skip_onerr)
- goto next;
- }
-
if (cpuhp_is_atomic_state(state)) {
local_irq_disable();
st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
@@ -673,7 +660,6 @@ static void cpuhp_thread_fun(unsigned int cpu)
st->should_run = false;
}
-next:
cpuhp_lock_release(bringup);
if (!st->should_run)
@@ -916,12 +902,8 @@ void cpuhp_report_idle_dead(void)
static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
{
- for (st->state++; st->state < st->target; st->state++) {
- struct cpuhp_step *step = cpuhp_get_step(st->state);
-
- if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
- }
+ for (st->state++; st->state < st->target; st->state++)
+ cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
}
static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
@@ -934,7 +916,8 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
if (ret) {
st->target = prev_state;
- undo_cpu_down(cpu, st);
+ if (st->state < prev_state)
+ undo_cpu_down(cpu, st);
break;
}
}
@@ -987,7 +970,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
* to do the further cleanups.
*/
ret = cpuhp_down_callbacks(cpu, st, target);
- if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
+ if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) {
cpuhp_reset_state(st, prev_state);
__cpuhp_kick_ap(st);
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 1c35b7b945d0..de87b0282e74 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -168,7 +168,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
int dma_direct_supported(struct device *dev, u64 mask)
{
#ifdef CONFIG_ZONE_DMA
- if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+ if (mask < phys_to_dma(dev, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)))
return 0;
#else
/*
@@ -177,7 +177,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
* memory, or by providing a ZONE_DMA32. If neither is the case, the
* architecture needs to use an IOMMU instead of the direct mapping.
*/
- if (mask < DMA_BIT_MASK(32))
+ if (mask < phys_to_dma(dev, DMA_BIT_MASK(32)))
return 0;
#endif
/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2a62b96600ad..c80549bf82c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2867,16 +2867,11 @@ static int perf_event_modify_breakpoint(struct perf_event *bp,
_perf_event_disable(bp);
err = modify_user_hw_breakpoint_check(bp, attr, true);
- if (err) {
- if (!bp->attr.disabled)
- _perf_event_enable(bp);
- return err;
- }
-
- if (!attr->disabled)
+ if (!bp->attr.disabled)
_perf_event_enable(bp);
- return 0;
+
+ return err;
}
static int perf_event_modify_attr(struct perf_event *event,
@@ -5948,6 +5943,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
unsigned long sp;
unsigned int rem;
u64 dyn_size;
+ mm_segment_t fs;
/*
* We dump:
@@ -5965,7 +5961,10 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
+ fs = get_fs();
+ set_fs(USER_DS);
rem = __output_copy_user(handle, (void *) sp, dump_size);
+ set_fs(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b3814fce5ecb..d6b56180827c 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -509,6 +509,8 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a
*/
int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
{
+ int err;
+
/*
* modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
* will not be possible to raise IPIs that invoke __perf_event_disable.
@@ -520,15 +522,12 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
else
perf_event_disable(bp);
- if (!attr->disabled) {
- int err = modify_user_hw_breakpoint_check(bp, attr, false);
+ err = modify_user_hw_breakpoint_check(bp, attr, false);
- if (err)
- return err;
+ if (!bp->attr.disabled)
perf_event_enable(bp);
- bp->attr.disabled = 0;
- }
- return 0;
+
+ return err;
}
EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
diff --git a/kernel/fork.c b/kernel/fork.c
index d896e9ca38b0..f0b58479534f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -550,8 +550,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out;
}
/* a new mm has just been created */
- arch_dup_mmap(oldmm, mm);
- retval = 0;
+ retval = arch_dup_mmap(oldmm, mm);
out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 01ebdf1f9f40..2e62503bea0d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -678,7 +678,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
case MODULE_STATE_COMING:
ret = jump_label_add_module(mod);
if (ret) {
- WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n");
+ WARN(1, "Failed to allocate memory: jump_label may not work properly.\n");
jump_label_del_module(mod);
}
break;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e406c5fdb41e..dd13f865ad40 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -55,7 +55,6 @@
#include "lockdep_internals.h"
-#include <trace/events/preemptirq.h>
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 1a81a1257b3f..3f8a35104285 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -389,7 +389,7 @@ static bool __ww_mutex_wound(struct mutex *lock,
/*
* wake_up_process() paired with set_current_state()
* inserts sufficient barriers to make sure @owner either sees
- * it's wounded in __ww_mutex_lock_check_stamp() or has a
+ * it's wounded in __ww_mutex_check_kill() or has a
* wakeup pending to re-read the wounded state.
*/
if (owner != current)
@@ -946,7 +946,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}
debug_mutex_lock_common(lock, &waiter);
- debug_mutex_add_waiter(lock, &waiter, current);
lock_contended(&lock->dep_map, ip);
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 5b915b370d5a..0be047dbd897 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -324,7 +324,7 @@ static int __test_cycle(unsigned int nthreads)
if (!cycle->result)
continue;
- pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n",
+ pr_err("cyclic deadlock not resolved, ret[%d/%d] = %d\n",
n, nthreads, cycle->result);
ret = -EINVAL;
break;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 924e37fb1620..9bf5404397e0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -38,7 +38,6 @@
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
#include <linux/cpu.h>
-#include <linux/notifier.h>
#include <linux/rculist.h>
#include <linux/poll.h>
#include <linux/irq_work.h>
@@ -352,7 +351,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
*/
enum log_flags {
- LOG_NOCONS = 1, /* suppress print, do not print to console */
LOG_NEWLINE = 2, /* text ended with a newline */
LOG_PREFIX = 4, /* text started with a prefix */
LOG_CONT = 8, /* text is a fragment of a continuation line */
@@ -1882,9 +1880,6 @@ int vprintk_store(int facility, int level,
if (dict)
lflags |= LOG_PREFIX|LOG_NEWLINE;
- if (suppress_message_printing(level))
- lflags |= LOG_NOCONS;
-
return log_output(facility, level, lflags,
dict, dictlen, text, text_len);
}
@@ -2033,6 +2028,7 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
static size_t msg_print_text(const struct printk_log *msg,
bool syslog, char *buf, size_t size) { return 0; }
+static bool suppress_message_printing(int level) { return false; }
#endif /* CONFIG_PRINTK */
@@ -2369,10 +2365,11 @@ skip:
break;
msg = log_from_idx(console_idx);
- if (msg->flags & LOG_NOCONS) {
+ if (suppress_message_printing(msg->level)) {
/*
- * Skip record if !ignore_loglevel, and
- * record has level above the console loglevel.
+ * Skip record we have buffered and already printed
+ * directly to the console when we received it, and
+ * record that has level above the console loglevel.
*/
console_idx = log_next(console_idx);
console_seq++;
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index a0a74c533e4b..0913b4d385de 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -306,12 +306,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
return printk_safe_log_store(s, fmt, args);
}
-void printk_nmi_enter(void)
+void notrace printk_nmi_enter(void)
{
this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
}
-void printk_nmi_exit(void)
+void notrace printk_nmi_exit(void)
{
this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 60caf1fb94e0..6383aa6a60ca 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -89,12 +89,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
static void sched_feat_disable(int i)
{
- static_key_disable(&sched_feat_keys[i]);
+ static_key_disable_cpuslocked(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
- static_key_enable(&sched_feat_keys[i]);
+ static_key_enable_cpuslocked(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
@@ -146,9 +146,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
+ cpus_read_lock();
inode_lock(inode);
ret = sched_feat_set(cmp);
inode_unlock(inode);
+ cpus_read_unlock();
if (ret < 0)
return ret;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b39fb596f6c1..f808ddf2a868 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3362,6 +3362,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
+ * @flags: migration hints
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
@@ -7263,6 +7264,7 @@ static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
+ const struct sched_class *curr_class;
struct rq_flags rf;
bool done = true;
@@ -7299,8 +7301,10 @@ static void update_blocked_averages(int cpu)
if (cfs_rq_has_blocked(cfs_rq))
done = false;
}
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+
+ curr_class = rq->curr->sched_class;
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
@@ -7365,13 +7369,16 @@ static inline void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
+ const struct sched_class *curr_class;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+
+ curr_class = rq->curr->sched_class;
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
@@ -7482,10 +7489,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long scale_rt_capacity(int cpu)
+static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+ unsigned long max = arch_scale_cpu_capacity(sd, cpu);
unsigned long used, free;
unsigned long irq;
@@ -7507,7 +7514,7 @@ static unsigned long scale_rt_capacity(int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = scale_rt_capacity(cpu);
+ unsigned long capacity = scale_rt_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
@@ -8269,7 +8276,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds);
- return sds.busiest;
+ return env->imbalance ? sds.busiest : NULL;
out_balanced:
env->imbalance = 0;
@@ -9638,7 +9645,8 @@ static inline bool vruntime_normalized(struct task_struct *p)
* - A task which has been woken up by try_to_wake_up() and
* waiting for actually being woken up by sched_ttwu_pending().
*/
- if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+ if (!se->sum_exec_runtime ||
+ (p->state == TASK_WAKING && p->sched_remote_wakeup))
return true;
return false;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 56a0fed30c0a..505a41c42b96 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1295,7 +1295,7 @@ static void init_numa_topology_type(void)
n = sched_max_numa_distance;
- if (sched_domains_numa_levels <= 1) {
+ if (sched_domains_numa_levels <= 2) {
sched_numa_topology_type = NUMA_DIRECT;
return;
}
@@ -1380,9 +1380,6 @@ void sched_init_numa(void)
break;
}
- if (!level)
- return;
-
/*
* 'level' contains the number of unique distances
*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f74fb00d8064..0e6e97a01942 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned long *flags)
spin_unlock_irqrestore(&watchdog_lock, *flags);
}
+static int clocksource_watchdog_kthread(void *data);
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
+
/*
* Interval: 0.5sec Threshold: 0.0625s
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
+static void clocksource_watchdog_work(struct work_struct *work)
+{
+ /*
+ * We cannot directly run clocksource_watchdog_kthread() here, because
+ * clocksource_select() calls timekeeping_notify() which uses
+ * stop_machine(). One cannot use stop_machine() from a workqueue() due
+ * lock inversions wrt CPU hotplug.
+ *
+ * Also, we only ever run this work once or twice during the lifetime
+ * of the kernel, so there is no point in creating a more permanent
+ * kthread for this.
+ *
+ * If kthread_run fails the next watchdog scan over the
+ * watchdog_list will find the unstable clock again.
+ */
+ kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
+}
+
static void __clocksource_unstable(struct clocksource *cs)
{
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
/*
- * If the clocksource is registered clocksource_watchdog_work() will
+ * If the clocksource is registered clocksource_watchdog_kthread() will
* re-rate and re-select.
*/
if (list_empty(&cs->list)) {
@@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs)
if (cs->mark_unstable)
cs->mark_unstable(cs);
- /* kick clocksource_watchdog_work() */
+ /* kick clocksource_watchdog_kthread() */
if (finished_booting)
schedule_work(&watchdog_work);
}
@@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs)
* @cs: clocksource to be marked unstable
*
* This function is called by the x86 TSC code to mark clocksources as unstable;
- * it defers demotion and re-selection to a work.
+ * it defers demotion and re-selection to a kthread.
*/
void clocksource_mark_unstable(struct clocksource *cs)
{
@@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs)
}
}
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
-
-static int __clocksource_watchdog_work(void)
+static int __clocksource_watchdog_kthread(void)
{
struct clocksource *cs, *tmp;
unsigned long flags;
@@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void)
return select;
}
-static void clocksource_watchdog_work(struct work_struct *work)
+static int clocksource_watchdog_kthread(void *data)
{
mutex_lock(&clocksource_mutex);
- if (__clocksource_watchdog_work())
+ if (__clocksource_watchdog_kthread())
clocksource_select();
mutex_unlock(&clocksource_mutex);
+ return 0;
}
static bool clocksource_is_watchdog(struct clocksource *cs)
@@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
static void clocksource_select_watchdog(bool fallback) { }
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
-static inline int __clocksource_watchdog_work(void) { return 0; }
+static inline int __clocksource_watchdog_kthread(void) { return 0; }
static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
void clocksource_mark_unstable(struct clocksource *cs) { }
@@ -810,7 +830,7 @@ static int __init clocksource_done_booting(void)
/*
* Run the watchdog first to eliminate unstable clock sources
*/
- __clocksource_watchdog_work();
+ __clocksource_watchdog_kthread();
clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5470dce212c0..977918d5d350 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -261,7 +261,7 @@ static void __touch_watchdog(void)
* entering idle state. This should only be used for scheduler events.
* Use touch_softlockup_watchdog() for everything else.
*/
-void touch_softlockup_watchdog_sched(void)
+notrace void touch_softlockup_watchdog_sched(void)
{
/*
* Preemption can be enabled. It doesn't matter which CPU's timestamp
@@ -270,7 +270,7 @@ void touch_softlockup_watchdog_sched(void)
raw_cpu_write(watchdog_touch_ts, 0);
}
-void touch_softlockup_watchdog(void)
+notrace void touch_softlockup_watchdog(void)
{
touch_softlockup_watchdog_sched();
wq_watchdog_touch(raw_smp_processor_id());
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 1f7020d65d0a..71381168dede 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -29,7 +29,7 @@ static struct cpumask dead_events_mask;
static unsigned long hardlockup_allcpu_dumped;
static atomic_t watchdog_cpus = ATOMIC_INIT(0);
-void arch_touch_nmi_watchdog(void)
+notrace void arch_touch_nmi_watchdog(void)
{
/*
* Using __raw here because some code paths have
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 60e80198c3df..0280deac392e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5574,7 +5574,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
mod_timer(&wq_watchdog_timer, jiffies + thresh);
}
-void wq_watchdog_touch(int cpu)
+notrace void wq_watchdog_touch(int cpu)
{
if (cpu >= 0)
per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;