diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/arraymap.c | 24 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 2 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 54 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 75 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 1 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 218 | ||||
-rw-r--r-- | kernel/cpu.c | 37 | ||||
-rw-r--r-- | kernel/dma/direct.c | 4 | ||||
-rw-r--r-- | kernel/events/core.c | 15 | ||||
-rw-r--r-- | kernel/events/hw_breakpoint.c | 13 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/jump_label.c | 2 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 1 | ||||
-rw-r--r-- | kernel/locking/mutex.c | 3 | ||||
-rw-r--r-- | kernel/locking/test-ww_mutex.c | 2 | ||||
-rw-r--r-- | kernel/printk/printk.c | 13 | ||||
-rw-r--r-- | kernel/printk/printk_safe.c | 4 | ||||
-rw-r--r-- | kernel/sched/debug.c | 6 | ||||
-rw-r--r-- | kernel/sched/fair.c | 26 | ||||
-rw-r--r-- | kernel/sched/topology.c | 5 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 40 | ||||
-rw-r--r-- | kernel/watchdog.c | 4 | ||||
-rw-r--r-- | kernel/watchdog_hld.c | 2 | ||||
-rw-r--r-- | kernel/workqueue.c | 2 |
24 files changed, 276 insertions, 280 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0c17aab3ce5f..f9d24121be99 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -358,6 +358,29 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + seq_printf(m, "%u: {\n", *(u32 *)key); + pptr = array->pptrs[index & array->index_mask]; + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + static int array_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) @@ -398,6 +421,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2590700237c1..138f0302692e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1844,7 +1844,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env) hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; - end = btf->nohdr_data + hdr->type_len; + end = cur + hdr->type_len; env->log_type_id = 1; while (cur < end) { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 04b8eda94e7d..2c1790288138 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -15,6 +15,7 @@ #include <linux/jhash.h> #include <linux/filter.h> #include <linux/rculist_nulls.h> +#include <linux/random.h> #include <uapi/linux/btf.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" @@ -41,6 +42,7 @@ struct bpf_htab { atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ + u32 hashrnd; }; /* each htab element is struct htab_elem + key + value */ @@ -371,6 +373,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_htab; + htab->hashrnd = get_random_int(); for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); raw_spin_lock_init(&htab->buckets[i].lock); @@ -402,9 +405,9 @@ free_htab: return ERR_PTR(err); } -static inline u32 htab_map_hash(const void *key, u32 key_len) +static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) { - return jhash(key, key_len, 0); + return jhash(key, key_len, hashrnd); } static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) @@ -470,7 +473,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); head = select_bucket(htab, hash); @@ -597,7 +600,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) if (!key) goto find_first_elem; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); head = select_bucket(htab, hash); @@ -824,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -880,7 +883,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -945,7 +948,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -998,7 +1001,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -1071,7 +1074,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -1103,7 +1106,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -1282,6 +1285,35 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, return ret; } +static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct htab_elem *l; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + l = __htab_map_lookup_elem(map, key); + if (!l) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": {\n"); + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1290,6 +1322,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_lookup_elem = htab_percpu_map_lookup_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, }; const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1300,6 +1333,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_lookup_elem = htab_lru_percpu_map_lookup_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 98e621a29e8e..488ef9663c01 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -236,7 +236,7 @@ static int bpf_tcp_init(struct sock *sk) } static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); static void bpf_tcp_release(struct sock *sk) { @@ -248,7 +248,7 @@ static void bpf_tcp_release(struct sock *sk) goto out; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } @@ -330,14 +330,14 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } @@ -369,7 +369,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) /* If another thread deleted this object skip deletion. * The refcnt on psock may or may not be zero. */ - if (l) { + if (l && l == link) { hlist_del_rcu(&link->hash_node); smap_release_sock(psock, link->sk); free_htab_elem(htab, link); @@ -570,14 +570,16 @@ static void free_bytes_sg(struct sock *sk, int bytes, md->sg_start = i; } -static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +static int free_sg(struct sock *sk, int start, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = start, free = 0; while (sg[i].length) { free += sg[i].length; - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); if (!md->skb) put_page(sg_page(&sg[i])); sg[i].length = 0; @@ -594,9 +596,9 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) return free; } -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) { - int free = free_sg(sk, md->sg_start, md); + int free = free_sg(sk, md->sg_start, md, charge); md->sg_start = md->sg_end; return free; @@ -604,7 +606,7 @@ static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) { - return free_sg(sk, md->sg_curr, md); + return free_sg(sk, md->sg_curr, md, true); } static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) @@ -718,7 +720,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, list_add_tail(&r->list, &psock->ingress); sk->sk_data_ready(sk); } else { - free_start_sg(sk, r); + free_start_sg(sk, r, true); kfree(r); } @@ -752,14 +754,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, release_sock(sk); } smap_release_sock(psock, sk); - if (unlikely(err)) - goto out; - return 0; + return err; out_rcu: rcu_read_unlock(); -out: - free_bytes_sg(NULL, send, md, false); - return err; + return 0; } static inline void bpf_md_init(struct smap_psock *psock) @@ -822,7 +820,7 @@ more_data: case __SK_PASS: err = bpf_tcp_push(sk, send, m, flags, true); if (unlikely(err)) { - *copied -= free_start_sg(sk, m); + *copied -= free_start_sg(sk, m, true); break; } @@ -845,16 +843,17 @@ more_data: lock_sock(sk); if (unlikely(err < 0)) { - free_start_sg(sk, m); + int free = free_start_sg(sk, m, false); + psock->sg_size = 0; if (!cork) - *copied -= send; + *copied -= free; } else { psock->sg_size -= send; } if (cork) { - free_start_sg(sk, m); + free_start_sg(sk, m, true); psock->sg_size = 0; kfree(m); m = NULL; @@ -912,6 +911,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); rcu_read_lock(); psock = smap_psock_sk(sk); @@ -922,9 +923,6 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out; rcu_read_unlock(); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); bytes_ready: while (copied != len) { @@ -1122,7 +1120,7 @@ wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); if (err) { if (m && m != psock->cork) - free_start_sg(sk, m); + free_start_sg(sk, m, true); goto out_err; } } @@ -1427,12 +1425,15 @@ out: static void smap_write_space(struct sock *sk) { struct smap_psock *psock; + void (*write_space)(struct sock *sk); rcu_read_lock(); psock = smap_psock_sk(sk); if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state))) schedule_work(&psock->tx_work); + write_space = psock->save_write_space; rcu_read_unlock(); + write_space(sk); } static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) @@ -1461,10 +1462,16 @@ static void smap_destroy_psock(struct rcu_head *rcu) schedule_work(&psock->gc_work); } +static bool psock_is_smap_sk(struct sock *sk) +{ + return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; +} + static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { - tcp_cleanup_ulp(sock); + if (psock_is_smap_sk(sock)) + tcp_cleanup_ulp(sock); write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); write_unlock_bh(&sock->sk_callback_lock); @@ -1578,13 +1585,13 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_tx_msg); if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } @@ -1891,6 +1898,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, * doesn't update user data. */ if (psock) { + if (!psock_is_smap_sk(sock)) { + err = -EBUSY; + goto out_progs; + } if (READ_ONCE(psock->bpf_parse) && parse) { err = -EBUSY; goto out_progs; @@ -2140,7 +2151,9 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) return ERR_PTR(-EPERM); /* check sanity of attributes */ - if (attr->max_entries == 0 || attr->value_size != 4 || + if (attr->max_entries == 0 || + attr->key_size == 0 || + attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); @@ -2267,8 +2280,10 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, } l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); - if (!l_new) + if (!l_new) { + atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); + } memcpy(l_new->key, key, key_size); l_new->sk = sk; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8339d81cba1d..3c9636f03bb2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -30,7 +30,6 @@ #include <linux/cred.h> #include <linux/timekeeping.h> #include <linux/ctype.h> -#include <linux/btf.h> #include <linux/nospec.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92246117d2b0..8cd4f5306c3c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -263,6 +263,13 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +static char slot_type_char[] = { + [STACK_INVALID] = '?', + [STACK_SPILL] = 'r', + [STACK_MISC] = 'm', + [STACK_ZERO] = '0', +}; + static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { @@ -349,15 +356,26 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) { - verbose(env, " fp%d", - (-i - 1) * BPF_REG_SIZE); - print_liveness(env, state->stack[i].spilled_ptr.live); + char types_buf[BPF_REG_SIZE + 1]; + bool valid = false; + int j; + + for (j = 0; j < BPF_REG_SIZE; j++) { + if (state->stack[i].slot_type[j] != STACK_INVALID) + valid = true; + types_buf[j] = slot_type_char[ + state->stack[i].slot_type[j]]; + } + types_buf[BPF_REG_SIZE] = 0; + if (!valid) + continue; + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + if (state->stack[i].slot_type[0] == STACK_SPILL) verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); - } - if (state->stack[i].slot_type[0] == STACK_ZERO) - verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); + else + verbose(env, "=%s", types_buf); } verbose(env, "\n"); } @@ -380,9 +398,9 @@ static int copy_stack_state(struct bpf_func_state *dst, /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from * the program calls into realloc_func_state() to grow the stack size. - * Note there is a non-zero 'parent' pointer inside bpf_verifier_state - * which this function copies over. It points to previous bpf_verifier_state - * which is never reallocated + * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state + * which this function copies over. It points to corresponding reg in previous + * bpf_verifier_state which is never reallocated */ static int realloc_func_state(struct bpf_func_state *state, int size, bool copy_old) @@ -466,7 +484,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->frame[i] = NULL; } dst_state->curframe = src->curframe; - dst_state->parent = src->parent; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -732,6 +749,7 @@ static void init_reg_state(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; + regs[i].parent = NULL; } /* frame pointer */ @@ -876,74 +894,21 @@ next: return 0; } -static -struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) -{ - struct bpf_verifier_state *tmp = NULL; - - /* 'parent' could be a state of caller and - * 'state' could be a state of callee. In such case - * parent->curframe < state->curframe - * and it's ok for r1 - r5 registers - * - * 'parent' could be a callee's state after it bpf_exit-ed. - * In such case parent->curframe > state->curframe - * and it's ok for r0 only - */ - if (parent->curframe == state->curframe || - (parent->curframe < state->curframe && - regno >= BPF_REG_1 && regno <= BPF_REG_5) || - (parent->curframe > state->curframe && - regno == BPF_REG_0)) - return parent; - - if (parent->curframe > state->curframe && - regno >= BPF_REG_6) { - /* for callee saved regs we have to skip the whole chain - * of states that belong to callee and mark as LIVE_READ - * the registers before the call - */ - tmp = parent; - while (tmp && tmp->curframe != state->curframe) { - tmp = tmp->parent; - } - if (!tmp) - goto bug; - parent = tmp; - } else { - goto bug; - } - return parent; -bug: - verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); - verbose(env, "regno %d parent frame %d current frame %d\n", - regno, parent->curframe, state->curframe); - return NULL; -} - +/* Parentage chain of this register (or stack slot) should take care of all + * issues like callee-saved registers, stack slot allocation time, etc. + */ static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) + const struct bpf_reg_state *state, + struct bpf_reg_state *parent) { bool writes = parent == state->parent; /* Observe write marks */ - if (regno == BPF_REG_FP) - /* We don't need to worry about FP liveness because it's read-only */ - return 0; - while (parent) { /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->live & REG_LIVE_WRITTEN) break; - parent = skip_callee(env, state, parent, regno); - if (!parent) - return -EFAULT; /* ... then we depend on parent's value */ - parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; + parent->live |= REG_LIVE_READ; state = parent; parent = state->parent; writes = true; @@ -969,7 +934,10 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - return mark_reg_read(env, vstate, vstate->parent, regno); + /* We don't need to worry about FP liveness because it's read-only */ + if (regno != BPF_REG_FP) + return mark_reg_read(env, ®s[regno], + regs[regno].parent); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -1080,8 +1048,8 @@ static int check_stack_write(struct bpf_verifier_env *env, } else { u8 type = STACK_MISC; - /* regular write of data into stack */ - state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; + /* regular write of data into stack destroys any spilled ptr */ + state->stack[spi].spilled_ptr.type = NOT_INIT; /* only mark the slot as written if all 8 bytes were written * otherwise read propagation may incorrectly stop too soon @@ -1106,61 +1074,6 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -/* registers of every function are unique and mark_reg_read() propagates - * the liveness in the following cases: - * - from callee into caller for R1 - R5 that were used as arguments - * - from caller into callee for R0 that used as result of the call - * - from caller to the same caller skipping states of the callee for R6 - R9, - * since R6 - R9 are callee saved by implicit function prologue and - * caller's R6 != callee's R6, so when we propagate liveness up to - * parent states we need to skip callee states for R6 - R9. - * - * stack slot marking is different, since stacks of caller and callee are - * accessible in both (since caller can pass a pointer to caller's stack to - * callee which can pass it to another function), hence mark_stack_slot_read() - * has to propagate the stack liveness to all parent states at given frame number. - * Consider code: - * f1() { - * ptr = fp - 8; - * *ptr = ctx; - * call f2 { - * .. = *ptr; - * } - * .. = *ptr; - * } - * First *ptr is reading from f1's stack and mark_stack_slot_read() has - * to mark liveness at the f1's frame and not f2's frame. - * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has - * to propagate liveness to f2 states at f1's frame level and further into - * f1 states at f1's frame level until write into that stack slot - */ -static void mark_stack_slot_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - int slot, int frameno) -{ - bool writes = parent == state->parent; /* Observe write marks */ - - while (parent) { - if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) - /* since LIVE_WRITTEN mark is only done for full 8-byte - * write the read marks are conservative and parent - * state may not even have the stack allocated. In such case - * end the propagation, since the loop reached beginning - * of the function - */ - break; - /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) - break; - /* ... then we depend on parent's value */ - parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; - state = parent; - parent = state->parent; - writes = true; - } -} - static int check_stack_read(struct bpf_verifier_env *env, struct bpf_func_state *reg_state /* func where register points to */, int off, int size, int value_regno) @@ -1198,8 +1111,8 @@ static int check_stack_read(struct bpf_verifier_env *env, */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); return 0; } else { int zeros = 0; @@ -1215,8 +1128,8 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -1908,8 +1821,8 @@ mark: /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ - mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, - spi, state->frameno); + mark_reg_read(env, &state->stack[spi].spilled_ptr, + state->stack[spi].spilled_ptr.parent); } return update_stack_depth(env, state, off); } @@ -2366,11 +2279,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe + 1 /* frameno within this callchain */, subprog /* subprog number within this prog */); - /* copy r1 - r5 args that callee can access */ + /* copy r1 - r5 args that callee can access. The copy includes parent + * pointers, which connects us up to the liveness chain + */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) callee->regs[i] = caller->regs[i]; - /* after the call regsiters r0 - r5 were scratched */ + /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, caller->regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); @@ -3163,7 +3078,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * an arbitrary scalar. Disallow all math except * pointer subtraction */ - if (opcode == BPF_SUB){ + if (opcode == BPF_SUB && env->allow_ptr_leaks) { mark_reg_unknown(env, regs, insn->dst_reg); return 0; } @@ -4370,7 +4285,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, /* explored state didn't use this */ return true; - equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; if (rold->type == PTR_TO_STACK) /* two stack pointers are equal only if they're pointing to @@ -4603,7 +4518,7 @@ static bool states_equal(struct bpf_verifier_env *env, * equivalent state (jump target or such) we didn't arrive by the straight-line * code, so read marks in the state must propagate to the parent regardless * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() and mark_stack_slot_read() is for. + * in mark_reg_read() is for. */ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, @@ -4624,7 +4539,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, vstate, vparent, i); + err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], + &vparent->frame[vstate->curframe]->regs[i]); if (err) return err; } @@ -4639,7 +4555,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_stack_slot_read(env, vstate, vparent, i, frame); + mark_reg_read(env, &state->stack[i].spilled_ptr, + &parent->stack[i].spilled_ptr); } } return err; @@ -4649,7 +4566,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err; sl = env->explored_states[insn_idx]; @@ -4691,16 +4608,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; /* add new state to the head of linked list */ - err = copy_verifier_state(&new_sl->state, cur); + new = &new_sl->state; + err = copy_verifier_state(new, cur); if (err) { - free_verifier_state(&new_sl->state, false); + free_verifier_state(new, false); kfree(new_sl); return err; } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ - cur->parent = &new_sl->state; + for (i = 0; i < BPF_REG_FP; i++) + cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -4713,9 +4632,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* all stack frames are accessible from callee, clear them all */ for (j = 0; j <= cur->curframe; j++) { struct bpf_func_state *frame = cur->frame[j]; + struct bpf_func_state *newframe = new->frame[j]; - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + frame->stack[i].spilled_ptr.parent = + &newframe->stack[i].spilled_ptr; + } } return 0; } @@ -4734,7 +4657,6 @@ static int do_check(struct bpf_verifier_env *env) if (!state) return -ENOMEM; state->curframe = 0; - state->parent = NULL; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); diff --git a/kernel/cpu.c b/kernel/cpu.c index ed44d7d34c2d..0097acec1c71 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -102,8 +102,6 @@ static inline void cpuhp_lock_release(bool bringup) { } * @name: Name of the step * @startup: Startup function of the step * @teardown: Teardown function of the step - * @skip_onerr: Do not invoke the functions on error rollback - * Will go away once the notifiers are gone * @cant_stop: Bringup/teardown can't be stopped at this step */ struct cpuhp_step { @@ -119,7 +117,6 @@ struct cpuhp_step { struct hlist_node *node); } teardown; struct hlist_head list; - bool skip_onerr; bool cant_stop; bool multi_instance; }; @@ -550,12 +547,8 @@ static int bringup_cpu(unsigned int cpu) static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { - for (st->state--; st->state > st->target; st->state--) { - struct cpuhp_step *step = cpuhp_get_step(st->state); - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); - } + for (st->state--; st->state > st->target; st->state--) + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); } static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, @@ -614,15 +607,15 @@ static void cpuhp_thread_fun(unsigned int cpu) bool bringup = st->bringup; enum cpuhp_state state; + if (WARN_ON_ONCE(!st->should_run)) + return; + /* * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures * that if we see ->should_run we also see the rest of the state. */ smp_mb(); - if (WARN_ON_ONCE(!st->should_run)) - return; - cpuhp_lock_acquire(bringup); if (st->single) { @@ -644,12 +637,6 @@ static void cpuhp_thread_fun(unsigned int cpu) WARN_ON_ONCE(!cpuhp_is_ap_state(state)); - if (st->rollback) { - struct cpuhp_step *step = cpuhp_get_step(state); - if (step->skip_onerr) - goto next; - } - if (cpuhp_is_atomic_state(state)) { local_irq_disable(); st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); @@ -673,7 +660,6 @@ static void cpuhp_thread_fun(unsigned int cpu) st->should_run = false; } -next: cpuhp_lock_release(bringup); if (!st->should_run) @@ -916,12 +902,8 @@ void cpuhp_report_idle_dead(void) static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) { - for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = cpuhp_get_step(st->state); - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); - } + for (st->state++; st->state < st->target; st->state++) + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); } static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, @@ -934,7 +916,8 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); if (ret) { st->target = prev_state; - undo_cpu_down(cpu, st); + if (st->state < prev_state) + undo_cpu_down(cpu, st); break; } } @@ -987,7 +970,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * to do the further cleanups. */ ret = cpuhp_down_callbacks(cpu, st, target); - if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { + if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) { cpuhp_reset_state(st, prev_state); __cpuhp_kick_ap(st); } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 1c35b7b945d0..de87b0282e74 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -168,7 +168,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, int dma_direct_supported(struct device *dev, u64 mask) { #ifdef CONFIG_ZONE_DMA - if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + if (mask < phys_to_dma(dev, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))) return 0; #else /* @@ -177,7 +177,7 @@ int dma_direct_supported(struct device *dev, u64 mask) * memory, or by providing a ZONE_DMA32. If neither is the case, the * architecture needs to use an IOMMU instead of the direct mapping. */ - if (mask < DMA_BIT_MASK(32)) + if (mask < phys_to_dma(dev, DMA_BIT_MASK(32))) return 0; #endif /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 2a62b96600ad..c80549bf82c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2867,16 +2867,11 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, _perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, true); - if (err) { - if (!bp->attr.disabled) - _perf_event_enable(bp); - return err; - } - - if (!attr->disabled) + if (!bp->attr.disabled) _perf_event_enable(bp); - return 0; + + return err; } static int perf_event_modify_attr(struct perf_event *event, @@ -5948,6 +5943,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, unsigned long sp; unsigned int rem; u64 dyn_size; + mm_segment_t fs; /* * We dump: @@ -5965,7 +5961,10 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); + fs = get_fs(); + set_fs(USER_DS); rem = __output_copy_user(handle, (void *) sp, dump_size); + set_fs(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b3814fce5ecb..d6b56180827c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -509,6 +509,8 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { + int err; + /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it * will not be possible to raise IPIs that invoke __perf_event_disable. @@ -520,15 +522,12 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att else perf_event_disable(bp); - if (!attr->disabled) { - int err = modify_user_hw_breakpoint_check(bp, attr, false); + err = modify_user_hw_breakpoint_check(bp, attr, false); - if (err) - return err; + if (!bp->attr.disabled) perf_event_enable(bp); - bp->attr.disabled = 0; - } - return 0; + + return err; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); diff --git a/kernel/fork.c b/kernel/fork.c index d896e9ca38b0..f0b58479534f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -550,8 +550,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; } /* a new mm has just been created */ - arch_dup_mmap(oldmm, mm); - retval = 0; + retval = arch_dup_mmap(oldmm, mm); out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 01ebdf1f9f40..2e62503bea0d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -678,7 +678,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, case MODULE_STATE_COMING: ret = jump_label_add_module(mod); if (ret) { - WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); + WARN(1, "Failed to allocate memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } break; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e406c5fdb41e..dd13f865ad40 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,7 +55,6 @@ #include "lockdep_internals.h" -#include <trace/events/preemptirq.h> #define CREATE_TRACE_POINTS #include <trace/events/lock.h> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 1a81a1257b3f..3f8a35104285 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -389,7 +389,7 @@ static bool __ww_mutex_wound(struct mutex *lock, /* * wake_up_process() paired with set_current_state() * inserts sufficient barriers to make sure @owner either sees - * it's wounded in __ww_mutex_lock_check_stamp() or has a + * it's wounded in __ww_mutex_check_kill() or has a * wakeup pending to re-read the wounded state. */ if (owner != current) @@ -946,7 +946,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, current); lock_contended(&lock->dep_map, ip); diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 5b915b370d5a..0be047dbd897 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -324,7 +324,7 @@ static int __test_cycle(unsigned int nthreads) if (!cycle->result) continue; - pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n", + pr_err("cyclic deadlock not resolved, ret[%d/%d] = %d\n", n, nthreads, cycle->result); ret = -EINVAL; break; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 924e37fb1620..9bf5404397e0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -38,7 +38,6 @@ #include <linux/kmsg_dump.h> #include <linux/syslog.h> #include <linux/cpu.h> -#include <linux/notifier.h> #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> @@ -352,7 +351,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; */ enum log_flags { - LOG_NOCONS = 1, /* suppress print, do not print to console */ LOG_NEWLINE = 2, /* text ended with a newline */ LOG_PREFIX = 4, /* text started with a prefix */ LOG_CONT = 8, /* text is a fragment of a continuation line */ @@ -1882,9 +1880,6 @@ int vprintk_store(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - if (suppress_message_printing(level)) - lflags |= LOG_NOCONS; - return log_output(facility, level, lflags, dict, dictlen, text, text_len); } @@ -2033,6 +2028,7 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } +static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2369,10 +2365,11 @@ skip: break; msg = log_from_idx(console_idx); - if (msg->flags & LOG_NOCONS) { + if (suppress_message_printing(msg->level)) { /* - * Skip record if !ignore_loglevel, and - * record has level above the console loglevel. + * Skip record we have buffered and already printed + * directly to the console when we received it, and + * record that has level above the console loglevel. */ console_idx = log_next(console_idx); console_seq++; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index a0a74c533e4b..0913b4d385de 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -306,12 +306,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) return printk_safe_log_store(s, fmt, args); } -void printk_nmi_enter(void) +void notrace printk_nmi_enter(void) { this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); } -void printk_nmi_exit(void) +void notrace printk_nmi_exit(void) { this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 60caf1fb94e0..6383aa6a60ca 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -89,12 +89,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - static_key_disable(&sched_feat_keys[i]); + static_key_disable_cpuslocked(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - static_key_enable(&sched_feat_keys[i]); + static_key_enable_cpuslocked(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -146,9 +146,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); + cpus_read_lock(); inode_lock(inode); ret = sched_feat_set(cmp); inode_unlock(inode); + cpus_read_unlock(); if (ret < 0) return ret; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b39fb596f6c1..f808ddf2a868 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3362,6 +3362,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach + * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. @@ -7263,6 +7264,7 @@ static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; + const struct sched_class *curr_class; struct rq_flags rf; bool done = true; @@ -7299,8 +7301,10 @@ static void update_blocked_averages(int cpu) if (cfs_rq_has_blocked(cfs_rq)) done = false; } - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ if (others_have_blocked(rq)) @@ -7365,13 +7369,16 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; + const struct sched_class *curr_class; struct rq_flags rf; rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; @@ -7482,10 +7489,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long scale_rt_capacity(int cpu) +static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long max = arch_scale_cpu_capacity(sd, cpu); unsigned long used, free; unsigned long irq; @@ -7507,7 +7514,7 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(cpu); + unsigned long capacity = scale_rt_capacity(sd, cpu); struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); @@ -8269,7 +8276,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); - return sds.busiest; + return env->imbalance ? sds.busiest : NULL; out_balanced: env->imbalance = 0; @@ -9638,7 +9645,8 @@ static inline bool vruntime_normalized(struct task_struct *p) * - A task which has been woken up by try_to_wake_up() and * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!se->sum_exec_runtime || p->state == TASK_WAKING) + if (!se->sum_exec_runtime || + (p->state == TASK_WAKING && p->sched_remote_wakeup)) return true; return false; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 56a0fed30c0a..505a41c42b96 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1295,7 +1295,7 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (sched_domains_numa_levels <= 1) { + if (sched_domains_numa_levels <= 2) { sched_numa_topology_type = NUMA_DIRECT; return; } @@ -1380,9 +1380,6 @@ void sched_init_numa(void) break; } - if (!level) - return; - /* * 'level' contains the number of unique distances * diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f74fb00d8064..0e6e97a01942 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned long *flags) spin_unlock_irqrestore(&watchdog_lock, *flags); } +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); + /* * Interval: 0.5sec Threshold: 0.0625s */ #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* + * We cannot directly run clocksource_watchdog_kthread() here, because + * clocksource_select() calls timekeeping_notify() which uses + * stop_machine(). One cannot use stop_machine() from a workqueue() due + * lock inversions wrt CPU hotplug. + * + * Also, we only ever run this work once or twice during the lifetime + * of the kernel, so there is no point in creating a more permanent + * kthread for this. + * + * If kthread_run fails the next watchdog scan over the + * watchdog_list will find the unstable clock again. + */ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; /* - * If the clocksource is registered clocksource_watchdog_work() will + * If the clocksource is registered clocksource_watchdog_kthread() will * re-rate and re-select. */ if (list_empty(&cs->list)) { @@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs) if (cs->mark_unstable) cs->mark_unstable(cs); - /* kick clocksource_watchdog_work() */ + /* kick clocksource_watchdog_kthread() */ if (finished_booting) schedule_work(&watchdog_work); } @@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs) * @cs: clocksource to be marked unstable * * This function is called by the x86 TSC code to mark clocksources as unstable; - * it defers demotion and re-selection to a work. + * it defers demotion and re-selection to a kthread. */ void clocksource_mark_unstable(struct clocksource *cs) { @@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) } } -static void __clocksource_change_rating(struct clocksource *cs, int rating); - -static int __clocksource_watchdog_work(void) +static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; @@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void) return select; } -static void clocksource_watchdog_work(struct work_struct *work) +static int clocksource_watchdog_kthread(void *data) { mutex_lock(&clocksource_mutex); - if (__clocksource_watchdog_work()) + if (__clocksource_watchdog_kthread()) clocksource_select(); mutex_unlock(&clocksource_mutex); + return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) @@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static inline int __clocksource_watchdog_work(void) { return 0; } +static inline int __clocksource_watchdog_kthread(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } void clocksource_mark_unstable(struct clocksource *cs) { } @@ -810,7 +830,7 @@ static int __init clocksource_done_booting(void) /* * Run the watchdog first to eliminate unstable clock sources */ - __clocksource_watchdog_work(); + __clocksource_watchdog_kthread(); clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5470dce212c0..977918d5d350 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -261,7 +261,7 @@ static void __touch_watchdog(void) * entering idle state. This should only be used for scheduler events. * Use touch_softlockup_watchdog() for everything else. */ -void touch_softlockup_watchdog_sched(void) +notrace void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's timestamp @@ -270,7 +270,7 @@ void touch_softlockup_watchdog_sched(void) raw_cpu_write(watchdog_touch_ts, 0); } -void touch_softlockup_watchdog(void) +notrace void touch_softlockup_watchdog(void) { touch_softlockup_watchdog_sched(); wq_watchdog_touch(raw_smp_processor_id()); diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 1f7020d65d0a..71381168dede 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -29,7 +29,7 @@ static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; static atomic_t watchdog_cpus = ATOMIC_INIT(0); -void arch_touch_nmi_watchdog(void) +notrace void arch_touch_nmi_watchdog(void) { /* * Using __raw here because some code paths have diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 60e80198c3df..0280deac392e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5574,7 +5574,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) mod_timer(&wq_watchdog_timer, jiffies + thresh); } -void wq_watchdog_touch(int cpu) +notrace void wq_watchdog_touch(int cpu) { if (cpu >= 0) per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; |