diff options
Diffstat (limited to 'kernel')
107 files changed, 2674 insertions, 2463 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 14750e7c5ee4..027107f4be53 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -476,7 +476,7 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr) } /* decrement refcnt of all bpf_progs that are stored in this map */ -void bpf_fd_array_map_clear(struct bpf_map *map) +static void bpf_fd_array_map_clear(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -495,6 +495,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, + .map_release_uref = bpf_fd_array_map_clear, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d315b393abdd..6ef6746a7871 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -218,47 +218,84 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) +static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, + u32 curr, const bool probe_pass) { + const s64 imm_min = S32_MIN, imm_max = S32_MAX; + s64 imm = insn->imm; + + if (curr < pos && curr + imm + 1 > pos) + imm += delta; + else if (curr > pos + delta && curr + imm + 1 <= pos + delta) + imm -= delta; + if (imm < imm_min || imm > imm_max) + return -ERANGE; + if (!probe_pass) + insn->imm = imm; + return 0; +} + +static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, + u32 curr, const bool probe_pass) +{ + const s32 off_min = S16_MIN, off_max = S16_MAX; + s32 off = insn->off; + + if (curr < pos && curr + off + 1 > pos) + off += delta; + else if (curr > pos + delta && curr + off + 1 <= pos + delta) + off -= delta; + if (off < off_min || off > off_max) + return -ERANGE; + if (!probe_pass) + insn->off = off; + return 0; +} + +static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, + const bool probe_pass) +{ + u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0); struct bpf_insn *insn = prog->insnsi; - u32 i, insn_cnt = prog->len; - bool pseudo_call; - u8 code; - int off; + int ret = 0; for (i = 0; i < insn_cnt; i++, insn++) { + u8 code; + + /* In the probing pass we still operate on the original, + * unpatched image in order to check overflows before we + * do any other adjustments. Therefore skip the patchlet. + */ + if (probe_pass && i == pos) { + i += delta + 1; + insn++; + } code = insn->code; - if (BPF_CLASS(code) != BPF_JMP) - continue; - if (BPF_OP(code) == BPF_EXIT) + if (BPF_CLASS(code) != BPF_JMP || + BPF_OP(code) == BPF_EXIT) continue; + /* Adjust offset of jmps if we cross patch boundaries. */ if (BPF_OP(code) == BPF_CALL) { - if (insn->src_reg == BPF_PSEUDO_CALL) - pseudo_call = true; - else + if (insn->src_reg != BPF_PSEUDO_CALL) continue; + ret = bpf_adj_delta_to_imm(insn, pos, delta, i, + probe_pass); } else { - pseudo_call = false; + ret = bpf_adj_delta_to_off(insn, pos, delta, i, + probe_pass); } - off = pseudo_call ? insn->imm : insn->off; - - /* Adjust offset of jmps if we cross boundaries. */ - if (i < pos && i + off + 1 > pos) - off += delta; - else if (i > pos + delta && i + off + 1 <= pos + delta) - off -= delta; - - if (pseudo_call) - insn->imm = off; - else - insn->off = off; + if (ret) + break; } + + return ret; } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len) { u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; + const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; /* Since our patchlet doesn't expand the image, we're done. */ @@ -269,6 +306,15 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, insn_adj_cnt = prog->len + insn_delta; + /* Reject anything that would potentially let the insn->off + * target overflow when we have excessive program expansions. + * We need to probe here before we do any reallocation where + * we afterwards may not fail anymore. + */ + if (insn_adj_cnt > cnt_max && + bpf_adj_branches(prog, off, insn_delta, true)) + return NULL; + /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as * last page could have large enough tailroom. @@ -294,7 +340,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, sizeof(*patch) * insn_rest); memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); - bpf_adj_branches(prog_adj, off, insn_delta); + /* We are guaranteed to not fail at this point, otherwise + * the ship has sailed to reverse to the original state. An + * overflow cannot happen at this point. + */ + BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); return prog_adj; } @@ -1572,13 +1622,32 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) return cnt; } +static bool bpf_prog_array_copy_core(struct bpf_prog **prog, + u32 *prog_ids, + u32 request_cnt) +{ + int i = 0; + + for (; *prog; prog++) { + if (*prog == &dummy_bpf_prog.prog) + continue; + prog_ids[i] = (*prog)->aux->id; + if (++i == request_cnt) { + prog++; + break; + } + } + + return !!(*prog); +} + int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, __u32 __user *prog_ids, u32 cnt) { struct bpf_prog **prog; unsigned long err = 0; - u32 i = 0, *ids; bool nospc; + u32 *ids; /* users of this function are doing: * cnt = bpf_prog_array_length(); @@ -1595,16 +1664,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return -ENOMEM; rcu_read_lock(); prog = rcu_dereference(progs)->progs; - for (; *prog; prog++) { - if (*prog == &dummy_bpf_prog.prog) - continue; - ids[i] = (*prog)->aux->id; - if (++i == cnt) { - prog++; - break; - } - } - nospc = !!(*prog); + nospc = bpf_prog_array_copy_core(prog, ids, cnt); rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); @@ -1683,22 +1743,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, - __u32 __user *prog_ids, u32 request_cnt, - __u32 __user *prog_cnt) + u32 *prog_ids, u32 request_cnt, + u32 *prog_cnt) { + struct bpf_prog **prog; u32 cnt = 0; if (array) cnt = bpf_prog_array_length(array); - if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) - return -EFAULT; + *prog_cnt = cnt; /* return early if user requested only program count or nothing to copy */ if (!request_cnt || !cnt) return 0; - return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); + /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ + prog = rcu_dereference_check(array, 1)->progs; + return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC + : 0; } static void bpf_prog_free_deferred(struct work_struct *work) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 8dd9210d7db7..95a84b2f10ce 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -43,6 +43,7 @@ #include <net/tcp.h> #include <linux/ptr_ring.h> #include <net/inet_common.h> +#include <linux/sched/signal.h> #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -325,6 +326,9 @@ retry: if (ret > 0) { if (apply) apply_bytes -= ret; + + sg->offset += ret; + sg->length -= ret; size -= ret; offset += ret; if (uncharge) @@ -332,8 +336,6 @@ retry: goto retry; } - sg->length = size; - sg->offset = offset; return ret; } @@ -391,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) } while (i != md->sg_end); } -static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +static void free_bytes_sg(struct sock *sk, int bytes, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = md->sg_start, free; @@ -401,11 +404,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) if (bytes < free) { sg[i].length -= bytes; sg[i].offset += bytes; - sk_mem_uncharge(sk, bytes); + if (charge) + sk_mem_uncharge(sk, bytes); break; } - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); put_page(sg_page(&sg[i])); bytes -= sg[i].length; sg[i].length = 0; @@ -416,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) if (i == MAX_SKB_FRAGS) i = 0; } + md->sg_start = i; } static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) @@ -523,8 +529,6 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, i = md->sg_start; do { - r->sg_data[i] = md->sg_data[i]; - size = (apply && apply_bytes < md->sg_data[i].length) ? apply_bytes : md->sg_data[i].length; @@ -535,6 +539,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, } sk_mem_charge(sk, size); + r->sg_data[i] = md->sg_data[i]; r->sg_data[i].length = size; md->sg_data[i].length -= size; md->sg_data[i].offset += size; @@ -575,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct sk_msg_buff *md, int flags) { + bool ingress = !!(md->flags & BPF_F_INGRESS); struct smap_psock *psock; struct scatterlist *sg; - int i, err, free = 0; - bool ingress = !!(md->flags & BPF_F_INGRESS); + int err = 0; sg = md->sg_data; @@ -606,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, out_rcu: rcu_read_unlock(); out: - i = md->sg_start; - while (sg[i].length) { - free += sg[i].length; - put_page(sg_page(&sg[i])); - sg[i].length = 0; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - } - return free; + free_bytes_sg(NULL, send, md, false); + return err; } static inline void bpf_md_init(struct smap_psock *psock) @@ -700,19 +697,26 @@ more_data: err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); lock_sock(sk); + if (unlikely(err < 0)) { + free_start_sg(sk, m); + psock->sg_size = 0; + if (!cork) + *copied -= send; + } else { + psock->sg_size -= send; + } + if (cork) { free_start_sg(sk, m); + psock->sg_size = 0; kfree(m); m = NULL; + err = 0; } - if (unlikely(err)) - *copied -= err; - else - psock->sg_size -= send; break; case __SK_DROP: default: - free_bytes_sg(sk, send, m); + free_bytes_sg(sk, send, m, true); apply_bytes_dec(psock, send); *copied -= send; psock->sg_size -= send; @@ -732,6 +736,26 @@ out_err: return err; } +static int bpf_wait_data(struct sock *sk, + struct smap_psock *psk, int flags, + long timeo, int *err) +{ + int rc; + + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + rc = sk_wait_event(sk, &timeo, + !list_empty(&psk->ingress) || + !skb_queue_empty(&sk->sk_receive_queue), + &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + + return rc; +} + static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -755,6 +779,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); lock_sock(sk); +bytes_ready: while (copied != len) { struct scatterlist *sg; struct sk_msg_buff *md; @@ -809,6 +834,28 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } + if (!copied) { + long timeo; + int data; + int err = 0; + + timeo = sock_rcvtimeo(sk, nonblock); + data = bpf_wait_data(sk, psock, flags, timeo, &err); + + if (data) { + if (!skb_queue_empty(&sk->sk_receive_queue)) { + release_sock(sk); + smap_release_sock(psock, sk); + copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + return copied; + } + goto bytes_ready; + } + + if (err) + copied = err; + } + release_sock(sk); smap_release_sock(psock, sk); return copied; @@ -1442,9 +1489,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - if (attr->value_size > KMALLOC_MAX_SIZE) - return ERR_PTR(-E2BIG); - err = bpf_tcp_ulp_register(); if (err && err != -EEXIST) return ERR_PTR(err); @@ -1659,11 +1703,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, * we increment the refcnt. If this is the case abort with an * error. */ - verdict = bpf_prog_inc_not_zero(stab->bpf_verdict); + verdict = bpf_prog_inc_not_zero(verdict); if (IS_ERR(verdict)) return PTR_ERR(verdict); - parse = bpf_prog_inc_not_zero(stab->bpf_parse); + parse = bpf_prog_inc_not_zero(parse); if (IS_ERR(parse)) { bpf_prog_put(verdict); return PTR_ERR(parse); @@ -1671,12 +1715,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } if (tx_msg) { - tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + tx_msg = bpf_prog_inc_not_zero(tx_msg); if (IS_ERR(tx_msg)) { - if (verdict) - bpf_prog_put(verdict); - if (parse) + if (parse && verdict) { bpf_prog_put(parse); + bpf_prog_put(verdict); + } return PTR_ERR(tx_msg); } } @@ -1761,10 +1805,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, out_free: smap_release_sock(psock, sock); out_progs: - if (verdict) - bpf_prog_put(verdict); - if (parse) + if (parse && verdict) { bpf_prog_put(parse); + bpf_prog_put(verdict); + } if (tx_msg) bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); @@ -1834,7 +1878,7 @@ static int sock_map_update_elem(struct bpf_map *map, return err; } -static void sock_map_release(struct bpf_map *map, struct file *map_file) +static void sock_map_release(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_prog *orig; @@ -1858,7 +1902,7 @@ const struct bpf_map_ops sock_map_ops = { .map_get_next_key = sock_map_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, - .map_release = sock_map_release, + .map_release_uref = sock_map_release, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ca46df19c9a..016ef9025827 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -26,6 +26,7 @@ #include <linux/cred.h> #include <linux/timekeeping.h> #include <linux/ctype.h> +#include <linux/nospec.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -102,12 +103,14 @@ const struct bpf_map_ops bpf_map_offload_ops = { static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { const struct bpf_map_ops *ops; + u32 type = attr->map_type; struct bpf_map *map; int err; - if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) + if (type >= ARRAY_SIZE(bpf_map_types)) return ERR_PTR(-EINVAL); - ops = bpf_map_types[attr->map_type]; + type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); + ops = bpf_map_types[type]; if (!ops) return ERR_PTR(-EINVAL); @@ -122,7 +125,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) if (IS_ERR(map)) return map; map->ops = ops; - map->map_type = attr->map_type; + map->map_type = type; return map; } @@ -257,8 +260,8 @@ static void bpf_map_free_deferred(struct work_struct *work) static void bpf_map_put_uref(struct bpf_map *map) { if (atomic_dec_and_test(&map->usercnt)) { - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) - bpf_fd_array_map_clear(map); + if (map->ops->map_release_uref) + map->ops->map_release_uref(map); } } @@ -871,11 +874,17 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { - if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) + const struct bpf_prog_ops *ops; + + if (type >= ARRAY_SIZE(bpf_prog_types)) + return -EINVAL; + type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); + ops = bpf_prog_types[type]; + if (!ops) return -EINVAL; if (!bpf_prog_is_dev_bound(prog->aux)) - prog->aux->ops = bpf_prog_types[type]; + prog->aux->ops = ops; else prog->aux->ops = &bpf_offload_prog_ops; prog->type = type; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5dd1dcb902bf..1904e814f282 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -156,7 +156,29 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 -#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) +#define BPF_MAP_PTR_UNPRIV 1UL +#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ + POISON_POINTER_DELTA)) +#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) + +static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) +{ + return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON; +} + +static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) +{ + return aux->map_state & BPF_MAP_PTR_UNPRIV; +} + +static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, + const struct bpf_map *map, bool unpriv) +{ + BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); + unpriv |= bpf_map_ptr_unpriv(aux); + aux->map_state = (unsigned long)map | + (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); +} struct bpf_call_arg_meta { struct bpf_map *map_ptr; @@ -978,7 +1000,7 @@ static bool register_is_null(struct bpf_reg_state *reg) */ static int check_stack_write(struct bpf_verifier_env *env, struct bpf_func_state *state, /* func where register points to */ - int off, int size, int value_regno) + int off, int size, int value_regno, int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; @@ -1017,8 +1039,33 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr = cur->regs[value_regno]; state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - for (i = 0; i < BPF_REG_SIZE; i++) + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack[spi].slot_type[i] == STACK_MISC && + !env->allow_ptr_leaks) { + int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; + int soff = (-spi - 1) * BPF_REG_SIZE; + + /* detected reuse of integer stack slot with a pointer + * which means either llvm is reusing stack slot or + * an attacker is trying to exploit CVE-2018-3639 + * (speculative store bypass) + * Have to sanitize that slot with preemptive + * store of zero. + */ + if (*poff && *poff != soff) { + /* disallow programs where single insn stores + * into two different stack slots, since verifier + * cannot sanitize them + */ + verbose(env, + "insn %d cannot access two stack slots fp%d and fp%d", + insn_idx, *poff, soff); + return -EINVAL; + } + *poff = soff; + } state->stack[spi].slot_type[i] = STACK_SPILL; + } } else { u8 type = STACK_MISC; @@ -1694,7 +1741,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, - value_regno); + value_regno, insn_idx); else err = check_stack_read(env, state, off, size, value_regno); @@ -2333,6 +2380,29 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return 0; } +static int +record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, + int func_id, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + + if (func_id != BPF_FUNC_tail_call && + func_id != BPF_FUNC_map_lookup_elem) + return 0; + if (meta->map_ptr == NULL) { + verbose(env, "kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + + if (!BPF_MAP_PTR(aux->map_state)) + bpf_map_ptr_store(aux, meta->map_ptr, + meta->map_ptr->unpriv_array); + else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr) + bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, + meta->map_ptr->unpriv_array); + return 0; +} + static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; @@ -2387,13 +2457,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; - if (func_id == BPF_FUNC_tail_call) { - if (meta.map_ptr == NULL) { - verbose(env, "verifier bug\n"); - return -EINVAL; - } - env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; - } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -2404,6 +2467,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + err = record_func_map(env, &meta, func_id, insn_idx); + if (err) + return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ @@ -2428,8 +2495,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { - struct bpf_insn_aux_data *insn_aux; - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); @@ -2445,11 +2510,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; - insn_aux = &env->insn_aux_data[insn_idx]; - if (!insn_aux->map_ptr) - insn_aux->map_ptr = meta.map_ptr; - else if (insn_aux->map_ptr != meta.map_ptr) - insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -5169,6 +5229,34 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) else continue; + if (type == BPF_WRITE && + env->insn_aux_data[i + delta].sanitize_stack_off) { + struct bpf_insn patch[] = { + /* Sanitize suspicious stack slot with zero. + * There are no memory dependencies for this store, + * since it's only using frame pointer and immediate + * constant of zero + */ + BPF_ST_MEM(BPF_DW, BPF_REG_FP, + env->insn_aux_data[i + delta].sanitize_stack_off, + 0), + /* the original STX instruction will immediately + * overwrite the same stack slot with appropriate value + */ + *insn, + }; + + cnt = ARRAY_SIZE(patch); + new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; @@ -5417,6 +5505,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; + struct bpf_insn_aux_data *aux; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; @@ -5491,19 +5580,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + aux = &env->insn_aux_data[i + delta]; + if (!bpf_map_ptr_unpriv(aux)) + continue; + /* instead of changing every JIT dealing with tail_call * emit two extra insns: * if (index >= max_entries) goto out; * index &= array->index_mask; * to avoid out-of-bounds cpu speculation */ - map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (map_ptr == BPF_MAP_PTR_POISON) { + if (bpf_map_ptr_poisoned(aux)) { verbose(env, "tail_call abusing map_ptr\n"); return -EINVAL; } - if (!map_ptr->unpriv_array) - continue; + + map_ptr = BPF_MAP_PTR(aux->map_state); insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -5527,9 +5619,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_map_lookup_elem) { - map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (map_ptr == BPF_MAP_PTR_POISON || - !map_ptr->ops->map_gen_lookup) + aux = &env->insn_aux_data[i + delta]; + if (bpf_map_ptr_poisoned(aux)) + goto patch_call_imm; + + map_ptr = BPF_MAP_PTR(aux->map_state); + if (!map_ptr->ops->map_gen_lookup) goto patch_call_imm; cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 2be89a003185..bfcdae896122 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o stat.o namespace.o cgroup-v1.o +obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index b928b27050c6..77ff1cd6a252 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -201,13 +201,12 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int cgroup_task_count(const struct cgroup *cgrp); /* - * stat.c + * rstat.c */ -void cgroup_stat_flush(struct cgroup *cgrp); -int cgroup_stat_init(struct cgroup *cgrp); -void cgroup_stat_exit(struct cgroup *cgrp); -void cgroup_stat_show_cputime(struct seq_file *seq); -void cgroup_stat_boot(void); +int cgroup_rstat_init(struct cgroup *cgrp); +void cgroup_rstat_exit(struct cgroup *cgrp); +void cgroup_rstat_boot(void); +void cgroup_base_stat_cputime_show(struct seq_file *seq); /* * namespace.c @@ -218,9 +217,9 @@ extern const struct proc_ns_operations cgroupns_operations; * cgroup-v1.c */ extern struct cftype cgroup1_base_files[]; -extern const struct file_operations proc_cgroupstats_operations; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; +int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); void cgroup1_release_agent(struct work_struct *work); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index a2c05d2476ac..e06c97f3ed1a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -682,7 +682,7 @@ struct cftype cgroup1_base_files[] = { }; /* Display information about each subsystem and each hierarchy */ -static int proc_cgroupstats_show(struct seq_file *m, void *v) +int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; int i; @@ -705,18 +705,6 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) return 0; } -static int cgroupstats_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_cgroupstats_show, NULL); -} - -const struct file_operations proc_cgroupstats_operations = { - .open = cgroupstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a662bfcbea0e..acb66713f9b6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -54,6 +54,7 @@ #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> +#include <linux/sched/cputime.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -61,6 +62,8 @@ #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) +/* let's not notify more than 100 times per second */ +#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* * cgroup_mutex is the master lock. Any modification to cgroup or its @@ -142,14 +145,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS -static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); +static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; +struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -1554,6 +1557,8 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); + + del_timer_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); @@ -1573,8 +1578,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - list_for_each_entry(cfts, &css->ss->cfts, node) + if (!css->ss) { + if (cgroup_on_dfl(cgrp)) + cfts = cgroup_base_files; + else + cfts = cgroup1_base_files; + cgroup_addrm_files(css, cgrp, cfts, false); + } else { + list_for_each_entry(cfts, &css->ss->cfts, node) + cgroup_addrm_files(css, cgrp, cfts, false); + } } /** @@ -1598,14 +1612,16 @@ static int css_populate_dir(struct cgroup_subsys_state *css) else cfts = cgroup1_base_files; - return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); - } - - list_for_each_entry(cfts, &css->ss->cfts, node) { - ret = cgroup_addrm_files(css, cgrp, cfts, true); - if (ret < 0) { - failed_cfts = cfts; - goto err; + ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); + if (ret < 0) + return ret; + } else { + list_for_each_entry(cfts, &css->ss->cfts, node) { + ret = cgroup_addrm_files(css, cgrp, cfts, true); + if (ret < 0) { + failed_cfts = cfts; + goto err; + } } } @@ -1782,13 +1798,6 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - spin_lock_irq(&css_set_lock); - - if (use_task_css_set_links) - goto out_unlock; - - use_task_css_set_links = true; - /* * We need tasklist_lock because RCU is not safe against * while_each_thread(). Besides, a forking task that has passed @@ -1797,6 +1806,13 @@ static void cgroup_enable_task_cg_lists(void) * tasklist if we walk through it with RCU. */ read_lock(&tasklist_lock); + spin_lock_irq(&css_set_lock); + + if (use_task_css_set_links) + goto out_unlock; + + use_task_css_set_links = true; + do_each_thread(g, p) { WARN_ON_ONCE(!list_empty(&p->cg_list) || task_css_set(p) != &init_css_set); @@ -1824,9 +1840,9 @@ static void cgroup_enable_task_cg_lists(void) } spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); - read_unlock(&tasklist_lock); out_unlock: spin_unlock_irq(&css_set_lock); + read_unlock(&tasklist_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -1844,6 +1860,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; + INIT_LIST_HEAD(&cgrp->rstat_css_list); + prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -3381,7 +3399,7 @@ static int cpu_stat_show(struct seq_file *seq, void *v) struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; - cgroup_stat_show_cputime(seq); + cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); #endif @@ -3521,6 +3539,12 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn) return kernfs_setattr(kn, &iattr); } +static void cgroup_file_notify_timer(struct timer_list *timer) +{ + cgroup_file_notify(container_of(timer, struct cgroup_file, + notify_timer)); +} + static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft) { @@ -3547,6 +3571,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; + timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); + spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; spin_unlock_irq(&cgroup_file_kn_lock); @@ -3796,8 +3822,17 @@ void cgroup_file_notify(struct cgroup_file *cfile) unsigned long flags; spin_lock_irqsave(&cgroup_file_kn_lock, flags); - if (cfile->kn) - kernfs_notify(cfile->kn); + if (cfile->kn) { + unsigned long last = cfile->notified_at; + unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; + + if (time_in_range(jiffies, last, next)) { + timer_reduce(&cfile->notify_timer, next); + } else { + kernfs_notify(cfile->kn); + cfile->notified_at = jiffies; + } + } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } @@ -4560,7 +4595,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); if (cgroup_on_dfl(cgrp)) - cgroup_stat_exit(cgrp); + cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -4587,6 +4622,11 @@ static void css_release_work_fn(struct work_struct *work) if (ss) { /* css release path */ + if (!list_empty(&css->rstat_css_node)) { + cgroup_rstat_flush(cgrp); + list_del_rcu(&css->rstat_css_node); + } + cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); @@ -4597,7 +4637,7 @@ static void css_release_work_fn(struct work_struct *work) trace_cgroup_release(cgrp); if (cgroup_on_dfl(cgrp)) - cgroup_stat_flush(cgrp); + cgroup_rstat_flush(cgrp); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) @@ -4648,6 +4688,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + INIT_LIST_HEAD(&css->rstat_css_node); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); @@ -4656,6 +4697,9 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } + if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) + list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); + BUG_ON(cgroup_css(cgrp, ss)); } @@ -4757,6 +4801,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); err_free_css: + list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); @@ -4785,7 +4830,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) goto out_free_cgrp; if (cgroup_on_dfl(parent)) { - ret = cgroup_stat_init(cgrp); + ret = cgroup_rstat_init(cgrp); if (ret) goto out_cancel_ref; } @@ -4850,7 +4895,7 @@ out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: if (cgroup_on_dfl(parent)) - cgroup_stat_exit(cgrp); + cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5090,10 +5135,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for_each_css(css, ssid, cgrp) kill_css(css); - /* - * Remove @cgrp directory along with the base files. @cgrp has an - * extra ref on its kn. - */ + /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ + css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); if (parent && cgroup_is_threaded(cgrp)) @@ -5245,7 +5288,7 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); - cgroup_stat_boot(); + cgroup_rstat_boot(); /* * The latency of the synchronize_sched() is too high for cgroups, @@ -5335,7 +5378,7 @@ int __init cgroup_init(void) WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); - WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); + WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); return 0; } diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index defad3c5e7dc..d3bbb757ee49 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -362,35 +362,32 @@ EXPORT_SYMBOL(rdmacg_unregister_device); static int parse_resource(char *c, int *intval) { substring_t argstr; - const char **table = &rdmacg_resource_names[0]; char *name, *value = c; size_t len; - int ret, i = 0; + int ret, i; name = strsep(&value, "="); if (!name || !value) return -EINVAL; - len = strlen(value); + i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); + if (i < 0) + return i; - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { - if (strcmp(table[i], name)) - continue; + len = strlen(value); - argstr.from = value; - argstr.to = value + len; + argstr.from = value; + argstr.to = value + len; - ret = match_int(&argstr, intval); - if (ret >= 0) { - if (*intval < 0) - break; - return i; - } - if (strncmp(value, RDMACG_MAX_STR, len) == 0) { - *intval = S32_MAX; - return i; - } - break; + ret = match_int(&argstr, intval); + if (ret >= 0) { + if (*intval < 0) + return -EINVAL; + return i; + } + if (strncmp(value, RDMACG_MAX_STR, len) == 0) { + *intval = S32_MAX; + return i; } return -EINVAL; } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c new file mode 100644 index 000000000000..d503d1a9007c --- /dev/null +++ b/kernel/cgroup/rstat.c @@ -0,0 +1,416 @@ +#include "cgroup-internal.h" + +#include <linux/sched/cputime.h> + +static DEFINE_SPINLOCK(cgroup_rstat_lock); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); + +static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); + +static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->rstat_cpu, cpu); +} + +/** + * cgroup_rstat_updated - keep track of updated rstat_cpu + * @cgrp: target cgroup + * @cpu: cpu on which rstat_cpu was updated + * + * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching + * rstat_cpu->updated_children list. See the comment on top of + * cgroup_rstat_cpu definition for details. + */ +void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); + struct cgroup *parent; + unsigned long flags; + + /* nothing to do for root */ + if (!cgroup_parent(cgrp)) + return; + + /* + * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we + * see NULL updated_next or they see our updated stat. + */ + smp_mb(); + + /* + * Because @parent's updated_children is terminated with @parent + * instead of NULL, we can tell whether @cgrp is on the list by + * testing the next pointer for NULL. + */ + if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) + return; + + raw_spin_lock_irqsave(cpu_lock, flags); + + /* put @cgrp and all ancestors on the corresponding updated lists */ + for (parent = cgroup_parent(cgrp); parent; + cgrp = parent, parent = cgroup_parent(cgrp)) { + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); + + /* + * Both additions and removals are bottom-up. If a cgroup + * is already in the tree, all ancestors are. + */ + if (rstatc->updated_next) + break; + + rstatc->updated_next = prstatc->updated_children; + prstatc->updated_children = cgrp; + } + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} +EXPORT_SYMBOL_GPL(cgroup_rstat_updated); + +/** + * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree + * @pos: current position + * @root: root of the tree to traversal + * @cpu: target cpu + * + * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts + * the traversal and %NULL return indicates the end. During traversal, + * each returned cgroup is unlinked from the tree. Must be called with the + * matching cgroup_rstat_cpu_lock held. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, if a child is visited, its parent is + * guaranteed to be visited afterwards. + */ +static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, + struct cgroup *root, int cpu) +{ + struct cgroup_rstat_cpu *rstatc; + struct cgroup *parent; + + if (pos == root) + return NULL; + + /* + * We're gonna walk down to the first leaf and visit/remove it. We + * can pick whatever unvisited node as the starting point. + */ + if (!pos) + pos = root; + else + pos = cgroup_parent(pos); + + /* walk down to the first leaf */ + while (true) { + rstatc = cgroup_rstat_cpu(pos, cpu); + if (rstatc->updated_children == pos) + break; + pos = rstatc->updated_children; + } + + /* + * Unlink @pos from the tree. As the updated_children list is + * singly linked, we have to walk it to find the removal point. + * However, due to the way we traverse, @pos will be the first + * child in most cases. The only exception is @root. + */ + parent = cgroup_parent(pos); + if (parent && rstatc->updated_next) { + struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); + struct cgroup_rstat_cpu *nrstatc; + struct cgroup **nextp; + + nextp = &prstatc->updated_children; + while (true) { + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + if (*nextp == pos) + break; + + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + + *nextp = rstatc->updated_next; + rstatc->updated_next = NULL; + + /* + * Paired with the one in cgroup_rstat_cpu_updated(). + * Either they see NULL updated_next or we see their + * updated stat. + */ + smp_mb(); + } + + return pos; +} + +/* see cgroup_rstat_flush() */ +static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) +{ + int cpu; + + lockdep_assert_held(&cgroup_rstat_lock); + + for_each_possible_cpu(cpu) { + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, + cpu); + struct cgroup *pos = NULL; + + raw_spin_lock(cpu_lock); + while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { + struct cgroup_subsys_state *css; + + cgroup_base_stat_flush(pos, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(css, &pos->rstat_css_list, + rstat_css_node) + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + } + raw_spin_unlock(cpu_lock); + + /* if @may_sleep, play nice and yield if necessary */ + if (may_sleep && (need_resched() || + spin_needbreak(&cgroup_rstat_lock))) { + spin_unlock_irq(&cgroup_rstat_lock); + if (!cond_resched()) + cpu_relax(); + spin_lock_irq(&cgroup_rstat_lock); + } + } +} + +/** + * cgroup_rstat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + * + * This function may block. + */ +void cgroup_rstat_flush(struct cgroup *cgrp) +{ + might_sleep(); + + spin_lock_irq(&cgroup_rstat_lock); + cgroup_rstat_flush_locked(cgrp, true); + spin_unlock_irq(&cgroup_rstat_lock); +} + +/** + * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() + * @cgrp: target cgroup + * + * This function can be called from any context. + */ +void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) +{ + unsigned long flags; + + spin_lock_irqsave(&cgroup_rstat_lock, flags); + cgroup_rstat_flush_locked(cgrp, false); + spin_unlock_irqrestore(&cgroup_rstat_lock, flags); +} + +/** + * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold + * @cgrp: target cgroup + * + * Flush stats in @cgrp's subtree and prevent further flushes. Must be + * paired with cgroup_rstat_flush_release(). + * + * This function may block. + */ +void cgroup_rstat_flush_hold(struct cgroup *cgrp) + __acquires(&cgroup_rstat_lock) +{ + might_sleep(); + spin_lock_irq(&cgroup_rstat_lock); + cgroup_rstat_flush_locked(cgrp, true); +} + +/** + * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() + */ +void cgroup_rstat_flush_release(void) + __releases(&cgroup_rstat_lock) +{ + spin_unlock_irq(&cgroup_rstat_lock); +} + +int cgroup_rstat_init(struct cgroup *cgrp) +{ + int cpu; + + /* the root cgrp has rstat_cpu preallocated */ + if (!cgrp->rstat_cpu) { + cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); + if (!cgrp->rstat_cpu) + return -ENOMEM; + } + + /* ->updated_children list is self terminated */ + for_each_possible_cpu(cpu) { + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + + rstatc->updated_children = cgrp; + u64_stats_init(&rstatc->bsync); + } + + return 0; +} + +void cgroup_rstat_exit(struct cgroup *cgrp) +{ + int cpu; + + cgroup_rstat_flush(cgrp); + + /* sanity check */ + for_each_possible_cpu(cpu) { + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + + if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || + WARN_ON_ONCE(rstatc->updated_next)) + return; + } + + free_percpu(cgrp->rstat_cpu); + cgrp->rstat_cpu = NULL; +} + +void __init cgroup_rstat_boot(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); + + BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); +} + +/* + * Functions for cgroup basic resource statistics implemented on top of + * rstat. + */ +static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) +{ + dst_bstat->cputime.utime += src_bstat->cputime.utime; + dst_bstat->cputime.stime += src_bstat->cputime.stime; + dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; +} + +static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; + struct task_cputime cputime; + struct cgroup_base_stat delta; + unsigned seq; + + /* fetch the current per-cpu values */ + do { + seq = __u64_stats_fetch_begin(&rstatc->bsync); + cputime = rstatc->bstat.cputime; + } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); + + /* calculate the delta to propgate */ + delta.cputime.utime = cputime.utime - last_cputime->utime; + delta.cputime.stime = cputime.stime - last_cputime->stime; + delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - + last_cputime->sum_exec_runtime; + *last_cputime = cputime; + + /* transfer the pending stat into delta */ + cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); + memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); + + /* propagate delta into the global stat and the parent's pending */ + cgroup_base_stat_accumulate(&cgrp->bstat, &delta); + if (parent) + cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); +} + +static struct cgroup_rstat_cpu * +cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) +{ + struct cgroup_rstat_cpu *rstatc; + + rstatc = get_cpu_ptr(cgrp->rstat_cpu); + u64_stats_update_begin(&rstatc->bsync); + return rstatc; +} + +static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, + struct cgroup_rstat_cpu *rstatc) +{ + u64_stats_update_end(&rstatc->bsync); + cgroup_rstat_updated(cgrp, smp_processor_id()); + put_cpu_ptr(rstatc); +} + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) +{ + struct cgroup_rstat_cpu *rstatc; + + rstatc = cgroup_base_stat_cputime_account_begin(cgrp); + rstatc->bstat.cputime.sum_exec_runtime += delta_exec; + cgroup_base_stat_cputime_account_end(cgrp, rstatc); +} + +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec) +{ + struct cgroup_rstat_cpu *rstatc; + + rstatc = cgroup_base_stat_cputime_account_begin(cgrp); + + switch (index) { + case CPUTIME_USER: + case CPUTIME_NICE: + rstatc->bstat.cputime.utime += delta_exec; + break; + case CPUTIME_SYSTEM: + case CPUTIME_IRQ: + case CPUTIME_SOFTIRQ: + rstatc->bstat.cputime.stime += delta_exec; + break; + default: + break; + } + + cgroup_base_stat_cputime_account_end(cgrp, rstatc); +} + +void cgroup_base_stat_cputime_show(struct seq_file *seq) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 usage, utime, stime; + + if (!cgroup_parent(cgrp)) + return; + + cgroup_rstat_flush_hold(cgrp); + usage = cgrp->bstat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); + cgroup_rstat_flush_release(); + + do_div(usage, NSEC_PER_USEC); + do_div(utime, NSEC_PER_USEC); + do_div(stime, NSEC_PER_USEC); + + seq_printf(seq, "usage_usec %llu\n" + "user_usec %llu\n" + "system_usec %llu\n", + usage, utime, stime); +} diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c deleted file mode 100644 index 1e111dd455c4..000000000000 --- a/kernel/cgroup/stat.c +++ /dev/null @@ -1,338 +0,0 @@ -#include "cgroup-internal.h" - -#include <linux/sched/cputime.h> - -static DEFINE_MUTEX(cgroup_stat_mutex); -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); - -static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) -{ - return per_cpu_ptr(cgrp->cpu_stat, cpu); -} - -/** - * cgroup_cpu_stat_updated - keep track of updated cpu_stat - * @cgrp: target cgroup - * @cpu: cpu on which cpu_stat was updated - * - * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching - * cpu_stat->updated_children list. See the comment on top of - * cgroup_cpu_stat definition for details. - */ -static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) -{ - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); - struct cgroup *parent; - unsigned long flags; - - /* - * Speculative already-on-list test. This may race leading to - * temporary inaccuracies, which is fine. - * - * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @cgrp is on the list by - * testing the next pointer for NULL. - */ - if (cgroup_cpu_stat(cgrp, cpu)->updated_next) - return; - - raw_spin_lock_irqsave(cpu_lock, flags); - - /* put @cgrp and all ancestors on the corresponding updated lists */ - for (parent = cgroup_parent(cgrp); parent; - cgrp = parent, parent = cgroup_parent(cgrp)) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); - - /* - * Both additions and removals are bottom-up. If a cgroup - * is already in the tree, all ancestors are. - */ - if (cstat->updated_next) - break; - - cstat->updated_next = pcstat->updated_children; - pcstat->updated_children = cgrp; - } - - raw_spin_unlock_irqrestore(cpu_lock, flags); -} - -/** - * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree - * @pos: current position - * @root: root of the tree to traversal - * @cpu: target cpu - * - * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts - * the traversal and %NULL return indicates the end. During traversal, - * each returned cgroup is unlinked from the tree. Must be called with the - * matching cgroup_cpu_stat_lock held. - * - * The only ordering guarantee is that, for a parent and a child pair - * covered by a given traversal, if a child is visited, its parent is - * guaranteed to be visited afterwards. - */ -static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, - struct cgroup *root, int cpu) -{ - struct cgroup_cpu_stat *cstat; - struct cgroup *parent; - - if (pos == root) - return NULL; - - /* - * We're gonna walk down to the first leaf and visit/remove it. We - * can pick whatever unvisited node as the starting point. - */ - if (!pos) - pos = root; - else - pos = cgroup_parent(pos); - - /* walk down to the first leaf */ - while (true) { - cstat = cgroup_cpu_stat(pos, cpu); - if (cstat->updated_children == pos) - break; - pos = cstat->updated_children; - } - - /* - * Unlink @pos from the tree. As the updated_children list is - * singly linked, we have to walk it to find the removal point. - * However, due to the way we traverse, @pos will be the first - * child in most cases. The only exception is @root. - */ - parent = cgroup_parent(pos); - if (parent && cstat->updated_next) { - struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); - struct cgroup_cpu_stat *ncstat; - struct cgroup **nextp; - - nextp = &pcstat->updated_children; - while (true) { - ncstat = cgroup_cpu_stat(*nextp, cpu); - if (*nextp == pos) - break; - - WARN_ON_ONCE(*nextp == parent); - nextp = &ncstat->updated_next; - } - - *nextp = cstat->updated_next; - cstat->updated_next = NULL; - } - - return pos; -} - -static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, - struct cgroup_stat *src_stat) -{ - dst_stat->cputime.utime += src_stat->cputime.utime; - dst_stat->cputime.stime += src_stat->cputime.stime; - dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; -} - -static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) -{ - struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - struct task_cputime *last_cputime = &cstat->last_cputime; - struct task_cputime cputime; - struct cgroup_stat delta; - unsigned seq; - - lockdep_assert_held(&cgroup_stat_mutex); - - /* fetch the current per-cpu values */ - do { - seq = __u64_stats_fetch_begin(&cstat->sync); - cputime = cstat->cputime; - } while (__u64_stats_fetch_retry(&cstat->sync, seq)); - - /* accumulate the deltas to propgate */ - delta.cputime.utime = cputime.utime - last_cputime->utime; - delta.cputime.stime = cputime.stime - last_cputime->stime; - delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - - last_cputime->sum_exec_runtime; - *last_cputime = cputime; - - /* transfer the pending stat into delta */ - cgroup_stat_accumulate(&delta, &cgrp->pending_stat); - memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); - - /* propagate delta into the global stat and the parent's pending */ - cgroup_stat_accumulate(&cgrp->stat, &delta); - if (parent) - cgroup_stat_accumulate(&parent->pending_stat, &delta); -} - -/* see cgroup_stat_flush() */ -static void cgroup_stat_flush_locked(struct cgroup *cgrp) -{ - int cpu; - - lockdep_assert_held(&cgroup_stat_mutex); - - for_each_possible_cpu(cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); - struct cgroup *pos = NULL; - - raw_spin_lock_irq(cpu_lock); - while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) - cgroup_cpu_stat_flush_one(pos, cpu); - raw_spin_unlock_irq(cpu_lock); - } -} - -/** - * cgroup_stat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup - * - * Collect all per-cpu stats in @cgrp's subtree into the global counters - * and propagate them upwards. After this function returns, all cgroups in - * the subtree have up-to-date ->stat. - * - * This also gets all cgroups in the subtree including @cgrp off the - * ->updated_children lists. - */ -void cgroup_stat_flush(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_stat_mutex); - cgroup_stat_flush_locked(cgrp); - mutex_unlock(&cgroup_stat_mutex); -} - -static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) -{ - struct cgroup_cpu_stat *cstat; - - cstat = get_cpu_ptr(cgrp->cpu_stat); - u64_stats_update_begin(&cstat->sync); - return cstat; -} - -static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, - struct cgroup_cpu_stat *cstat) -{ - u64_stats_update_end(&cstat->sync); - cgroup_cpu_stat_updated(cgrp, smp_processor_id()); - put_cpu_ptr(cstat); -} - -void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) -{ - struct cgroup_cpu_stat *cstat; - - cstat = cgroup_cpu_stat_account_begin(cgrp); - cstat->cputime.sum_exec_runtime += delta_exec; - cgroup_cpu_stat_account_end(cgrp, cstat); -} - -void __cgroup_account_cputime_field(struct cgroup *cgrp, - enum cpu_usage_stat index, u64 delta_exec) -{ - struct cgroup_cpu_stat *cstat; - - cstat = cgroup_cpu_stat_account_begin(cgrp); - - switch (index) { - case CPUTIME_USER: - case CPUTIME_NICE: - cstat->cputime.utime += delta_exec; - break; - case CPUTIME_SYSTEM: - case CPUTIME_IRQ: - case CPUTIME_SOFTIRQ: - cstat->cputime.stime += delta_exec; - break; - default: - break; - } - - cgroup_cpu_stat_account_end(cgrp, cstat); -} - -void cgroup_stat_show_cputime(struct seq_file *seq) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime; - - if (!cgroup_parent(cgrp)) - return; - - mutex_lock(&cgroup_stat_mutex); - - cgroup_stat_flush_locked(cgrp); - - usage = cgrp->stat.cputime.sum_exec_runtime; - cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, - &utime, &stime); - - mutex_unlock(&cgroup_stat_mutex); - - do_div(usage, NSEC_PER_USEC); - do_div(utime, NSEC_PER_USEC); - do_div(stime, NSEC_PER_USEC); - - seq_printf(seq, "usage_usec %llu\n" - "user_usec %llu\n" - "system_usec %llu\n", - usage, utime, stime); -} - -int cgroup_stat_init(struct cgroup *cgrp) -{ - int cpu; - - /* the root cgrp has cpu_stat preallocated */ - if (!cgrp->cpu_stat) { - cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); - if (!cgrp->cpu_stat) - return -ENOMEM; - } - - /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - - cstat->updated_children = cgrp; - u64_stats_init(&cstat->sync); - } - - prev_cputime_init(&cgrp->stat.prev_cputime); - - return 0; -} - -void cgroup_stat_exit(struct cgroup *cgrp) -{ - int cpu; - - cgroup_stat_flush(cgrp); - - /* sanity check */ - for_each_possible_cpu(cpu) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - - if (WARN_ON_ONCE(cstat->updated_children != cgrp) || - WARN_ON_ONCE(cstat->updated_next)) - return; - } - - free_percpu(cgrp->cpu_stat); - cgrp->cpu_stat = NULL; -} - -void __init cgroup_stat_boot(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); - - BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); -} diff --git a/kernel/compat.c b/kernel/compat.c index 6d21894806b4..702aa846ddac 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -34,6 +34,7 @@ int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp) { struct compat_timex tx32; + memset(txc, 0, sizeof(struct timex)); if (copy_from_user(&tx32, utp, sizeof(struct compat_timex))) return -EFAULT; @@ -120,50 +121,6 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static int __compat_get_timespec64(struct timespec64 *ts64, - const struct compat_timespec __user *cts) -{ - struct compat_timespec ts; - int ret; - - ret = copy_from_user(&ts, cts, sizeof(ts)); - if (ret) - return -EFAULT; - - ts64->tv_sec = ts.tv_sec; - ts64->tv_nsec = ts.tv_nsec; - - return 0; -} - -static int __compat_put_timespec64(const struct timespec64 *ts64, - struct compat_timespec __user *cts) -{ - struct compat_timespec ts = { - .tv_sec = ts64->tv_sec, - .tv_nsec = ts64->tv_nsec - }; - return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; -} - -int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_get_timespec64(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_get_timespec64); - -int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_put_timespec64(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_put_timespec64); - int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) @@ -367,6 +324,14 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, return ret; } +/* Todo: Delete these extern declarations when get/put_compat_itimerspec64() + * are moved to kernel/time/time.c . + */ +extern int __compat_get_timespec64(struct timespec64 *ts64, + const struct compat_timespec __user *cts); +extern int __compat_put_timespec64(const struct timespec64 *ts64, + struct compat_timespec __user *cts); + int get_compat_itimerspec64(struct itimerspec64 *its, const struct compat_itimerspec __user *uits) { diff --git a/kernel/delayacct.c b/kernel/delayacct.c index e2764d767f18..ca8ac2824f0b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -44,23 +44,24 @@ void __delayacct_tsk_init(struct task_struct *tsk) { tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); if (tsk->delays) - spin_lock_init(&tsk->delays->lock); + raw_spin_lock_init(&tsk->delays->lock); } /* * Finish delay accounting for a statistic using its timestamps (@start), * accumalator (@total) and @count */ -static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count) +static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, + u32 *count) { s64 ns = ktime_get_ns() - *start; unsigned long flags; if (ns > 0) { - spin_lock_irqsave(lock, flags); + raw_spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - spin_unlock_irqrestore(lock, flags); + raw_spin_unlock_irqrestore(lock, flags); } } @@ -127,7 +128,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ - spin_lock_irqsave(&tsk->delays->lock, flags); + raw_spin_lock_irqsave(&tsk->delays->lock, flags); tmp = d->blkio_delay_total + tsk->delays->blkio_delay; d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; tmp = d->swapin_delay_total + tsk->delays->swapin_delay; @@ -137,7 +138,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; - spin_unlock_irqrestore(&tsk->delays->lock, flags); + raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; } @@ -147,10 +148,10 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) __u64 ret; unsigned long flags; - spin_lock_irqsave(&tsk->delays->lock, flags); + raw_spin_lock_irqsave(&tsk->delays->lock, flags); ret = nsec_to_clock_t(tsk->delays->blkio_delay + tsk->delays->swapin_delay); - spin_unlock_irqrestore(&tsk->delays->lock, flags); + raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } diff --git a/kernel/dma.c b/kernel/dma.c index 3506fc34a712..40f152936316 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -135,21 +135,9 @@ static int proc_dma_show(struct seq_file *m, void *v) } #endif /* MAX_DMA_CHANNELS */ -static int proc_dma_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_dma_show, NULL); -} - -static const struct file_operations proc_dma_operations = { - .open = proc_dma_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_dma_init(void) { - proc_create("dma", 0, NULL, &proc_dma_operations); + proc_create_single("dma", 0, NULL, proc_dma_show); return 0; } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 772a43fea825..c187aa3df3c8 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -119,23 +119,20 @@ int get_callchain_buffers(int event_max_stack) goto exit; } - if (count > 1) { - /* If the allocation failed, give up */ - if (!callchain_cpus_entries) - err = -ENOMEM; - /* - * If requesting per event more than the global cap, - * return a different error to help userspace figure - * this out. - * - * And also do it here so that we have &callchain_mutex held. - */ - if (event_max_stack > sysctl_perf_event_max_stack) - err = -EOVERFLOW; + /* + * If requesting per event more than the global cap, + * return a different error to help userspace figure + * this out. + * + * And also do it here so that we have &callchain_mutex held. + */ + if (event_max_stack > sysctl_perf_event_max_stack) { + err = -EOVERFLOW; goto exit; } - err = alloc_callchain_buffers(); + if (count == 1) + err = alloc_callchain_buffers(); exit: if (err) atomic_dec(&nr_callchain_events); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d5fe26551f8..08f5e1b42b43 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5120,6 +5120,8 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd, switch (_IOC_NR(cmd)) { case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): case _IOC_NR(PERF_EVENT_IOC_ID): + case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF): + case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES): /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { cmd &= ~IOCSIZE_MASK; @@ -6668,7 +6670,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - if (filter->inode) { + if (filter->path.dentry) { event->addr_filters_offs[count] = 0; restart++; } @@ -7333,7 +7335,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { - if (filter->inode != file_inode(file)) + if (d_inode(filter->path.dentry) != file_inode(file)) return false; if (filter->offset > offset + size) @@ -7587,6 +7589,10 @@ static void perf_event_switch(struct task_struct *task, }, }; + if (!sched_in && task->state == TASK_RUNNING) + switch_event.event_id.header.misc |= + PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); @@ -8682,8 +8688,7 @@ static void free_filters_list(struct list_head *filters) struct perf_addr_filter *filter, *iter; list_for_each_entry_safe(filter, iter, filters, entry) { - if (filter->inode) - iput(filter->inode); + path_put(&filter->path); list_del(&filter->entry); kfree(filter); } @@ -8780,7 +8785,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) * Adjust base offset if the filter is associated to a binary * that needs to be mapped: */ - if (filter->inode) + if (filter->path.dentry) event->addr_filters_offs[count] = perf_addr_filter_apply(filter, mm); @@ -8854,7 +8859,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, { struct perf_addr_filter *filter = NULL; char *start, *orig, *filename = NULL; - struct path path; substring_t args[MAX_OPT_ARGS]; int state = IF_STATE_ACTION, token; unsigned int kernel = 0; @@ -8967,19 +8971,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, goto fail_free_name; /* look up the path and grab its inode */ - ret = kern_path(filename, LOOKUP_FOLLOW, &path); + ret = kern_path(filename, LOOKUP_FOLLOW, + &filter->path); if (ret) goto fail_free_name; - filter->inode = igrab(d_inode(path.dentry)); - path_put(&path); kfree(filename); filename = NULL; ret = -EINVAL; - if (!filter->inode || - !S_ISREG(filter->inode->i_mode)) - /* free_filters_list() will iput() */ + if (!filter->path.dentry || + !S_ISREG(d_inode(filter->path.dentry) + ->i_mode)) goto fail; event->addr_filters.nr_file_filters++; @@ -10205,9 +10208,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, * __u16 sample size limit. */ if (attr->sample_stack_user >= USHRT_MAX) - ret = -EINVAL; + return -EINVAL; else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) - ret = -EINVAL; + return -EINVAL; } if (!attr->sample_max_stack) @@ -10517,19 +10520,20 @@ SYSCALL_DEFINE5(perf_event_open, if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; - if (group_leader && - (is_software_event(event) != is_software_event(group_leader))) { - if (is_software_event(event)) { + if (group_leader) { + if (is_software_event(event) && + !in_software_context(group_leader)) { /* - * If event and group_leader are not both a software - * event, and event is, then group leader is not. + * If the event is a sw event, but the group_leader + * is on hw context. * - * Allow the addition of software events to !software - * groups, this is safe because software events never - * fail to schedule. + * Allow the addition of software events to hw + * groups, this is safe because software events + * never fail to schedule. */ - pmu = group_leader->pmu; - } else if (is_software_event(group_leader) && + pmu = group_leader->ctx->pmu; + } else if (!is_software_event(event) && + is_software_event(group_leader) && (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6c6b3c48db71..1d8ca9ea9979 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/circ_buf.h> #include <linux/poll.h> +#include <linux/nospec.h> #include "internal.h" @@ -867,8 +868,10 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) return NULL; /* AUX space */ - if (pgoff >= rb->aux_pgoff) - return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]); + if (pgoff >= rb->aux_pgoff) { + int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages); + return virt_to_page(rb->aux_pages[aux_pgoff]); + } } return __perf_mmap_to_page(rb, pgoff); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ce6848e46e94..1725b902983f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -491,7 +491,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) if (!uprobe) return NULL; - uprobe->inode = igrab(inode); + uprobe->inode = inode; uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); @@ -502,7 +502,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) if (cur_uprobe) { kfree(uprobe); uprobe = cur_uprobe; - iput(inode); } return uprobe; @@ -701,7 +700,6 @@ static void delete_uprobe(struct uprobe *uprobe) rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock(&uprobes_treelock); RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ - iput(uprobe->inode); put_uprobe(uprobe); } @@ -873,7 +871,8 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. + * unregisters. Caller of uprobe_register() is required to keep @inode + * (and the containing mount) referenced. * * Return errno if it cannot successully install probes * else return 0 (success) diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index a5697119290e..33f07c5f2515 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -27,21 +27,9 @@ static int execdomains_proc_show(struct seq_file *m, void *v) return 0; } -static int execdomains_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, execdomains_proc_show, NULL); -} - -static const struct file_operations execdomains_proc_fops = { - .open = execdomains_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_execdomains_init(void) { - proc_create("execdomains", 0, NULL, &execdomains_proc_fops); + proc_create_single("execdomains", 0, NULL, execdomains_proc_show); return 0; } module_init(proc_execdomains_init); diff --git a/kernel/fork.c b/kernel/fork.c index cd18448b025a..80b48a8fb47b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -216,10 +216,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; -#ifdef CONFIG_DEBUG_KMEMLEAK /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); -#endif + tsk->stack_vm_area = s; return s->addr; } diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index fc4f361a86bb..dd20d0d528d4 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -1,11 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. + * Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl> */ #include <linux/slab.h> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2a8571f72b17..4ca2fd46645d 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -76,6 +76,19 @@ static inline void irq_chip_write_msi_msg(struct irq_data *data, data->chip->irq_write_msi_msg(data, msg); } +static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg) +{ + struct msi_domain_info *info = domain->host_data; + + /* + * If the MSI provider has messed with the second message and + * not advertized that it is level-capable, signal the breakage. + */ + WARN_ON(!((info->flags & MSI_FLAG_LEVEL_CAPABLE) && + (info->chip->flags & IRQCHIP_SUPPORTS_LEVEL_MSI)) && + (msg[1].address_lo || msg[1].address_hi || msg[1].data)); +} + /** * msi_domain_set_affinity - Generic affinity setter function for MSI domains * @irq_data: The irq data associated to the interrupt @@ -89,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force) { struct irq_data *parent = irq_data->parent_data; - struct msi_msg msg; + struct msi_msg msg[2] = { [1] = { }, }; int ret; ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { - BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); - irq_chip_write_msi_msg(irq_data, &msg); + BUG_ON(irq_chip_compose_msi_msg(irq_data, msg)); + msi_check_level(irq_data->domain, msg); + irq_chip_write_msi_msg(irq_data, msg); } return ret; @@ -104,20 +118,21 @@ int msi_domain_set_affinity(struct irq_data *irq_data, static int msi_domain_activate(struct irq_domain *domain, struct irq_data *irq_data, bool early) { - struct msi_msg msg; + struct msi_msg msg[2] = { [1] = { }, }; - BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); - irq_chip_write_msi_msg(irq_data, &msg); + BUG_ON(irq_chip_compose_msi_msg(irq_data, msg)); + msi_check_level(irq_data->domain, msg); + irq_chip_write_msi_msg(irq_data, msg); return 0; } static void msi_domain_deactivate(struct irq_domain *domain, struct irq_data *irq_data) { - struct msi_msg msg; + struct msi_msg msg[2]; - memset(&msg, 0, sizeof(msg)); - irq_chip_write_msi_msg(irq_data, &msg); + memset(msg, 0, sizeof(msg)); + irq_chip_write_msi_msg(irq_data, msg); } static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 7cb091d81d91..37eda10f5c36 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -185,11 +185,6 @@ static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); } -static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode)); -} - static const struct file_operations irq_affinity_proc_fops = { .open = irq_affinity_proc_open, .read = seq_read, @@ -198,13 +193,6 @@ static const struct file_operations irq_affinity_proc_fops = { .write = irq_affinity_proc_write, }; -static const struct file_operations irq_affinity_hint_proc_fops = { - .open = irq_affinity_hint_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static const struct file_operations irq_affinity_list_proc_fops = { .open = irq_affinity_list_proc_open, .read = seq_read, @@ -223,32 +211,6 @@ static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v) { return show_irq_affinity(EFFECTIVE_LIST, m); } - -static int irq_effective_aff_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode)); -} - -static int irq_effective_aff_list_proc_open(struct inode *inode, - struct file *file) -{ - return single_open(file, irq_effective_aff_list_proc_show, - PDE_DATA(inode)); -} - -static const struct file_operations irq_effective_aff_proc_fops = { - .open = irq_effective_aff_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static const struct file_operations irq_effective_aff_list_proc_fops = { - .open = irq_effective_aff_list_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif static int default_affinity_show(struct seq_file *m, void *v) @@ -313,18 +275,6 @@ static int irq_node_proc_show(struct seq_file *m, void *v) seq_printf(m, "%d\n", irq_desc_get_node(desc)); return 0; } - -static int irq_node_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_node_proc_show, PDE_DATA(inode)); -} - -static const struct file_operations irq_node_proc_fops = { - .open = irq_node_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif static int irq_spurious_proc_show(struct seq_file *m, void *v) @@ -337,18 +287,6 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) return 0; } -static int irq_spurious_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_spurious_proc_show, PDE_DATA(inode)); -} - -static const struct file_operations irq_spurious_proc_fops = { - .open = irq_spurious_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) @@ -421,24 +359,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) &irq_affinity_proc_fops, irqp); /* create /proc/irq/<irq>/affinity_hint */ - proc_create_data("affinity_hint", 0444, desc->dir, - &irq_affinity_hint_proc_fops, irqp); + proc_create_single_data("affinity_hint", 0444, desc->dir, + irq_affinity_hint_proc_show, irqp); /* create /proc/irq/<irq>/smp_affinity_list */ proc_create_data("smp_affinity_list", 0644, desc->dir, &irq_affinity_list_proc_fops, irqp); - proc_create_data("node", 0444, desc->dir, - &irq_node_proc_fops, irqp); + proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, + irqp); # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK - proc_create_data("effective_affinity", 0444, desc->dir, - &irq_effective_aff_proc_fops, irqp); - proc_create_data("effective_affinity_list", 0444, desc->dir, - &irq_effective_aff_list_proc_fops, irqp); + proc_create_single_data("effective_affinity", 0444, desc->dir, + irq_effective_aff_proc_show, irqp); + proc_create_single_data("effective_affinity_list", 0444, desc->dir, + irq_effective_aff_list_proc_show, irqp); # endif #endif - proc_create_data("spurious", 0444, desc->dir, - &irq_spurious_proc_fops, (void *)(long)irq); + proc_create_single_data("spurious", 0444, desc->dir, + irq_spurious_proc_show, (void *)(long)irq); out_unlock: mutex_unlock(®ister_lock); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 102160ff5c66..ea619021d901 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2428,7 +2428,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) struct kprobe_blacklist_entry *ent = list_entry(v, struct kprobe_blacklist_entry, list); - seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr, + seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, (void *)ent->end_addr, (void *)ent->start_addr); return 0; } diff --git a/kernel/kthread.c b/kernel/kthread.c index cd50e99202b0..481951bf091d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -55,7 +55,6 @@ enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, - KTHREAD_IS_PARKED, }; static inline void set_kthread_struct(void *kthread) @@ -177,14 +176,12 @@ void *kthread_probe_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { - __set_current_state(TASK_PARKED); - while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { - if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) - complete(&self->parked); + for (;;) { + set_current_state(TASK_PARKED); + if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) + break; schedule(); - __set_current_state(TASK_PARKED); } - clear_bit(KTHREAD_IS_PARKED, &self->flags); __set_current_state(TASK_RUNNING); } @@ -194,6 +191,11 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); +void kthread_park_complete(struct task_struct *k) +{ + complete_all(&to_kthread(k)->parked); +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -450,22 +452,16 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* - * We clear the IS_PARKED bit here as we don't wait - * until the task has left the park code. So if we'd - * park before that happens we'd see the IS_PARKED bit - * which might be about to be cleared. + * Newly created kthread was parked when the CPU was offline. + * The binding was lost and we need to set it again. */ - if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - /* - * Newly created kthread was parked when the CPU was offline. - * The binding was lost and we need to set it again. - */ - if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) - __kthread_bind(k, kthread->cpu, TASK_PARKED); - wake_up_state(k, TASK_PARKED); - } + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu, TASK_PARKED); + + reinit_completion(&kthread->parked); + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -488,12 +484,10 @@ int kthread_park(struct task_struct *k) if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; - if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); - if (k != current) { - wake_up_process(k); - wait_for_completion(&kthread->parked); - } + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + if (k != current) { + wake_up_process(k); + wait_for_completion(&kthread->parked); } return 0; diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index fdac27588d60..83958c814439 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -113,8 +113,10 @@ void *klp_shadow_get(void *obj, unsigned long id) } EXPORT_SYMBOL_GPL(klp_shadow_get); -static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags, bool warn_on_exist) +static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data, + bool warn_on_exist) { struct klp_shadow *new_shadow; void *shadow_data; @@ -125,18 +127,15 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, if (shadow_data) goto exists; - /* Allocate a new shadow variable for use inside the lock below */ + /* + * Allocate a new shadow variable. Fill it with zeroes by default. + * More complex setting can be done by @ctor function. But it is + * called only when the buffer is really used (under klp_shadow_lock). + */ new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags); if (!new_shadow) return NULL; - new_shadow->obj = obj; - new_shadow->id = id; - - /* Initialize the shadow variable if data provided */ - if (data) - memcpy(new_shadow->data, data, size); - /* Look for <obj, id> again under the lock */ spin_lock_irqsave(&klp_shadow_lock, flags); shadow_data = klp_shadow_get(obj, id); @@ -150,6 +149,22 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, goto exists; } + new_shadow->obj = obj; + new_shadow->id = id; + + if (ctor) { + int err; + + err = ctor(obj, new_shadow->data, ctor_data); + if (err) { + spin_unlock_irqrestore(&klp_shadow_lock, flags); + kfree(new_shadow); + pr_err("Failed to construct shadow variable <%p, %lx> (%d)\n", + obj, id, err); + return NULL; + } + } + /* No <obj, id> found, so attach the newly allocated one */ hash_add_rcu(klp_shadow_hash, &new_shadow->node, (unsigned long)new_shadow->obj); @@ -170,26 +185,32 @@ exists: * klp_shadow_alloc() - allocate and add a new shadow variable * @obj: pointer to parent object * @id: data identifier - * @data: pointer to data to attach to parent * @size: size of attached data * @gfp_flags: GFP mask for allocation + * @ctor: custom constructor to initialize the shadow data (optional) + * @ctor_data: pointer to any data needed by @ctor (optional) + * + * Allocates @size bytes for new shadow variable data using @gfp_flags. + * The data are zeroed by default. They are further initialized by @ctor + * function if it is not NULL. The new shadow variable is then added + * to the global hashtable. * - * Allocates @size bytes for new shadow variable data using @gfp_flags - * and copies @size bytes from @data into the new shadow variable's own - * data space. If @data is NULL, @size bytes are still allocated, but - * no copy is performed. The new shadow variable is then added to the - * global hashtable. + * If an existing <obj, id> shadow variable can be found, this routine will + * issue a WARN, exit early and return NULL. * - * If an existing <obj, id> shadow variable can be found, this routine - * will issue a WARN, exit early and return NULL. + * This function guarantees that the constructor function is called only when + * the variable did not exist before. The cost is that @ctor is called + * in atomic context under a spin lock. * * Return: the shadow variable data element, NULL on duplicate or * failure. */ -void *klp_shadow_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags) +void *klp_shadow_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data) { - return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true); + return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags, + ctor, ctor_data, true); } EXPORT_SYMBOL_GPL(klp_shadow_alloc); @@ -197,37 +218,51 @@ EXPORT_SYMBOL_GPL(klp_shadow_alloc); * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable * @obj: pointer to parent object * @id: data identifier - * @data: pointer to data to attach to parent * @size: size of attached data * @gfp_flags: GFP mask for allocation + * @ctor: custom constructor to initialize the shadow data (optional) + * @ctor_data: pointer to any data needed by @ctor (optional) * * Returns a pointer to existing shadow data if an <obj, id> shadow * variable is already present. Otherwise, it creates a new shadow * variable like klp_shadow_alloc(). * - * This function guarantees that only one shadow variable exists with - * the given @id for the given @obj. It also guarantees that the shadow - * variable will be initialized by the given @data only when it did not - * exist before. + * This function guarantees that only one shadow variable exists with the given + * @id for the given @obj. It also guarantees that the constructor function + * will be called only when the variable did not exist before. The cost is + * that @ctor is called in atomic context under a spin lock. * * Return: the shadow variable data element, NULL on failure. */ -void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags) +void *klp_shadow_get_or_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data) { - return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false); + return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags, + ctor, ctor_data, false); } EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc); +static void klp_shadow_free_struct(struct klp_shadow *shadow, + klp_shadow_dtor_t dtor) +{ + hash_del_rcu(&shadow->node); + if (dtor) + dtor(shadow->obj, shadow->data); + kfree_rcu(shadow, rcu_head); +} + /** * klp_shadow_free() - detach and free a <obj, id> shadow variable * @obj: pointer to parent object * @id: data identifier + * @dtor: custom callback that can be used to unregister the variable + * and/or free data that the shadow variable points to (optional) * * This function releases the memory for this <obj, id> shadow variable * instance, callers should stop referencing it accordingly. */ -void klp_shadow_free(void *obj, unsigned long id) +void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor) { struct klp_shadow *shadow; unsigned long flags; @@ -239,8 +274,7 @@ void klp_shadow_free(void *obj, unsigned long id) (unsigned long)obj) { if (klp_shadow_match(shadow, obj, id)) { - hash_del_rcu(&shadow->node); - kfree_rcu(shadow, rcu_head); + klp_shadow_free_struct(shadow, dtor); break; } } @@ -252,11 +286,13 @@ EXPORT_SYMBOL_GPL(klp_shadow_free); /** * klp_shadow_free_all() - detach and free all <*, id> shadow variables * @id: data identifier + * @dtor: custom callback that can be used to unregister the variable + * and/or free data that the shadow variable points to (optional) * * This function releases the memory for all <*, id> shadow variable * instances, callers should stop referencing them accordingly. */ -void klp_shadow_free_all(unsigned long id) +void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor) { struct klp_shadow *shadow; unsigned long flags; @@ -266,10 +302,8 @@ void klp_shadow_free_all(unsigned long id) /* Delete all <*, id> from hash */ hash_for_each(klp_shadow_hash, i, shadow, node) { - if (klp_shadow_match(shadow, shadow->obj, id)) { - hash_del_rcu(&shadow->node); - kfree_rcu(shadow, rcu_head); - } + if (klp_shadow_match(shadow, shadow->obj, id)) + klp_shadow_free_struct(shadow, dtor); } spin_unlock_irqrestore(&klp_shadow_lock, flags); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 023386338269..edcac5de7ebc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -561,20 +561,24 @@ static void print_lock(struct held_lock *hlock) printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } -static void lockdep_print_held_locks(struct task_struct *curr) +static void lockdep_print_held_locks(struct task_struct *p) { - int i, depth = curr->lockdep_depth; + int i, depth = READ_ONCE(p->lockdep_depth); - if (!depth) { - printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); + if (!depth) + printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p)); + else + printk("%d lock%s held by %s/%d:\n", depth, + depth > 1 ? "s" : "", p->comm, task_pid_nr(p)); + /* + * It's not reliable to print a task's held locks if it's not sleeping + * and it's not the current task. + */ + if (p->state == TASK_RUNNING && p != current) return; - } - printk("%d lock%s held by %s/%d:\n", - depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr)); - for (i = 0; i < depth; i++) { printk(" #%d: ", i); - print_lock(curr->held_locks + i); + print_lock(p->held_locks + i); } } @@ -4451,8 +4455,6 @@ EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { struct task_struct *g, *p; - int count = 10; - int unlock = 1; if (unlikely(!debug_locks)) { pr_warn("INFO: lockdep is turned off.\n"); @@ -4460,50 +4462,18 @@ void debug_show_all_locks(void) } pr_warn("\nShowing all locks held in the system:\n"); - /* - * Here we try to get the tasklist_lock as hard as possible, - * if not successful after 2 seconds we ignore it (but keep - * trying). This is to enable a debug printout even if a - * tasklist_lock-holding task deadlocks or crashes. - */ -retry: - if (!read_trylock(&tasklist_lock)) { - if (count == 10) - pr_warn("hm, tasklist_lock locked, retrying... "); - if (count) { - count--; - pr_cont(" #%d", 10-count); - mdelay(200); - goto retry; - } - pr_cont(" ignoring it.\n"); - unlock = 0; - } else { - if (count != 10) - pr_cont(" locked it.\n"); - } - - do_each_thread(g, p) { - /* - * It's not reliable to print a task's held locks - * if it's not sleeping (or if it's not the current - * task): - */ - if (p->state == TASK_RUNNING && p != current) + rcu_read_lock(); + for_each_process_thread(g, p) { + if (!p->lockdep_depth) continue; - if (p->lockdep_depth) - lockdep_print_held_locks(p); - if (!unlock) - if (read_trylock(&tasklist_lock)) - unlock = 1; + lockdep_print_held_locks(p); touch_nmi_watchdog(); - } while_each_thread(g, p); + touch_all_softlockup_watchdogs(); + } + rcu_read_unlock(); pr_warn("\n"); pr_warn("=============================================\n\n"); - - if (unlock) - read_unlock(&tasklist_lock); } EXPORT_SYMBOL_GPL(debug_show_all_locks); #endif diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index ad69bbc9bd28..3dd980dfba2d 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -101,18 +101,6 @@ static const struct seq_operations lockdep_ops = { .show = l_show, }; -static int lockdep_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_ops); -} - -static const struct file_operations proc_lockdep_operations = { - .open = lockdep_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - #ifdef CONFIG_PROVE_LOCKING static void *lc_start(struct seq_file *m, loff_t *pos) { @@ -170,18 +158,6 @@ static const struct seq_operations lockdep_chains_ops = { .stop = lc_stop, .show = lc_show, }; - -static int lockdep_chains_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_chains_ops); -} - -static const struct file_operations proc_lockdep_chains_operations = { - .open = lockdep_chains_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; #endif /* CONFIG_PROVE_LOCKING */ static void lockdep_stats_debug_show(struct seq_file *m) @@ -355,18 +331,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) return 0; } -static int lockdep_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, lockdep_stats_show, NULL); -} - -static const struct file_operations proc_lockdep_stats_operations = { - .open = lockdep_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - #ifdef CONFIG_LOCK_STAT struct lock_stat_data { @@ -682,14 +646,11 @@ static const struct file_operations proc_lock_stat_operations = { static int __init lockdep_proc_init(void) { - proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); + proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops); #ifdef CONFIG_PROVE_LOCKING - proc_create("lockdep_chains", S_IRUSR, NULL, - &proc_lockdep_chains_operations); + proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops); #endif - proc_create("lockdep_stats", S_IRUSR, NULL, - &proc_lockdep_stats_operations); - + proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show); #ifdef CONFIG_LOCK_STAT proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, &proc_lock_stat_operations); diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index f046b7ce9dd6..5e10153b4d3c 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -23,13 +23,15 @@ struct mcs_spinlock { #ifndef arch_mcs_spin_lock_contended /* - * Using smp_load_acquire() provides a memory barrier that ensures - * subsequent operations happen after the lock is acquired. + * Using smp_cond_load_acquire() provides the acquire semantics + * required so that subsequent operations happen after the + * lock is acquired. Additionally, some architectures such as + * ARM64 would like to do spin-waiting instead of purely + * spinning, and smp_cond_load_acquire() provides that behavior. */ #define arch_mcs_spin_lock_contended(l) \ do { \ - while (!(smp_load_acquire(l))) \ - cpu_relax(); \ + smp_cond_load_acquire(l, VAL); \ } while (0) #endif diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2048359f33d2..f44f658ae629 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struct mutex *lock) static __always_inline bool __mutex_trylock_fast(struct mutex *lock) { unsigned long curr = (unsigned long)current; + unsigned long zero = 0UL; - if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr)) + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; return false; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index d880296245c5..bfaeb05123ff 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -12,11 +12,11 @@ * GNU General Public License for more details. * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. - * (C) Copyright 2013-2014 Red Hat, Inc. + * (C) Copyright 2013-2014,2018 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP * - * Authors: Waiman Long <waiman.long@hpe.com> + * Authors: Waiman Long <longman@redhat.com> * Peter Zijlstra <peterz@infradead.org> */ @@ -33,6 +33,11 @@ #include <asm/qspinlock.h> /* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + +/* * The basic principle of a queue-based spinlock can best be understood * by studying a classic queue-based spinlock implementation called the * MCS lock. The paper below provides a good description for this kind @@ -77,6 +82,18 @@ #endif /* + * The pending bit spinning loop count. + * This heuristic is used to limit the number of lockword accesses + * made by atomic_cond_read_relaxed when waiting for the lock to + * transition out of the "== _Q_PENDING_VAL" state. We don't spin + * indefinitely because there's no guarantee that we'll make forward + * progress. + */ +#ifndef _Q_PENDING_LOOPS +#define _Q_PENDING_LOOPS 1 +#endif + +/* * Per-CPU queue node structures; we can never have more than 4 nested * contexts: task, softirq, hardirq, nmi. * @@ -114,41 +131,18 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail) #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) -/* - * By using the whole 2nd least significant byte for the pending bit, we - * can allow better optimization of the lock acquisition for the pending - * bit holder. +#if _Q_PENDING_BITS == 8 +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure * - * This internal structure is also used by the set_locked function which - * is not restricted to _Q_PENDING_BITS == 8. + * *,1,* -> *,0,* */ -struct __qspinlock { - union { - atomic_t val; -#ifdef __LITTLE_ENDIAN - struct { - u8 locked; - u8 pending; - }; - struct { - u16 locked_pending; - u16 tail; - }; -#else - struct { - u16 tail; - u16 locked_pending; - }; - struct { - u8 reserved[2]; - u8 pending; - u8 locked; - }; -#endif - }; -}; +static __always_inline void clear_pending(struct qspinlock *lock) +{ + WRITE_ONCE(lock->pending, 0); +} -#if _Q_PENDING_BITS == 8 /** * clear_pending_set_locked - take ownership and clear the pending bit. * @lock: Pointer to queued spinlock structure @@ -159,9 +153,7 @@ struct __qspinlock { */ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); } /* @@ -176,19 +168,28 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) */ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) { - struct __qspinlock *l = (void *)lock; - /* - * Use release semantics to make sure that the MCS node is properly - * initialized before changing the tail code. + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. */ - return (u32)xchg_release(&l->tail, + return (u32)xchg_relaxed(&lock->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; } #else /* _Q_PENDING_BITS == 8 */ /** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_andnot(_Q_PENDING_VAL, &lock->val); +} + +/** * clear_pending_set_locked - take ownership and clear the pending bit. * @lock: Pointer to queued spinlock structure * @@ -216,10 +217,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) for (;;) { new = (val & _Q_LOCKED_PENDING_MASK) | tail; /* - * Use release semantics to make sure that the MCS node is - * properly initialized before changing the tail code. + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. */ - old = atomic_cmpxchg_release(&lock->val, val, new); + old = atomic_cmpxchg_relaxed(&lock->val, val, new); if (old == val) break; @@ -237,9 +239,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) */ static __always_inline void set_locked(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->locked, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); } @@ -294,86 +294,83 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { struct mcs_spinlock *prev, *next, *node; - u32 new, old, tail; + u32 old, tail; int idx; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); if (pv_enabled()) - goto queue; + goto pv_queue; if (virt_spin_lock(lock)) return; /* - * wait for in-progress pending->locked hand-overs + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. * * 0,1,0 -> 0,0,1 */ if (val == _Q_PENDING_VAL) { - while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL) - cpu_relax(); + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--); } /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* * trylock || pending * * 0,0,0 -> 0,0,1 ; trylock * 0,0,1 -> 0,1,1 ; pending */ - for (;;) { + val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); + if (!(val & ~_Q_LOCKED_MASK)) { /* - * If we observe any contention; queue. + * We're pending, wait for the owner to go away. + * + * *,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. */ - if (val & ~_Q_LOCKED_MASK) - goto queue; - - new = _Q_LOCKED_VAL; - if (val == new) - new |= _Q_PENDING_VAL; + if (val & _Q_LOCKED_MASK) { + atomic_cond_read_acquire(&lock->val, + !(VAL & _Q_LOCKED_MASK)); + } /* - * Acquire semantic is required here as the function may - * return immediately if the lock was free. + * take ownership and clear the pending bit. + * + * *,1,0 -> *,0,1 */ - old = atomic_cmpxchg_acquire(&lock->val, val, new); - if (old == val) - break; - - val = old; - } - - /* - * we won the trylock - */ - if (new == _Q_LOCKED_VAL) + clear_pending_set_locked(lock); + qstat_inc(qstat_lock_pending, true); return; + } /* - * we're pending, wait for the owner to go away. - * - * *,1,1 -> *,1,0 - * - * this wait loop must be a load-acquire such that we match the - * store-release that clears the locked bit and create lock - * sequentiality; this is because not all clear_pending_set_locked() - * implementations imply full barriers. - */ - smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); - - /* - * take ownership and clear the pending bit. - * - * *,1,0 -> *,0,1 + * If pending was clear but there are waiters in the queue, then + * we need to undo our setting of pending before we queue ourselves. */ - clear_pending_set_locked(lock); - return; + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); /* * End of pending bit optimistic spinning and beginning of MCS * queuing. */ queue: + qstat_inc(qstat_lock_slowpath, true); +pv_queue: node = this_cpu_ptr(&mcs_nodes[0]); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); @@ -400,12 +397,18 @@ queue: goto release; /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. * We have already touched the queueing cacheline; don't bother with * pending stuff. * * p,*,* -> n,*,* - * - * RELEASE, such that the stores to @node must be complete. */ old = xchg_tail(lock, tail); next = NULL; @@ -417,14 +420,8 @@ queue: if (old & _Q_TAIL_MASK) { prev = decode_tail(old); - /* - * We must ensure that the stores to @node are observed before - * the write to prev->next. The address dependency from - * xchg_tail is not sufficient to ensure this because the read - * component of xchg_tail is unordered with respect to the - * initialisation of @node. - */ - smp_store_release(&prev->next, node); + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); @@ -453,8 +450,8 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_cond_load_acquire() call. As the next PV queue head hasn't been - * designated yet, there is no way for the locked value to become + * atomic_cond_read_acquire() call. As the next PV queue head hasn't + * been designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. * @@ -464,44 +461,38 @@ queue: if ((val = pv_wait_head_or_lock(lock, node))) goto locked; - val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); + val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK)); locked: /* * claim the lock: * * n,0,0 -> 0,0,1 : lock, uncontended - * *,0,0 -> *,0,1 : lock, contended + * *,*,0 -> *,*,1 : lock, contended * - * If the queue head is the only one in the queue (lock value == tail), - * clear the tail code and grab the lock. Otherwise, we only need - * to grab the lock. + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. */ - for (;;) { - /* In the PV case we might already have _Q_LOCKED_VAL set */ - if ((val & _Q_TAIL_MASK) != tail) { - set_locked(lock); - break; - } - /* - * The smp_cond_load_acquire() call above has provided the - * necessary acquire semantics required for locking. At most - * two iterations of this loop may be ran. - */ - old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); - if (old == val) - goto release; /* No contention */ - val = old; - } + /* + * In the PV case we might already have _Q_LOCKED_VAL set. + * + * The atomic_cond_read_acquire() call above has provided the + * necessary acquire semantics required for locking. + */ + if (((val & _Q_TAIL_MASK) == tail) && + atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + + /* Either somebody is queued behind us or _Q_PENDING_VAL is set */ + set_locked(lock); /* * contended path; wait for next if not observed yet, release. */ - if (!next) { - while (!(next = READ_ONCE(node->next))) - cpu_relax(); - } + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); arch_mcs_spin_unlock_contended(&next->locked); pv_kick_node(lock, next); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 6ee477765e6c..5a0cf5f9008c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -56,11 +56,6 @@ struct pv_node { }; /* - * Include queued spinlock statistics code - */ -#include "qspinlock_stat.h" - -/* * Hybrid PV queued/unfair lock * * By replacing the regular queued_spin_trylock() with the function below, @@ -87,8 +82,6 @@ struct pv_node { #define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l) static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - /* * Stay in unfair lock mode as long as queued mode waiters are * present in the MCS wait queue but the pending bit isn't set. @@ -97,7 +90,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) int val = atomic_read(&lock->val); if (!(val & _Q_LOCKED_PENDING_MASK) && - (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { qstat_inc(qstat_pv_lock_stealing, true); return true; } @@ -117,16 +110,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) #if _Q_PENDING_BITS == 8 static __always_inline void set_pending(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->pending, 1); -} - -static __always_inline void clear_pending(struct qspinlock *lock) -{ - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->pending, 0); + WRITE_ONCE(lock->pending, 1); } /* @@ -136,10 +120,8 @@ static __always_inline void clear_pending(struct qspinlock *lock) */ static __always_inline int trylock_clear_pending(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - return !READ_ONCE(l->locked) && - (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL, + return !READ_ONCE(lock->locked) && + (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) == _Q_PENDING_VAL); } #else /* _Q_PENDING_BITS == 8 */ @@ -148,11 +130,6 @@ static __always_inline void set_pending(struct qspinlock *lock) atomic_or(_Q_PENDING_VAL, &lock->val); } -static __always_inline void clear_pending(struct qspinlock *lock) -{ - atomic_andnot(_Q_PENDING_VAL, &lock->val); -} - static __always_inline int trylock_clear_pending(struct qspinlock *lock) { int val = atomic_read(&lock->val); @@ -384,7 +361,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - struct __qspinlock *l = (void *)lock; /* * If the vCPU is indeed halted, advance its state to match that of @@ -413,7 +389,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * the hash table later on at unlock time, no atomic instruction is * needed. */ - WRITE_ONCE(l->locked, _Q_SLOW_VAL); + WRITE_ONCE(lock->locked, _Q_SLOW_VAL); (void)pv_hash(lock, pn); } @@ -428,7 +404,6 @@ static u32 pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - struct __qspinlock *l = (void *)lock; struct qspinlock **lp = NULL; int waitcnt = 0; int loop; @@ -443,7 +418,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) /* * Tracking # of slowpath locking operations */ - qstat_inc(qstat_pv_lock_slowpath, true); + qstat_inc(qstat_lock_slowpath, true); for (;; waitcnt++) { /* @@ -479,13 +454,13 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ - if (xchg(&l->locked, _Q_SLOW_VAL) == 0) { + if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) { /* * The lock was free and now we own the lock. * Change the lock value back to _Q_LOCKED_VAL * and unhash the table. */ - WRITE_ONCE(l->locked, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); WRITE_ONCE(*lp, NULL); goto gotlock; } @@ -493,7 +468,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) WRITE_ONCE(pn->state, vcpu_hashed); qstat_inc(qstat_pv_wait_head, true); qstat_inc(qstat_pv_wait_again, waitcnt); - pv_wait(&l->locked, _Q_SLOW_VAL); + pv_wait(&lock->locked, _Q_SLOW_VAL); /* * Because of lock stealing, the queue head vCPU may not be @@ -518,7 +493,6 @@ gotlock: __visible void __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { - struct __qspinlock *l = (void *)lock; struct pv_node *node; if (unlikely(locked != _Q_SLOW_VAL)) { @@ -547,7 +521,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * Now that we have a reference to the (likely) blocked pv_node, * release the lock. */ - smp_store_release(&l->locked, 0); + smp_store_release(&lock->locked, 0); /* * At this point the memory pointed at by lock can be freed/reused, @@ -573,7 +547,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) #ifndef __pv_queued_spin_unlock __visible void __pv_queued_spin_unlock(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; u8 locked; /* @@ -581,7 +554,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); + locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0); if (likely(locked == _Q_LOCKED_VAL)) return; diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 4a30ef63c607..6bd78c0740fc 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -22,13 +22,14 @@ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake * pv_latency_kick - average latency (ns) of vCPU kick operation * pv_latency_wake - average latency (ns) from vCPU kick to wakeup - * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs * pv_wait_again - # of wait's after a queue head vCPU kick * pv_wait_early - # of early vCPU wait's * pv_wait_head - # of vCPU wait's at the queue head * pv_wait_node - # of vCPU wait's at a non-head queue node + * lock_pending - # of locking operations via pending code + * lock_slowpath - # of locking operations via MCS lock queue * * Writing to the "reset_counters" file will reset all the above counter * values. @@ -46,13 +47,14 @@ enum qlock_stats { qstat_pv_kick_wake, qstat_pv_latency_kick, qstat_pv_latency_wake, - qstat_pv_lock_slowpath, qstat_pv_lock_stealing, qstat_pv_spurious_wakeup, qstat_pv_wait_again, qstat_pv_wait_early, qstat_pv_wait_head, qstat_pv_wait_node, + qstat_lock_pending, + qstat_lock_slowpath, qstat_num, /* Total number of statistical counters */ qstat_reset_cnts = qstat_num, }; @@ -73,12 +75,13 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", [qstat_pv_latency_kick] = "pv_latency_kick", [qstat_pv_latency_wake] = "pv_latency_wake", - [qstat_pv_lock_slowpath] = "pv_lock_slowpath", [qstat_pv_lock_stealing] = "pv_lock_stealing", [qstat_pv_wait_again] = "pv_wait_again", [qstat_pv_wait_early] = "pv_wait_early", [qstat_pv_wait_head] = "pv_wait_head", [qstat_pv_wait_node] = "pv_wait_node", + [qstat_lock_pending] = "lock_pending", + [qstat_lock_slowpath] = "lock_slowpath", [qstat_reset_cnts] = "reset_counters", }; diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index e795908f3607..3064c50e181e 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -347,30 +347,31 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) } } +static inline bool owner_on_cpu(struct task_struct *owner) +{ + /* + * As lock holder preemption issue, we both skip spinning if + * task is not on cpu or its cpu is preempted + */ + return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); +} + static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; bool ret = true; + BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); + if (need_resched()) return false; rcu_read_lock(); owner = READ_ONCE(sem->owner); - if (!rwsem_owner_is_writer(owner)) { - /* - * Don't spin if the rwsem is readers owned. - */ - ret = !rwsem_owner_is_reader(owner); - goto done; + if (owner) { + ret = is_rwsem_owner_spinnable(owner) && + owner_on_cpu(owner); } - - /* - * As lock holder preemption issue, we both skip spinning if task is not - * on cpu or its cpu is preempted - */ - ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); -done: rcu_read_unlock(); return ret; } @@ -382,11 +383,11 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner = READ_ONCE(sem->owner); - if (!rwsem_owner_is_writer(owner)) - goto out; + if (!is_rwsem_owner_spinnable(owner)) + return false; rcu_read_lock(); - while (sem->owner == owner) { + while (owner && (READ_ONCE(sem->owner) == owner)) { /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking sem->owner still matches owner, if that fails, @@ -399,8 +400,7 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) * abort spinning when need_resched or owner is not running or * owner's cpu is preempted. */ - if (!owner->on_cpu || need_resched() || - vcpu_is_preempted(task_cpu(owner))) { + if (need_resched() || !owner_on_cpu(owner)) { rcu_read_unlock(); return false; } @@ -408,12 +408,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) cpu_relax(); } rcu_read_unlock(); -out: + /* * If there is a new owner or the owner is not set, we continue * spinning. */ - return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); + return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 30465a2f2b6c..bc1e507be9ff 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -221,5 +221,3 @@ void up_read_non_owner(struct rw_semaphore *sem) EXPORT_SYMBOL(up_read_non_owner); #endif - - diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a17cba8d94bb..b9d0e72aa80f 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,20 +1,24 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * The owner field of the rw_semaphore structure will be set to - * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear + * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear * the owner field when it unlocks. A reader, on the other hand, will * not touch the owner field when it unlocks. * - * In essence, the owner field now has the following 3 states: + * In essence, the owner field now has the following 4 states: * 1) 0 * - lock is free or the owner hasn't set the field yet * 2) RWSEM_READER_OWNED * - lock is currently or previously owned by readers (lock is free * or not set by owner yet) - * 3) Other non-zero value - * - a writer owns the lock + * 3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well + * - lock is owned by an anonymous writer, so spinning on the lock + * owner should be disabled. + * 4) Other non-zero value + * - a writer owns the lock and other writers can spin on the lock owner. */ -#define RWSEM_READER_OWNED ((struct task_struct *)1UL) +#define RWSEM_ANONYMOUSLY_OWNED (1UL << 0) +#define RWSEM_READER_OWNED ((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED) #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) @@ -51,14 +55,22 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); } -static inline bool rwsem_owner_is_writer(struct task_struct *owner) +/* + * Return true if the a rwsem waiter can spin on the rwsem's owner + * and steal the lock, i.e. the lock is not anonymously owned. + * N.B. !owner is considered spinnable. + */ +static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) { - return owner && owner != RWSEM_READER_OWNED; + return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); } -static inline bool rwsem_owner_is_reader(struct task_struct *owner) +/* + * Return true if rwsem is owned by an anonymous writer or readers. + */ +static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) { - return owner == RWSEM_READER_OWNED; + return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; } #else static inline void rwsem_set_owner(struct rw_semaphore *sem) diff --git a/kernel/module.c b/kernel/module.c index a6e43a5806a1..c9bea7f2b43e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1472,7 +1472,8 @@ static ssize_t module_sect_show(struct module_attribute *mattr, { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%pK\n", (void *)sattr->address); + return sprintf(buf, "0x%px\n", kptr_restrict < 2 ? + (void *)sattr->address : NULL); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -3516,6 +3517,11 @@ static noinline int do_init_module(struct module *mod) * walking this with preempt disabled. In all the failure paths, we * call synchronize_sched(), but we don't want to slow down the success * path, so use actual RCU here. + * Note that module_alloc() on most architectures creates W+X page + * mappings which won't be cleaned up until do_free_init() runs. Any + * code such as mark_rodata_ro() which depends on those mappings to + * be cleaned up needs to sync with the queued work - ie + * rcu_barrier_sched() */ call_rcu_sched(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 5454cc639a8d..9c85c7822383 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -287,6 +287,8 @@ static int create_image(int platform_mode) local_irq_disable(); + system_state = SYSTEM_SUSPEND; + error = syscore_suspend(); if (error) { pr_err("Some system devices failed to power down, aborting hibernation\n"); @@ -317,6 +319,7 @@ static int create_image(int platform_mode) syscore_resume(); Enable_irqs: + system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: @@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode) goto Enable_cpus; local_irq_disable(); + system_state = SYSTEM_SUSPEND; error = syscore_suspend(); if (error) @@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode) syscore_resume(); Enable_irqs: + system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: @@ -563,6 +568,7 @@ int hibernation_platform_enter(void) goto Enable_cpus; local_irq_disable(); + system_state = SYSTEM_SUSPEND; syscore_suspend(); if (pm_wakeup_pending()) { error = -EAGAIN; @@ -575,6 +581,7 @@ int hibernation_platform_enter(void) Power_up: syscore_resume(); + system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: diff --git a/kernel/power/qos.c b/kernel/power/qos.c index fa39092b7aea..86d72ffb811b 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -184,7 +184,6 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) c->target_value = value; } -static inline int pm_qos_get_value(struct pm_qos_constraints *c); static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused) { struct pm_qos_object *qos = (struct pm_qos_object *)s->private; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4c10be0f4843..87331565e505 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> +#include <linux/swait.h> #include <linux/ftrace.h> #include <trace/events/power.h> #include <linux/compiler.h> @@ -57,10 +58,10 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags); static const struct platform_suspend_ops *suspend_ops; static const struct platform_s2idle_ops *s2idle_ops; -static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head); +static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); enum s2idle_states __read_mostly s2idle_state; -static DEFINE_SPINLOCK(s2idle_lock); +static DEFINE_RAW_SPINLOCK(s2idle_lock); void s2idle_set_ops(const struct platform_s2idle_ops *ops) { @@ -78,12 +79,12 @@ static void s2idle_enter(void) { trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); - spin_lock_irq(&s2idle_lock); + raw_spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; s2idle_state = S2IDLE_STATE_ENTER; - spin_unlock_irq(&s2idle_lock); + raw_spin_unlock_irq(&s2idle_lock); get_online_cpus(); cpuidle_resume(); @@ -91,17 +92,17 @@ static void s2idle_enter(void) /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ - wait_event(s2idle_wait_head, - s2idle_state == S2IDLE_STATE_WAKE); + swait_event(s2idle_wait_head, + s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); put_online_cpus(); - spin_lock_irq(&s2idle_lock); + raw_spin_lock_irq(&s2idle_lock); out: s2idle_state = S2IDLE_STATE_NONE; - spin_unlock_irq(&s2idle_lock); + raw_spin_unlock_irq(&s2idle_lock); trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false); } @@ -156,12 +157,12 @@ void s2idle_wake(void) { unsigned long flags; - spin_lock_irqsave(&s2idle_lock, flags); + raw_spin_lock_irqsave(&s2idle_lock, flags); if (s2idle_state > S2IDLE_STATE_NONE) { s2idle_state = S2IDLE_STATE_WAKE; - wake_up(&s2idle_wait_head); + swake_up(&s2idle_wait_head); } - spin_unlock_irqrestore(&s2idle_lock, flags); + raw_spin_unlock_irqrestore(&s2idle_lock, flags); } EXPORT_SYMBOL_GPL(s2idle_wake); @@ -428,6 +429,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); + system_state = SYSTEM_SUSPEND; + error = syscore_suspend(); if (!error) { *wakeup = pm_wakeup_pending(); @@ -443,6 +446,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) syscore_resume(); } + system_state = SYSTEM_RUNNING; + arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11b4282c2d20..1efcb5b0c3ed 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -269,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); + bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio_set_dev(bio, hib_resume_bdev); bio_set_op_attrs(bio, op, op_flags); @@ -376,7 +376,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) return -ENOSPC; if (hb) { - src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | + src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); @@ -384,7 +384,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_RECLAIM | + src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { @@ -691,7 +691,7 @@ static int save_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); + page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { pr_err("Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -989,7 +989,7 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_RECLAIM | __GFP_HIGH); + __get_free_page(GFP_NOIO | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; @@ -1261,8 +1261,8 @@ static int load_image_lzo(struct swap_map_handle *handle, for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_RECLAIM | __GFP_HIGH : - __GFP_RECLAIM | __GFP_NOWARN | + GFP_NOIO | __GFP_HIGH : + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!page[i]) { diff --git a/kernel/power/user.c b/kernel/power/user.c index 75c959de4b29..abd225550271 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -186,6 +186,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, res = PAGE_SIZE - pg_offp; } + if (!data_of(data->handle)) { + res = -EINVAL; + goto unlock; + } + res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, buf, count); if (res > 0) diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index dfba59be190b..4210152e56f0 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -188,6 +188,7 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len, return ERR_PTR(-ENOMEM); } wl->ws.name = wl->name; + wl->ws.last_time = ktime_get(); wakeup_source_add(&wl->ws); rb_link_node(&wl->node, parent, node); rb_insert_color(&wl->node, &wakelocks_tree); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2f4af216bd6e..247808333ba4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1908,6 +1908,7 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -2289,9 +2290,7 @@ void console_unlock(void) { static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[LOG_LINE_MAX + PREFIX_MAX]; - static u64 seen_seq; unsigned long flags; - bool wake_klogd = false; bool do_cond_resched, retry; if (console_suspended) { @@ -2335,11 +2334,6 @@ again: printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); - if (seen_seq != log_next_seq) { - wake_klogd = true; - seen_seq = log_next_seq; - } - if (console_seq < log_first_seq) { len = sprintf(text, "** %u printk messages dropped **\n", (unsigned)(log_first_seq - console_seq)); @@ -2397,7 +2391,7 @@ skip: if (console_lock_spinning_disable_and_check()) { printk_safe_exit_irqrestore(flags); - goto out; + return; } printk_safe_exit_irqrestore(flags); @@ -2429,10 +2423,6 @@ skip: if (retry && console_trylock()) goto again; - -out: - if (wake_klogd) - wake_up_klogd(); } EXPORT_SYMBOL(console_unlock); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 3e3c2004bb23..d7d091309054 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -82,6 +82,7 @@ static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, { int add; size_t len; + va_list ap; again: len = atomic_read(&s->len); @@ -100,7 +101,9 @@ again: if (!len) smp_rmb(); - add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + va_copy(ap, args); + add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap); + va_end(ap); if (!add) return 0; @@ -278,7 +281,7 @@ void printk_safe_flush_on_panic(void) * Make sure that we could access the main ring buffer. * Do not risk a double release when more CPUs are up. */ - if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) { + if (raw_spin_is_locked(&logbuf_lock)) { if (num_online_cpus() > 1) return; diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7a693e31184a..40cea6735c2d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -270,6 +270,12 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) } } +/* Returns first leaf rcu_node of the specified RCU flavor. */ +#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1]) + +/* Is this rcu_node a leaf? */ +#define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1) + /* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. @@ -284,8 +290,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * rcu_node tree with but one rcu_node structure, this loop is a no-op. */ #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) + for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++) /* * Scan the leaves of the rcu_node hierarchy for the specified rcu_state @@ -294,7 +299,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * It is still a leaf node, even if it is also the root node. */ #define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + for ((rnp) = rcu_first_leaf_node(rsp); \ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* @@ -486,6 +491,7 @@ void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +extern struct workqueue_struct *rcu_par_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 88cba7c2956c..5aff271adf1e 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -404,24 +404,6 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) } /* - * Scan the specified rcu_segcblist structure for callbacks that need - * a grace period later than the one specified by "seq". We don't look - * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't - * have a grace-period sequence number. - */ -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq) -{ - int i; - - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rsclp->tails[i - 1] != rsclp->tails[i] && - ULONG_CMP_LT(seq, rsclp->gp_seq[i])) - return true; - return false; -} - -/* * Merge the source rcu_segcblist structure into the destination * rcu_segcblist structure, then initialize the source. Any pending * callbacks from the source get to start over. It is best to diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 581c12b63544..948470cef385 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -134,7 +134,5 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq); void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 777e7a6a0292..e232846516b3 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -369,7 +369,7 @@ static bool __maybe_unused torturing_tasks(void) */ static void rcu_perf_wait_shutdown(void) { - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) return; while (!torture_must_stop()) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 680c96d8c00f..e628fcfd1bde 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -593,7 +593,12 @@ static void srcu_torture_init(void) static void srcu_torture_cleanup(void) { - cleanup_srcu_struct(&srcu_ctld); + static DEFINE_TORTURE_RANDOM(rand); + + if (torture_random(&rand) & 0x800) + cleanup_srcu_struct(&srcu_ctld); + else + cleanup_srcu_struct_quiesced(&srcu_ctld); srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ } @@ -1609,6 +1614,9 @@ static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) { + int flags = 0; + unsigned long gpnum = 0; + unsigned long completed = 0; int i; rcutorture_record_test_transition(); @@ -1639,6 +1647,11 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, + &flags, &gpnum, &completed); + pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n", + cur_ops->name, gpnum, completed, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 76ac5f50b2c7..622792abe41a 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void cleanup_srcu_struct(struct srcu_struct *sp) +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) { WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); - flush_work(&sp->srcu_work); + if (quiesced) + WARN_ON(work_pending(&sp->srcu_work)); + else + flush_work(&sp->srcu_work); WARN_ON(sp->srcu_gp_running); WARN_ON(sp->srcu_gp_waiting); WARN_ON(sp->srcu_cb_head); WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); } -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); /* * Removes the count for the old reader from the appropriate element of diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index fb560fca9ef4..b4123d7a2cec 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp) return SRCU_INTERVAL; } -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) +/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) { int cpu; if (WARN_ON(!srcu_get_delay(sp))) - return; /* Leakage unless caller handles error. */ + return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(sp))) - return; /* Leakage unless caller handles error. */ - flush_delayed_work(&sp->work); + return; /* Just leak it! */ + if (quiesced) { + if (WARN_ON(delayed_work_pending(&sp->work))) + return; /* Just leak it! */ + } else { + flush_delayed_work(&sp->work); + } for_each_possible_cpu(cpu) - flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + if (quiesced) { + if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work))) + return; /* Just leak it! */ + } else { + flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + } if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); @@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) free_percpu(sp->sda); sp->sda = NULL; } -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a734692a581..aa7cade1b9f3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -524,8 +524,6 @@ module_param(rcu_kick_kthreads, bool, 0644); static ulong jiffies_till_sched_qs = HZ / 10; module_param(jiffies_till_sched_qs, ulong, 0444); -static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); @@ -711,44 +709,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) } /* - * Is there any need for future grace periods? - * Interrupts must be disabled. If the caller does not hold the root - * rnp_node structure's ->lock, the results are advisory only. - */ -static int rcu_future_needs_gp(struct rcu_state *rsp) -{ - struct rcu_node *rnp = rcu_get_root(rsp); - int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; - int *fp = &rnp->need_future_gp[idx]; - - lockdep_assert_irqs_disabled(); - return READ_ONCE(*fp); -} - -/* - * Does the current CPU require a not-yet-started grace period? - * The caller must have disabled interrupts to prevent races with - * normal callback registry. - */ -static bool -cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (rcu_gp_in_progress(rsp)) - return false; /* No, a grace period is already in progress. */ - if (rcu_future_needs_gp(rsp)) - return true; /* Yes, a no-CBs CPU needs one. */ - if (!rcu_segcblist_is_enabled(&rdp->cblist)) - return false; /* No, this is a no-CBs (or offline) CPU. */ - if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) - return true; /* Yes, CPU has newly registered callbacks. */ - if (rcu_segcblist_future_gp_needed(&rdp->cblist, - READ_ONCE(rsp->completed))) - return true; /* Yes, CBs for future grace period. */ - return false; /* No grace period needed. */ -} - -/* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * @@ -1234,10 +1194,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * Has this CPU encountered a cond_resched_rcu_qs() since the - * beginning of the grace period? For this to be the case, - * the CPU has to have noticed the current grace period. This - * might not be the case for nohz_full CPUs looping in the kernel. + * Has this CPU encountered a cond_resched() since the beginning + * of the grace period? For this to be the case, the CPU has to + * have noticed the current grace period. This might not be the + * case for nohz_full CPUs looping in the kernel. */ jtsq = jiffies_till_sched_qs; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); @@ -1642,18 +1602,30 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, return rnp->completed + 1; /* + * If the current rcu_node structure believes that RCU is + * idle, and if the rcu_state structure does not yet reflect + * the start of a new grace period, then the next grace period + * will suffice. The memory barrier is needed to accurately + * sample the rsp->gpnum, and pairs with the second lock + * acquisition in rcu_gp_init(), which is augmented with + * smp_mb__after_unlock_lock() for this purpose. + */ + if (rnp->gpnum == rnp->completed) { + smp_mb(); /* See above block comment. */ + if (READ_ONCE(rsp->gpnum) == rnp->completed) + return rnp->completed + 1; + } + + /* * Otherwise, wait for a possible partial grace period and * then the subsequent full grace period. */ return rnp->completed + 2; } -/* - * Trace-event helper function for rcu_start_future_gp() and - * rcu_nocb_wait_gp(). - */ -static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, const char *s) +/* Trace-event wrapper function for trace_rcu_future_grace_period. */ +static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, const char *s) { trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, c, rnp->level, @@ -1661,96 +1633,67 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* - * Start some future grace period, as needed to handle newly arrived + * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. Returns true if there + * rcu_node structure's ->need_future_gp[] field. Returns true if there * is reason to awaken the grace-period kthread. * - * The caller must hold the specified rcu_node structure's ->lock. + * The caller must hold the specified rcu_node structure's ->lock, which + * is why the caller is responsible for waking the grace-period kthread. */ -static bool __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long *c_out) +static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c) { - unsigned long c; bool ret = false; - struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); - - raw_lockdep_assert_held_rcu_node(rnp); - - /* - * Pick up grace-period number for new callbacks. If this - * grace period is already marked as needed, return to the caller. - */ - c = rcu_cbs_completed(rdp->rsp, rnp); - trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); - if (rnp->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); - goto out; - } + struct rcu_state *rsp = rdp->rsp; + struct rcu_node *rnp_root; /* - * If either this rcu_node structure or the root rcu_node structure - * believe that a grace period is in progress, then we must wait - * for the one following, which is in "c". Because our request - * will be noticed at the end of the current grace period, we don't - * need to explicitly start one. We only do the lockless check - * of rnp_root's fields if the current rcu_node structure thinks - * there is no grace period in flight, and because we hold rnp->lock, - * the only possible change is when rnp_root's two fields are - * equal, in which case rnp_root->gpnum might be concurrently - * incremented. But that is OK, as it will just result in our - * doing some extra useless work. + * Use funnel locking to either acquire the root rcu_node + * structure's lock or bail out if the need for this grace period + * has already been recorded -- or has already started. If there + * is already a grace period in progress in a non-leaf node, no + * recording is needed because the end of the grace period will + * scan the leaf rcu_node structures. Note that rnp->lock must + * not be released. */ - if (rnp->gpnum != rnp->completed || - READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) { - rnp->need_future_gp[c & 0x1]++; - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); - goto out; + raw_lockdep_assert_held_rcu_node(rnp); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); + for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); + WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum + + need_future_gp_mask(), c)); + if (need_future_gp_element(rnp_root, c) || + ULONG_CMP_GE(rnp_root->gpnum, c) || + (rnp != rnp_root && + rnp_root->gpnum != rnp_root->completed)) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); + goto unlock_out; + } + need_future_gp_element(rnp_root, c) = true; + if (rnp_root != rnp && rnp_root->parent != NULL) + raw_spin_unlock_rcu_node(rnp_root); + if (!rnp_root->parent) + break; /* At root, and perhaps also leaf. */ } - /* - * There might be no grace period in progress. If we don't already - * hold it, acquire the root rcu_node structure's lock in order to - * start one (if needed). - */ - if (rnp != rnp_root) - raw_spin_lock_rcu_node(rnp_root); - - /* - * Get a new grace-period number. If there really is no grace - * period in progress, it will be smaller than the one we obtained - * earlier. Adjust callbacks as needed. - */ - c = rcu_cbs_completed(rdp->rsp, rnp_root); - if (!rcu_is_nocb_cpu(rdp->cpu)) - (void)rcu_segcblist_accelerate(&rdp->cblist, c); - - /* - * If the needed for the required grace period is already - * recorded, trace and leave. - */ - if (rnp_root->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); + /* If GP already in progress, just leave, otherwise start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); goto unlock_out; } - - /* Record the need for the future grace period. */ - rnp_root->need_future_gp[c & 0x1]++; - - /* If a grace period is not already in progress, start one. */ - if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); - } else { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); - ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + if (!rsp->gp_kthread) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); + goto unlock_out; } + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + ret = true; /* Caller must wake GP kthread. */ unlock_out: if (rnp != rnp_root) raw_spin_unlock_rcu_node(rnp_root); -out: - if (c_out != NULL) - *c_out = c; return ret; } @@ -1758,16 +1701,16 @@ out: * Clean up any old requests for the just-ended grace period. Also return * whether any additional grace periods have been requested. */ -static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - int c = rnp->completed; - int needmore; + unsigned long c = rnp->completed; + bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rnp->need_future_gp[c & 0x1] = 0; - needmore = rnp->need_future_gp[(c + 1) & 0x1]; - trace_rcu_future_gp(rnp, rdp, c, - needmore ? TPS("CleanupMore") : TPS("Cleanup")); + need_future_gp_element(rnp, c) = false; + needmore = need_any_future_gp(rnp); + trace_rcu_this_gp(rnp, rdp, c, + needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1802,6 +1745,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + unsigned long c; bool ret = false; raw_lockdep_assert_held_rcu_node(rnp); @@ -1820,8 +1764,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) - ret = rcu_start_future_gp(rnp, rdp, NULL); + c = rcu_cbs_completed(rsp, rnp); + if (rcu_segcblist_accelerate(&rdp->cblist, c)) + ret = rcu_start_this_gp(rnp, rdp, c); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) @@ -2049,7 +1994,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq_rcu_node(rnp); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -2108,7 +2053,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; bool needgp = false; - int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); struct swait_queue_head *sq; @@ -2147,31 +2091,35 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ - nocb += rcu_future_gp_cleanup(rsp, rnp); + needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp; sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ - rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->gp_state = RCU_GP_IDLE; + /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); + if (need_any_future_gp(rnp)) { + trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, + TPS("CleanupMore")); + needgp = true; + } /* Advance CBs to reduce false positives below. */ - needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; - if (needgp || cpu_needs_another_gp(rsp, rdp)) { + if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); } + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); raw_spin_unlock_irq_rcu_node(rnp); } @@ -2202,7 +2150,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, @@ -2247,7 +2195,7 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqsend")); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); ret = 0; /* Force full wait till next FQS. */ j = jiffies_till_next_fqs; @@ -2260,7 +2208,7 @@ static int __noreturn rcu_gp_kthread(void *arg) } } else { /* Deal with stray signal. */ - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, @@ -2283,71 +2231,6 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* - * Start a new RCU grace period if warranted, re-initializing the hierarchy - * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock and hard irqs must be disabled. - * - * Note that it is legal for a dying CPU (which is marked as offline) to - * invoke this function. This can happen when the dying CPU reports its - * quiescent state. - * - * Returns true if the grace-period kthread must be awakened. - */ -static bool -rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) -{ - raw_lockdep_assert_held_rcu_node(rnp); - if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { - /* - * Either we have not yet spawned the grace-period - * task, this CPU does not need another grace period, - * or a grace period is already in progress. - * Either way, don't start a new grace period. - */ - return false; - } - WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), - TPS("newreq")); - - /* - * We can't do wakeups while holding the rnp->lock, as that - * could cause possible deadlocks with the rq->lock. Defer - * the wakeup to our caller. - */ - return true; -} - -/* - * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's - * callbacks. Note that rcu_start_gp_advanced() cannot do this because it - * is invoked indirectly from rcu_advance_cbs(), which would result in - * endless recursion -- or would do so if it wasn't for the self-deadlock - * that is encountered beforehand. - * - * Returns true if the grace-period kthread needs to be awakened. - */ -static bool rcu_start_gp(struct rcu_state *rsp) -{ - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - bool ret = false; - - /* - * If there is no grace period in progress right now, any - * callbacks we have up to this point will be satisfied by the - * next grace period. Also, advancing the callbacks reduces the - * probability of false positives from cpu_needs_another_gp() - * resulting in pointless grace periods. So, advance callbacks - * then start the grace period! - */ - ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; - ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; - return ret; -} - -/* * Report a full set of quiescent states to the specified rcu_state data * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period * kthread if another grace period is required. Whether we wake @@ -2398,7 +2281,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && + WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, @@ -2782,7 +2665,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { @@ -2874,22 +2757,27 @@ __rcu_process_callbacks(struct rcu_state *rsp) unsigned long flags; bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + struct rcu_node *rnp; WARN_ON_ONCE(!rdp->beenonline); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); - /* Does this CPU require a not-yet-started grace period? */ - local_irq_save(flags); - if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ - needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - if (needwake) - rcu_gp_kthread_wake(rsp); - } else { - local_irq_restore(flags); + /* No grace period and unregistered callbacks? */ + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist)) { + local_irq_save(flags); + if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { + local_irq_restore(flags); + } else { + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); + } } /* If there are callbacks ready, invoke them. */ @@ -2973,11 +2861,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp_root); - needwake = rcu_start_gp(rsp); - raw_spin_unlock_rcu_node(rnp_root); + raw_spin_lock_rcu_node(rnp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_rcu_node(rnp); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3368,7 +3256,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist) && + !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; /* Has another RCU grace period completed? */ @@ -3775,6 +3665,8 @@ int rcutree_dead_cpu(unsigned int cpu) return 0; } +static DEFINE_PER_CPU(int, rcu_cpu_started); + /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3796,6 +3688,11 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; struct rcu_state *rsp; + if (per_cpu(rcu_cpu_started, cpu)) + return; + + per_cpu(rcu_cpu_started, cpu) = 1; + for_each_rcu_flavor(rsp) { rdp = per_cpu_ptr(rsp->rda, cpu); rnp = rdp->mynode; @@ -3852,6 +3749,8 @@ void rcu_report_dead(unsigned int cpu) preempt_enable(); for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); + + per_cpu(rcu_cpu_started, cpu) = 0; } /* Migrate the dead CPU's callbacks to the current CPU. */ @@ -3861,6 +3760,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + bool needwake; if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ @@ -3872,12 +3772,15 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) return; } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ - rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ - rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ + /* Leverage recent GPs and set GP for new callbacks. */ + needwake = rcu_advance_cbs(rsp, rnp_root, rdp) || + rcu_advance_cbs(rsp, rnp_root, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", @@ -4056,7 +3959,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) init_swait_queue_head(&rsp->gp_wq); init_swait_queue_head(&rsp->expedited_wq); - rnp = rsp->level[rcu_num_lvls - 1]; + rnp = rcu_first_leaf_node(rsp); for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; @@ -4168,6 +4071,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) } struct workqueue_struct *rcu_gp_wq; +struct workqueue_struct *rcu_par_gp_wq; void __init rcu_init(void) { @@ -4199,6 +4103,8 @@ void __init rcu_init(void) /* Create workqueue for expedited GPs and for Tree SRCU. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f491ab4f2e8e..78e051dffc5b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -58,6 +58,14 @@ struct rcu_dynticks { #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; +/* Communicate arguments to a workqueue handler. */ +struct rcu_exp_work { + smp_call_func_t rew_func; + struct rcu_state *rew_rsp; + unsigned long rew_s; + struct work_struct rew_work; +}; + /* RCU's kthread states for tracing. */ #define RCU_KTHREAD_STOPPED 0 #define RCU_KTHREAD_RUNNING 1 @@ -150,15 +158,32 @@ struct rcu_node { struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - int need_future_gp[2]; - /* Counts of upcoming no-CB GP requests. */ + u8 need_future_gp[4]; /* Counts of upcoming GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; spinlock_t exp_lock ____cacheline_internodealigned_in_smp; unsigned long exp_seq_rq; wait_queue_head_t exp_wq[4]; + struct rcu_exp_work rew; + bool exp_need_flush; /* Need to flush workitem? */ } ____cacheline_internodealigned_in_smp; +/* Accessors for ->need_future_gp[] array. */ +#define need_future_gp_mask() \ + (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) +#define need_future_gp_element(rnp, c) \ + ((rnp)->need_future_gp[(c) & need_future_gp_mask()]) +#define need_any_future_gp(rnp) \ +({ \ + int __i; \ + bool __nonzero = false; \ + \ + for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ + __nonzero = __nonzero || \ + READ_ONCE((rnp)->need_future_gp[__i]); \ + __nonzero; \ +}) + /* * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and * are indexed relative to this interval rather than the global CPU ID space. @@ -224,10 +249,6 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - atomic_long_t exp_workdone0; /* # done by workqueue. */ - atomic_long_t exp_workdone1; /* # done by others #1. */ - atomic_long_t exp_workdone2; /* # done by others #2. */ - atomic_long_t exp_workdone3; /* # done by others #3. */ int exp_dynticks_snap; /* Double-check need for IPI. */ /* 6) Callback offloading. */ @@ -408,7 +429,6 @@ extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ int rcu_dynticks_snap(struct rcu_dynticks *rdtp); -bool rcu_eqs_special_set(int cpu); #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -438,7 +458,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST -static void rcu_preempt_do_callbacks(void); static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -454,7 +473,6 @@ static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f72eefab8543..d40708e8c5d6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -20,6 +20,8 @@ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#include <linux/lockdep.h> + /* * Record the start of an expedited grace period. */ @@ -154,15 +156,35 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold the rcu_state's exp_mutex. + * Caller must hold the specificed rcu_node structure's ->lock */ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { + raw_lockdep_assert_held_rcu_node(rnp); + return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } /* + * Like sync_rcu_preempt_exp_done(), but this function assumes the caller + * doesn't hold the rcu_node's ->lock, and will acquire and release the lock + * itself + */ +static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + ret = sync_rcu_preempt_exp_done(rnp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + return ret; +} + + +/* * Report the exit from RCU read-side critical section for the last task * that queued itself during or before the current expedited preemptible-RCU * grace period. This event is reported either to the rcu_node structure on @@ -170,8 +192,7 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. + * Caller must hold the specified rcu_node structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake, unsigned long flags) @@ -207,8 +228,6 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, /* * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. */ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -221,8 +240,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the rcu_state's - * exp_mutex. + * specified leaf rcu_node structure. */ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask, bool wake) @@ -248,14 +266,12 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, } /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, - unsigned long s) +static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(stat); return true; } return false; @@ -289,7 +305,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * promoting locality and is not strictly needed for correctness. */ for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + if (sync_exp_work_done(rsp, s)) return true; /* Work not done, either wait here or go up. */ @@ -302,8 +318,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp->grplo, rnp->grphi, TPS("wait")); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone2, s)); + sync_exp_work_done(rsp, s)); return true; } rnp->exp_seq_rq = s; /* Followers can wait on us. */ @@ -313,7 +328,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) } mutex_lock(&rsp->exp_mutex); fastpath: - if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + if (sync_exp_work_done(rsp, s)) { mutex_unlock(&rsp->exp_mutex); return true; } @@ -362,93 +377,129 @@ static void sync_sched_exp_online_cleanup(int cpu) } /* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. + * Select the CPUs within the specified rcu_node that the upcoming + * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) { int cpu; unsigned long flags; + smp_call_func_t func; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_node *rnp; - - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); - sync_exp_reset_tree(rsp); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); + struct rcu_state *rsp = rewp->rew_rsp; - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); - int snap; + func = rewp->rew_func; + raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (raw_smp_processor_id() == cpu || - !(rnp->qsmaskinitnext & mask)) { + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + int snap; + + if (raw_smp_processor_id() == cpu || + !(rnp->qsmaskinitnext & mask)) { + mask_ofl_test |= mask; + } else { + snap = rcu_dynticks_snap(rdtp); + if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; - } else { - snap = rcu_dynticks_snap(rdtp); - if (rcu_dynticks_in_eqs(snap)) - mask_ofl_test |= mask; - else - rdp->exp_dynticks_snap = snap; - } + else + rdp->exp_dynticks_snap = snap; } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited GP + * until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (!(mask_ofl_ipi & mask)) - continue; + /* IPI the remaining CPUs for expedited quiescent state. */ + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + + if (!(mask_ofl_ipi & mask)) + continue; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp->dynticks, - rdp->exp_dynticks_snap)) { - mask_ofl_test |= mask; - continue; - } - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with CPU hotplug operation. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if ((rnp->qsmaskinitnext & mask) && - (rnp->expmask & mask)) { - /* Online, so delay for a bit and try again. */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); - schedule_timeout_uninterruptible(1); - goto retry_ipi; - } - /* CPU really is offline, so we can ignore it. */ - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + if (rcu_dynticks_in_eqs_since(rdp->dynticks, + rdp->exp_dynticks_snap)) { + mask_ofl_test |= mask; + continue; + } + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with CPU hotplug operation. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if ((rnp->qsmaskinitnext & mask) && + (rnp->expmask & mask)) { + /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); + schedule_timeout_uninterruptible(1); + goto retry_ipi; + } + /* CPU really is offline, so we can ignore it. */ + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + struct rcu_node *rnp; + + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); + sync_exp_reset_tree(rsp); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); + + /* Schedule work for each leaf rcu_node structure. */ + rcu_for_each_leaf_node(rsp, rnp) { + rnp->exp_need_flush = false; + if (!READ_ONCE(rnp->expmask)) + continue; /* Avoid early boot non-existent wq. */ + rnp->rew.rew_func = func; + rnp->rew.rew_rsp = rsp; + if (!READ_ONCE(rcu_par_gp_wq) || + rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { + /* No workqueues yet. */ + sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); + continue; } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work); + rnp->exp_need_flush = true; } + + /* Wait for workqueue jobs (if any) to complete. */ + rcu_for_each_leaf_node(rsp, rnp) + if (rnp->exp_need_flush) + flush_work(&rnp->rew.rew_work); } static void synchronize_sched_expedited_wait(struct rcu_state *rsp) @@ -469,9 +520,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) for (;;) { ret = swait_event_timeout( rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), + sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) return; WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) @@ -504,7 +555,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) + if (sync_rcu_preempt_exp_done_unlocked(rnp)) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, @@ -560,14 +611,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_wake_mutex); } -/* Let the workqueue handler know what it is supposed to do. */ -struct rcu_exp_work { - smp_call_func_t rew_func; - struct rcu_state *rew_rsp; - unsigned long rew_s; - struct work_struct rew_work; -}; - /* * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. @@ -633,7 +676,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); rnp = rcu_get_root(rsp); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, &rdp->exp_workdone0, s)); + sync_exp_work_done(rsp, s)); smp_mb(); /* Workqueue actions happen before return. */ /* Let the next expedited grace period start. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 84fbee4686d3..7fd12039e512 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -182,7 +182,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); + WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); /* * Decide where to queue the newly blocked task. In theory, @@ -384,6 +384,50 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) } /* + * Preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* critical section after entry code. */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting != 1) { + --t->rcu_read_lock_nesting; + } else { + barrier(); /* critical section before exit code. */ + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } +#ifdef CONFIG_PROVE_LOCKING + { + int rrln = READ_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* * Advance a ->blkd_tasks-list pointer to the next entry, instead * returning NULL if at the end of the list. */ @@ -489,7 +533,7 @@ void rcu_read_unlock_special(struct task_struct *t) rnp = t->rcu_blocked_node; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); + WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -685,15 +729,6 @@ static void rcu_preempt_check_callbacks(void) t->rcu_read_unlock_special.b.need_qs = true; } -#ifdef CONFIG_RCU_BOOST - -static void rcu_preempt_do_callbacks(void) -{ - rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p)); -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - /** * call_rcu() - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -1140,7 +1175,7 @@ static void rcu_kthread_do_work(void) { rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); - rcu_preempt_do_callbacks(); + rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); } static void rcu_cpu_kthread_setup(unsigned int cpu) @@ -1607,7 +1642,7 @@ static int rcu_oom_notify(struct notifier_block *self, for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); } /* Unconditionally decrement: no need to wake ourselves up. */ @@ -1780,19 +1815,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) swake_up_all(sq); } -/* - * Set the root rcu_node structure's ->need_future_gp field - * based on the sum of those of all rcu_node structures. This does - * double-count the root rcu_node structure's requests, but this - * is necessary to handle the possibility of a rcu_nocb_kthread() - * having awakened during the time that the rcu_node structures - * were being updated for the end of the previous grace period. - */ -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ - rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return &rnp->nocb_gp_wq[rnp->completed & 0x1]; @@ -1966,7 +1988,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); } rdp->qlen_last_fqs_check = LONG_MAX / 2; @@ -2048,7 +2070,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - needwake = rcu_start_future_gp(rnp, rdp, &c); + c = rcu_cbs_completed(rdp->rsp, rnp); + needwake = rcu_start_this_gp(rnp, rdp, c); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); @@ -2057,7 +2080,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ - trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2065,9 +2088,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; WARN_ON(signal_pending(current)); - trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); } - trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } @@ -2236,7 +2259,7 @@ static int rcu_nocb_kthread(void *arg) cl++; c++; local_bh_enable(); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); @@ -2292,7 +2315,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) void __init rcu_init_nohz(void) { int cpu; - bool need_rcu_nocb_mask = true; + bool need_rcu_nocb_mask = false; struct rcu_state *rsp; #if defined(CONFIG_NO_HZ_FULL) @@ -2315,7 +2338,7 @@ void __init rcu_init_nohz(void) #endif /* #if defined(CONFIG_NO_HZ_FULL) */ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); + pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); cpumask_and(rcu_nocb_mask, cpu_possible_mask, rcu_nocb_mask); } @@ -2495,10 +2518,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return NULL; @@ -2587,8 +2606,7 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) } /* - * Bind the grace-period kthread for the sysidle flavor of RCU to the - * timekeeping CPU. + * Bind the RCU grace-period kthreads to the housekeeping CPU. */ static void rcu_bind_gp_kthread(void) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 68fa19a5e7bd..4c230a60ece4 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -226,54 +226,6 @@ core_initcall(rcu_set_runtime_mode); #endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ -#ifdef CONFIG_PREEMPT_RCU - -/* - * Preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* critical section after entry code. */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -/* - * Preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting != 1) { - --t->rcu_read_lock_nesting; - } else { - barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = READ_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = @@ -624,7 +576,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); * grace period has elapsed, in other words after all currently * executing rcu-tasks read-side critical sections have elapsed. These * read-side critical sections are delimited by calls to schedule(), - * cond_resched_rcu_qs(), idle execution, userspace execution, calls + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). * * This is a very specialized primitive, intended only for a few uses in diff --git a/kernel/resource.c b/kernel/resource.c index 2af6c03858b9..b589dda910b3 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -87,7 +87,7 @@ enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { - struct resource *p = m->private; + struct resource *p = PDE_DATA(file_inode(m->file)); loff_t l = 0; read_lock(&resource_lock); for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) @@ -103,7 +103,7 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct resource *root = m->private; + struct resource *root = PDE_DATA(file_inode(m->file)); struct resource *r = v, *p; unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; @@ -135,44 +135,11 @@ static const struct seq_operations resource_op = { .show = r_show, }; -static int ioports_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &ioport_resource; - } - return res; -} - -static int iomem_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &iomem_resource; - } - return res; -} - -static const struct file_operations proc_ioports_operations = { - .open = ioports_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations proc_iomem_operations = { - .open = iomem_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init ioresources_init(void) { - proc_create("ioports", 0, NULL, &proc_ioports_operations); - proc_create("iomem", 0, NULL, &proc_iomem_operations); + proc_create_seq_data("ioports", 0, NULL, &resource_op, + &ioport_resource); + proc_create_seq_data("iomem", 0, NULL, &resource_op, &iomem_resource); return 0; } __initcall(ioresources_init); diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 6be6c575b6cd..2d4ff5353ded 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -2,6 +2,7 @@ /* * Auto-group scheduling implementation: */ +#include <linux/nospec.h> #include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; @@ -209,7 +210,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; unsigned long shares; - int err; + int err, idx; if (nice < MIN_NICE || nice > MAX_NICE) return -EINVAL; @@ -227,7 +228,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) next = HZ / 10 + jiffies; ag = autogroup_task_get(p); - shares = scale_load(sched_prio_to_weight[nice + 20]); + + idx = array_index_nospec(nice + 20, 40); + shares = scale_load(sched_prio_to_weight[idx]); down_write(&ag->lock); err = sched_group_set_shares(ag->tg, shares); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e10aaeebfcc..e9866f86f304 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,6 +7,9 @@ */ #include "sched.h" +#include <linux/kthread.h> +#include <linux/nospec.h> + #include <asm/switch_to.h> #include <asm/tlb.h> @@ -878,6 +881,33 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} + +/* + * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * __set_cpus_allowed_ptr() and select_fallback_rq(). + */ +static inline bool is_cpu_allowed(struct task_struct *p, int cpu) +{ + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + return false; + + if (is_per_cpu_kthread(p)) + return cpu_online(cpu); + + return cpu_active(cpu); +} + /* * This is how migration works: * @@ -935,16 +965,8 @@ struct migration_arg { static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int dest_cpu) { - if (p->flags & PF_KTHREAD) { - if (unlikely(!cpu_online(dest_cpu))) - return rq; - } else { - if (unlikely(!cpu_active(dest_cpu))) - return rq; - } - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (!is_cpu_allowed(p, dest_cpu)) return rq; update_rq_clock(rq); @@ -1473,10 +1495,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, &p->cpus_allowed) { - if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) - continue; - if (!cpu_online(dest_cpu)) + if (!is_cpu_allowed(p, dest_cpu)) continue; + goto out; } @@ -1539,8 +1560,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || - !cpu_online(cpu))) + if (unlikely(!is_cpu_allowed(p, cpu))) cpu = select_fallback_rq(task_cpu(p), p); return cpu; @@ -2174,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif -#ifdef CONFIG_NUMA_BALANCING - if (p->mm && atomic_read(&p->mm->mm_users) == 1) { - p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - p->mm->numa_scan_seq = 0; - } - - if (clone_flags & CLONE_VM) - p->numa_preferred_nid = current->numa_preferred_nid; - else - p->numa_preferred_nid = -1; - - p->node_stamp = 0ULL; - p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->last_task_numa_placement = 0; - p->last_sum_exec_runtime = 0; - - p->numa_group = NULL; -#endif /* CONFIG_NUMA_BALANCING */ + init_numa_balancing(clone_flags, p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -2718,20 +2718,28 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } - if (unlikely(prev_state == TASK_DEAD)) { - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { + switch (prev_state) { + case TASK_DEAD: + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); - /* Task is done with its stack. */ - put_task_stack(prev); + /* Task is done with its stack. */ + put_task_stack(prev); - put_task_struct(prev); + put_task_struct(prev); + break; + + case TASK_PARKED: + kthread_park_complete(prev); + break; + } } tick_nohz_task_switch(); @@ -3498,23 +3506,8 @@ static void __sched notrace __schedule(bool preempt) void __noreturn do_task_dead(void) { - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - raw_spin_lock_irq(¤t->pi_lock); - raw_spin_unlock_irq(¤t->pi_lock); - /* Causes final put_task_struct in finish_task_switch(): */ - __set_current_state(TASK_DEAD); + set_special_state(TASK_DEAD); /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; @@ -4037,6 +4030,23 @@ int idle_cpu(int cpu) } /** + * available_idle_cpu - is a given CPU idle for enqueuing work. + * @cpu: the CPU in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int available_idle_cpu(int cpu) +{ + if (!idle_cpu(cpu)) + return 0; + + if (vcpu_is_preempted(cpu)) + return 0; + + return 1; +} + +/** * idle_task - return the idle task for a given CPU. * @cpu: the processor in question. * @@ -5012,20 +5022,6 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); -int __sched __cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { - local_bh_enable(); - preempt_schedule_common(); - local_bh_disable(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(__cond_resched_softirq); - /** * yield - yield the current processor to other threads. * @@ -6928,11 +6924,15 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; + int idx; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; - weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; + idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; + idx = array_index_nospec(idx, 40); + weight = sched_prio_to_weight[idx]; + return sched_group_set_shares(css_tg(css), scale_load(weight)); } #endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d2c6083304b4..3cde46483f0a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -51,7 +51,7 @@ struct sugov_cpu { bool iowait_boost_pending; unsigned int iowait_boost; unsigned int iowait_boost_max; - u64 last_update; + u64 last_update; /* The fields below are only needed when sharing a policy: */ unsigned long util_cfs; @@ -89,46 +89,52 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) * schedule the kthread. */ if (sg_policy->policy->fast_switch_enabled && - !cpufreq_can_do_remote_dvfs(sg_policy->policy)) + !cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (sg_policy->work_in_progress) - return false; - - if (unlikely(sg_policy->need_freq_update)) { - sg_policy->need_freq_update = false; - /* - * This happens when limits change, so forget the previous - * next_freq value and force an update. - */ - sg_policy->next_freq = UINT_MAX; + if (unlikely(sg_policy->need_freq_update)) return true; - } delta_ns = time - sg_policy->last_freq_update_time; return delta_ns >= sg_policy->freq_update_delay_ns; } -static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) +static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) { - struct cpufreq_policy *policy = sg_policy->policy; - if (sg_policy->next_freq == next_freq) - return; + return false; sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; - if (policy->fast_switch_enabled) { - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; + return true; +} - policy->cur = next_freq; - trace_cpu_frequency(next_freq, smp_processor_id()); - } else { +static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + struct cpufreq_policy *policy = sg_policy->policy; + + if (!sugov_update_next_freq(sg_policy, time, next_freq)) + return; + + next_freq = cpufreq_driver_fast_switch(policy, next_freq); + if (!next_freq) + return; + + policy->cur = next_freq; + trace_cpu_frequency(next_freq, smp_processor_id()); +} + +static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + if (!sugov_update_next_freq(sg_policy, time, next_freq)) + return; + + if (!sg_policy->work_in_progress) { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } @@ -165,8 +171,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, freq = (freq + (freq >> 2)) * util / max; - if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; + + sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -183,61 +191,137 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util; - if (rq->rt.rt_nr_running) { - util = sg_cpu->max; - } else { - util = sg_cpu->util_dl; - if (rq->cfs.h_nr_running) - util += sg_cpu->util_cfs; - } + if (rq->rt.rt_nr_running) + return sg_cpu->max; /* + * Utilization required by DEADLINE must always be granted while, for + * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to + * gracefully reduce the frequency when no tasks show up for longer + * periods of time. + * * Ideally we would like to set util_dl as min/guaranteed freq and * util_cfs + util_dl as requested freq. However, cpufreq is not yet * ready for such an interface. So, we only do the latter for now. */ - return min(util, sg_cpu->max); + return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs)); } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) +/** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. + * @sg_cpu: the sugov data for the CPU to boost + * @time: the update time from the caller + * @set_iowait_boost: true if an IO boost has been requested + * + * The IO wait boost of a task is disabled after a tick since the last update + * of a CPU. If a new IO wait boost is requested after more then a tick, then + * we enable the boost starting from the minimum frequency, which improves + * energy efficiency by ignoring sporadic wakeups from IO. + */ +static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, + bool set_iowait_boost) { - if (flags & SCHED_CPUFREQ_IOWAIT) { - if (sg_cpu->iowait_boost_pending) - return; + s64 delta_ns = time - sg_cpu->last_update; - sg_cpu->iowait_boost_pending = true; + /* Reset boost only if a tick has elapsed since last request */ + if (delta_ns <= TICK_NSEC) + return false; - if (sg_cpu->iowait_boost) { - sg_cpu->iowait_boost <<= 1; - if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; - } else { - sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; - } - } else if (sg_cpu->iowait_boost) { - s64 delta_ns = time - sg_cpu->last_update; + sg_cpu->iowait_boost = set_iowait_boost + ? sg_cpu->sg_policy->policy->min : 0; + sg_cpu->iowait_boost_pending = set_iowait_boost; - /* Clear iowait_boost if the CPU apprears to have been idle. */ - if (delta_ns > TICK_NSEC) { - sg_cpu->iowait_boost = 0; - sg_cpu->iowait_boost_pending = false; - } + return true; +} + +/** + * sugov_iowait_boost() - Updates the IO boost status of a CPU. + * @sg_cpu: the sugov data for the CPU to boost + * @time: the update time from the caller + * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait + * + * Each time a task wakes up after an IO operation, the CPU utilization can be + * boosted to a certain utilization which doubles at each "frequent and + * successive" wakeup from IO, ranging from the utilization of the minimum + * OPP to the utilization of the maximum OPP. + * To keep doubling, an IO boost has to be requested at least once per tick, + * otherwise we restart from the utilization of the minimum OPP. + */ +static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + unsigned int flags) +{ + bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; + + /* Reset boost if the CPU appears to have been idle enough */ + if (sg_cpu->iowait_boost && + sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) + return; + + /* Boost only tasks waking up after IO */ + if (!set_iowait_boost) + return; + + /* Ensure boost doubles only one time at each request */ + if (sg_cpu->iowait_boost_pending) + return; + sg_cpu->iowait_boost_pending = true; + + /* Double the boost at each request */ + if (sg_cpu->iowait_boost) { + sg_cpu->iowait_boost <<= 1; + if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + return; } + + /* First wakeup after IO: start with minimum boost */ + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; } -static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, - unsigned long *max) +/** + * sugov_iowait_apply() - Apply the IO boost to a CPU. + * @sg_cpu: the sugov data for the cpu to boost + * @time: the update time from the caller + * @util: the utilization to (eventually) boost + * @max: the maximum value the utilization can be boosted to + * + * A CPU running a task which woken up after an IO operation can have its + * utilization boosted to speed up the completion of those IO operations. + * The IO boost value is increased each time a task wakes up from IO, in + * sugov_iowait_apply(), and it's instead decreased by this function, + * each time an increase has not been requested (!iowait_boost_pending). + * + * A CPU which also appears to have been idle for at least one tick has also + * its IO boost utilization reset. + * + * This mechanism is designed to boost high frequently IO waiting tasks, while + * being more conservative on tasks which does sporadic IO operations. + */ +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long *util, unsigned long *max) { unsigned int boost_util, boost_max; + /* No boost currently required */ if (!sg_cpu->iowait_boost) return; + /* Reset boost if the CPU appears to have been idle enough */ + if (sugov_iowait_reset(sg_cpu, time, false)) + return; + + /* + * An IO waiting task has just woken up: + * allow to further double the boost value + */ if (sg_cpu->iowait_boost_pending) { sg_cpu->iowait_boost_pending = false; } else { + /* + * Otherwise: reduce the boost value and disable it when we + * reach the minimum. + */ sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { sg_cpu->iowait_boost = 0; @@ -245,9 +329,12 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, } } + /* + * Apply the current boost value: a CPU is boosted only if its current + * utilization is smaller then the current IO boost level. + */ boost_util = sg_cpu->iowait_boost; boost_max = sg_cpu->iowait_boost_max; - if (*util * boost_max < *max * boost_util) { *util = boost_util; *max = boost_max; @@ -286,7 +373,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; ignore_dl_rate_limit(sg_cpu, sg_policy); @@ -299,7 +386,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_get_util(sg_cpu); max = sg_cpu->max; util = sugov_aggregate_util(sg_cpu); - sugov_iowait_boost(sg_cpu, &util, &max); + sugov_iowait_apply(sg_cpu, time, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* * Do not reduce the frequency if the CPU has not been idle @@ -312,7 +399,18 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sg_policy->cached_raw_freq = 0; } - sugov_update_commit(sg_policy, time, next_f); + /* + * This code runs under rq->lock for the target CPU, so it won't run + * concurrently on two different CPUs for the same target and it is not + * necessary to acquire the lock in the fast switch case. + */ + if (sg_policy->policy->fast_switch_enabled) { + sugov_fast_switch(sg_policy, time, next_f); + } else { + raw_spin_lock(&sg_policy->update_lock); + sugov_deferred_update(sg_policy, time, next_f); + raw_spin_unlock(&sg_policy->update_lock); + } } static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) @@ -325,28 +423,12 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; - s64 delta_ns; sugov_get_util(j_sg_cpu); - - /* - * If the CFS CPU utilization was last updated before the - * previous frequency update and the time elapsed between the - * last update of the CPU utilization and the last frequency - * update is long enough, reset iowait_boost and util_cfs, as - * they are now probably stale. However, still consider the - * CPU contribution if it has some DEADLINE utilization - * (util_dl). - */ - delta_ns = time - j_sg_cpu->last_update; - if (delta_ns > TICK_NSEC) { - j_sg_cpu->iowait_boost = 0; - j_sg_cpu->iowait_boost_pending = false; - } - j_max = j_sg_cpu->max; j_util = sugov_aggregate_util(j_sg_cpu); - sugov_iowait_boost(j_sg_cpu, &j_util, &j_max); + sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); + if (j_util * max > j_max * util) { util = j_util; max = j_max; @@ -365,14 +447,18 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) raw_spin_lock(&sg_policy->update_lock); - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; ignore_dl_rate_limit(sg_cpu, sg_policy); if (sugov_should_update_freq(sg_policy, time)) { next_f = sugov_next_freq_shared(sg_cpu, time); - sugov_update_commit(sg_policy, time, next_f); + + if (sg_policy->policy->fast_switch_enabled) + sugov_fast_switch(sg_policy, time, next_f); + else + sugov_deferred_update(sg_policy, time, next_f); } raw_spin_unlock(&sg_policy->update_lock); @@ -381,13 +467,27 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + unsigned int freq; + unsigned long flags; + + /* + * Hold sg_policy->update_lock shortly to handle the case where: + * incase sg_policy->next_freq is read here, and then updated by + * sugov_deferred_update() just before work_in_progress is set to false + * here, we may miss queueing the new update. + * + * Note: If a work was queued after the update_lock is released, + * sugov_work() will just be called again by kthread_work code; and the + * request will be proceed before the sugov thread sleeps. + */ + raw_spin_lock_irqsave(&sg_policy->update_lock, flags); + freq = sg_policy->next_freq; + sg_policy->work_in_progress = false; + raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags); mutex_lock(&sg_policy->work_lock); - __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, - CPUFREQ_RELATION_L); + __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); mutex_unlock(&sg_policy->work_lock); - - sg_policy->work_in_progress = false; } static void sugov_irq_work(struct irq_work *irq_work) @@ -396,19 +496,6 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - /* - * For RT tasks, the schedutil governor shoots the frequency to maximum. - * Special care must be taken to ensure that this kthread doesn't result - * in the same behavior. - * - * This is (mostly) guaranteed by the work_in_progress flag. The flag is - * updated only at the end of the sugov_work() function and before that - * the schedutil governor rejects all other frequency scaling requests. - * - * There is a very rare case though, where the RT thread yields right - * after the work_in_progress flag is cleared. The effects of that are - * neglected for now. - */ kthread_queue_work(&sg_policy->worker, &sg_policy->work); } @@ -523,11 +610,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - - /* Kthread is bound to all CPUs by default */ - if (!policy->dvfs_possible_from_any_cpu) - kthread_bind_mask(thread, policy->related_cpus); - + kthread_bind_mask(thread, policy->related_cpus); init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -670,7 +753,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; sg_policy->last_freq_update_time = 0; - sg_policy->next_freq = UINT_MAX; + sg_policy->next_freq = 0; sg_policy->work_in_progress = false; sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e7b3008b85bb..fbfc3f1d368a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1117,7 +1117,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. * So, overflow is not an issue here. */ -u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) +static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ u64 u_act; @@ -1259,6 +1259,9 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) rq = task_rq_lock(p, &rf); + sched_clock_tick(); + update_rq_clock(rq); + if (!dl_task(p) || p->state == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); @@ -1278,9 +1281,6 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) if (dl_se->dl_non_contending == 0) goto unlock; - sched_clock_tick(); - update_rq_clock(rq); - sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: @@ -2731,8 +2731,6 @@ bool dl_cpu_busy(unsigned int cpu) #endif #ifdef CONFIG_SCHED_DEBUG -extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); - void print_dl_stats(struct seq_file *m, int cpu) { print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 15b10e210a6b..e593b4118578 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -823,35 +823,9 @@ static const struct seq_operations sched_debug_sops = { .show = sched_debug_show, }; -static int sched_debug_release(struct inode *inode, struct file *file) -{ - seq_release(inode, file); - - return 0; -} - -static int sched_debug_open(struct inode *inode, struct file *filp) -{ - int ret = 0; - - ret = seq_open(filp, &sched_debug_sops); - - return ret; -} - -static const struct file_operations sched_debug_fops = { - .open = sched_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = sched_debug_release, -}; - static int __init init_sched_debug_procfs(void) { - struct proc_dir_entry *pe; - - pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops); - if (!pe) + if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops)) return -ENOMEM; return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 54dc31e7ab9b..e497c05aab7f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ + int mm_users = 0; + struct mm_struct *mm = p->mm; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) { + mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + mm->numa_scan_seq = 0; + } + } + p->node_stamp = 0; + p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + p->numa_group = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + + /* New address space, reset the preferred nid */ + if (!(clone_flags & CLONE_VM)) { + p->numa_preferred_nid = -1; + return; + } + + /* + * New thread, keep existing numa_preferred_nid which should be copied + * already by arch_dup_task_struct but stagger when scans start. + */ + if (mm) { + unsigned int delay; + + delay = min_t(unsigned int, task_scan_max(current), + current->numa_scan_period * mm_users * NSEC_PER_MSEC); + delay += 2 * TICK_NSEC; + p->node_stamp = delay; + } +} + static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { rq->nr_numa_running += (p->numa_preferred_nid != -1); @@ -1854,7 +1895,6 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { unsigned long interval = HZ; - unsigned long numa_migrate_retry; /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) @@ -1862,18 +1902,7 @@ static void numa_migrate_preferred(struct task_struct *p) /* Periodically retry migrating the task to the preferred node */ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); - numa_migrate_retry = jiffies + interval; - - /* - * Check that the new retry threshold is after the current one. If - * the retry is in the future, it implies that wake_affine has - * temporarily asked NUMA balancing to backoff from placement. - */ - if (numa_migrate_retry > p->numa_migrate_retry) - return; - - /* Safe to try placing the task on the preferred node */ - p->numa_migrate_retry = numa_migrate_retry; + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) @@ -5357,6 +5386,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; /* + * The code below (indirectly) updates schedutil which looks at + * the cfs_rq utilization to select a frequency. + * Let's add the task's estimated utilization to the cfs_rq's + * estimated utilization, before we update schedutil. + */ + util_est_enqueue(&rq->cfs, p); + + /* * If in_iowait is set, the code below may not trigger any cpufreq * utilization updates, so do it here explicitly with the IOWAIT flag * passed. @@ -5397,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) add_nr_running(rq, 1); - util_est_enqueue(&rq->cfs, p); hrtick_update(rq); } @@ -5870,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) * a cpufreq perspective, it's better to have higher utilisation * on one CPU. */ - if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) - return idle_cpu(prev_cpu) ? prev_cpu : this_cpu; + if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) + return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; @@ -5922,48 +5958,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; } -#ifdef CONFIG_NUMA_BALANCING -static void -update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) -{ - unsigned long interval; - - if (!static_branch_likely(&sched_numa_balancing)) - return; - - /* If balancing has no preference then continue gathering data */ - if (p->numa_preferred_nid == -1) - return; - - /* - * If the wakeup is not affecting locality then it is neutral from - * the perspective of NUMA balacing so continue gathering data. - */ - if (cpu_to_node(prev_cpu) == cpu_to_node(target)) - return; - - /* - * Temporarily prevent NUMA balancing trying to place waker/wakee after - * wakee has been moved by wake_affine. This will potentially allow - * related tasks to converge and update their data placement. The - * 4 * numa_scan_period is to allow the two-pass filter to migrate - * hot data to the wakers node. - */ - interval = max(sysctl_numa_balancing_scan_delay, - p->numa_scan_period << 2); - p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); - - interval = max(sysctl_numa_balancing_scan_delay, - current->numa_scan_period << 2); - current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); -} -#else -static void -update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) -{ -} -#endif - static int wake_affine(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { @@ -5979,7 +5973,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (target == nr_cpumask_bits) return prev_cpu; - update_wa_numa_placement(p, prev_cpu, target); schedstat_inc(sd->ttwu_move_affine); schedstat_inc(p->se.statistics.nr_wakeups_affine); return target; @@ -6157,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { - if (idle_cpu(i)) { + if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -6199,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) return prev_cpu; + /* + * We need task's util for capacity_spare_wake, sync it up to prev_cpu's + * last_update_time. + */ + if (!(sd_flag & SD_BALANCE_FORK)) + sync_entity_load_avg(&p->se); + while (sd) { struct sched_group *group; struct sched_domain *tmp; @@ -6279,7 +6279,7 @@ void __update_idle_core(struct rq *rq) if (cpu == core) continue; - if (!idle_cpu(cpu)) + if (!available_idle_cpu(cpu)) goto unlock; } @@ -6311,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int for_each_cpu(cpu, cpu_smt_mask(core)) { cpumask_clear_cpu(cpu, cpus); - if (!idle_cpu(cpu)) + if (!available_idle_cpu(cpu)) idle = false; } @@ -6340,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu(cpu, cpu_smt_mask(target)) { if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; - if (idle_cpu(cpu)) + if (available_idle_cpu(cpu)) return cpu; } @@ -6403,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t return -1; if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; - if (idle_cpu(cpu)) + if (available_idle_cpu(cpu)) break; } @@ -6423,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) struct sched_domain *sd; int i, recent_used_cpu; - if (idle_cpu(target)) + if (available_idle_cpu(target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) return prev; /* Check a recently used CPU as a potential idle candidate: */ @@ -6437,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - idle_cpu(recent_used_cpu) && + available_idle_cpu(recent_used_cpu) && cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { /* * Replace recent_used_cpu with prev as it is a potential @@ -6613,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { - struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; + struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; @@ -6636,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { - affine_sd = tmp; + if (cpu != prev_cpu) + new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync); + + sd = NULL; /* Prefer wake_affine over balance flags */ break; } @@ -6646,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f break; } - if (affine_sd) { - sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu == prev_cpu) - goto pick_cpu; - - new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync); - } - - if (sd && !(sd_flag & SD_BALANCE_FORK)) { - /* - * We're going to need the task's util for capacity_spare_wake - * in find_idlest_group. Sync it up to prev_cpu's - * last_update_time. - */ - sync_entity_load_avg(&p->se); - } + if (unlikely(sd)) { + /* Slow path */ + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + /* Fast path */ - if (!sd) { -pick_cpu: - if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - if (want_affine) - current->recent_used_cpu = cpu; - } - } else { - new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + if (want_affine) + current->recent_used_cpu = cpu; } rcu_read_unlock(); @@ -9847,6 +9833,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; +out: /* * While browsing the domains, we released the rq lock, a task could * have been enqueued in the meantime. Since we're not going idle, @@ -9855,7 +9842,6 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; -out: /* Move the next balance forward */ if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7aef6b4e885a..ef3c4e6f5345 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2701,8 +2701,6 @@ int sched_rr_handler(struct ctl_table *table, int write, } #ifdef CONFIG_SCHED_DEBUG -extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); - void print_rt_stats(struct seq_file *m, int cpu) { rt_rq_iter_t iter; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 15750c222ca2..6601baf2361c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -983,7 +983,7 @@ static inline void rq_clock_skip_update(struct rq *rq) } /* - * See rt task throttoling, which is the only time a skip + * See rt task throttling, which is the only time a skip * request is cancelled. */ static inline void rq_clock_cancel_skipupdate(struct rq *rq) @@ -1069,6 +1069,12 @@ enum numa_faults_stats { extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); +extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +#else +static inline void +init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ +} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SMP @@ -2025,8 +2031,9 @@ extern bool sched_debug_enabled; extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); -extern void -print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index ab112cbfd7c8..750fb3c67eed 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -120,22 +120,9 @@ static const struct seq_operations schedstat_sops = { .show = show_schedstat, }; -static int schedstat_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &schedstat_sops); -} - -static const struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_schedstat_init(void) { - proc_create("schedstat", 0, NULL, &proc_schedstat_operations); - + proc_create_seq("schedstat", 0, NULL, &schedstat_sops); return 0; } subsys_initcall(proc_schedstat_init); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 64cc564f5255..61a1125c1ae4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1708,7 +1708,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att rcu_read_unlock(); if (rq && sched_debug_enabled) { - pr_info("span: %*pbl (max cpu_capacity = %lu)\n", + pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5386749cdd21..fd023ac24e10 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -19,6 +19,8 @@ #include <linux/compat.h> #include <linux/coredump.h> #include <linux/kmemleak.h> +#include <linux/nospec.h> +#include <linux/prctl.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/seccomp.h> @@ -227,8 +229,11 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) return true; } +void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { } + static inline void seccomp_assign_mode(struct task_struct *task, - unsigned long seccomp_mode) + unsigned long seccomp_mode, + unsigned long flags) { assert_spin_locked(&task->sighand->siglock); @@ -238,6 +243,9 @@ static inline void seccomp_assign_mode(struct task_struct *task, * filter) is set. */ smp_mb__before_atomic(); + /* Assume default seccomp processes want spec flaw mitigation. */ + if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0) + arch_seccomp_spec_mitigate(task); set_tsk_thread_flag(task, TIF_SECCOMP); } @@ -305,7 +313,7 @@ static inline pid_t seccomp_can_sync_threads(void) * without dropping the locks. * */ -static inline void seccomp_sync_threads(void) +static inline void seccomp_sync_threads(unsigned long flags) { struct task_struct *thread, *caller; @@ -346,7 +354,8 @@ static inline void seccomp_sync_threads(void) * allow one thread to transition the other. */ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) - seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); + seccomp_assign_mode(thread, SECCOMP_MODE_FILTER, + flags); } } @@ -469,7 +478,7 @@ static long seccomp_attach_filter(unsigned int flags, /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) - seccomp_sync_threads(); + seccomp_sync_threads(flags); return 0; } @@ -815,7 +824,7 @@ static long seccomp_set_mode_strict(void) #ifdef TIF_NOTSC disable_TSC(); #endif - seccomp_assign_mode(current, seccomp_mode); + seccomp_assign_mode(current, seccomp_mode, 0); ret = 0; out: @@ -873,7 +882,7 @@ static long seccomp_set_mode_filter(unsigned int flags, /* Do not free the successfully attached filter. */ prepared = NULL; - seccomp_assign_mode(current, seccomp_mode); + seccomp_assign_mode(current, seccomp_mode, flags); out: spin_unlock_irq(¤t->sighand->siglock); if (flags & SECCOMP_FILTER_FLAG_TSYNC) diff --git a/kernel/signal.c b/kernel/signal.c index d4ccea599692..0f865d67415d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1539,7 +1539,6 @@ int send_sig_fault(int sig, int code, void __user *addr return send_sig_info(info.si_signo, &info, t); } -#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR) int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) { struct siginfo info; @@ -1568,9 +1567,7 @@ int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct * return send_sig_info(info.si_signo, &info, t); } EXPORT_SYMBOL(send_sig_mceerr); -#endif -#ifdef SEGV_BNDERR int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) { struct siginfo info; @@ -1584,7 +1581,6 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) info.si_upper = upper; return force_sig_info(info.si_signo, &info, current); } -#endif #ifdef SEGV_PKUERR int force_sig_pkuerr(void __user *addr, u32 pkey) @@ -1961,14 +1957,27 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) return; } + set_special_state(TASK_TRACED); + /* * We're committing to trapping. TRACED should be visible before * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). * Also, transition to TRACED and updates to ->jobctl should be * atomic with respect to siglock and should be done after the arch * hook as siglock is released and regrabbed across it. + * + * TRACER TRACEE + * + * ptrace_attach() + * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED) + * do_wait() + * set_current_state() smp_wmb(); + * ptrace_do_wait() + * wait_task_stopped() + * task_stopped_code() + * [L] task_is_traced() [S] task_clear_jobctl_trapping(); */ - set_current_state(TASK_TRACED); + smp_wmb(); current->last_siginfo = info; current->exit_code = exit_code; @@ -2176,7 +2185,7 @@ static bool do_signal_stop(int signr) if (task_participate_group_stop(current)) notify = CLD_STOPPED; - __set_current_state(TASK_STOPPED); + set_special_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); /* @@ -2824,8 +2833,19 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) [SIGPOLL] = { NSIGPOLL, SIL_POLL }, [SIGSYS] = { NSIGSYS, SIL_SYS }, }; - if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) + if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) { layout = filter[sig].layout; + /* Handle the exceptions */ + if ((sig == SIGBUS) && + (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO)) + layout = SIL_FAULT_MCEERR; + else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR)) + layout = SIL_FAULT_BNDERR; +#ifdef SEGV_PKUERR + else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR)) + layout = SIL_FAULT_PKUERR; +#endif + } else if (si_code <= NSIGPOLL) layout = SIL_POLL; } else { @@ -2835,104 +2855,15 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) layout = SIL_POLL; else if (si_code < 0) layout = SIL_RT; - /* Tests to support buggy kernel ABIs */ -#ifdef TRAP_FIXME - if ((sig == SIGTRAP) && (si_code == TRAP_FIXME)) - layout = SIL_FAULT; -#endif -#ifdef FPE_FIXME - if ((sig == SIGFPE) && (si_code == FPE_FIXME)) - layout = SIL_FAULT; -#endif } return layout; } int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) { - int err; - - if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t))) + if (copy_to_user(to, from , sizeof(struct siginfo))) return -EFAULT; - if (from->si_code < 0) - return __copy_to_user(to, from, sizeof(siginfo_t)) - ? -EFAULT : 0; - /* - * If you change siginfo_t structure, please be sure - * this code is fixed accordingly. - * Please remember to update the signalfd_copyinfo() function - * inside fs/signalfd.c too, in case siginfo_t changes. - * It should never copy any pad contained in the structure - * to avoid security leaks, but must copy the generic - * 3 ints plus the relevant union member. - */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case SIL_TIMER: - /* Unreached SI_TIMER is negative */ - break; - case SIL_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case SIL_FAULT: - err |= __put_user(from->si_addr, &to->si_addr); -#ifdef __ARCH_SI_TRAPNO - err |= __put_user(from->si_trapno, &to->si_trapno); -#endif -#ifdef __ia64__ - err |= __put_user(from->si_imm, &to->si_imm); - err |= __put_user(from->si_flags, &to->si_flags); - err |= __put_user(from->si_isr, &to->si_isr); -#endif - /* - * Other callers might not initialize the si_lsb field, - * so check explicitly for the right codes here. - */ -#ifdef BUS_MCEERR_AR - if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR) - err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); -#endif -#ifdef BUS_MCEERR_AO - if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO) - err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); -#endif -#ifdef SEGV_BNDERR - if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) { - err |= __put_user(from->si_lower, &to->si_lower); - err |= __put_user(from->si_upper, &to->si_upper); - } -#endif -#ifdef SEGV_PKUERR - if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR) - err |= __put_user(from->si_pkey, &to->si_pkey); -#endif - break; - case SIL_CHLD: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_status, &to->si_status); - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - break; - case SIL_RT: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_ptr, &to->si_ptr); - break; - case SIL_SYS: - err |= __put_user(from->si_call_addr, &to->si_call_addr); - err |= __put_user(from->si_syscall, &to->si_syscall); - err |= __put_user(from->si_arch, &to->si_arch); - break; - } - return err; + return 0; } #ifdef CONFIG_COMPAT @@ -2971,27 +2902,28 @@ int __copy_siginfo_to_user32(struct compat_siginfo __user *to, #ifdef __ARCH_SI_TRAPNO new.si_trapno = from->si_trapno; #endif -#ifdef BUS_MCEERR_AR - if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR)) - new.si_addr_lsb = from->si_addr_lsb; -#endif -#ifdef BUS_MCEERR_AO - if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO)) - new.si_addr_lsb = from->si_addr_lsb; + break; + case SIL_FAULT_MCEERR: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; #endif -#ifdef SEGV_BNDERR - if ((from->si_signo == SIGSEGV) && - (from->si_code == SEGV_BNDERR)) { - new.si_lower = ptr_to_compat(from->si_lower); - new.si_upper = ptr_to_compat(from->si_upper); - } + new.si_addr_lsb = from->si_addr_lsb; + break; + case SIL_FAULT_BNDERR: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; #endif -#ifdef SEGV_PKUERR - if ((from->si_signo == SIGSEGV) && - (from->si_code == SEGV_PKUERR)) - new.si_pkey = from->si_pkey; + new.si_lower = ptr_to_compat(from->si_lower); + new.si_upper = ptr_to_compat(from->si_upper); + break; + case SIL_FAULT_PKUERR: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; #endif - + new.si_pkey = from->si_pkey; break; case SIL_CHLD: new.si_pid = from->si_pid; @@ -3057,24 +2989,28 @@ int copy_siginfo_from_user32(struct siginfo *to, #ifdef __ARCH_SI_TRAPNO to->si_trapno = from.si_trapno; #endif -#ifdef BUS_MCEERR_AR - if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR)) - to->si_addr_lsb = from.si_addr_lsb; -#endif -#ifdef BUS_MCEER_AO - if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO)) - to->si_addr_lsb = from.si_addr_lsb; + break; + case SIL_FAULT_MCEERR: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; #endif -#ifdef SEGV_BNDERR - if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) { - to->si_lower = compat_ptr(from.si_lower); - to->si_upper = compat_ptr(from.si_upper); - } + to->si_addr_lsb = from.si_addr_lsb; + break; + case SIL_FAULT_BNDERR: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; #endif -#ifdef SEGV_PKUERR - if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR)) - to->si_pkey = from.si_pkey; + to->si_lower = compat_ptr(from.si_lower); + to->si_upper = compat_ptr(from.si_upper); + break; + case SIL_FAULT_PKUERR: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; #endif + to->si_pkey = from.si_pkey; break; case SIL_CHLD: to->si_pid = from.si_pid; diff --git a/kernel/softirq.c b/kernel/softirq.c index 177de3640c78..de2f57fddc04 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -49,8 +49,8 @@ */ #ifndef __ARCH_IRQ_STAT -irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; -EXPORT_SYMBOL(irq_stat); +DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); #endif static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; @@ -145,8 +145,7 @@ static void __local_bh_enable(unsigned int cnt) } /* - * Special-case - softirqs can safely be enabled in - * cond_resched_softirq(), or by __do_softirq(), + * Special-case - softirqs can safely be enabled by __do_softirq(), * without processing still-pending softirqs: */ void _local_bh_enable(void) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b7591261652d..f89014a2c238 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -21,6 +21,7 @@ #include <linux/smpboot.h> #include <linux/atomic.h> #include <linux/nmi.h> +#include <linux/sched/wake_q.h> /* * Structure to determine completion condition and record errors. May @@ -36,7 +37,7 @@ struct cpu_stop_done { struct cpu_stopper { struct task_struct *thread; - spinlock_t lock; + raw_spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ @@ -65,26 +66,30 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done) } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, - struct cpu_stop_work *work) + struct cpu_stop_work *work, + struct wake_q_head *wakeq) { list_add_tail(&work->list, &stopper->works); - wake_up_process(stopper->thread); + wake_q_add(wakeq, stopper->thread); } /* queue @work to @stopper. if offline, @work is completed immediately */ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + DEFINE_WAKE_Q(wakeq); unsigned long flags; bool enabled; - spin_lock_irqsave(&stopper->lock, flags); + raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) - __cpu_stop_queue_work(stopper, work); + __cpu_stop_queue_work(stopper, work, &wakeq); else if (work->done) cpu_stop_signal_done(work->done); - spin_unlock_irqrestore(&stopper->lock, flags); + raw_spin_unlock_irqrestore(&stopper->lock, flags); + + wake_up_q(&wakeq); return enabled; } @@ -229,10 +234,11 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, { struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); + DEFINE_WAKE_Q(wakeq); int err; retry: - spin_lock_irq(&stopper1->lock); - spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(&stopper1->lock); + raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); err = -ENOENT; if (!stopper1->enabled || !stopper2->enabled) @@ -252,17 +258,20 @@ retry: goto unlock; err = 0; - __cpu_stop_queue_work(stopper1, work1); - __cpu_stop_queue_work(stopper2, work2); + __cpu_stop_queue_work(stopper1, work1, &wakeq); + __cpu_stop_queue_work(stopper2, work2, &wakeq); unlock: - spin_unlock(&stopper2->lock); - spin_unlock_irq(&stopper1->lock); + raw_spin_unlock(&stopper2->lock); + raw_spin_unlock_irq(&stopper1->lock); if (unlikely(err == -EDEADLK)) { while (stop_cpus_in_progress) cpu_relax(); goto retry; } + + wake_up_q(&wakeq); + return err; } /** @@ -448,9 +457,9 @@ static int cpu_stop_should_run(unsigned int cpu) unsigned long flags; int run; - spin_lock_irqsave(&stopper->lock, flags); + raw_spin_lock_irqsave(&stopper->lock, flags); run = !list_empty(&stopper->works); - spin_unlock_irqrestore(&stopper->lock, flags); + raw_spin_unlock_irqrestore(&stopper->lock, flags); return run; } @@ -461,13 +470,13 @@ static void cpu_stopper_thread(unsigned int cpu) repeat: work = NULL; - spin_lock_irq(&stopper->lock); + raw_spin_lock_irq(&stopper->lock); if (!list_empty(&stopper->works)) { work = list_first_entry(&stopper->works, struct cpu_stop_work, list); list_del_init(&work->list); } - spin_unlock_irq(&stopper->lock); + raw_spin_unlock_irq(&stopper->lock); if (work) { cpu_stop_fn_t fn = work->fn; @@ -541,7 +550,7 @@ static int __init cpu_stop_init(void) for_each_possible_cpu(cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - spin_lock_init(&stopper->lock); + raw_spin_lock_init(&stopper->lock); INIT_LIST_HEAD(&stopper->works); } diff --git a/kernel/sys.c b/kernel/sys.c index ad692183dfe9..d1b2b8d934bb 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -61,6 +61,8 @@ #include <linux/uidgid.h> #include <linux/cred.h> +#include <linux/nospec.h> + #include <linux/kmsg_dump.h> /* Move somewhere else to avoid recompiling? */ #include <generated/utsrelease.h> @@ -69,6 +71,9 @@ #include <asm/io.h> #include <asm/unistd.h> +/* Hardening for Spectre-v1 */ +#include <linux/nospec.h> + #include "uid16.h" #ifndef SET_UNALIGN_CTL @@ -1451,6 +1456,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, if (resource >= RLIM_NLIMITS) return -EINVAL; + resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); x = current->signal->rlim[resource]; task_unlock(current->group_leader); @@ -1470,6 +1476,7 @@ COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, if (resource >= RLIM_NLIMITS) return -EINVAL; + resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); r = current->signal->rlim[resource]; task_unlock(current->group_leader); @@ -2242,6 +2249,17 @@ static int propagate_has_child_subreaper(struct task_struct *p, void *data) return 1; } +int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which) +{ + return -EINVAL; +} + +int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, + unsigned long ctrl) +{ + return -EINVAL; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2450,6 +2468,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SVE_GET_VL: error = SVE_GET_VL(); break; + case PR_GET_SPECULATION_CTRL: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_prctl_spec_ctrl_get(me, arg2); + break; + case PR_SET_SPECULATION_CTRL: + if (arg4 || arg5) + return -EINVAL; + error = arch_prctl_spec_ctrl_set(me, arg2, arg3); + break; default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 9791364925dc..183169c2a75b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -43,7 +43,9 @@ COND_SYSCALL(io_submit); COND_SYSCALL_COMPAT(io_submit); COND_SYSCALL(io_cancel); COND_SYSCALL(io_getevents); +COND_SYSCALL(io_pgetevents); COND_SYSCALL_COMPAT(io_getevents); +COND_SYSCALL_COMPAT(io_pgetevents); /* fs/xattr.c */ diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e8c0dab4fd65..07148b497451 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -704,24 +704,6 @@ static const struct bin_table bin_net_netfilter_table[] = { {} }; -static const struct bin_table bin_net_irda_table[] = { - { CTL_INT, NET_IRDA_DISCOVERY, "discovery" }, - { CTL_STR, NET_IRDA_DEVNAME, "devname" }, - { CTL_INT, NET_IRDA_DEBUG, "debug" }, - { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" }, - { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, - { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, - { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, - { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, - { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, - { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, - { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, - { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, - { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, - { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, - {} -}; - static const struct bin_table bin_net_table[] = { { CTL_DIR, NET_CORE, "core", bin_net_core_table }, /* NET_ETHER not used */ @@ -743,7 +725,7 @@ static const struct bin_table bin_net_table[] = { { CTL_DIR, NET_LLC, "llc", bin_net_llc_table }, { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table }, /* NET_DCCP "dccp" no longer used */ - { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table }, + /* NET_IRDA "irda" no longer used */ { CTL_INT, 2089, "nf_conntrack_max" }, {} }; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 0e974cface0b..f89a78e2792b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -119,8 +119,15 @@ static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; -static int clocksource_watchdog_kthread(void *data); -static void __clocksource_change_rating(struct clocksource *cs, int rating); +static void inline clocksource_watchdog_lock(unsigned long *flags) +{ + spin_lock_irqsave(&watchdog_lock, *flags); +} + +static void inline clocksource_watchdog_unlock(unsigned long *flags) +{ + spin_unlock_irqrestore(&watchdog_lock, *flags); +} /* * Interval: 0.5sec Threshold: 0.0625s @@ -128,23 +135,24 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating); #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) -static void clocksource_watchdog_work(struct work_struct *work) -{ - /* - * If kthread_run fails the next watchdog scan over the - * watchdog_list will find the unstable clock again. - */ - kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); -} - static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; + /* + * If the clocksource is registered clocksource_watchdog_work() will + * re-rate and re-select. + */ + if (list_empty(&cs->list)) { + cs->rating = 0; + return; + } + if (cs->mark_unstable) cs->mark_unstable(cs); + /* kick clocksource_watchdog_work() */ if (finished_booting) schedule_work(&watchdog_work); } @@ -153,10 +161,8 @@ static void __clocksource_unstable(struct clocksource *cs) * clocksource_mark_unstable - mark clocksource unstable via watchdog * @cs: clocksource to be marked unstable * - * This function is called instead of clocksource_change_rating from - * cpu hotplug code to avoid a deadlock between the clocksource mutex - * and the cpu hotplug mutex. It defers the update of the clocksource - * to the watchdog thread. + * This function is called by the x86 TSC code to mark clocksources as unstable; + * it defers demotion and re-selection to a work. */ void clocksource_mark_unstable(struct clocksource *cs) { @@ -164,7 +170,7 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_lock_irqsave(&watchdog_lock, flags); if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { - if (list_empty(&cs->wd_list)) + if (!list_empty(&cs->list) && list_empty(&cs->wd_list)) list_add(&cs->wd_list, &watchdog_list); __clocksource_unstable(cs); } @@ -319,9 +325,8 @@ static void clocksource_resume_watchdog(void) static void clocksource_enqueue_watchdog(struct clocksource *cs) { - unsigned long flags; + INIT_LIST_HEAD(&cs->wd_list); - spin_lock_irqsave(&watchdog_lock, flags); if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { /* cs is a clocksource to be watched. */ list_add(&cs->wd_list, &watchdog_list); @@ -331,7 +336,6 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } - spin_unlock_irqrestore(&watchdog_lock, flags); } static void clocksource_select_watchdog(bool fallback) @@ -373,9 +377,6 @@ static void clocksource_select_watchdog(bool fallback) static void clocksource_dequeue_watchdog(struct clocksource *cs) { - unsigned long flags; - - spin_lock_irqsave(&watchdog_lock, flags); if (cs != watchdog) { if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { /* cs is a watched clocksource. */ @@ -384,21 +385,21 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) clocksource_stop_watchdog(); } } - spin_unlock_irqrestore(&watchdog_lock, flags); } -static int __clocksource_watchdog_kthread(void) +static void __clocksource_change_rating(struct clocksource *cs, int rating); + +static int __clocksource_watchdog_work(void) { struct clocksource *cs, *tmp; unsigned long flags; - LIST_HEAD(unstable); int select = 0; spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); - list_add(&cs->wd_list, &unstable); + __clocksource_change_rating(cs, 0); select = 1; } if (cs->flags & CLOCK_SOURCE_RESELECT) { @@ -410,21 +411,15 @@ static int __clocksource_watchdog_kthread(void) clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); - /* Needs to be done outside of watchdog lock */ - list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { - list_del_init(&cs->wd_list); - __clocksource_change_rating(cs, 0); - } return select; } -static int clocksource_watchdog_kthread(void *data) +static void clocksource_watchdog_work(struct work_struct *work) { mutex_lock(&clocksource_mutex); - if (__clocksource_watchdog_kthread()) + if (__clocksource_watchdog_work()) clocksource_select(); mutex_unlock(&clocksource_mutex); - return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) @@ -443,10 +438,13 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static inline int __clocksource_watchdog_kthread(void) { return 0; } +static inline int __clocksource_watchdog_work(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } void clocksource_mark_unstable(struct clocksource *cs) { } +static inline void clocksource_watchdog_lock(unsigned long *flags) { } +static inline void clocksource_watchdog_unlock(unsigned long *flags) { } + #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ /** @@ -674,7 +672,7 @@ static int __init clocksource_done_booting(void) /* * Run the watchdog first to eliminate unstable clock sources */ - __clocksource_watchdog_kthread(); + __clocksource_watchdog_work(); clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; @@ -779,14 +777,19 @@ EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); */ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) { + unsigned long flags; /* Initialize mult/shift and max_idle_ns */ __clocksource_update_freq_scale(cs, scale, freq); /* Add clocksource to the clocksource list */ mutex_lock(&clocksource_mutex); + + clocksource_watchdog_lock(&flags); clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); + clocksource_watchdog_unlock(&flags); + clocksource_select(); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); @@ -808,8 +811,13 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) */ void clocksource_change_rating(struct clocksource *cs, int rating) { + unsigned long flags; + mutex_lock(&clocksource_mutex); + clocksource_watchdog_lock(&flags); __clocksource_change_rating(cs, rating); + clocksource_watchdog_unlock(&flags); + clocksource_select(); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); @@ -821,6 +829,8 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { + unsigned long flags; + if (clocksource_is_watchdog(cs)) { /* Select and try to install a replacement watchdog. */ clocksource_select_watchdog(true); @@ -834,8 +844,12 @@ static int clocksource_unbind(struct clocksource *cs) if (curr_clocksource == cs) return -EBUSY; } + + clocksource_watchdog_lock(&flags); clocksource_dequeue_watchdog(cs); list_del_init(&cs->list); + clocksource_watchdog_unlock(&flags); + return 0; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index eda1210ce50f..055a4a728c00 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -91,6 +91,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_real, }, { + .index = HRTIMER_BASE_BOOTTIME, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, + { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, @@ -106,6 +111,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_real, }, { + .index = HRTIMER_BASE_BOOTTIME_SOFT, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, + { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, @@ -119,7 +129,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_MONOTONIC, + [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; @@ -571,12 +581,14 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, - offs_real, offs_tai); + offs_real, offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; + base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; @@ -1747,8 +1759,10 @@ out: return ret; } -SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, - struct timespec __user *, rmtp) +#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) + +SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; @@ -1763,7 +1777,9 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } -#ifdef CONFIG_COMPAT +#endif + +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2541bd89f20e..5a6251ac6f7a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1205,10 +1205,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, u64 *newval, u64 *oldval) { u64 now; + int ret; WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); + ret = cpu_timer_sample_group(clock_idx, tsk, &now); - if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) { + if (oldval && ret != -EINVAL) { /* * We are setting itimer. The *oldval is absolute and we update * it to be relative, *newval argument is relative and we update diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index e0dbae98db9d..26aa9569e24a 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -59,7 +59,7 @@ SYS_NI(alarm); */ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - const struct timespec __user *, tp) + const struct __kernel_timespec __user *, tp) { struct timespec64 new_tp; @@ -83,8 +83,6 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) case CLOCK_BOOTTIME: get_monotonic_boottime64(tp); break; - case CLOCK_MONOTONIC_ACTIVE: - ktime_get_active_ts64(tp); default: return -EINVAL; } @@ -92,7 +90,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) return 0; } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *, tp) + struct __kernel_timespec __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -106,7 +104,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, return 0; } -SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, @@ -126,8 +124,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us } SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, - const struct timespec __user *, rqtp, - struct timespec __user *, rmtp) + const struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) { struct timespec64 t; @@ -160,7 +158,9 @@ COMPAT_SYS_NI(timer_settime); COMPAT_SYS_NI(timer_gettime); COMPAT_SYS_NI(getitimer); COMPAT_SYS_NI(setitimer); +#endif +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, struct compat_timespec __user *, tp) { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b6899b5060bd..e08ce3f27447 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -252,16 +252,15 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) { - timekeeping_clocktai64(tp); + get_monotonic_boottime64(tp); return 0; } -static int posix_get_monotonic_active(clockid_t which_clock, - struct timespec64 *tp) +static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) { - ktime_get_active_ts64(tp); + timekeeping_clocktai64(tp); return 0; } @@ -1041,7 +1040,7 @@ void exit_itimers(struct signal_struct *sig) } SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - const struct timespec __user *, tp) + const struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 new_tp; @@ -1056,7 +1055,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *,tp) + struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 kernel_tp; @@ -1097,7 +1096,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, } SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, - struct timespec __user *, tp) + struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 rtn_tp; @@ -1114,7 +1113,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, return error; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, struct compat_timespec __user *, tp) @@ -1149,6 +1148,10 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return err; } +#endif + +#ifdef CONFIG_COMPAT + COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, struct compat_timex __user *, utp) { @@ -1173,6 +1176,10 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, return err; } +#endif + +#ifdef CONFIG_COMPAT_32BIT_TIME + COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, struct compat_timespec __user *, tp) { @@ -1204,8 +1211,8 @@ static int common_nsleep(const clockid_t which_clock, int flags, } SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, - const struct timespec __user *, rqtp, - struct timespec __user *, rmtp) + const struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; @@ -1228,7 +1235,8 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return kc->nsleep(which_clock, flags, &t); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME + COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) @@ -1253,6 +1261,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return kc->nsleep(which_clock, flags, &t); } + #endif static const struct k_clock clock_realtime = { @@ -1317,9 +1326,19 @@ static const struct k_clock clock_tai = { .timer_arm = common_hrtimer_arm, }; -static const struct k_clock clock_monotonic_active = { +static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_active, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, }; static const struct k_clock * const posix_clocks[] = { @@ -1330,11 +1349,10 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, - [CLOCK_BOOTTIME] = &clock_monotonic, + [CLOCK_BOOTTIME] = &clock_boottime, [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, - [CLOCK_MONOTONIC_ACTIVE] = &clock_monotonic_active, }; static const struct k_clock *clockid_to_kclock(const clockid_t id) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b398c2ea69b2..aa2094d5dd27 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -612,6 +612,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) now = ktime_get(); /* Find all expired events */ for_each_cpu(cpu, tick_broadcast_oneshot_mask) { + /* + * Required for !SMP because for_each_cpu() reports + * unconditionally CPU0 as set on UP kernels. + */ + if (!IS_ENABLED(CONFIG_SMP) && + cpumask_empty(tick_broadcast_oneshot_mask)) + break; + td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event <= now) { cpumask_set_cpu(cpu, tmpmask); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 099572ca4a8f..b7005dd21ec1 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -277,7 +277,8 @@ static bool tick_check_preferred(struct clock_event_device *curdev, */ return !curdev || newdev->rating > curdev->rating || - !cpumask_equal(curdev->cpumask, newdev->cpumask); + (!cpumask_equal(curdev->cpumask, newdev->cpumask) && + !tick_check_percpu(curdev, newdev, smp_processor_id())); } /* @@ -419,19 +420,6 @@ void tick_suspend_local(void) clockevents_shutdown(td->evtdev); } -static void tick_forward_next_period(void) -{ - ktime_t delta, now = ktime_get(); - u64 n; - - delta = ktime_sub(now, tick_next_period); - n = ktime_divns(delta, tick_period); - tick_next_period += n * tick_period; - if (tick_next_period < now) - tick_next_period += tick_period; - tick_sched_forward_next_period(); -} - /** * tick_resume_local - Resume the local tick device * @@ -444,8 +432,6 @@ void tick_resume_local(void) struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool broadcast = tick_resume_check_broadcast(); - tick_forward_next_period(); - clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) @@ -505,6 +491,7 @@ void tick_freeze(void) if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); + system_state = SYSTEM_SUSPEND; timekeeping_suspend(); } else { tick_suspend_local(); @@ -528,6 +515,7 @@ void tick_unfreeze(void) if (tick_freeze_depth == num_online_cpus()) { timekeeping_resume(); + system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); } else { diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 21efab7485ca..e277284c2831 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -141,12 +141,6 @@ static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } #endif /* !(BROADCAST && ONESHOT) */ -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) -extern void tick_sched_forward_next_period(void); -#else -static inline void tick_sched_forward_next_period(void) { } -#endif - /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index c1f518e7aa80..6fe615d57ebb 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -82,16 +82,15 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || !tick_device_is_functional(dev)) { - printk(KERN_INFO "Clockevents: " - "could not switch to one-shot mode:"); + pr_info("Clockevents: could not switch to one-shot mode:"); if (!dev) { - printk(" no tick device\n"); + pr_cont(" no tick device\n"); } else { if (!tick_device_is_functional(dev)) - printk(" %s is not functional.\n", dev->name); + pr_cont(" %s is not functional.\n", dev->name); else - printk(" %s does not support one-shot mode.\n", - dev->name); + pr_cont(" %s does not support one-shot mode.\n", + dev->name); } return -EINVAL; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 646645e981f9..da9455a6b42b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -52,15 +52,6 @@ struct tick_sched *tick_get_tick_sched(int cpu) static ktime_t last_jiffies_update; /* - * Called after resume. Make sure that jiffies are not fast forwarded due to - * clock monotonic being forwarded by the suspended time. - */ -void tick_sched_forward_next_period(void) -{ - last_jiffies_update = tick_next_period; -} - -/* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) @@ -804,12 +795,12 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) return; } - hrtimer_set_expires(&ts->sched_timer, tick); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); - else + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + } else { + hrtimer_set_expires(&ts->sched_timer, tick); tick_program_event(tick, 1); + } } static void tick_nohz_retain_tick(struct tick_sched *ts) diff --git a/kernel/time/time.c b/kernel/time/time.c index 3044d48ebe56..6fa99213fc72 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -407,7 +407,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -#if __BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * @@ -468,7 +467,6 @@ struct timespec ns_to_timespec(const s64 nsec) return ts; } EXPORT_SYMBOL(ns_to_timespec); -#endif /** * ns_to_timeval - Convert nanoseconds to timeval @@ -853,9 +851,9 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, } int get_timespec64(struct timespec64 *ts, - const struct timespec __user *uts) + const struct __kernel_timespec __user *uts) { - struct timespec kts; + struct __kernel_timespec kts; int ret; ret = copy_from_user(&kts, uts, sizeof(kts)); @@ -863,6 +861,11 @@ int get_timespec64(struct timespec64 *ts, return -EFAULT; ts->tv_sec = kts.tv_sec; + + /* Zero out the padding for 32 bit systems or in compat mode */ + if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())) + kts.tv_nsec &= 0xFFFFFFFFUL; + ts->tv_nsec = kts.tv_nsec; return 0; @@ -870,16 +873,61 @@ int get_timespec64(struct timespec64 *ts, EXPORT_SYMBOL_GPL(get_timespec64); int put_timespec64(const struct timespec64 *ts, - struct timespec __user *uts) + struct __kernel_timespec __user *uts) { - struct timespec kts = { + struct __kernel_timespec kts = { .tv_sec = ts->tv_sec, .tv_nsec = ts->tv_nsec }; + return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(put_timespec64); +int __compat_get_timespec64(struct timespec64 *ts64, + const struct compat_timespec __user *cts) +{ + struct compat_timespec ts; + int ret; + + ret = copy_from_user(&ts, cts, sizeof(ts)); + if (ret) + return -EFAULT; + + ts64->tv_sec = ts.tv_sec; + ts64->tv_nsec = ts.tv_nsec; + + return 0; +} + +int __compat_put_timespec64(const struct timespec64 *ts64, + struct compat_timespec __user *cts) +{ + struct compat_timespec ts = { + .tv_sec = ts64->tv_sec, + .tv_nsec = ts64->tv_nsec + }; + return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; +} + +int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_get_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec64); + +int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_put_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec64); + int get_itimerspec64(struct itimerspec64 *it, const struct itimerspec __user *uit) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ca90219a1e73..4786df904c22 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -138,12 +138,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - /* Update both bases so mono and raw stay coupled. */ - tk->tkr_mono.base += delta; - tk->tkr_raw.base += delta; - - /* Accumulate time spent in suspend */ - tk->time_suspended += delta; + tk->offs_boot = ktime_add(tk->offs_boot, delta); } /* @@ -473,6 +468,36 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqlocks. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + + /* * See comment for __ktime_get_fast_ns() vs. timestamp ordering */ @@ -680,18 +705,19 @@ static void timekeeping_forward_now(struct timekeeper *tk) } /** - * __getnstimeofday64 - Returns the time of day in a timespec64. + * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * - * Updates the time of day in the timespec. - * Returns 0 on success, or -ve when suspended (timespec will be undefined). + * Returns the time of day in a timespec64 (WARN if suspended). */ -int __getnstimeofday64(struct timespec64 *ts) +void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; u64 nsecs; + WARN_ON(timekeeping_suspended); + do { seq = read_seqcount_begin(&tk_core.seq); @@ -702,28 +728,8 @@ int __getnstimeofday64(struct timespec64 *ts) ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); - - /* - * Do not bail out early, in case there were callers still using - * the value, even in the face of the WARN_ON. - */ - if (unlikely(timekeeping_suspended)) - return -EAGAIN; - return 0; } -EXPORT_SYMBOL(__getnstimeofday64); - -/** - * getnstimeofday64 - Returns the time of day in a timespec64. - * @ts: pointer to the timespec64 to be set - * - * Returns the time of day in a timespec64 (WARN if suspended). - */ -void getnstimeofday64(struct timespec64 *ts) -{ - WARN_ON(__getnstimeofday64(ts)); -} -EXPORT_SYMBOL(getnstimeofday64); +EXPORT_SYMBOL(ktime_get_real_ts64); ktime_t ktime_get(void) { @@ -764,6 +770,7 @@ EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, + [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; @@ -788,6 +795,25 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) } EXPORT_SYMBOL_GPL(ktime_get_with_offset); +ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base, *offset = offsets[offs]; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->tkr_mono.base, *offset); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return base; + +} +EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); + /** * ktime_mono_to_any() - convert mononotic time to any other time * @tmono: time to convert. @@ -861,39 +887,6 @@ void ktime_get_ts64(struct timespec64 *ts) EXPORT_SYMBOL_GPL(ktime_get_ts64); /** - * ktime_get_active_ts64 - Get the active non-suspended monotonic clock - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime clock and - * the wall_to_monotonic offset, subtracts the accumulated suspend time and - * stores the result in normalized timespec64 format in the variable - * pointed to by @ts. - */ -void ktime_get_active_ts64(struct timespec64 *ts) -{ - struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 tomono, tsusp; - u64 nsec, nssusp; - unsigned int seq; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&tk_core.seq); - ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(&tk->tkr_mono); - tomono = tk->wall_to_monotonic; - nssusp = tk->time_suspended; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - ts->tv_sec += tomono.tv_sec; - ts->tv_nsec = 0; - timespec64_add_ns(ts, nsec + tomono.tv_nsec); - tsusp = ns_to_timespec64(nssusp); - *ts = timespec64_sub(*ts, tsusp); -} - -/** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * * Returns the seconds portion of CLOCK_MONOTONIC with a single non @@ -1417,12 +1410,12 @@ int timekeeping_notify(struct clocksource *clock) } /** - * getrawmonotonic64 - Returns the raw monotonic time in a timespec + * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ -void getrawmonotonic64(struct timespec64 *ts) +void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; @@ -1438,7 +1431,7 @@ void getrawmonotonic64(struct timespec64 *ts) ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } -EXPORT_SYMBOL(getrawmonotonic64); +EXPORT_SYMBOL(ktime_get_raw_ts64); /** @@ -1593,6 +1586,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, return; } tk_xtime_add(tk, delta); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } @@ -2125,7 +2119,7 @@ out: void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - ktime_t t = ktime_sub(tk->offs_real, tk->time_suspended); + ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); *ts = ktime_to_timespec64(t); } @@ -2139,30 +2133,20 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); -struct timespec __current_kernel_time(void) +void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - - return timespec64_to_timespec(tk_xtime(tk)); -} - -struct timespec64 current_kernel_time64(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 now; unsigned long seq; do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + *ts = tk_xtime(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); - - return now; } -EXPORT_SYMBOL(current_kernel_time64); +EXPORT_SYMBOL(ktime_get_coarse_real_ts64); -struct timespec64 get_monotonic_coarse64(void) +void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; @@ -2175,12 +2159,10 @@ struct timespec64 get_monotonic_coarse64(void) mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); - set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, + set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); - - return now; } -EXPORT_SYMBOL(get_monotonic_coarse64); +EXPORT_SYMBOL(ktime_get_coarse_ts64); /* * Must hold jiffies_lock @@ -2195,6 +2177,7 @@ void do_timer(unsigned long ticks) * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the @@ -2204,7 +2187,7 @@ void do_timer(unsigned long ticks) * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, - ktime_t *offs_tai) + ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; @@ -2221,6 +2204,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 79b67f5e0343..7a9b4eb7a1d5 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -6,6 +6,7 @@ */ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, + ktime_t *offs_boot, ktime_t *offs_tai); extern int timekeeping_valid_for_hres(void); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 4a4fd567fb26..cc2d23e6ff61 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1251,18 +1251,18 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * * Note: For !irqsafe timers, you must not hold locks that are held in * interrupt context while calling this function. Even if the lock has - * nothing to do with the timer in question. Here's why: + * nothing to do with the timer in question. Here's why:: * * CPU0 CPU1 * ---- ---- - * <SOFTIRQ> - * call_timer_fn(); - * base->running_timer = mytimer; - * spin_lock_irq(somelock); + * <SOFTIRQ> + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); * <IRQ> * spin_lock(somelock); - * del_timer_sync(mytimer); - * while (base->running_timer == mytimer); + * del_timer_sync(mytimer); + * while (base->running_timer == mytimer); * * Now del_timer_sync() will never return and never release somelock. * The interrupt on the other CPU is waiting to grab somelock but diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 0ed768b56c60..d647dabdac97 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -28,8 +28,6 @@ struct timer_list_iter { u64 now; }; -typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); - /* * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q): @@ -372,24 +370,12 @@ static const struct seq_operations timer_list_sops = { .show = timer_list_show, }; -static int timer_list_open(struct inode *inode, struct file *filp) -{ - return seq_open_private(filp, &timer_list_sops, - sizeof(struct timer_list_iter)); -} - -static const struct file_operations timer_list_fops = { - .open = timer_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - static int __init init_timer_list_procfs(void) { struct proc_dir_entry *pe; - pe = proc_create("timer_list", 0400, NULL, &timer_list_fops); + pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops, + sizeof(struct timer_list_iter), NULL); if (!pe) return -ENOMEM; return 0; diff --git a/kernel/torture.c b/kernel/torture.c index 37b94012a3f8..3de1efbecd6a 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -574,7 +574,7 @@ void stutter_wait(const char *title) { int spt; - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { if (spt == 1) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d88e96d4e12c..56ba0f2a01db 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -977,6 +977,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; + u32 *ids, prog_cnt, ids_len; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -985,16 +986,32 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return -EINVAL; if (copy_from_user(&query, uquery, sizeof(query))) return -EFAULT; - if (query.ids_len > BPF_TRACE_MAX_PROGS) + + ids_len = query.ids_len; + if (ids_len > BPF_TRACE_MAX_PROGS) return -E2BIG; + ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN); + if (!ids) + return -ENOMEM; + /* + * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which + * is required when user only wants to check for uquery->prog_cnt. + * There is no need to check for it since the case is handled + * gracefully in bpf_prog_array_copy_info. + */ mutex_lock(&bpf_event_mutex); ret = bpf_prog_array_copy_info(event->tp_event->prog_array, - uquery->ids, - query.ids_len, - &uquery->prog_cnt); + ids, + ids_len, + &prog_cnt); mutex_unlock(&bpf_event_mutex); + if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || + copy_to_user(uquery->ids, ids, ids_len * sizeof(u32))) + ret = -EFAULT; + + kfree(ids); return ret; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 16bbf062018f..8d83bcf9ef69 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5514,10 +5514,10 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) ftrace_create_filter_files(&global_ops, d_tracer); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - trace_create_file("set_graph_function", 0444, d_tracer, + trace_create_file("set_graph_function", 0644, d_tracer, NULL, &ftrace_graph_fops); - trace_create_file("set_graph_notrace", 0444, d_tracer, + trace_create_file("set_graph_notrace", 0644, d_tracer, NULL, &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dfbcf9ee1447..bcd93031d042 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -893,7 +893,7 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -static void tracing_snapshot_instance(struct trace_array *tr) +void tracing_snapshot_instance(struct trace_array *tr) { struct tracer *tracer = tr->current_trace; unsigned long flags; @@ -949,7 +949,7 @@ static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, struct trace_buffer *size_buf, int cpu_id); static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); -static int alloc_snapshot(struct trace_array *tr) +int tracing_alloc_snapshot_instance(struct trace_array *tr) { int ret; @@ -995,7 +995,7 @@ int tracing_alloc_snapshot(void) struct trace_array *tr = &global_trace; int ret; - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); WARN_ON(ret < 0); return ret; @@ -1165,7 +1165,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, - { ktime_get_mono_fast_ns, "boot", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; @@ -5408,7 +5408,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; } @@ -6451,7 +6451,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, } #endif if (!tr->allocated_snapshot) { - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) break; } @@ -7179,7 +7179,7 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, return ret; out_reg: - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6fb46a06c9dc..507954b4e058 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1817,6 +1817,17 @@ static inline void __init trace_event_init(void) { } static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } #endif +#ifdef CONFIG_TRACER_SNAPSHOT +void tracing_snapshot_instance(struct trace_array *tr); +int tracing_alloc_snapshot_instance(struct trace_array *tr); +#else +static inline void tracing_snapshot_instance(struct trace_array *tr) { } +static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) +{ + return 0; +} +#endif + extern struct trace_iterator *tracepoint_print_iter; #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 22fee766081b..80e0b2aca703 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -159,13 +159,13 @@ static int benchmark_event_kthread(void *arg) * wants to run, schedule in, but if the CPU is idle, * we'll keep burning cycles. * - * Note the _rcu_qs() version of cond_resched() will + * Note the tasks_rcu_qs() version of cond_resched() will * notify synchronize_rcu_tasks() that this thread has * passed a quiescent state for rcu_tasks. Otherwise * this thread will never voluntarily schedule which would * block synchronize_rcu_tasks() indefinitely. */ - cond_resched(); + cond_resched_tasks_rcu_qs(); } return 0; diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e954ae3d82c0..e3a658bac10f 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -356,7 +356,7 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __field( unsigned int, seqnum ) ), - F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", + F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tnmi-ts:%llu\tnmi-count:%u\n", __entry->seqnum, __entry->tv_sec, __entry->tv_nsec, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9b4716bb8bb0..7d306b74230f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -762,6 +762,9 @@ static int regex_match_full(char *str, struct regex *r, int len) static int regex_match_front(char *str, struct regex *r, int len) { + if (len < r->len) + return 0; + if (strncmp(str, r->pattern, r->len) == 0) return 1; return 0; @@ -1499,14 +1502,14 @@ static int process_preds(struct trace_event_call *call, return ret; } - if (!nr_preds) { - prog = NULL; - } else { - prog = predicate_parse(filter_string, nr_parens, nr_preds, + if (!nr_preds) + return -EINVAL; + + prog = predicate_parse(filter_string, nr_parens, nr_preds, parse_pred, call, pe); - if (IS_ERR(prog)) - return PTR_ERR(prog); - } + if (IS_ERR(prog)) + return PTR_ERR(prog); + rcu_assign_pointer(filter->prog, prog); return 0; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0d7b3ffbecc2..b9061ed59bbd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2466,6 +2466,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; else { + hist_err("Invalid field modifier: ", modifier); field = ERR_PTR(-EINVAL); goto out; } @@ -2481,6 +2482,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { + hist_err("Couldn't find field: ", field_name); field = ERR_PTR(-EINVAL); goto out; } @@ -4913,6 +4915,16 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) seq_printf(m, "%s", field_name); } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) seq_puts(m, "common_timestamp"); + + if (hist_field->flags) { + if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) && + !(hist_field->flags & HIST_FIELD_FL_EXPR)) { + const char *flags = get_hist_field_flags(hist_field); + + if (flags) + seq_printf(m, ".%s", flags); + } + } } static int event_hist_trigger_print(struct seq_file *m, diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d251cabcf69a..8b5bdcf64871 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -483,9 +483,10 @@ clear_event_triggers(struct trace_array *tr) struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { - struct event_trigger_data *data; - list_for_each_entry_rcu(data, &file->triggers, list) { + struct event_trigger_data *data, *n; + list_for_each_entry_safe(data, n, &file->triggers, list) { trace_event_trigger_enable_disable(file, 0); + list_del_rcu(&data->list); if (data->ops->free) data->ops->free(data->ops, data); } @@ -642,6 +643,7 @@ event_trigger_callback(struct event_command *cmd_ops, trigger_data->count = -1; trigger_data->ops = trigger_ops; trigger_data->cmd_ops = cmd_ops; + trigger_data->private_data = file; INIT_LIST_HEAD(&trigger_data->list); INIT_LIST_HEAD(&trigger_data->named_list); @@ -1053,7 +1055,12 @@ static void snapshot_trigger(struct event_trigger_data *data, void *rec, struct ring_buffer_event *event) { - tracing_snapshot(); + struct trace_event_file *file = data->private_data; + + if (file) + tracing_snapshot_instance(file->tr); + else + tracing_snapshot(); } static void @@ -1076,7 +1083,7 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, { int ret = register_trigger(glob, ops, data, file); - if (ret > 0 && tracing_alloc_snapshot() != 0) { + if (ret > 0 && tracing_alloc_snapshot_instance(file->tr) != 0) { unregister_trigger(glob, ops, data, file); ret = 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1cd3fb4d70f8..02aed76e0978 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -512,8 +512,6 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; else { - pr_warn("Could not insert probe at %s+%lu: %d\n", - trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); ret = 0; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3c7bfc4bf5e9..4237eba4ef20 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -472,7 +472,7 @@ static __init int stack_trace_init(void) NULL, &stack_trace_fops); #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("stack_trace_filter", 0444, d_tracer, + trace_create_file("stack_trace_filter", 0644, d_tracer, &trace_ops, &stack_trace_filter_fops); #endif diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 34fd0e0ec51d..ac892878dbe6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -55,6 +55,7 @@ struct trace_uprobe { struct list_head list; struct trace_uprobe_filter filter; struct uprobe_consumer consumer; + struct path path; struct inode *inode; char *filename; unsigned long offset; @@ -289,7 +290,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu) for (i = 0; i < tu->tp.nr_args; i++) traceprobe_free_probe_arg(&tu->tp.args[i]); - iput(tu->inode); + path_put(&tu->path); kfree(tu->tp.call.class->system); kfree(tu->tp.call.name); kfree(tu->filename); @@ -363,7 +364,6 @@ end: static int create_trace_uprobe(int argc, char **argv) { struct trace_uprobe *tu; - struct inode *inode; char *arg, *event, *group, *filename; char buf[MAX_EVENT_NAME_LEN]; struct path path; @@ -371,7 +371,6 @@ static int create_trace_uprobe(int argc, char **argv) bool is_delete, is_return; int i, ret; - inode = NULL; ret = 0; is_delete = false; is_return = false; @@ -437,21 +436,16 @@ static int create_trace_uprobe(int argc, char **argv) } /* Find the last occurrence, in case the path contains ':' too. */ arg = strrchr(argv[1], ':'); - if (!arg) { - ret = -EINVAL; - goto fail_address_parse; - } + if (!arg) + return -EINVAL; *arg++ = '\0'; filename = argv[1]; ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) - goto fail_address_parse; - - inode = igrab(d_real_inode(path.dentry)); - path_put(&path); + return ret; - if (!inode || !S_ISREG(inode->i_mode)) { + if (!d_is_reg(path.dentry)) { ret = -EINVAL; goto fail_address_parse; } @@ -490,7 +484,7 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } tu->offset = offset; - tu->inode = inode; + tu->path = path; tu->filename = kstrdup(filename, GFP_KERNEL); if (!tu->filename) { @@ -558,7 +552,7 @@ error: return ret; fail_address_parse: - iput(inode); + path_put(&path); pr_info("Failed to parse address or file.\n"); @@ -922,6 +916,7 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, goto err_flags; tu->consumer.filter = filter; + tu->inode = d_real_inode(tu->path.dentry); ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) goto err_buffer; @@ -967,6 +962,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) WARN_ON(!uprobe_filter_is_empty(&tu->filter)); uprobe_unregister(tu->inode, tu->offset, &tu->consumer); + tu->inode = NULL; tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; uprobe_buffer_disable(); @@ -1337,7 +1333,6 @@ struct trace_event_call * create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) { struct trace_uprobe *tu; - struct inode *inode; struct path path; int ret; @@ -1345,11 +1340,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) if (ret) return ERR_PTR(ret); - inode = igrab(d_inode(path.dentry)); - path_put(&path); - - if (!inode || !S_ISREG(inode->i_mode)) { - iput(inode); + if (!d_is_reg(path.dentry)) { + path_put(&path); return ERR_PTR(-EINVAL); } @@ -1364,11 +1356,12 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) if (IS_ERR(tu)) { pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); + path_put(&path); return ERR_CAST(tu); } tu->offset = offs; - tu->inode = inode; + tu->path = path; tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu, &tu->tp.call); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 671b13457387..1e37da2e0c25 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -207,7 +207,7 @@ static int tracepoint_add_func(struct tracepoint *tp, lockdep_is_held(&tracepoints_mutex)); old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } @@ -239,7 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, lockdep_is_held(&tracepoints_mutex)); old = func_remove(&tp_funcs, func); if (IS_ERR(old)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca7959be8aaa..7ea75529eabb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -66,7 +66,7 @@ enum { * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED should be flipped only while holding - * attach_mutex to avoid changing binding state while + * wq_pool_attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. */ POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ @@ -123,7 +123,7 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * A: pool->attach_mutex protected. + * A: wq_pool_attach_mutex protected. * * PL: wq_pool_mutex protected. * @@ -166,7 +166,6 @@ struct worker_pool { /* L: hash of busy workers */ struct worker *manager; /* L: purely informational */ - struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ struct completion *detach_completion; /* all workers detached */ @@ -297,6 +296,7 @@ static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ +static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ @@ -399,14 +399,14 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * @worker: iteration cursor * @pool: worker_pool to iterate workers of * - * This must be called with @pool->attach_mutex. + * This must be called with wq_pool_attach_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool_worker(worker, pool) \ list_for_each_entry((worker), &(pool)->workers, node) \ - if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \ + if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \ else /** @@ -1724,7 +1724,7 @@ static struct worker *alloc_worker(int node) static void worker_attach_to_pool(struct worker *worker, struct worker_pool *pool) { - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any @@ -1733,37 +1733,40 @@ static void worker_attach_to_pool(struct worker *worker, set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); /* - * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains - * stable across this function. See the comments above the - * flag definition for details. + * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains + * stable across this function. See the comments above the flag + * definition for details. */ if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; list_add_tail(&worker->node, &pool->workers); + worker->pool = pool; - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); } /** * worker_detach_from_pool() - detach a worker from its pool * @worker: worker which is attached to its pool - * @pool: the pool @worker is attached to * * Undo the attaching which had been done in worker_attach_to_pool(). The * caller worker shouldn't access to the pool after detached except it has * other reference to the pool. */ -static void worker_detach_from_pool(struct worker *worker, - struct worker_pool *pool) +static void worker_detach_from_pool(struct worker *worker) { + struct worker_pool *pool = worker->pool; struct completion *detach_completion = NULL; - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); + list_del(&worker->node); + worker->pool = NULL; + if (list_empty(&pool->workers)) detach_completion = pool->detach_completion; - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); /* clear leftover flags without pool->lock after it is detached */ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); @@ -1799,7 +1802,6 @@ static struct worker *create_worker(struct worker_pool *pool) if (!worker) goto fail; - worker->pool = pool; worker->id = id; if (pool->cpu >= 0) @@ -2086,6 +2088,12 @@ __acquires(&pool->lock) worker->current_pwq = pwq; work_color = get_work_color(work); + /* + * Record wq name for cmdline and debug reporting, may get + * overridden through set_worker_desc(). + */ + strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN); + list_del_init(&work->entry); /* @@ -2181,7 +2189,6 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; - worker->desc_valid = false; pwq_dec_nr_in_flight(pwq, work_color); } @@ -2206,6 +2213,16 @@ static void process_scheduled_works(struct worker *worker) } } +static void set_pf_worker(bool val) +{ + mutex_lock(&wq_pool_attach_mutex); + if (val) + current->flags |= PF_WQ_WORKER; + else + current->flags &= ~PF_WQ_WORKER; + mutex_unlock(&wq_pool_attach_mutex); +} + /** * worker_thread - the worker thread function * @__worker: self @@ -2224,7 +2241,7 @@ static int worker_thread(void *__worker) struct worker_pool *pool = worker->pool; /* tell the scheduler that this is a workqueue worker */ - worker->task->flags |= PF_WQ_WORKER; + set_pf_worker(true); woke_up: spin_lock_irq(&pool->lock); @@ -2232,11 +2249,11 @@ woke_up: if (unlikely(worker->flags & WORKER_DIE)) { spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); - worker->task->flags &= ~PF_WQ_WORKER; + set_pf_worker(false); set_task_comm(worker->task, "kworker/dying"); ida_simple_remove(&pool->worker_ida, worker->id); - worker_detach_from_pool(worker, pool); + worker_detach_from_pool(worker); kfree(worker); return 0; } @@ -2335,7 +2352,7 @@ static int rescuer_thread(void *__rescuer) * Mark rescuer as worker too. As WORKER_PREP is never cleared, it * doesn't participate in concurrency management. */ - rescuer->task->flags |= PF_WQ_WORKER; + set_pf_worker(true); repeat: set_current_state(TASK_IDLE); @@ -2367,7 +2384,6 @@ repeat: worker_attach_to_pool(rescuer, pool); spin_lock_irq(&pool->lock); - rescuer->pool = pool; /* * Slurp in all works issued via this workqueue and @@ -2417,10 +2433,9 @@ repeat: if (need_more_worker(pool)) wake_up_worker(pool); - rescuer->pool = NULL; spin_unlock_irq(&pool->lock); - worker_detach_from_pool(rescuer, pool); + worker_detach_from_pool(rescuer); spin_lock_irq(&wq_mayday_lock); } @@ -2429,7 +2444,7 @@ repeat: if (should_stop) { __set_current_state(TASK_RUNNING); - rescuer->task->flags &= ~PF_WQ_WORKER; + set_pf_worker(false); return 0; } @@ -3271,7 +3286,6 @@ static int init_worker_pool(struct worker_pool *pool) timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); - mutex_init(&pool->attach_mutex); INIT_LIST_HEAD(&pool->workers); ida_init(&pool->worker_ida); @@ -3354,10 +3368,10 @@ static void put_unbound_pool(struct worker_pool *pool) WARN_ON(pool->nr_workers || pool->nr_idle); spin_unlock_irq(&pool->lock); - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); if (!list_empty(&pool->workers)) pool->detach_completion = &detach_completion; - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); if (pool->detach_completion) wait_for_completion(pool->detach_completion); @@ -4347,7 +4361,6 @@ void set_worker_desc(const char *fmt, ...) va_start(args, fmt); vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); va_end(args); - worker->desc_valid = true; } } @@ -4371,7 +4384,6 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) char desc[WORKER_DESC_LEN] = { }; struct pool_workqueue *pwq = NULL; struct workqueue_struct *wq = NULL; - bool desc_valid = false; struct worker *worker; if (!(task->flags & PF_WQ_WORKER)) @@ -4384,22 +4396,18 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) worker = kthread_probe_data(task); /* - * Carefully copy the associated workqueue's workfn and name. Keep - * the original last '\0' in case the original contains garbage. + * Carefully copy the associated workqueue's workfn, name and desc. + * Keep the original last '\0' in case the original is garbage. */ probe_kernel_read(&fn, &worker->current_func, sizeof(fn)); probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq)); probe_kernel_read(&wq, &pwq->wq, sizeof(wq)); probe_kernel_read(name, wq->name, sizeof(name) - 1); - - /* copy worker description */ - probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid)); - if (desc_valid) - probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); + probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { printk("%sWorkqueue: %s %pf", log_lvl, name, fn); - if (desc[0]) + if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); } @@ -4579,6 +4587,47 @@ void show_workqueue_state(void) rcu_read_unlock_sched(); } +/* used to show worker information through /proc/PID/{comm,stat,status} */ +void wq_worker_comm(char *buf, size_t size, struct task_struct *task) +{ + int off; + + /* always show the actual comm */ + off = strscpy(buf, task->comm, size); + if (off < 0) + return; + + /* stabilize PF_WQ_WORKER and worker pool association */ + mutex_lock(&wq_pool_attach_mutex); + + if (task->flags & PF_WQ_WORKER) { + struct worker *worker = kthread_data(task); + struct worker_pool *pool = worker->pool; + + if (pool) { + spin_lock_irq(&pool->lock); + /* + * ->desc tracks information (wq name or + * set_worker_desc()) for the latest execution. If + * current, prepend '+', otherwise '-'. + */ + if (worker->desc[0] != '\0') { + if (worker->current_work) + scnprintf(buf + off, size - off, "+%s", + worker->desc); + else + scnprintf(buf + off, size - off, "-%s", + worker->desc); + } + spin_unlock_irq(&pool->lock); + } + } + + mutex_unlock(&wq_pool_attach_mutex); +} + +#ifdef CONFIG_SMP + /* * CPU hotplug. * @@ -4600,7 +4649,7 @@ static void unbind_workers(int cpu) struct worker *worker; for_each_cpu_worker_pool(pool, cpu) { - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); spin_lock_irq(&pool->lock); /* @@ -4616,7 +4665,7 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); /* * Call schedule() so that we cross rq->lock and thus can @@ -4657,7 +4706,7 @@ static void rebind_workers(struct worker_pool *pool) { struct worker *worker; - lockdep_assert_held(&pool->attach_mutex); + lockdep_assert_held(&wq_pool_attach_mutex); /* * Restore CPU affinity of all workers. As all idle workers should @@ -4727,7 +4776,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) static cpumask_t cpumask; struct worker *worker; - lockdep_assert_held(&pool->attach_mutex); + lockdep_assert_held(&wq_pool_attach_mutex); /* is @cpu allowed for @pool? */ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) @@ -4762,14 +4811,14 @@ int workqueue_online_cpu(unsigned int cpu) mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); if (pool->cpu == cpu) rebind_workers(pool); else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); } /* update NUMA affinity of unbound workqueues */ @@ -4799,8 +4848,6 @@ int workqueue_offline_cpu(unsigned int cpu) return 0; } -#ifdef CONFIG_SMP - struct work_for_cpu { struct work_struct work; long (*fn)(void *); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index d390d1be3748..66fbb5a9e633 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -31,13 +31,12 @@ struct worker { struct work_struct *current_work; /* L: work being processed */ work_func_t current_func; /* L: current_work's fn */ struct pool_workqueue *current_pwq; /* L: current_work's pwq */ - bool desc_valid; /* ->desc is valid */ struct list_head scheduled; /* L: scheduled works */ /* 64 bytes boundary on 64bit, 32 on 32bit */ struct task_struct *task; /* I: worker task */ - struct worker_pool *pool; /* I: the associated pool */ + struct worker_pool *pool; /* A: the associated pool */ /* L: for rescuers */ struct list_head node; /* A: anchored at pool->workers */ /* A: runs through worker->node */ |