diff options
Diffstat (limited to 'kernel')
82 files changed, 2298 insertions, 979 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 320f1f3941b7..e8a6715f38dc 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,6 +41,9 @@ KCSAN_SANITIZE_kcov.o := n UBSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector +# Don't instrument error handlers +CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI) + obj-y += sched/ obj-y += locking/ obj-y += power/ @@ -111,6 +114,7 @@ obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o +obj-$(CONFIG_CFI_CLANG) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/audit.c b/kernel/audit.c index 551a394bc8f4..121d37e700a6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2132,7 +2132,7 @@ int audit_log_task_context(struct audit_buffer *ab) int error; u32 sid; - security_task_getsecid(current, &sid); + security_task_getsecid_subj(current, &sid); if (!sid) return 0; @@ -2353,7 +2353,7 @@ int audit_signal_info(int sig, struct task_struct *t) audit_sig_uid = auid; else audit_sig_uid = uid; - security_task_getsecid(current, &audit_sig_sid); + security_task_getsecid_subj(current, &audit_sig_sid); } return audit_signal_info_syscall(t); diff --git a/kernel/audit.h b/kernel/audit.h index 3b9c0945225a..1522e100fd17 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -292,8 +292,8 @@ extern void audit_filter_inodes(struct task_struct *tsk, extern struct list_head *audit_killed_trees(void); #else /* CONFIG_AUDITSYSCALL */ #define auditsc_get_stamp(c, t, s) 0 -#define audit_put_watch(w) {} -#define audit_get_watch(w) {} +#define audit_put_watch(w) do { } while (0) +#define audit_get_watch(w) do { } while (0) #define audit_to_watch(k, p, l, o) (-EINVAL) #define audit_add_watch(k, l) (-EINVAL) #define audit_remove_watch_rule(k) BUG() @@ -302,8 +302,8 @@ extern struct list_head *audit_killed_trees(void); #define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL)) #define audit_mark_path(m) "" -#define audit_remove_mark(m) -#define audit_remove_mark_rule(k) +#define audit_remove_mark(m) do { } while (0) +#define audit_remove_mark_rule(k) do { } while (0) #define audit_mark_compare(m, i, d) 0 #define audit_exe_compare(t, m) (-EINVAL) #define audit_dupe_exe(n, o) (-EINVAL) @@ -311,8 +311,8 @@ extern struct list_head *audit_killed_trees(void); #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL #define audit_make_tree(rule, str, op) -EINVAL -#define audit_trim_trees() (void)0 -#define audit_put_tree(tree) (void)0 +#define audit_trim_trees() do { } while (0) +#define audit_put_tree(tree) do { } while (0) #define audit_tag_tree(old, new) -EINVAL #define audit_tree_path(rule) "" /* never called */ #define audit_kill_trees(context) BUG() diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 333b3bcfc545..db2c6b59dfc3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1359,7 +1359,8 @@ int audit_filter(int msgtype, unsigned int listtype) case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: if (f->lsm_rule) { - security_task_getsecid(current, &sid); + security_task_getsecid_subj(current, + &sid); result = security_audit_rule_match(sid, f->type, f->op, f->lsm_rule); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 47fb48f42c93..175ef6f3ea4e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -667,7 +667,7 @@ static int audit_filter_rules(struct task_struct *tsk, logged upon error */ if (f->lsm_rule) { if (need_sid) { - security_task_getsecid(tsk, &sid); + security_task_getsecid_subj(tsk, &sid); need_sid = 0; } result = security_audit_rule_match(sid, f->type, @@ -805,8 +805,7 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val) * (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). */ static void audit_filter_syscall(struct task_struct *tsk, - struct audit_context *ctx, - struct list_head *list) + struct audit_context *ctx) { struct audit_entry *e; enum audit_state state; @@ -815,7 +814,7 @@ static void audit_filter_syscall(struct task_struct *tsk, return; rcu_read_lock(); - list_for_each_entry_rcu(e, list, list) { + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) { if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, NULL, &state, false)) { @@ -1627,8 +1626,7 @@ void __audit_free(struct task_struct *tsk) context->return_valid = AUDITSC_INVALID; context->return_code = 0; - audit_filter_syscall(tsk, context, - &audit_filter_list[AUDIT_FILTER_EXIT]); + audit_filter_syscall(tsk, context); audit_filter_inodes(tsk, context); if (context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(); @@ -1735,8 +1733,7 @@ void __audit_syscall_exit(int success, long return_code) else context->return_code = return_code; - audit_filter_syscall(current, context, - &audit_filter_list[AUDIT_FILTER_EXIT]); + audit_filter_syscall(current, context); audit_filter_inodes(current, context); if (context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(); @@ -2400,7 +2397,7 @@ void __audit_ptrace(struct task_struct *t) context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); - security_task_getsecid(t, &context->target_sid); + security_task_getsecid_obj(t, &context->target_sid); memcpy(context->target_comm, t->comm, TASK_COMM_LEN); } @@ -2427,7 +2424,7 @@ int audit_signal_info_syscall(struct task_struct *t) ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); - security_task_getsecid(t, &ctx->target_sid); + security_task_getsecid_obj(t, &ctx->target_sid); memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); return 0; } @@ -2448,7 +2445,7 @@ int audit_signal_info_syscall(struct task_struct *t) axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); - security_task_getsecid(t, &axp->target_sid[axp->pid_count]); + security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]); memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); axp->pid_count++; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 1622a44d1617..0ff58259ccf8 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -209,7 +209,8 @@ BTF_ID(func, bpf_lsm_socket_socketpair) BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) -BTF_ID(func, bpf_lsm_task_getsecid) +BTF_ID(func, bpf_lsm_task_getsecid_subj) +BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) BTF_ID(func, bpf_lsm_task_to_inode) diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 3acc7e0b6916..faa54d58972c 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -84,7 +84,7 @@ static const char *const bpf_atomic_alu_string[16] = { [BPF_ADD >> 4] = "add", [BPF_AND >> 4] = "and", [BPF_OR >> 4] = "or", - [BPF_XOR >> 4] = "or", + [BPF_XOR >> 4] = "xor", }; static const char *const bpf_ldst_string[] = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 1576ff331ee4..d2de2abec35b 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -543,11 +543,11 @@ int bpf_obj_get_user(const char __user *pathname, int flags) return PTR_ERR(raw); if (type == BPF_TYPE_PROG) - ret = bpf_prog_new_fd(raw); + ret = (f_flags != O_RDWR) ? -EINVAL : bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); else if (type == BPF_TYPE_LINK) - ret = bpf_link_new_fd(raw); + ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw); else return -ENOENT; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index be35bfb7fb13..6fbc2abe9c91 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -517,9 +517,17 @@ const struct bpf_func_proto bpf_get_stack_proto = { BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { - struct pt_regs *regs = task_pt_regs(task); + struct pt_regs *regs; + long res; - return __bpf_get_stack(regs, task, NULL, buf, size, flags); + if (!try_get_task_stack(task)) + return -EFAULT; + + regs = task_pt_regs(task); + res = __bpf_get_stack(regs, task, NULL, buf, size, flags); + put_task_stack(task); + + return res; } BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 1f3a4be4b175..4aa8b52adf25 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -9,6 +9,7 @@ #include <linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/rcupdate_wait.h> +#include <linux/module.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -87,6 +88,26 @@ out: return tr; } +static int bpf_trampoline_module_get(struct bpf_trampoline *tr) +{ + struct module *mod; + int err = 0; + + preempt_disable(); + mod = __module_text_address((unsigned long) tr->func.addr); + if (mod && !try_module_get(mod)) + err = -ENOENT; + preempt_enable(); + tr->mod = mod; + return err; +} + +static void bpf_trampoline_module_put(struct bpf_trampoline *tr) +{ + module_put(tr->mod); + tr->mod = NULL; +} + static int is_ftrace_location(void *ip) { long addr; @@ -108,6 +129,9 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) ret = unregister_ftrace_direct((long)ip, (long)old_addr); else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); + + if (!ret) + bpf_trampoline_module_put(tr); return ret; } @@ -134,10 +158,16 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return ret; tr->func.ftrace_managed = ret; + if (bpf_trampoline_module_get(tr)) + return -ENOENT; + if (tr->func.ftrace_managed) ret = register_ftrace_direct((long)ip, (long)new_addr); else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + + if (ret) + bpf_trampoline_module_put(tr); return ret; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 44e4ec1640f1..0399ac092b36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5856,40 +5856,51 @@ static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) return &env->insn_aux_data[env->insn_idx]; } +enum { + REASON_BOUNDS = -1, + REASON_TYPE = -2, + REASON_PATHS = -3, + REASON_LIMIT = -4, + REASON_STACK = -5, +}; + static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, - u32 *ptr_limit, u8 opcode, bool off_is_neg) + const struct bpf_reg_state *off_reg, + u32 *alu_limit, u8 opcode) { + bool off_is_neg = off_reg->smin_value < 0; bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || (opcode == BPF_SUB && !off_is_neg); - u32 off, max; + u32 max = 0, ptr_limit = 0; + + if (!tnum_is_const(off_reg->var_off) && + (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) + return REASON_BOUNDS; switch (ptr_reg->type) { case PTR_TO_STACK: /* Offset 0 is out-of-bounds, but acceptable start for the - * left direction, see BPF_REG_FP. + * left direction, see BPF_REG_FP. Also, unknown scalar + * offset where we would need to deal with min/max bounds is + * currently prohibited for unprivileged. */ max = MAX_BPF_STACK + mask_to_left; - /* Indirect variable offset stack access is prohibited in - * unprivileged mode so it's not handled here. - */ - off = ptr_reg->off + ptr_reg->var_off.value; - if (mask_to_left) - *ptr_limit = MAX_BPF_STACK + off; - else - *ptr_limit = -off - 1; - return *ptr_limit >= max ? -ERANGE : 0; + ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off); + break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; - if (mask_to_left) { - *ptr_limit = ptr_reg->umax_value + ptr_reg->off; - } else { - off = ptr_reg->smin_value + ptr_reg->off; - *ptr_limit = ptr_reg->map_ptr->value_size - off - 1; - } - return *ptr_limit >= max ? -ERANGE : 0; + ptr_limit = (mask_to_left ? + ptr_reg->smin_value : + ptr_reg->umax_value) + ptr_reg->off; + break; default: - return -EINVAL; + return REASON_TYPE; } + + if (ptr_limit >= max) + return REASON_LIMIT; + *alu_limit = ptr_limit; + return 0; } static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, @@ -5907,7 +5918,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, if (aux->alu_state && (aux->alu_state != alu_state || aux->alu_limit != alu_limit)) - return -EACCES; + return REASON_PATHS; /* Corresponding fixup done in fixup_bpf_calls(). */ aux->alu_state = alu_state; @@ -5926,14 +5937,22 @@ static int sanitize_val_alu(struct bpf_verifier_env *env, return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0); } +static bool sanitize_needed(u8 opcode) +{ + return opcode == BPF_ADD || opcode == BPF_SUB; +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, + const struct bpf_reg_state *off_reg, struct bpf_reg_state *dst_reg, - bool off_is_neg) + struct bpf_insn_aux_data *tmp_aux, + const bool commit_window) { + struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux; struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_insn_aux_data *aux = cur_aux(env); + bool off_is_neg = off_reg->smin_value < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; @@ -5951,18 +5970,33 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (vstate->speculative) goto do_sim; - alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; - alu_state |= ptr_is_dst_reg ? - BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; - - err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg); + err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode); if (err < 0) return err; + if (commit_window) { + /* In commit phase we narrow the masking window based on + * the observed pointer move after the simulated operation. + */ + alu_state = tmp_aux->alu_state; + alu_limit = abs(tmp_aux->alu_limit - alu_limit); + } else { + alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + } + err = update_alu_sanitation_state(aux, alu_state, alu_limit); if (err < 0) return err; do_sim: + /* If we're in commit phase, we're done here given we already + * pushed the truncated dst_reg into the speculative verification + * stack. + */ + if (commit_window) + return 0; + /* Simulate and find potential out-of-bounds access under * speculative execution from truncation as a result of * masking when off was not within expected range. If off @@ -5979,7 +6013,46 @@ do_sim: ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); if (!ptr_is_dst_reg && ret) *dst_reg = tmp; - return !ret ? -EFAULT : 0; + return !ret ? REASON_STACK : 0; +} + +static int sanitize_err(struct bpf_verifier_env *env, + const struct bpf_insn *insn, int reason, + const struct bpf_reg_state *off_reg, + const struct bpf_reg_state *dst_reg) +{ + static const char *err = "pointer arithmetic with it prohibited for !root"; + const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub"; + u32 dst = insn->dst_reg, src = insn->src_reg; + + switch (reason) { + case REASON_BOUNDS: + verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n", + off_reg == dst_reg ? dst : src, err); + break; + case REASON_TYPE: + verbose(env, "R%d has pointer with unsupported alu operation, %s\n", + off_reg == dst_reg ? src : dst, err); + break; + case REASON_PATHS: + verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n", + dst, op, err); + break; + case REASON_LIMIT: + verbose(env, "R%d tried to %s beyond pointer bounds, %s\n", + dst, op, err); + break; + case REASON_STACK: + verbose(env, "R%d could not be pushed for speculative verification, %s\n", + dst, err); + break; + default: + verbose(env, "verifier internal error: unknown reason (%d)\n", + reason); + break; + } + + return -EACCES; } /* check that stack access falls within stack limits and that 'reg' doesn't @@ -6016,6 +6089,37 @@ static int check_stack_access_for_ptr_arithmetic( return 0; } +static int sanitize_check_bounds(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + const struct bpf_reg_state *dst_reg) +{ + u32 dst = insn->dst_reg; + + /* For unprivileged we require that resulting offset must be in bounds + * in order to be able to sanitize access later on. + */ + if (env->bypass_spec_v1) + return 0; + + switch (dst_reg->type) { + case PTR_TO_STACK: + if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg, + dst_reg->off + dst_reg->var_off.value)) + return -EACCES; + break; + case PTR_TO_MAP_VALUE: + if (check_map_access(env, dst, dst_reg->off, 1, false)) { + verbose(env, "R%d pointer arithmetic of map value goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } + break; + default: + break; + } + + return 0; +} /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. @@ -6035,8 +6139,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; - u32 dst = insn->dst_reg, src = insn->src_reg; + struct bpf_insn_aux_data tmp_aux = {}; u8 opcode = BPF_OP(insn->code); + u32 dst = insn->dst_reg; int ret; dst_reg = ®s[dst]; @@ -6084,13 +6189,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; - case PTR_TO_MAP_VALUE: - if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { - verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", - off_reg == dst_reg ? dst : src); - return -EACCES; - } - fallthrough; default: break; } @@ -6108,13 +6206,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* pointer types do not carry 32-bit bounds at the moment. */ __mark_reg32_unbounded(dst_reg); + if (sanitize_needed(opcode)) { + ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg, + &tmp_aux, false); + if (ret < 0) + return sanitize_err(env, insn, ret, off_reg, dst_reg); + } + switch (opcode) { case BPF_ADD: - ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); - if (ret < 0) { - verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst); - return ret; - } /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -6165,11 +6265,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: - ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); - if (ret < 0) { - verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst); - return ret; - } if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ verbose(env, "R%d tried to subtract pointer from scalar\n", @@ -6250,21 +6345,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); - /* For unprivileged we require that resulting offset must be in bounds - * in order to be able to sanitize access later on. - */ - if (!env->bypass_spec_v1) { - if (dst_reg->type == PTR_TO_MAP_VALUE && - check_map_access(env, dst, dst_reg->off, 1, false)) { - verbose(env, "R%d pointer arithmetic of map value goes out of range, " - "prohibited for !root\n", dst); - return -EACCES; - } else if (dst_reg->type == PTR_TO_STACK && - check_stack_access_for_ptr_arithmetic( - env, dst, dst_reg, dst_reg->off + - dst_reg->var_off.value)) { - return -EACCES; - } + if (sanitize_check_bounds(env, insn, dst_reg) < 0) + return -EACCES; + if (sanitize_needed(opcode)) { + ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg, + &tmp_aux, true); + if (ret < 0) + return sanitize_err(env, insn, ret, off_reg, dst_reg); } return 0; @@ -6858,9 +6945,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, s32 s32_min_val, s32_max_val; u32 u32_min_val, u32_max_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; - u32 dst = insn->dst_reg; - int ret; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); + int ret; smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; @@ -6902,6 +6988,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, return 0; } + if (sanitize_needed(opcode)) { + ret = sanitize_val_alu(env, insn); + if (ret < 0) + return sanitize_err(env, insn, ret, NULL, NULL); + } + /* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops. * There are two classes of instructions: The first class we track both * alu32 and alu64 sign/unsigned bounds independently this provides the @@ -6918,21 +7010,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, */ switch (opcode) { case BPF_ADD: - ret = sanitize_val_alu(env, insn); - if (ret < 0) { - verbose(env, "R%d tried to add from different pointers or scalars\n", dst); - return ret; - } scalar32_min_max_add(dst_reg, &src_reg); scalar_min_max_add(dst_reg, &src_reg); dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: - ret = sanitize_val_alu(env, insn); - if (ret < 0) { - verbose(env, "R%d tried to sub from different pointers or scalars\n", dst); - return ret; - } scalar32_min_max_sub(dst_reg, &src_reg); scalar_min_max_sub(dst_reg, &src_reg); dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); @@ -12158,6 +12240,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) u32 btf_id, member_idx; const char *mname; + if (!prog->gpl_compatible) { + verbose(env, "struct ops programs must have a GPL compatible license\n"); + return -EINVAL; + } + btf_id = prog->aux->attach_btf_id; st_ops = bpf_struct_ops_find(btf_id); if (!st_ops) { diff --git a/kernel/cfi.c b/kernel/cfi.c new file mode 100644 index 000000000000..e17a56639766 --- /dev/null +++ b/kernel/cfi.c @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Clang Control Flow Integrity (CFI) error and slowpath handling. + * + * Copyright (C) 2021 Google LLC + */ + +#include <linux/hardirq.h> +#include <linux/kallsyms.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/printk.h> +#include <linux/ratelimit.h> +#include <linux/rcupdate.h> +#include <linux/vmalloc.h> +#include <asm/cacheflush.h> +#include <asm/set_memory.h> + +/* Compiler-defined handler names */ +#ifdef CONFIG_CFI_PERMISSIVE +#define cfi_failure_handler __ubsan_handle_cfi_check_fail +#else +#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort +#endif + +static inline void handle_cfi_failure(void *ptr) +{ + if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) + WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr); + else + panic("CFI failure (target: %pS)\n", ptr); +} + +#ifdef CONFIG_MODULES +#ifdef CONFIG_CFI_CLANG_SHADOW +/* + * Index type. A 16-bit index can address at most (2^16)-2 pages (taking + * into account SHADOW_INVALID), i.e. ~256M with 4k pages. + */ +typedef u16 shadow_t; +#define SHADOW_INVALID ((shadow_t)~0UL) + +struct cfi_shadow { + /* Page index for the beginning of the shadow */ + unsigned long base; + /* An array of __cfi_check locations (as indices to the shadow) */ + shadow_t shadow[1]; +} __packed; + +/* + * The shadow covers ~128M from the beginning of the module region. If + * the region is larger, we fall back to __module_address for the rest. + */ +#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT) + +/* The in-memory size of struct cfi_shadow, always at least one page */ +#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT) +#define SHADOW_PAGES max(1UL, __SHADOW_PAGES) +#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT) + +/* The actual size of the shadow array, minus metadata */ +#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow)) +#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t)) + +static DEFINE_MUTEX(shadow_update_lock); +static struct cfi_shadow __rcu *cfi_shadow __read_mostly; + +/* Returns the index in the shadow for the given address */ +static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr) +{ + unsigned long index; + unsigned long page = ptr >> PAGE_SHIFT; + + if (unlikely(page < s->base)) + return -1; /* Outside of module area */ + + index = page - s->base; + + if (index >= SHADOW_ARR_SLOTS) + return -1; /* Cannot be addressed with shadow */ + + return (int)index; +} + +/* Returns the page address for an index in the shadow */ +static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s, + int index) +{ + if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS)) + return 0; + + return (s->base + index) << PAGE_SHIFT; +} + +/* Returns the __cfi_check function address for the given shadow location */ +static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s, + int index) +{ + if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS)) + return 0; + + if (unlikely(s->shadow[index] == SHADOW_INVALID)) + return 0; + + /* __cfi_check is always page aligned */ + return (s->base + s->shadow[index]) << PAGE_SHIFT; +} + +static void prepare_next_shadow(const struct cfi_shadow __rcu *prev, + struct cfi_shadow *next) +{ + int i, index, check; + + /* Mark everything invalid */ + memset(next->shadow, 0xFF, SHADOW_ARR_SIZE); + + if (!prev) + return; /* No previous shadow */ + + /* If the base address didn't change, an update is not needed */ + if (prev->base == next->base) { + memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE); + return; + } + + /* Convert the previous shadow to the new address range */ + for (i = 0; i < SHADOW_ARR_SLOTS; ++i) { + if (prev->shadow[i] == SHADOW_INVALID) + continue; + + index = ptr_to_shadow(next, shadow_to_ptr(prev, i)); + if (index < 0) + continue; + + check = ptr_to_shadow(next, + shadow_to_check_fn(prev, prev->shadow[i])); + if (check < 0) + continue; + + next->shadow[index] = (shadow_t)check; + } +} + +static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod, + unsigned long min_addr, unsigned long max_addr) +{ + int check_index; + unsigned long check = (unsigned long)mod->cfi_check; + unsigned long ptr; + + if (unlikely(!PAGE_ALIGNED(check))) { + pr_warn("cfi: not using shadow for module %s\n", mod->name); + return; + } + + check_index = ptr_to_shadow(s, check); + if (check_index < 0) + return; /* Module not addressable with shadow */ + + /* For each page, store the check function index in the shadow */ + for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) { + int index = ptr_to_shadow(s, ptr); + + if (index >= 0) { + /* Each page must only contain one module */ + WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID); + s->shadow[index] = (shadow_t)check_index; + } + } +} + +static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod, + unsigned long min_addr, unsigned long max_addr) +{ + unsigned long ptr; + + for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) { + int index = ptr_to_shadow(s, ptr); + + if (index >= 0) + s->shadow[index] = SHADOW_INVALID; + } +} + +typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *, + unsigned long min_addr, unsigned long max_addr); + +static void update_shadow(struct module *mod, unsigned long base_addr, + update_shadow_fn fn) +{ + struct cfi_shadow *prev; + struct cfi_shadow *next; + unsigned long min_addr, max_addr; + + next = vmalloc(SHADOW_SIZE); + + mutex_lock(&shadow_update_lock); + prev = rcu_dereference_protected(cfi_shadow, + mutex_is_locked(&shadow_update_lock)); + + if (next) { + next->base = base_addr >> PAGE_SHIFT; + prepare_next_shadow(prev, next); + + min_addr = (unsigned long)mod->core_layout.base; + max_addr = min_addr + mod->core_layout.text_size; + fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK); + + set_memory_ro((unsigned long)next, SHADOW_PAGES); + } + + rcu_assign_pointer(cfi_shadow, next); + mutex_unlock(&shadow_update_lock); + synchronize_rcu(); + + if (prev) { + set_memory_rw((unsigned long)prev, SHADOW_PAGES); + vfree(prev); + } +} + +void cfi_module_add(struct module *mod, unsigned long base_addr) +{ + update_shadow(mod, base_addr, add_module_to_shadow); +} + +void cfi_module_remove(struct module *mod, unsigned long base_addr) +{ + update_shadow(mod, base_addr, remove_module_from_shadow); +} + +static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s, + unsigned long ptr) +{ + int index; + + if (unlikely(!s)) + return NULL; /* No shadow available */ + + index = ptr_to_shadow(s, ptr); + if (index < 0) + return NULL; /* Cannot be addressed with shadow */ + + return (cfi_check_fn)shadow_to_check_fn(s, index); +} + +static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr) +{ + cfi_check_fn fn; + + rcu_read_lock_sched(); + fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr); + rcu_read_unlock_sched(); + + return fn; +} + +#else /* !CONFIG_CFI_CLANG_SHADOW */ + +static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr) +{ + return NULL; +} + +#endif /* CONFIG_CFI_CLANG_SHADOW */ + +static inline cfi_check_fn find_module_check_fn(unsigned long ptr) +{ + cfi_check_fn fn = NULL; + struct module *mod; + + rcu_read_lock_sched(); + mod = __module_address(ptr); + if (mod) + fn = mod->cfi_check; + rcu_read_unlock_sched(); + + return fn; +} + +static inline cfi_check_fn find_check_fn(unsigned long ptr) +{ + cfi_check_fn fn = NULL; + + if (is_kernel_text(ptr)) + return __cfi_check; + + /* + * Indirect call checks can happen when RCU is not watching. Both + * the shadow and __module_address use RCU, so we need to wake it + * up if necessary. + */ + RCU_NONIDLE({ + if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) + fn = find_shadow_check_fn(ptr); + + if (!fn) + fn = find_module_check_fn(ptr); + }); + + return fn; +} + +void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) +{ + cfi_check_fn fn = find_check_fn((unsigned long)ptr); + + if (likely(fn)) + fn(id, ptr, diag); + else /* Don't allow unchecked modules */ + handle_cfi_failure(ptr); +} +EXPORT_SYMBOL(__cfi_slowpath_diag); + +#else /* !CONFIG_MODULES */ + +void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) +{ + handle_cfi_failure(ptr); /* No modules */ +} +EXPORT_SYMBOL(__cfi_slowpath_diag); + +#endif /* CONFIG_MODULES */ + +void cfi_failure_handler(void *data, void *ptr, void *vtable) +{ + handle_cfi_failure(ptr); +} +EXPORT_SYMBOL(cfi_failure_handler); diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 5d7a76bfbbb7..12f8457ad1f9 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -5,4 +5,5 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CGROUP_MISC) += misc.o obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index a5751784ad74..391aa570369b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -727,7 +727,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) stats->nr_stopped++; break; default: - if (delayacct_is_task_waiting_on_io(tsk)) + if (tsk->in_iowait) stats->nr_io_wait++; break; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5258b68153e0..a945504c0ae7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -585,7 +585,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); - /* On legacy hiearchy, we must be a subset of our parent cpuset. */ + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ ret = -EACCES; if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) goto out; @@ -1726,7 +1726,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * When configured nodemask is changed, the effective nodemasks of this cpuset * and all its descendants need to be updated. * - * On legacy hiearchy, effective_mems will be the same with mems_allowed. + * On legacy hierarchy, effective_mems will be the same with mems_allowed. * * Called with cpuset_mutex held */ @@ -2500,7 +2500,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) BUG(); } - /* Unrechable but makes gcc happy */ + /* Unreachable but makes gcc happy */ return 0; } diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c new file mode 100644 index 000000000000..ec02d963cad1 --- /dev/null +++ b/kernel/cgroup/misc.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Miscellaneous cgroup controller + * + * Copyright 2020 Google LLC + * Author: Vipin Sharma <vipinsh@google.com> + */ + +#include <linux/limits.h> +#include <linux/cgroup.h> +#include <linux/errno.h> +#include <linux/atomic.h> +#include <linux/slab.h> +#include <linux/misc_cgroup.h> + +#define MAX_STR "max" +#define MAX_NUM ULONG_MAX + +/* Miscellaneous res name, keep it in sync with enum misc_res_type */ +static const char *const misc_res_name[] = { +#ifdef CONFIG_KVM_AMD_SEV + /* AMD SEV ASIDs resource */ + "sev", + /* AMD SEV-ES ASIDs resource */ + "sev_es", +#endif +}; + +/* Root misc cgroup */ +static struct misc_cg root_cg; + +/* + * Miscellaneous resources capacity for the entire machine. 0 capacity means + * resource is not initialized or not present in the host. + * + * root_cg.max and capacity are independent of each other. root_cg.max can be + * more than the actual capacity. We are using Limits resource distribution + * model of cgroup for miscellaneous controller. + */ +static unsigned long misc_res_capacity[MISC_CG_RES_TYPES]; + +/** + * parent_misc() - Get the parent of the passed misc cgroup. + * @cgroup: cgroup whose parent needs to be fetched. + * + * Context: Any context. + * Return: + * * struct misc_cg* - Parent of the @cgroup. + * * %NULL - If @cgroup is null or the passed cgroup does not have a parent. + */ +static struct misc_cg *parent_misc(struct misc_cg *cgroup) +{ + return cgroup ? css_misc(cgroup->css.parent) : NULL; +} + +/** + * valid_type() - Check if @type is valid or not. + * @type: misc res type. + * + * Context: Any context. + * Return: + * * true - If valid type. + * * false - If not valid type. + */ +static inline bool valid_type(enum misc_res_type type) +{ + return type >= 0 && type < MISC_CG_RES_TYPES; +} + +/** + * misc_cg_res_total_usage() - Get the current total usage of the resource. + * @type: misc res type. + * + * Context: Any context. + * Return: Current total usage of the resource. + */ +unsigned long misc_cg_res_total_usage(enum misc_res_type type) +{ + if (valid_type(type)) + return atomic_long_read(&root_cg.res[type].usage); + + return 0; +} +EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); + +/** + * misc_cg_set_capacity() - Set the capacity of the misc cgroup res. + * @type: Type of the misc res. + * @capacity: Supported capacity of the misc res on the host. + * + * If capacity is 0 then the charging a misc cgroup fails for that type. + * + * Context: Any context. + * Return: + * * %0 - Successfully registered the capacity. + * * %-EINVAL - If @type is invalid. + */ +int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity) +{ + if (!valid_type(type)) + return -EINVAL; + + WRITE_ONCE(misc_res_capacity[type], capacity); + return 0; +} +EXPORT_SYMBOL_GPL(misc_cg_set_capacity); + +/** + * misc_cg_cancel_charge() - Cancel the charge from the misc cgroup. + * @type: Misc res type in misc cg to cancel the charge from. + * @cg: Misc cgroup to cancel charge from. + * @amount: Amount to cancel. + * + * Context: Any context. + */ +static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, + unsigned long amount) +{ + WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage), + "misc cgroup resource %s became less than 0", + misc_res_name[type]); +} + +/** + * misc_cg_try_charge() - Try charging the misc cgroup. + * @type: Misc res type to charge. + * @cg: Misc cgroup which will be charged. + * @amount: Amount to charge. + * + * Charge @amount to the misc cgroup. Caller must use the same cgroup during + * the uncharge call. + * + * Context: Any context. + * Return: + * * %0 - If successfully charged. + * * -EINVAL - If @type is invalid or misc res has 0 capacity. + * * -EBUSY - If max limit will be crossed or total usage will be more than the + * capacity. + */ +int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, + unsigned long amount) +{ + struct misc_cg *i, *j; + int ret; + struct misc_res *res; + int new_usage; + + if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type]))) + return -EINVAL; + + if (!amount) + return 0; + + for (i = cg; i; i = parent_misc(i)) { + res = &i->res[type]; + + new_usage = atomic_long_add_return(amount, &res->usage); + if (new_usage > READ_ONCE(res->max) || + new_usage > READ_ONCE(misc_res_capacity[type])) { + if (!res->failed) { + pr_info("cgroup: charge rejected by the misc controller for %s resource in ", + misc_res_name[type]); + pr_cont_cgroup_path(i->css.cgroup); + pr_cont("\n"); + res->failed = true; + } + ret = -EBUSY; + goto err_charge; + } + } + return 0; + +err_charge: + for (j = cg; j != i; j = parent_misc(j)) + misc_cg_cancel_charge(type, j, amount); + misc_cg_cancel_charge(type, i, amount); + return ret; +} +EXPORT_SYMBOL_GPL(misc_cg_try_charge); + +/** + * misc_cg_uncharge() - Uncharge the misc cgroup. + * @type: Misc res type which was charged. + * @cg: Misc cgroup which will be uncharged. + * @amount: Charged amount. + * + * Context: Any context. + */ +void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, + unsigned long amount) +{ + struct misc_cg *i; + + if (!(amount && valid_type(type) && cg)) + return; + + for (i = cg; i; i = parent_misc(i)) + misc_cg_cancel_charge(type, i, amount); +} +EXPORT_SYMBOL_GPL(misc_cg_uncharge); + +/** + * misc_cg_max_show() - Show the misc cgroup max limit. + * @sf: Interface file + * @v: Arguments passed + * + * Context: Any context. + * Return: 0 to denote successful print. + */ +static int misc_cg_max_show(struct seq_file *sf, void *v) +{ + int i; + struct misc_cg *cg = css_misc(seq_css(sf)); + unsigned long max; + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + if (READ_ONCE(misc_res_capacity[i])) { + max = READ_ONCE(cg->res[i].max); + if (max == MAX_NUM) + seq_printf(sf, "%s max\n", misc_res_name[i]); + else + seq_printf(sf, "%s %lu\n", misc_res_name[i], + max); + } + } + + return 0; +} + +/** + * misc_cg_max_write() - Update the maximum limit of the cgroup. + * @of: Handler for the file. + * @buf: Data from the user. It should be either "max", 0, or a positive + * integer. + * @nbytes: Number of bytes of the data. + * @off: Offset in the file. + * + * User can pass data like: + * echo sev 23 > misc.max, OR + * echo sev max > misc.max + * + * Context: Any context. + * Return: + * * >= 0 - Number of bytes processed in the input. + * * -EINVAL - If buf is not valid. + * * -ERANGE - If number is bigger than the unsigned long capacity. + */ +static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct misc_cg *cg; + unsigned long max; + int ret = 0, i; + enum misc_res_type type = MISC_CG_RES_TYPES; + char *token; + + buf = strstrip(buf); + token = strsep(&buf, " "); + + if (!token || !buf) + return -EINVAL; + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + if (!strcmp(misc_res_name[i], token)) { + type = i; + break; + } + } + + if (type == MISC_CG_RES_TYPES) + return -EINVAL; + + if (!strcmp(MAX_STR, buf)) { + max = MAX_NUM; + } else { + ret = kstrtoul(buf, 0, &max); + if (ret) + return ret; + } + + cg = css_misc(of_css(of)); + + if (READ_ONCE(misc_res_capacity[type])) + WRITE_ONCE(cg->res[type].max, max); + else + ret = -EINVAL; + + return ret ? ret : nbytes; +} + +/** + * misc_cg_current_show() - Show the current usage of the misc cgroup. + * @sf: Interface file + * @v: Arguments passed + * + * Context: Any context. + * Return: 0 to denote successful print. + */ +static int misc_cg_current_show(struct seq_file *sf, void *v) +{ + int i; + unsigned long usage; + struct misc_cg *cg = css_misc(seq_css(sf)); + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + usage = atomic_long_read(&cg->res[i].usage); + if (READ_ONCE(misc_res_capacity[i]) || usage) + seq_printf(sf, "%s %lu\n", misc_res_name[i], usage); + } + + return 0; +} + +/** + * misc_cg_capacity_show() - Show the total capacity of misc res on the host. + * @sf: Interface file + * @v: Arguments passed + * + * Only present in the root cgroup directory. + * + * Context: Any context. + * Return: 0 to denote successful print. + */ +static int misc_cg_capacity_show(struct seq_file *sf, void *v) +{ + int i; + unsigned long cap; + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + cap = READ_ONCE(misc_res_capacity[i]); + if (cap) + seq_printf(sf, "%s %lu\n", misc_res_name[i], cap); + } + + return 0; +} + +/* Misc cgroup interface files */ +static struct cftype misc_cg_files[] = { + { + .name = "max", + .write = misc_cg_max_write, + .seq_show = misc_cg_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .seq_show = misc_cg_current_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "capacity", + .seq_show = misc_cg_capacity_show, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + {} +}; + +/** + * misc_cg_alloc() - Allocate misc cgroup. + * @parent_css: Parent cgroup. + * + * Context: Process context. + * Return: + * * struct cgroup_subsys_state* - css of the allocated cgroup. + * * ERR_PTR(-ENOMEM) - No memory available to allocate. + */ +static struct cgroup_subsys_state * +misc_cg_alloc(struct cgroup_subsys_state *parent_css) +{ + enum misc_res_type i; + struct misc_cg *cg; + + if (!parent_css) { + cg = &root_cg; + } else { + cg = kzalloc(sizeof(*cg), GFP_KERNEL); + if (!cg) + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + WRITE_ONCE(cg->res[i].max, MAX_NUM); + atomic_long_set(&cg->res[i].usage, 0); + } + + return &cg->css; +} + +/** + * misc_cg_free() - Free the misc cgroup. + * @css: cgroup subsys object. + * + * Context: Any context. + */ +static void misc_cg_free(struct cgroup_subsys_state *css) +{ + kfree(css_misc(css)); +} + +/* Cgroup controller callbacks */ +struct cgroup_subsys misc_cgrp_subsys = { + .css_alloc = misc_cg_alloc, + .css_free = misc_cg_free, + .legacy_cftypes = misc_cg_files, + .dfl_cftypes = misc_cg_files, +}; diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index e149a0ac9e9e..8372897402f4 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -321,7 +321,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) /* * Copy the binary array pointed to by buf into mem. Fix $, #, and * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success. - * The input buf is overwitten with the result to write to mem. + * The input buf is overwritten with the result to write to mem. */ static int kgdb_ebin2mem(char *buf, char *mem, int count) { @@ -952,7 +952,7 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks) } /* - * This function performs all gdbserial command procesing + * This function performs all gdbserial command processing */ int gdb_serial_stub(struct kgdb_state *ks) { diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index ec4940146612..2168f8dacb99 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -522,6 +522,54 @@ static int kdb_ss(int argc, const char **argv) return KDB_CMD_SS; } +static kdbtab_t bptab[] = { + { .cmd_name = "bp", + .cmd_func = kdb_bp, + .cmd_usage = "[<vaddr>]", + .cmd_help = "Set/Display breakpoints", + .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, + }, + { .cmd_name = "bl", + .cmd_func = kdb_bp, + .cmd_usage = "[<vaddr>]", + .cmd_help = "Display breakpoints", + .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, + }, + { .cmd_name = "bc", + .cmd_func = kdb_bc, + .cmd_usage = "<bpnum>", + .cmd_help = "Clear Breakpoint", + .cmd_flags = KDB_ENABLE_FLOW_CTRL, + }, + { .cmd_name = "be", + .cmd_func = kdb_bc, + .cmd_usage = "<bpnum>", + .cmd_help = "Enable Breakpoint", + .cmd_flags = KDB_ENABLE_FLOW_CTRL, + }, + { .cmd_name = "bd", + .cmd_func = kdb_bc, + .cmd_usage = "<bpnum>", + .cmd_help = "Disable Breakpoint", + .cmd_flags = KDB_ENABLE_FLOW_CTRL, + }, + { .cmd_name = "ss", + .cmd_func = kdb_ss, + .cmd_usage = "", + .cmd_help = "Single Step", + .cmd_minlen = 1, + .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, + }, +}; + +static kdbtab_t bphcmd = { + .cmd_name = "bph", + .cmd_func = kdb_bp, + .cmd_usage = "[<vaddr>]", + .cmd_help = "[datar [length]|dataw [length]] Set hw brk", + .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, +}; + /* Initialize the breakpoint table and register breakpoint commands. */ void __init kdb_initbptab(void) @@ -537,30 +585,7 @@ void __init kdb_initbptab(void) for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) bp->bp_free = 1; - kdb_register_flags("bp", kdb_bp, "[<vaddr>]", - "Set/Display breakpoints", 0, - KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); - kdb_register_flags("bl", kdb_bp, "[<vaddr>]", - "Display breakpoints", 0, - KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); + kdb_register_table(bptab, ARRAY_SIZE(bptab)); if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) - kdb_register_flags("bph", kdb_bp, "[<vaddr>]", - "[datar [length]|dataw [length]] Set hw brk", 0, - KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); - kdb_register_flags("bc", kdb_bc, "<bpnum>", - "Clear Breakpoint", 0, - KDB_ENABLE_FLOW_CTRL); - kdb_register_flags("be", kdb_bc, "<bpnum>", - "Enable Breakpoint", 0, - KDB_ENABLE_FLOW_CTRL); - kdb_register_flags("bd", kdb_bc, "<bpnum>", - "Disable Breakpoint", 0, - KDB_ENABLE_FLOW_CTRL); - - kdb_register_flags("ss", kdb_ss, "", - "Single Step", 1, - KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); - /* - * Architecture dependent initialization. - */ + kdb_register_table(&bphcmd, 1); } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 930ac1b25ec7..1baa96a2ecb8 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -33,6 +33,7 @@ #include <linux/kallsyms.h> #include <linux/kgdb.h> #include <linux/kdb.h> +#include <linux/list.h> #include <linux/notifier.h> #include <linux/interrupt.h> #include <linux/delay.h> @@ -84,15 +85,8 @@ static unsigned int kdb_continue_catastrophic = static unsigned int kdb_continue_catastrophic; #endif -/* kdb_commands describes the available commands. */ -static kdbtab_t *kdb_commands; -#define KDB_BASE_CMD_MAX 50 -static int kdb_max_commands = KDB_BASE_CMD_MAX; -static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX]; -#define for_each_kdbcmd(cmd, num) \ - for ((cmd) = kdb_base_commands, (num) = 0; \ - num < kdb_max_commands; \ - num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++) +/* kdb_cmds_head describes the available commands. */ +static LIST_HEAD(kdb_cmds_head); typedef struct _kdbmsg { int km_diag; /* kdb diagnostic */ @@ -146,42 +140,18 @@ static const int __nkdb_err = ARRAY_SIZE(kdbmsgs); * KDB_ENVBUFSIZE if required). */ -static char *__env[] = { +static char *__env[31] = { #if defined(CONFIG_SMP) - "PROMPT=[%d]kdb> ", + "PROMPT=[%d]kdb> ", #else - "PROMPT=kdb> ", + "PROMPT=kdb> ", #endif - "MOREPROMPT=more> ", - "RADIX=16", - "MDCOUNT=8", /* lines of md output */ - KDB_PLATFORM_ENV, - "DTABCOUNT=30", - "NOSECT=1", - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, + "MOREPROMPT=more> ", + "RADIX=16", + "MDCOUNT=8", /* lines of md output */ + KDB_PLATFORM_ENV, + "DTABCOUNT=30", + "NOSECT=1", }; static const int __nenv = ARRAY_SIZE(__env); @@ -324,6 +294,63 @@ int kdbgetintenv(const char *match, int *value) } /* + * kdb_setenv() - Alter an existing environment variable or create a new one. + * @var: Name of the variable + * @val: Value of the variable + * + * Return: Zero on success, a kdb diagnostic on failure. + */ +static int kdb_setenv(const char *var, const char *val) +{ + int i; + char *ep; + size_t varlen, vallen; + + varlen = strlen(var); + vallen = strlen(val); + ep = kdballocenv(varlen + vallen + 2); + if (ep == (char *)0) + return KDB_ENVBUFFULL; + + sprintf(ep, "%s=%s", var, val); + + for (i = 0; i < __nenv; i++) { + if (__env[i] + && ((strncmp(__env[i], var, varlen) == 0) + && ((__env[i][varlen] == '\0') + || (__env[i][varlen] == '=')))) { + __env[i] = ep; + return 0; + } + } + + /* + * Wasn't existing variable. Fit into slot. + */ + for (i = 0; i < __nenv-1; i++) { + if (__env[i] == (char *)0) { + __env[i] = ep; + return 0; + } + } + + return KDB_ENVFULL; +} + +/* + * kdb_printenv() - Display the current environment variables. + */ +static void kdb_printenv(void) +{ + int i; + + for (i = 0; i < __nenv; i++) { + if (__env[i]) + kdb_printf("%s\n", __env[i]); + } +} + +/* * kdbgetularg - This function will convert a numeric string into an * unsigned long value. * Parameters: @@ -380,10 +407,6 @@ int kdbgetu64arg(const char *arg, u64 *value) */ int kdb_set(int argc, const char **argv) { - int i; - char *ep; - size_t varlen, vallen; - /* * we can be invoked two ways: * set var=value argv[1]="var", argv[2]="value" @@ -428,37 +451,7 @@ int kdb_set(int argc, const char **argv) * Tokenizer squashed the '=' sign. argv[1] is variable * name, argv[2] = value. */ - varlen = strlen(argv[1]); - vallen = strlen(argv[2]); - ep = kdballocenv(varlen + vallen + 2); - if (ep == (char *)0) - return KDB_ENVBUFFULL; - - sprintf(ep, "%s=%s", argv[1], argv[2]); - - ep[varlen+vallen+1] = '\0'; - - for (i = 0; i < __nenv; i++) { - if (__env[i] - && ((strncmp(__env[i], argv[1], varlen) == 0) - && ((__env[i][varlen] == '\0') - || (__env[i][varlen] == '=')))) { - __env[i] = ep; - return 0; - } - } - - /* - * Wasn't existing variable. Fit into slot. - */ - for (i = 0; i < __nenv-1; i++) { - if (__env[i] == (char *)0) { - __env[i] = ep; - return 0; - } - } - - return KDB_ENVFULL; + return kdb_setenv(argv[1], argv[2]); } static int kdb_check_regs(void) @@ -921,7 +914,7 @@ int kdb_parse(const char *cmdstr) char *cp; char *cpp, quoted; kdbtab_t *tp; - int i, escaped, ignore_errors = 0, check_grep = 0; + int escaped, ignore_errors = 0, check_grep = 0; /* * First tokenize the command string. @@ -1011,25 +1004,17 @@ int kdb_parse(const char *cmdstr) ++argv[0]; } - for_each_kdbcmd(tp, i) { - if (tp->cmd_name) { - /* - * If this command is allowed to be abbreviated, - * check to see if this is it. - */ - - if (tp->cmd_minlen - && (strlen(argv[0]) <= tp->cmd_minlen)) { - if (strncmp(argv[0], - tp->cmd_name, - tp->cmd_minlen) == 0) { - break; - } - } + list_for_each_entry(tp, &kdb_cmds_head, list_node) { + /* + * If this command is allowed to be abbreviated, + * check to see if this is it. + */ + if (tp->cmd_minlen && (strlen(argv[0]) <= tp->cmd_minlen) && + (strncmp(argv[0], tp->cmd_name, tp->cmd_minlen) == 0)) + break; - if (strcmp(argv[0], tp->cmd_name) == 0) - break; - } + if (strcmp(argv[0], tp->cmd_name) == 0) + break; } /* @@ -1037,19 +1022,15 @@ int kdb_parse(const char *cmdstr) * few characters of this match any of the known commands. * e.g., md1c20 should match md. */ - if (i == kdb_max_commands) { - for_each_kdbcmd(tp, i) { - if (tp->cmd_name) { - if (strncmp(argv[0], - tp->cmd_name, - strlen(tp->cmd_name)) == 0) { - break; - } - } + if (list_entry_is_head(tp, &kdb_cmds_head, list_node)) { + list_for_each_entry(tp, &kdb_cmds_head, list_node) { + if (strncmp(argv[0], tp->cmd_name, + strlen(tp->cmd_name)) == 0) + break; } } - if (i < kdb_max_commands) { + if (!list_entry_is_head(tp, &kdb_cmds_head, list_node)) { int result; if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1)) @@ -2073,12 +2054,7 @@ static int kdb_lsmod(int argc, const char **argv) static int kdb_env(int argc, const char **argv) { - int i; - - for (i = 0; i < __nenv; i++) { - if (__env[i]) - kdb_printf("%s\n", __env[i]); - } + kdb_printenv(); if (KDB_DEBUG(MASK)) kdb_printf("KDBDEBUG=0x%x\n", @@ -2101,7 +2077,7 @@ static int kdb_dmesg(int argc, const char **argv) int adjust = 0; int n = 0; int skip = 0; - struct kmsg_dumper dumper = { .active = 1 }; + struct kmsg_dump_iter iter; size_t len; char buf[201]; @@ -2126,8 +2102,8 @@ static int kdb_dmesg(int argc, const char **argv) kdb_set(2, setargs); } - kmsg_dump_rewind_nolock(&dumper); - while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) + kmsg_dump_rewind(&iter); + while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL)) n++; if (lines < 0) { @@ -2159,8 +2135,8 @@ static int kdb_dmesg(int argc, const char **argv) if (skip >= n || skip < 0) return 0; - kmsg_dump_rewind_nolock(&dumper); - while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { + kmsg_dump_rewind(&iter); + while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) { if (skip) { skip--; continue; @@ -2428,17 +2404,14 @@ static int kdb_kgdb(int argc, const char **argv) static int kdb_help(int argc, const char **argv) { kdbtab_t *kt; - int i; kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description"); kdb_printf("-----------------------------" "-----------------------------\n"); - for_each_kdbcmd(kt, i) { + list_for_each_entry(kt, &kdb_cmds_head, list_node) { char *space = ""; if (KDB_FLAG(CMD_INTERRUPT)) return 0; - if (!kt->cmd_name) - continue; if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true)) continue; if (strlen(kt->cmd_usage) > 20) @@ -2659,7 +2632,6 @@ static int kdb_grep_help(int argc, const char **argv) * Returns: * zero for success, one if a duplicate command. */ -#define kdb_command_extend 50 /* arbitrary */ int kdb_register_flags(char *cmd, kdb_func_t func, char *usage, @@ -2667,49 +2639,20 @@ int kdb_register_flags(char *cmd, short minlen, kdb_cmdflags_t flags) { - int i; kdbtab_t *kp; - /* - * Brute force method to determine duplicates - */ - for_each_kdbcmd(kp, i) { - if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { + list_for_each_entry(kp, &kdb_cmds_head, list_node) { + if (strcmp(kp->cmd_name, cmd) == 0) { kdb_printf("Duplicate kdb command registered: " "%s, func %px help %s\n", cmd, func, help); return 1; } } - /* - * Insert command into first available location in table - */ - for_each_kdbcmd(kp, i) { - if (kp->cmd_name == NULL) - break; - } - - if (i >= kdb_max_commands) { - kdbtab_t *new = kmalloc_array(kdb_max_commands - - KDB_BASE_CMD_MAX + - kdb_command_extend, - sizeof(*new), - GFP_KDB); - if (!new) { - kdb_printf("Could not allocate new kdb_command " - "table\n"); - return 1; - } - if (kdb_commands) { - memcpy(new, kdb_commands, - (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); - kfree(kdb_commands); - } - memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0, - kdb_command_extend * sizeof(*new)); - kdb_commands = new; - kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; - kdb_max_commands += kdb_command_extend; + kp = kmalloc(sizeof(*kp), GFP_KDB); + if (!kp) { + kdb_printf("Could not allocate new kdb_command table\n"); + return 1; } kp->cmd_name = cmd; @@ -2718,11 +2661,27 @@ int kdb_register_flags(char *cmd, kp->cmd_help = help; kp->cmd_minlen = minlen; kp->cmd_flags = flags; + kp->is_dynamic = true; + + list_add_tail(&kp->list_node, &kdb_cmds_head); return 0; } EXPORT_SYMBOL_GPL(kdb_register_flags); +/* + * kdb_register_table() - This function is used to register a kdb command + * table. + * @kp: pointer to kdb command table + * @len: length of kdb command table + */ +void kdb_register_table(kdbtab_t *kp, size_t len) +{ + while (len--) { + list_add_tail(&kp->list_node, &kdb_cmds_head); + kp++; + } +} /* * kdb_register - Compatibility register function for commands that do @@ -2757,15 +2716,16 @@ EXPORT_SYMBOL_GPL(kdb_register); */ int kdb_unregister(char *cmd) { - int i; kdbtab_t *kp; /* * find the command. */ - for_each_kdbcmd(kp, i) { - if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { - kp->cmd_name = NULL; + list_for_each_entry(kp, &kdb_cmds_head, list_node) { + if (strcmp(kp->cmd_name, cmd) == 0) { + list_del(&kp->list_node); + if (kp->is_dynamic) + kfree(kp); return 0; } } @@ -2775,118 +2735,222 @@ int kdb_unregister(char *cmd) } EXPORT_SYMBOL_GPL(kdb_unregister); -/* Initialize the kdb command table. */ -static void __init kdb_inittab(void) -{ - int i; - kdbtab_t *kp; - - for_each_kdbcmd(kp, i) - kp->cmd_name = NULL; - - kdb_register_flags("md", kdb_md, "<vaddr>", - "Display Memory Contents, also mdWcN, e.g. md8c1", 1, - KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); - kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>", - "Display Raw Memory", 0, - KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); - kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>", - "Display Physical Memory", 0, - KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); - kdb_register_flags("mds", kdb_md, "<vaddr>", - "Display Memory Symbolically", 0, - KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); - kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>", - "Modify Memory Contents", 0, - KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS); - kdb_register_flags("go", kdb_go, "[<vaddr>]", - "Continue Execution", 1, - KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); - kdb_register_flags("rd", kdb_rd, "", - "Display Registers", 0, - KDB_ENABLE_REG_READ); - kdb_register_flags("rm", kdb_rm, "<reg> <contents>", - "Modify Registers", 0, - KDB_ENABLE_REG_WRITE); - kdb_register_flags("ef", kdb_ef, "<vaddr>", - "Display exception frame", 0, - KDB_ENABLE_MEM_READ); - kdb_register_flags("bt", kdb_bt, "[<vaddr>]", - "Stack traceback", 1, - KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); - kdb_register_flags("btp", kdb_bt, "<pid>", - "Display stack for process <pid>", 0, - KDB_ENABLE_INSPECT); - kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", - "Backtrace all processes matching state flag", 0, - KDB_ENABLE_INSPECT); - kdb_register_flags("btc", kdb_bt, "", - "Backtrace current process on each cpu", 0, - KDB_ENABLE_INSPECT); - kdb_register_flags("btt", kdb_bt, "<vaddr>", - "Backtrace process given its struct task address", 0, - KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); - kdb_register_flags("env", kdb_env, "", - "Show environment variables", 0, - KDB_ENABLE_ALWAYS_SAFE); - kdb_register_flags("set", kdb_set, "", - "Set environment variables", 0, - KDB_ENABLE_ALWAYS_SAFE); - kdb_register_flags("help", kdb_help, "", - "Display Help Message", 1, - KDB_ENABLE_ALWAYS_SAFE); - kdb_register_flags("?", kdb_help, "", - "Display Help Message", 0, - KDB_ENABLE_ALWAYS_SAFE); - kdb_register_flags("cpu", kdb_cpu, "<cpunum>", - "Switch to new cpu", 0, - KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); - kdb_register_flags("kgdb", kdb_kgdb, "", - "Enter kgdb mode", 0, 0); - kdb_register_flags("ps", kdb_ps, "[<flags>|A]", - "Display active task list", 0, - KDB_ENABLE_INSPECT); - kdb_register_flags("pid", kdb_pid, "<pidnum>", - "Switch to another task", 0, - KDB_ENABLE_INSPECT); - kdb_register_flags("reboot", kdb_reboot, "", - "Reboot the machine immediately", 0, - KDB_ENABLE_REBOOT); +static kdbtab_t maintab[] = { + { .cmd_name = "md", + .cmd_func = kdb_md, + .cmd_usage = "<vaddr>", + .cmd_help = "Display Memory Contents, also mdWcN, e.g. md8c1", + .cmd_minlen = 1, + .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + }, + { .cmd_name = "mdr", + .cmd_func = kdb_md, + .cmd_usage = "<vaddr> <bytes>", + .cmd_help = "Display Raw Memory", + .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + }, + { .cmd_name = "mdp", + .cmd_func = kdb_md, + .cmd_usage = "<paddr> <bytes>", + .cmd_help = "Display Physical Memory", + .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + }, + { .cmd_name = "mds", + .cmd_func = kdb_md, + .cmd_usage = "<vaddr>", + .cmd_help = "Display Memory Symbolically", + .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + }, + { .cmd_name = "mm", + .cmd_func = kdb_mm, + .cmd_usage = "<vaddr> <contents>", + .cmd_help = "Modify Memory Contents", + .cmd_flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS, + }, + { .cmd_name = "go", + .cmd_func = kdb_go, + .cmd_usage = "[<vaddr>]", + .cmd_help = "Continue Execution", + .cmd_minlen = 1, + .cmd_flags = KDB_ENABLE_REG_WRITE | + KDB_ENABLE_ALWAYS_SAFE_NO_ARGS, + }, + { .cmd_name = "rd", + .cmd_func = kdb_rd, + .cmd_usage = "", + .cmd_help = "Display Registers", + .cmd_flags = KDB_ENABLE_REG_READ, + }, + { .cmd_name = "rm", + .cmd_func = kdb_rm, + .cmd_usage = "<reg> <contents>", + .cmd_help = "Modify Registers", + .cmd_flags = KDB_ENABLE_REG_WRITE, + }, + { .cmd_name = "ef", + .cmd_func = kdb_ef, + .cmd_usage = "<vaddr>", + .cmd_help = "Display exception frame", + .cmd_flags = KDB_ENABLE_MEM_READ, + }, + { .cmd_name = "bt", + .cmd_func = kdb_bt, + .cmd_usage = "[<vaddr>]", + .cmd_help = "Stack traceback", + .cmd_minlen = 1, + .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS, + }, + { .cmd_name = "btp", + .cmd_func = kdb_bt, + .cmd_usage = "<pid>", + .cmd_help = "Display stack for process <pid>", + .cmd_flags = KDB_ENABLE_INSPECT, + }, + { .cmd_name = "bta", + .cmd_func = kdb_bt, + .cmd_usage = "[D|R|S|T|C|Z|E|U|I|M|A]", + .cmd_help = "Backtrace all processes matching state flag", + .cmd_flags = KDB_ENABLE_INSPECT, + }, + { .cmd_name = "btc", + .cmd_func = kdb_bt, + .cmd_usage = "", + .cmd_help = "Backtrace current process on each cpu", + .cmd_flags = KDB_ENABLE_INSPECT, + }, + { .cmd_name = "btt", + .cmd_func = kdb_bt, + .cmd_usage = "<vaddr>", + .cmd_help = "Backtrace process given its struct task address", + .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS, + }, + { .cmd_name = "env", + .cmd_func = kdb_env, + .cmd_usage = "", + .cmd_help = "Show environment variables", + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + }, + { .cmd_name = "set", + .cmd_func = kdb_set, + .cmd_usage = "", + .cmd_help = "Set environment variables", + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + }, + { .cmd_name = "help", + .cmd_func = kdb_help, + .cmd_usage = "", + .cmd_help = "Display Help Message", + .cmd_minlen = 1, + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + }, + { .cmd_name = "?", + .cmd_func = kdb_help, + .cmd_usage = "", + .cmd_help = "Display Help Message", + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + }, + { .cmd_name = "cpu", + .cmd_func = kdb_cpu, + .cmd_usage = "<cpunum>", + .cmd_help = "Switch to new cpu", + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS, + }, + { .cmd_name = "kgdb", + .cmd_func = kdb_kgdb, + .cmd_usage = "", + .cmd_help = "Enter kgdb mode", + .cmd_flags = 0, + }, + { .cmd_name = "ps", + .cmd_func = kdb_ps, + .cmd_usage = "[<flags>|A]", + .cmd_help = "Display active task list", + .cmd_flags = KDB_ENABLE_INSPECT, + }, + { .cmd_name = "pid", + .cmd_func = kdb_pid, + .cmd_usage = "<pidnum>", + .cmd_help = "Switch to another task", + .cmd_flags = KDB_ENABLE_INSPECT, + }, + { .cmd_name = "reboot", + .cmd_func = kdb_reboot, + .cmd_usage = "", + .cmd_help = "Reboot the machine immediately", + .cmd_flags = KDB_ENABLE_REBOOT, + }, #if defined(CONFIG_MODULES) - kdb_register_flags("lsmod", kdb_lsmod, "", - "List loaded kernel modules", 0, - KDB_ENABLE_INSPECT); + { .cmd_name = "lsmod", + .cmd_func = kdb_lsmod, + .cmd_usage = "", + .cmd_help = "List loaded kernel modules", + .cmd_flags = KDB_ENABLE_INSPECT, + }, #endif #if defined(CONFIG_MAGIC_SYSRQ) - kdb_register_flags("sr", kdb_sr, "<key>", - "Magic SysRq key", 0, - KDB_ENABLE_ALWAYS_SAFE); + { .cmd_name = "sr", + .cmd_func = kdb_sr, + .cmd_usage = "<key>", + .cmd_help = "Magic SysRq key", + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + }, #endif #if defined(CONFIG_PRINTK) - kdb_register_flags("dmesg", kdb_dmesg, "[lines]", - "Display syslog buffer", 0, - KDB_ENABLE_ALWAYS_SAFE); + { .cmd_name = "dmesg", + .cmd_func = kdb_dmesg, + .cmd_usage = "[lines]", + .cmd_help = "Display syslog buffer", + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + }, #endif - if (arch_kgdb_ops.enable_nmi) { - kdb_register_flags("disable_nmi", kdb_disable_nmi, "", - "Disable NMI entry to KDB", 0, - KDB_ENABLE_ALWAYS_SAFE); - } - kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"", - "Define a set of commands, down to endefcmd", 0, - KDB_ENABLE_ALWAYS_SAFE); - kdb_register_flags("kill", kdb_kill, "<-signal> <pid>", - "Send a signal to a process", 0, - KDB_ENABLE_SIGNAL); - kdb_register_flags("summary", kdb_summary, "", - "Summarize the system", 4, - KDB_ENABLE_ALWAYS_SAFE); - kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", - "Display per_cpu variables", 3, - KDB_ENABLE_MEM_READ); - kdb_register_flags("grephelp", kdb_grep_help, "", - "Display help on | grep", 0, - KDB_ENABLE_ALWAYS_SAFE); + { .cmd_name = "defcmd", + .cmd_func = kdb_defcmd, + .cmd_usage = "name \"usage\" \"help\"", + .cmd_help = "Define a set of commands, down to endefcmd", + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + }, + { .cmd_name = "kill", + .cmd_func = kdb_kill, + .cmd_usage = "<-signal> <pid>", + .cmd_help = "Send a signal to a process", + .cmd_flags = KDB_ENABLE_SIGNAL, + }, + { .cmd_name = "summary", + .cmd_func = kdb_summary, + .cmd_usage = "", + .cmd_help = "Summarize the system", + .cmd_minlen = 4, + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + }, + { .cmd_name = "per_cpu", + .cmd_func = kdb_per_cpu, + .cmd_usage = "<sym> [<bytes>] [<cpu>]", + .cmd_help = "Display per_cpu variables", + .cmd_minlen = 3, + .cmd_flags = KDB_ENABLE_MEM_READ, + }, + { .cmd_name = "grephelp", + .cmd_func = kdb_grep_help, + .cmd_usage = "", + .cmd_help = "Display help on | grep", + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + }, +}; + +static kdbtab_t nmicmd = { + .cmd_name = "disable_nmi", + .cmd_func = kdb_disable_nmi, + .cmd_usage = "", + .cmd_help = "Disable NMI entry to KDB", + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, +}; + +/* Initialize the kdb command table. */ +static void __init kdb_inittab(void) +{ + kdb_register_table(maintab, ARRAY_SIZE(maintab)); + if (arch_kgdb_ops.enable_nmi) + kdb_register_table(&nmicmd, 1); } /* Execute any commands defined in kdb_cmds. */ diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 6cb92f7bbbd0..ccbed9089808 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -174,8 +174,11 @@ typedef struct _kdbtab { short cmd_minlen; /* Minimum legal # command * chars required */ kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ + struct list_head list_node; /* Command list */ + bool is_dynamic; /* Command table allocation type */ } kdbtab_t; +extern void kdb_register_table(kdbtab_t *kp, size_t len); extern int kdb_bt(int, const char **); /* KDB display back trace */ /* KDB breakpoint management functions */ @@ -207,9 +210,7 @@ extern unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask); extern void kdb_ps_suppressed(void); extern void kdb_ps1(const struct task_struct *p); -extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig(struct task_struct *p, int sig); -extern void kdb_meminfo_proc_show(void); extern char kdb_getchar(void); extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index f7c1885abeb6..91bb666d7c03 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -654,24 +654,6 @@ unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask) return (mask & kdb_task_state_string(state)) != 0; } -/* - * kdb_print_nameval - Print a name and its value, converting the - * value to a symbol lookup if possible. - * Inputs: - * name field name to print - * val value of field - */ -void kdb_print_nameval(const char *name, unsigned long val) -{ - kdb_symtab_t symtab; - kdb_printf(" %-11.11s ", name); - if (kdbnearsym(val, &symtab)) - kdb_symbol_print(val, &symtab, - KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE); - else - kdb_printf("0x%lx\n", val); -} - /* Last ditch allocator for debugging, so we can still debug even when * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned * for space usage, not for speed. One smallish memory pool, the free diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 8442e5c9cfa2..a0b3b04fb596 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -341,7 +341,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * Checking for rcu_is_watching() here would prevent the nesting * interrupt to invoke rcu_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully - * assume that it is the first interupt and eventually claim + * assume that it is the first interrupt and eventually claim * quiescent state and end grace periods prematurely. * * Unconditionally invoke rcu_irq_enter() so RCU state stays @@ -422,7 +422,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) instrumentation_begin(); if (IS_ENABLED(CONFIG_PREEMPTION)) { -#ifdef CONFIG_PREEMT_DYNAMIC +#ifdef CONFIG_PREEMPT_DYNAMIC static_call(irqentry_exit_cond_resched)(); #else irqentry_exit_cond_resched(); diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index 8743150db2ac..c466c7fbdece 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -70,7 +70,9 @@ struct gcov_fn_info { u32 ident; u32 checksum; +#if CONFIG_CLANG_VERSION < 110000 u8 use_extra_checksum; +#endif u32 cfg_checksum; u32 num_counters; @@ -145,10 +147,8 @@ void llvm_gcda_emit_function(u32 ident, const char *function_name, list_add_tail(&info->head, ¤t_info->functions); } -EXPORT_SYMBOL(llvm_gcda_emit_function); #else -void llvm_gcda_emit_function(u32 ident, u32 func_checksum, - u8 use_extra_checksum, u32 cfg_checksum) +void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum) { struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); @@ -158,12 +158,11 @@ void llvm_gcda_emit_function(u32 ident, u32 func_checksum, INIT_LIST_HEAD(&info->head); info->ident = ident; info->checksum = func_checksum; - info->use_extra_checksum = use_extra_checksum; info->cfg_checksum = cfg_checksum; list_add_tail(&info->head, ¤t_info->functions); } -EXPORT_SYMBOL(llvm_gcda_emit_function); #endif +EXPORT_SYMBOL(llvm_gcda_emit_function); void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) { @@ -293,11 +292,16 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) !list_is_last(&fn_ptr2->head, &info2->functions)) { if (fn_ptr1->checksum != fn_ptr2->checksum) return false; +#if CONFIG_CLANG_VERSION < 110000 if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) return false; if (fn_ptr1->use_extra_checksum && fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) return false; +#else + if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) + return false; +#endif fn_ptr1 = list_next_entry(fn_ptr1, head); fn_ptr2 = list_next_entry(fn_ptr2, head); } @@ -529,17 +533,22 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info) list_for_each_entry(fi_ptr, &info->functions, head) { u32 i; - u32 len = 2; - - if (fi_ptr->use_extra_checksum) - len++; pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); - pos += store_gcov_u32(buffer, pos, len); +#if CONFIG_CLANG_VERSION < 110000 + pos += store_gcov_u32(buffer, pos, + fi_ptr->use_extra_checksum ? 3 : 2); +#else + pos += store_gcov_u32(buffer, pos, 3); +#endif pos += store_gcov_u32(buffer, pos, fi_ptr->ident); pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); +#if CONFIG_CLANG_VERSION < 110000 if (fi_ptr->use_extra_checksum) pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); +#else + pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); +#endif pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6d89e33fe3aa..8cc8e5713287 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -761,7 +761,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_nmi); * handle_edge_irq - edge type IRQ handler * @desc: the interrupt description structure for this irq * - * Interrupt occures on the falling and/or rising edge of a hardware + * Interrupt occurs on the falling and/or rising edge of a hardware * signal. The occurrence is latched into the irq controller hardware * and must be acked in order to be reenabled. After the ack another * interrupt can happen on the same source even before the first one @@ -808,7 +808,7 @@ void handle_edge_irq(struct irq_desc *desc) /* * When another irq arrived while we were handling * one, we could have masked the irq. - * Renable it, if it was not disabled in meantime. + * Reenable it, if it was not disabled in meantime. */ if (unlikely(desc->istate & IRQS_PENDING)) { if (!irqd_irq_disabled(&desc->irq_data) && @@ -1419,7 +1419,7 @@ EXPORT_SYMBOL_GPL(irq_chip_eoi_parent); * @dest: The affinity mask to set * @force: Flag to enforce setting (disable online checks) * - * Conditinal, as the underlying parent chip might not implement it. + * Conditional, as the underlying parent chip might not implement it. */ int irq_chip_set_affinity_parent(struct irq_data *data, const struct cpumask *dest, bool force) @@ -1531,7 +1531,7 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent); #endif /** - * irq_chip_compose_msi_msg - Componse msi message for a irq chip + * irq_chip_compose_msi_msg - Compose msi message for a irq chip * @data: Pointer to interrupt specific data * @msg: Pointer to the MSI message * diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 0b0cdf206dc4..7fe6cffe7d0d 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -13,7 +13,7 @@ /* * What should we do if we get a hw irq event on an illegal vector? - * Each architecture has to answer this themself. + * Each architecture has to answer this themselves. */ static void ack_bad(struct irq_data *data) { diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 43e3d1be622c..52f11c791bf8 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -107,7 +107,7 @@ free_descs: * @irq: linux irq number to be destroyed * @dest: cpumask of cpus which should have the IPI removed * - * The IPIs allocated with irq_reserve_ipi() are retuerned to the system + * The IPIs allocated with irq_reserve_ipi() are returned to the system * destroying all virqs associated with them. * * Return 0 on success or error code on failure. diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 40880c350b95..0cd02efa3a74 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -24,10 +24,6 @@ struct irq_sim_irq_ctx { struct irq_sim_work_ctx *work_ctx; }; -struct irq_sim_devres { - struct irq_domain *domain; -}; - static void irq_sim_irqmask(struct irq_data *data) { struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); @@ -216,11 +212,11 @@ void irq_domain_remove_sim(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_domain_remove_sim); -static void devm_irq_domain_release_sim(struct device *dev, void *res) +static void devm_irq_domain_remove_sim(void *data) { - struct irq_sim_devres *this = res; + struct irq_domain *domain = data; - irq_domain_remove_sim(this->domain); + irq_domain_remove_sim(domain); } /** @@ -238,20 +234,17 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev, struct fwnode_handle *fwnode, unsigned int num_irqs) { - struct irq_sim_devres *dr; + struct irq_domain *domain; + int ret; - dr = devres_alloc(devm_irq_domain_release_sim, - sizeof(*dr), GFP_KERNEL); - if (!dr) - return ERR_PTR(-ENOMEM); + domain = irq_domain_create_sim(fwnode, num_irqs); + if (IS_ERR(domain)) + return domain; - dr->domain = irq_domain_create_sim(fwnode, num_irqs); - if (IS_ERR(dr->domain)) { - devres_free(dr); - return dr->domain; - } + ret = devm_add_action_or_reset(dev, devm_irq_domain_remove_sim, domain); + if (ret) + return ERR_PTR(ret); - devres_add(dev, dr); - return dr->domain; + return domain; } EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index cc1a09406c6e..4a617d7312a4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -31,7 +31,7 @@ static int __init irq_affinity_setup(char *str) cpulist_parse(str, irq_default_affinity); /* * Set at least the boot cpu. We don't want to end up with - * bugreports caused by random comandline masks + * bugreports caused by random commandline masks */ cpumask_set_cpu(smp_processor_id(), irq_default_affinity); return 1; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d10ab1d689d5..f42ef868efd3 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -62,7 +62,7 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); * @name: Optional user provided domain name * @pa: Optional user-provided physical address * - * Allocate a struct irqchip_fwid, and return a poiner to the embedded + * Allocate a struct irqchip_fwid, and return a pointer to the embedded * fwnode_handle (or NULL on failure). * * Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are @@ -665,7 +665,7 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); - /* Look for default domain if nececssary */ + /* Look for default domain if necessary */ if (domain == NULL) domain = irq_default_domain; if (domain == NULL) { @@ -703,41 +703,6 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_create_mapping_affinity); -/** - * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs - * @domain: domain owning the interrupt range - * @irq_base: beginning of linux IRQ range - * @hwirq_base: beginning of hardware IRQ range - * @count: Number of interrupts to map - * - * This routine is used for allocating and mapping a range of hardware - * irqs to linux irqs where the linux irq numbers are at pre-defined - * locations. For use by controllers that already have static mappings - * to insert in to the domain. - * - * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time - * domain insertion. - * - * 0 is returned upon success, while any failure to establish a static - * mapping is treated as an error. - */ -int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, - irq_hw_number_t hwirq_base, int count) -{ - struct device_node *of_node; - int ret; - - of_node = irq_domain_get_of_node(domain); - ret = irq_alloc_descs(irq_base, irq_base, count, - of_node_to_nid(of_node)); - if (unlikely(ret < 0)) - return ret; - - irq_domain_associate_many(domain, irq_base, hwirq_base, count); - return 0; -} -EXPORT_SYMBOL_GPL(irq_create_strict_mappings); - static int irq_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, irq_hw_number_t *hwirq, unsigned int *type) @@ -906,7 +871,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain, { struct irq_data *data; - /* Look for default domain if nececssary */ + /* Look for default domain if necessary */ if (domain == NULL) domain = irq_default_domain; if (domain == NULL) @@ -1436,7 +1401,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, * The whole process to setup an IRQ has been split into two steps. * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ * descriptor and required hardware resources. The second step, - * irq_domain_activate_irq(), is to program hardwares with preallocated + * irq_domain_activate_irq(), is to program the hardware with preallocated * resources. In this way, it's easier to rollback when failing to * allocate resources. */ @@ -1694,12 +1659,10 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) /** * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain + * @domain: Domain below which interrupts must be allocated * @irq_base: Base IRQ number * @nr_irqs: Number of IRQs to allocate * @arg: Allocation data (arch/domain specific) - * - * Check whether the domain has been setup recursive. If not allocate - * through the parent domain. */ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, @@ -1715,11 +1678,9 @@ EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); /** * irq_domain_free_irqs_parent - Free interrupts from parent domain + * @domain: Domain below which interrupts must be freed * @irq_base: Base IRQ number * @nr_irqs: Number of IRQs to free - * - * Check whether the domain has been setup recursive. If not free - * through the parent domain. */ void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 21ea370fccda..4c14356543d9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -179,7 +179,7 @@ bool irq_can_set_affinity_usr(unsigned int irq) /** * irq_set_thread_affinity - Notify irq threads to adjust affinity - * @desc: irq descriptor which has affitnity changed + * @desc: irq descriptor which has affinity changed * * We just set IRQTF_AFFINITY and delegate the affinity setting * to the interrupt thread itself. We can not call @@ -326,7 +326,7 @@ static bool irq_set_affinity_deactivated(struct irq_data *data, * If the interrupt is not yet activated, just store the affinity * mask and do not call the chip driver at all. On activation the * driver has to make sure anyway that the interrupt is in a - * useable state so startup works. + * usable state so startup works. */ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data) || !irqd_affinity_on_activate(data)) @@ -1054,7 +1054,7 @@ again: * to IRQS_INPROGRESS and the irq line is masked forever. * * This also serializes the state of shared oneshot handlers - * versus "desc->threads_onehsot |= action->thread_mask;" in + * versus "desc->threads_oneshot |= action->thread_mask;" in * irq_wake_thread(). See the comment there which explains the * serialization. */ @@ -1157,7 +1157,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) /* * Interrupts explicitly requested as threaded interrupts want to be - * preemtible - many of them need to sleep and wait for slow busses to + * preemptible - many of them need to sleep and wait for slow busses to * complete. */ static irqreturn_t irq_thread_fn(struct irq_desc *desc, @@ -1697,7 +1697,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); } - if (irq_settings_can_autoenable(desc)) { + if (!(new->flags & IRQF_NO_AUTOEN) && + irq_settings_can_autoenable(desc)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); } else { /* @@ -1912,7 +1913,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) /* Last action releases resources */ if (!desc->action) { /* - * Reaquire bus lock as irq_release_resources() might + * Reacquire bus lock as irq_release_resources() might * require it to deallocate resources over the slow bus. */ chip_bus_lock(desc); @@ -2090,10 +2091,15 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, * which interrupt is which (messes up the interrupt freeing * logic etc). * + * Also shared interrupts do not go well with disabling auto enable. + * The sharing interrupt might request it while it's still disabled + * and then wait for interrupts forever. + * * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and * it cannot be set along with IRQF_NO_SUSPEND. */ if (((irqflags & IRQF_SHARED) && !dev_id) || + ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) || (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; @@ -2249,7 +2255,8 @@ int request_nmi(unsigned int irq, irq_handler_t handler, desc = irq_to_desc(irq); - if (!desc || irq_settings_can_autoenable(desc) || + if (!desc || (irq_settings_can_autoenable(desc) && + !(irqflags & IRQF_NO_AUTOEN)) || !irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc)) || !irq_supports_nmi(desc)) @@ -2746,7 +2753,7 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: One of IRQCHIP_STATE_* the caller wants to know about - * @state: a pointer to a boolean where the state is to be storeed + * @state: a pointer to a boolean where the state is to be stored * * This call snapshots the internal irqchip state of an * interrupt, returning into @state the bit corresponding to diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 651a4ad6d711..578596e41cb6 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -337,15 +337,14 @@ void irq_matrix_assign(struct irq_matrix *m, unsigned int bit) * irq_matrix_reserve - Reserve interrupts * @m: Matrix pointer * - * This is merily a book keeping call. It increments the number of globally + * This is merely a book keeping call. It increments the number of globally * reserved interrupt bits w/o actually allocating them. This allows to * setup interrupt descriptors w/o assigning low level resources to it. * The actual allocation happens when the interrupt gets activated. */ void irq_matrix_reserve(struct irq_matrix *m) { - if (m->global_reserved <= m->global_available && - m->global_reserved + 1 > m->global_available) + if (m->global_reserved == m->global_available) pr_warn("Interrupt reservation exceeds available resources\n"); m->global_reserved++; @@ -356,7 +355,7 @@ void irq_matrix_reserve(struct irq_matrix *m) * irq_matrix_remove_reserved - Remove interrupt reservation * @m: Matrix pointer * - * This is merily a book keeping call. It decrements the number of globally + * This is merely a book keeping call. It decrements the number of globally * reserved interrupt bits. This is used to undo irq_matrix_reserve() when the * interrupt was never in use and a real vector allocated, which undid the * reservation. @@ -423,7 +422,9 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) return; - clear_bit(bit, cm->alloc_map); + if (WARN_ON_ONCE(!test_and_clear_bit(bit, cm->alloc_map))) + return; + cm->allocated--; if(managed) cm->managed_allocated--; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index def48589ea48..61ca924ef4b4 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -7,7 +7,7 @@ /** * irq_fixup_move_pending - Cleanup irq move pending from a dying CPU - * @desc: Interrupt descpriptor to clean up + * @desc: Interrupt descriptor to clean up * @force_clear: If set clear the move pending bit unconditionally. * If not set, clear it only when the dying CPU is the * last one in the pending mask. diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index b338d622f26e..c41965e348b5 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -5,7 +5,7 @@ * * This file is licensed under GPLv2. * - * This file contains common code to support Message Signalled Interrupt for + * This file contains common code to support Message Signaled Interrupts for * PCI compatible and non PCI compatible devices. */ #include <linux/types.h> diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 98138788cb04..7c5cd42df3b9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -144,7 +144,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) return -EIO; - if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; if (type) @@ -238,7 +238,7 @@ static ssize_t default_affinity_write(struct file *file, cpumask_var_t new_value; int err; - if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index bd1d85c610aa..0c46e9fe3a89 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -128,7 +128,7 @@ int check_irq_resend(struct irq_desc *desc, bool inject) if (!try_retrigger(desc)) err = irq_sw_resend(desc); - /* If the retrigger was successfull, mark it with the REPLAY bit */ + /* If the retrigger was successful, mark it with the REPLAY bit */ if (!err) desc->istate |= IRQS_REPLAY; return err; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index f865e5f4d382..c481d8458325 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -403,6 +403,10 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) desc->irqs_unhandled -= ok; } + if (likely(!desc->irqs_unhandled)) + return; + + /* Now getting into unhandled irq detection */ desc->irq_count++; if (likely(desc->irq_count < 100000)) return; diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 773b6105c4ae..d309d6fbf5bd 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -84,7 +84,7 @@ void irq_timings_disable(void) * 2. Log interval * * We saw the irq timings allow to compute the interval of the - * occurrences for a specific interrupt. We can reasonibly assume the + * occurrences for a specific interrupt. We can reasonably assume the * longer is the interval, the higher is the error for the next event * and we can consider storing those interval values into an array * where each slot in the array correspond to an interval at the power @@ -416,7 +416,7 @@ static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) * Copy the content of the circular buffer into another buffer * in order to linearize the buffer instead of dealing with * wrapping indexes and shifted array which will be prone to - * error and extremelly difficult to debug. + * error and extremely difficult to debug. */ for (i = 0; i < count; i++) { int index = (start + i) & IRQ_TIMINGS_MASK; @@ -485,7 +485,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) /* * The interrupt triggered more than one second apart, that - * ends the sequence as predictible for our purpose. In this + * ends the sequence as predictable for our purpose. In this * case, assume we have the beginning of a sequence and the * timestamp is the first value. As it is impossible to * predict anything at this point, return. @@ -514,7 +514,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) * If more than the array size interrupts happened during the * last busy/idle cycle, the index wrapped up and we have to * begin with the next element in the array which is the last one - * in the sequence, otherwise it is a the index 0. + * in the sequence, otherwise it is at the index 0. * * - have an indication of the interrupts activity on this CPU * (eg. irq/sec) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8043a90aa50e..c851ca0ed357 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -161,6 +161,27 @@ static unsigned long kallsyms_sym_address(int idx) return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; } +#if defined(CONFIG_CFI_CLANG) && defined(CONFIG_LTO_CLANG_THIN) +/* + * LLVM appends a hash to static function names when ThinLTO and CFI are + * both enabled, i.e. foo() becomes foo$707af9a22804d33c81801f27dcfe489b. + * This causes confusion and potentially breaks user space tools, so we + * strip the suffix from expanded symbol names. + */ +static inline bool cleanup_symbol_name(char *s) +{ + char *res; + + res = strrchr(s, '$'); + if (res) + *res = '\0'; + + return res != NULL; +} +#else +static inline bool cleanup_symbol_name(char *s) { return false; } +#endif + /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { @@ -173,6 +194,9 @@ unsigned long kallsyms_lookup_name(const char *name) if (strcmp(namebuf, name) == 0) return kallsyms_sym_address(i); + + if (cleanup_symbol_name(namebuf) && strcmp(namebuf, name) == 0) + return kallsyms_sym_address(i); } return module_kallsyms_lookup_name(name); } @@ -303,7 +327,9 @@ const char *kallsyms_lookup(unsigned long addr, namebuf, KSYM_NAME_LEN); if (modname) *modname = NULL; - return namebuf; + + ret = namebuf; + goto found; } /* See if it's in a module or a BPF JITed image. */ @@ -316,11 +342,16 @@ const char *kallsyms_lookup(unsigned long addr, if (!ret) ret = ftrace_mod_address_lookup(addr, symbolsize, offset, modname, namebuf); + +found: + cleanup_symbol_name(namebuf); return ret; } int lookup_symbol_name(unsigned long addr, char *symname) { + int res; + symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; @@ -331,15 +362,23 @@ int lookup_symbol_name(unsigned long addr, char *symname) /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), symname, KSYM_NAME_LEN); - return 0; + goto found; } /* See if it's in a module. */ - return lookup_module_symbol_name(addr, symname); + res = lookup_module_symbol_name(addr, symname); + if (res) + return res; + +found: + cleanup_symbol_name(symname); + return 0; } int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name) { + int res; + name[0] = '\0'; name[KSYM_NAME_LEN - 1] = '\0'; @@ -351,10 +390,16 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, kallsyms_expand_symbol(get_symbol_offset(pos), name, KSYM_NAME_LEN); modname[0] = '\0'; - return 0; + goto found; } /* See if it's in a module. */ - return lookup_module_symbol_attrs(addr, size, offset, modname, name); + res = lookup_module_symbol_attrs(addr, size, offset, modname, name); + if (res) + return res; + +found: + cleanup_symbol_name(name); + return 0; } /* Look up a kernel symbol and return it in a text buffer. */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 1578973c5740..a1972eba2917 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -963,7 +963,8 @@ static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; - WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); + WARN_ON_FUNCTION_MISMATCH(timer->function, + kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index f6310f848f34..3a4beb9395c4 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -9,6 +9,7 @@ #include <linux/cpu.h> #include <linux/stacktrace.h> +#include <linux/tracehook.h> #include "core.h" #include "patch.h" #include "transition.h" @@ -369,9 +370,7 @@ static void klp_send_signals(void) * Send fake signal to all non-kthread tasks which are * still not migrated. */ - spin_lock_irq(&task->sighand->siglock); - signal_wake_up(task, 0); - spin_unlock_irq(&task->sighand->siglock); + set_notify_signal(task); } } read_unlock(&tasklist_lock); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c6d0c1dc6253..ef28a0b9cf1e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -705,7 +705,7 @@ static void print_lock_name(struct lock_class *class) printk(KERN_CONT " ("); __print_lock_name(class); - printk(KERN_CONT "){%s}-{%hd:%hd}", usage, + printk(KERN_CONT "){%s}-{%d:%d}", usage, class->wait_type_outer ?: class->wait_type_inner, class->wait_type_inner); } @@ -930,7 +930,8 @@ static bool assign_lock_key(struct lockdep_map *lock) /* Debug-check: all keys must be persistent! */ debug_locks_off(); pr_err("INFO: trying to register non-static key.\n"); - pr_err("the code is fine but needs lockdep annotation.\n"); + pr_err("The code is fine but needs lockdep annotation, or maybe\n"); + pr_err("you didn't initialize this object before use?\n"); pr_err("turning off the locking correctness validator.\n"); dump_stack(); return false; @@ -1392,7 +1393,7 @@ static int add_lock_to_list(struct lock_class *this, /* * For good efficiency of modular, we use power of 2 */ -#define MAX_CIRCULAR_QUEUE_SIZE 4096UL +#define MAX_CIRCULAR_QUEUE_SIZE (1UL << CONFIG_LOCKDEP_CIRCULAR_QUEUE_BITS) #define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) /* diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index de49f9e1c11b..ecb8662e7a4e 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -99,16 +99,16 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_STACK_TRACE_ENTRIES 262144UL #define STACK_TRACE_HASH_SIZE 8192 #else -#define MAX_LOCKDEP_ENTRIES 32768UL +#define MAX_LOCKDEP_ENTRIES (1UL << CONFIG_LOCKDEP_BITS) -#define MAX_LOCKDEP_CHAINS_BITS 16 +#define MAX_LOCKDEP_CHAINS_BITS CONFIG_LOCKDEP_CHAINS_BITS /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ -#define MAX_STACK_TRACE_ENTRIES 524288UL -#define STACK_TRACE_HASH_SIZE 16384 +#define MAX_STACK_TRACE_ENTRIES (1UL << CONFIG_LOCKDEP_STACK_TRACE_BITS) +#define STACK_TRACE_HASH_SIZE (1 << CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS) #endif /* diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 4786dd271b45..b94f3831e963 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -60,6 +60,8 @@ EXPORT_SYMBOL(queued_read_lock_slowpath); */ void queued_write_lock_slowpath(struct qrwlock *lock) { + int cnts; + /* Put the writer into the wait queue */ arch_spin_lock(&lock->wait_lock); @@ -73,9 +75,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock) /* When no more readers or writers, set the locked flag */ do { - atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING); - } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING, - _QW_LOCKED) != _QW_WAITING); + cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING); + } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)); unlock: arch_spin_unlock(&lock->wait_lock); } diff --git a/kernel/module.c b/kernel/module.c index 30479355ab85..20fb004e7d8d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2146,6 +2146,8 @@ void __weak module_arch_freeing_init(struct module *mod) { } +static void cfi_cleanup(struct module *mod); + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -2187,6 +2189,9 @@ static void free_module(struct module *mod) synchronize_rcu(); mutex_unlock(&module_mutex); + /* Clean up CFI for the module. */ + cfi_cleanup(mod); + /* This may be empty, but that's OK */ module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); @@ -3866,6 +3871,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, return 0; } +static void cfi_init(struct module *mod); + /* * Allocate and load the module: note that size of section 0 is always * zero, and we rely on this for optional sections. @@ -3997,6 +4004,9 @@ static int load_module(struct load_info *info, const char __user *uargs, flush_module_icache(mod); + /* Setup CFI for the module. */ + cfi_init(mod); + /* Now copy in args */ mod->args = strndup_user(uargs, ~0UL >> 1); if (IS_ERR(mod->args)) { @@ -4070,6 +4080,7 @@ static int load_module(struct load_info *info, const char __user *uargs, synchronize_rcu(); kfree(mod->args); free_arch_cleanup: + cfi_cleanup(mod); module_arch_cleanup(mod); free_modinfo: free_modinfo(mod); @@ -4415,6 +4426,38 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, #endif /* CONFIG_LIVEPATCH */ #endif /* CONFIG_KALLSYMS */ +static void cfi_init(struct module *mod) +{ +#ifdef CONFIG_CFI_CLANG + initcall_t *init; + exitcall_t *exit; + + rcu_read_lock_sched(); + mod->cfi_check = (cfi_check_fn) + find_kallsyms_symbol_value(mod, "__cfi_check"); + init = (initcall_t *) + find_kallsyms_symbol_value(mod, "__cfi_jt_init_module"); + exit = (exitcall_t *) + find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module"); + rcu_read_unlock_sched(); + + /* Fix init/exit functions to point to the CFI jump table */ + if (init) + mod->init = *init; + if (exit) + mod->exit = *exit; + + cfi_module_add(mod, module_addr_min); +#endif +} + +static void cfi_cleanup(struct module *mod) +{ +#ifdef CONFIG_CFI_CLANG + cfi_module_remove(mod, module_addr_min); +#endif +} + /* Maximum number of characters written by module_flags() */ #define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index 9af5a50d3489..b29c8aca7486 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -54,7 +54,7 @@ static void try_to_suspend(struct work_struct *work) goto out; /* - * If the wakeup occured for an unknown reason, wait to prevent the + * If the wakeup occurred for an unknown reason, wait to prevent the * system from trying to suspend and waking up in a tight loop. */ if (final_count == initial_count) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d63560e1cf87..1a221dcb3c01 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -329,7 +329,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) /** * Data types related to memory bitmaps. * - * Memory bitmap is a structure consiting of many linked lists of + * Memory bitmap is a structure consisting of many linked lists of * objects. The main list's elements are of type struct zone_bitmap * and each of them corresonds to one zone. For each zone bitmap * object there is a list of objects of type struct bm_block that diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 72e33054a2e1..bea3cb8afa11 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -884,7 +884,7 @@ out_clean: * enough_swap - Make sure we have enough swap to save the image. * * Returns TRUE or FALSE after checking the total amount of swap - * space avaiable from the resume partition. + * space available from the resume partition. */ static int enough_swap(unsigned int nr_pages) diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 3a8fd491758c..51615c909b2f 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -12,8 +12,6 @@ #define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 -extern raw_spinlock_t logbuf_lock; - __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, @@ -21,7 +19,6 @@ int vprintk_store(int facility, int level, __printf(1, 0) int vprintk_default(const char *fmt, va_list args); __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); -__printf(1, 0) int vprintk_func(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); @@ -56,10 +53,8 @@ void defer_console_output(void); #else -__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } - /* - * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem + * In !PRINTK builds we still export console_sem * semaphore and some of console functions (console_unlock()/etc.), so * printk-safe must preserve the existing local IRQ guarantees. */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 575a34b88936..421c35571797 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -262,7 +262,7 @@ static void __up_console_sem(unsigned long ip) * definitely not the perfect debug tool (we don't know if _WE_ * hold it and are racing, but it helps tracking those weird code * paths in the console code where we end up in places I want - * locked without the console sempahore held). + * locked without the console semaphore held). */ static int console_locked, console_suspended; @@ -355,62 +355,50 @@ enum log_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -/* - * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken - * within the scheduler's rq lock. It must be released before calling - * console_unlock() or anything else that might wake up a process. - */ -DEFINE_RAW_SPINLOCK(logbuf_lock); - -/* - * Helper macros to lock/unlock logbuf_lock and switch between - * printk-safe/unsafe modes. - */ -#define logbuf_lock_irq() \ - do { \ - printk_safe_enter_irq(); \ - raw_spin_lock(&logbuf_lock); \ - } while (0) - -#define logbuf_unlock_irq() \ - do { \ - raw_spin_unlock(&logbuf_lock); \ - printk_safe_exit_irq(); \ - } while (0) - -#define logbuf_lock_irqsave(flags) \ - do { \ - printk_safe_enter_irqsave(flags); \ - raw_spin_lock(&logbuf_lock); \ - } while (0) - -#define logbuf_unlock_irqrestore(flags) \ - do { \ - raw_spin_unlock(&logbuf_lock); \ - printk_safe_exit_irqrestore(flags); \ - } while (0) +/* syslog_lock protects syslog_* variables and write access to clear_seq. */ +static DEFINE_RAW_SPINLOCK(syslog_lock); #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); +/* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; +/* All 3 protected by @console_sem. */ /* the next printk record to write to the console */ static u64 console_seq; static u64 exclusive_console_stop_seq; static unsigned long console_dropped; -/* the next printk record to read after the last 'clear' command */ -static u64 clear_seq; +struct latched_seq { + seqcount_latch_t latch; + u64 val[2]; +}; + +/* + * The next printk record to read after the last 'clear' command. There are + * two copies (updated with seqcount_latch) so that reads can locklessly + * access a valid value. Writers are synchronized by @syslog_lock. + */ +static struct latched_seq clear_seq = { + .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), + .val[0] = 0, + .val[1] = 0, +}; #ifdef CONFIG_PRINTK_CALLER #define PREFIX_MAX 48 #else #define PREFIX_MAX 32 #endif -#define LOG_LINE_MAX (1024 - PREFIX_MAX) + +/* the maximum size of a formatted record (i.e. with prefix added per line) */ +#define CONSOLE_LOG_MAX 1024 + +/* the maximum size allowed to be reserved for a record */ +#define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) #define LOG_LEVEL(v) ((v) & 0x07) #define LOG_FACILITY(v) ((v) >> 3 & 0xff) @@ -452,6 +440,31 @@ bool printk_percpu_data_ready(void) return __printk_percpu_data_ready; } +/* Must be called under syslog_lock. */ +static void latched_seq_write(struct latched_seq *ls, u64 val) +{ + raw_write_seqcount_latch(&ls->latch); + ls->val[0] = val; + raw_write_seqcount_latch(&ls->latch); + ls->val[1] = val; +} + +/* Can be called from any context. */ +static u64 latched_seq_read_nolock(struct latched_seq *ls) +{ + unsigned int seq; + unsigned int idx; + u64 val; + + do { + seq = raw_read_seqcount_latch(&ls->latch); + idx = seq & 0x1; + val = ls->val[idx]; + } while (read_seqcount_latch_retry(&ls->latch, seq)); + + return val; +} + /* Return log buffer address */ char *log_buf_addr_get(void) { @@ -619,7 +632,7 @@ out: /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { - u64 seq; + atomic64_t seq; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; @@ -719,27 +732,27 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (ret) return ret; - logbuf_lock_irq(); - if (!prb_read_valid(prb, user->seq, r)) { + printk_safe_enter_irq(); + if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - logbuf_unlock_irq(); + printk_safe_exit_irq(); goto out; } - logbuf_unlock_irq(); + printk_safe_exit_irq(); ret = wait_event_interruptible(log_wait, - prb_read_valid(prb, user->seq, r)); + prb_read_valid(prb, atomic64_read(&user->seq), r)); if (ret) goto out; - logbuf_lock_irq(); + printk_safe_enter_irq(); } - if (r->info->seq != user->seq) { + if (r->info->seq != atomic64_read(&user->seq)) { /* our last seen message is gone, return error and reset */ - user->seq = r->info->seq; + atomic64_set(&user->seq, r->info->seq); ret = -EPIPE; - logbuf_unlock_irq(); + printk_safe_exit_irq(); goto out; } @@ -748,8 +761,8 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, &r->text_buf[0], r->info->text_len, &r->info->dev_info); - user->seq = r->info->seq + 1; - logbuf_unlock_irq(); + atomic64_set(&user->seq, r->info->seq + 1); + printk_safe_exit_irq(); if (len > count) { ret = -EINVAL; @@ -784,11 +797,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - logbuf_lock_irq(); + printk_safe_enter_irq(); switch (whence) { case SEEK_SET: /* the first record */ - user->seq = prb_first_valid_seq(prb); + atomic64_set(&user->seq, prb_first_valid_seq(prb)); break; case SEEK_DATA: /* @@ -796,16 +809,16 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ - user->seq = clear_seq; + atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); break; case SEEK_END: /* after the last record */ - user->seq = prb_next_seq(prb); + atomic64_set(&user->seq, prb_next_seq(prb)); break; default: ret = -EINVAL; } - logbuf_unlock_irq(); + printk_safe_exit_irq(); return ret; } @@ -820,15 +833,15 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - logbuf_lock_irq(); - if (prb_read_valid_info(prb, user->seq, &info, NULL)) { + printk_safe_enter_irq(); + if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ - if (info.seq != user->seq) + if (info.seq != atomic64_read(&user->seq)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; } - logbuf_unlock_irq(); + printk_safe_exit_irq(); return ret; } @@ -861,9 +874,9 @@ static int devkmsg_open(struct inode *inode, struct file *file) prb_rec_init_rd(&user->record, &user->info, &user->text_buf[0], sizeof(user->text_buf)); - logbuf_lock_irq(); - user->seq = prb_first_valid_seq(prb); - logbuf_unlock_irq(); + printk_safe_enter_irq(); + atomic64_set(&user->seq, prb_first_valid_seq(prb)); + printk_safe_exit_irq(); file->private_data = user; return 0; @@ -955,6 +968,9 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_SIZE(atomic_long_t); VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); + + VMCOREINFO_STRUCT_SIZE(latched_seq); + VMCOREINFO_OFFSET(latched_seq, val); } #endif @@ -1421,6 +1437,50 @@ static size_t get_record_print_text_size(struct printk_info *info, return ((prefix_len * line_count) + info->text_len + 1); } +/* + * Beginning with @start_seq, find the first record where it and all following + * records up to (but not including) @max_seq fit into @size. + * + * @max_seq is simply an upper bound and does not need to exist. If the caller + * does not require an upper bound, -1 can be used for @max_seq. + */ +static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, + bool syslog, bool time) +{ + struct printk_info info; + unsigned int line_count; + size_t len = 0; + u64 seq; + + /* Determine the size of the records up to @max_seq. */ + prb_for_each_info(start_seq, prb, seq, &info, &line_count) { + if (info.seq >= max_seq) + break; + len += get_record_print_text_size(&info, line_count, syslog, time); + } + + /* + * Adjust the upper bound for the next loop to avoid subtracting + * lengths that were never added. + */ + if (seq < max_seq) + max_seq = seq; + + /* + * Move first record forward until length fits into the buffer. Ignore + * newest messages that were not counted in the above cycle. Messages + * might appear and get lost in the meantime. This is a best effort + * that prevents an infinite loop that could occur with a retry. + */ + prb_for_each_info(start_seq, prb, seq, &info, &line_count) { + if (len <= size || info.seq >= max_seq) + break; + len -= get_record_print_text_size(&info, line_count, syslog, time); + } + + return seq; +} + static int syslog_print(char __user *buf, int size) { struct printk_info info; @@ -1428,19 +1488,21 @@ static int syslog_print(char __user *buf, int size) char *text; int len = 0; - text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); + text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); if (!text) return -ENOMEM; - prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); + prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); while (size > 0) { size_t n; size_t skip; - logbuf_lock_irq(); + printk_safe_enter_irq(); + raw_spin_lock(&syslog_lock); if (!prb_read_valid(prb, syslog_seq, &r)) { - logbuf_unlock_irq(); + raw_spin_unlock(&syslog_lock); + printk_safe_exit_irq(); break; } if (r.info->seq != syslog_seq) { @@ -1469,7 +1531,8 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - logbuf_unlock_irq(); + raw_spin_unlock(&syslog_lock); + printk_safe_exit_irq(); if (!n) break; @@ -1492,34 +1555,26 @@ static int syslog_print(char __user *buf, int size) static int syslog_print_all(char __user *buf, int size, bool clear) { struct printk_info info; - unsigned int line_count; struct printk_record r; char *text; int len = 0; u64 seq; bool time; - text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); + text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); if (!text) return -ENOMEM; time = printk_time; - logbuf_lock_irq(); + printk_safe_enter_irq(); /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ - prb_for_each_info(clear_seq, prb, seq, &info, &line_count) - len += get_record_print_text_size(&info, line_count, true, time); + seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, + size, true, time); - /* move first record forward until length fits into the buffer */ - prb_for_each_info(clear_seq, prb, seq, &info, &line_count) { - if (len <= size) - break; - len -= get_record_print_text_size(&info, line_count, true, time); - } - - prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); + prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); len = 0; prb_for_each_record(seq, prb, seq, &r) { @@ -1532,20 +1587,23 @@ static int syslog_print_all(char __user *buf, int size, bool clear) break; } - logbuf_unlock_irq(); + printk_safe_exit_irq(); if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; - logbuf_lock_irq(); + printk_safe_enter_irq(); if (len < 0) break; } - if (clear) - clear_seq = seq; - logbuf_unlock_irq(); + if (clear) { + raw_spin_lock(&syslog_lock); + latched_seq_write(&clear_seq, seq); + raw_spin_unlock(&syslog_lock); + } + printk_safe_exit_irq(); kfree(text); return len; @@ -1553,9 +1611,23 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { - logbuf_lock_irq(); - clear_seq = prb_next_seq(prb); - logbuf_unlock_irq(); + printk_safe_enter_irq(); + raw_spin_lock(&syslog_lock); + latched_seq_write(&clear_seq, prb_next_seq(prb)); + raw_spin_unlock(&syslog_lock); + printk_safe_exit_irq(); +} + +/* Return a consistent copy of @syslog_seq. */ +static u64 read_syslog_seq_irq(void) +{ + u64 seq; + + raw_spin_lock_irq(&syslog_lock); + seq = syslog_seq; + raw_spin_unlock_irq(&syslog_lock); + + return seq; } int do_syslog(int type, char __user *buf, int len, int source) @@ -1581,8 +1653,9 @@ int do_syslog(int type, char __user *buf, int len, int source) return 0; if (!access_ok(buf, len)) return -EFAULT; + error = wait_event_interruptible(log_wait, - prb_read_valid(prb, syslog_seq, NULL)); + prb_read_valid(prb, read_syslog_seq_irq(), NULL)); if (error) return error; error = syslog_print(buf, len); @@ -1630,10 +1703,12 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - logbuf_lock_irq(); + printk_safe_enter_irq(); + raw_spin_lock(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ - logbuf_unlock_irq(); + raw_spin_unlock(&syslog_lock); + printk_safe_exit_irq(); return 0; } if (info.seq != syslog_seq) { @@ -1661,7 +1736,8 @@ int do_syslog(int type, char __user *buf, int len, int source) } error -= syslog_partial; } - logbuf_unlock_irq(); + raw_spin_unlock(&syslog_lock); + printk_safe_exit_irq(); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: @@ -2104,12 +2180,6 @@ asmlinkage int vprintk_emit(int facility, int level, } EXPORT_SYMBOL(vprintk_emit); -asmlinkage int vprintk(const char *fmt, va_list args) -{ - return vprintk_func(fmt, args); -} -EXPORT_SYMBOL(vprintk); - int vprintk_default(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); @@ -2143,7 +2213,7 @@ asmlinkage __visible int printk(const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_func(fmt, args); + r = vprintk(fmt, args); va_end(args); return r; @@ -2152,8 +2222,7 @@ EXPORT_SYMBOL(printk); #else /* CONFIG_PRINTK */ -#define LOG_LINE_MAX 0 -#define PREFIX_MAX 0 +#define CONSOLE_LOG_MAX 0 #define printk_time false #define prb_read_valid(rb, seq, r) false @@ -2262,7 +2331,7 @@ static int __init console_setup(char *str) /* * console="" or console=null have been suggested as a way to * disable console output. Use ttynull that has been created - * for exacly this purpose. + * for exactly this purpose. */ if (str[0] == 0 || strcmp(str, "null") == 0) { __add_preferred_console("ttynull", 0, NULL, NULL, true); @@ -2471,7 +2540,7 @@ static inline int can_use_console(void) void console_unlock(void) { static char ext_text[CONSOLE_EXT_LOG_MAX]; - static char text[LOG_LINE_MAX + PREFIX_MAX]; + static char text[CONSOLE_LOG_MAX]; unsigned long flags; bool do_cond_resched, retry; struct printk_info info; @@ -2518,7 +2587,6 @@ again: size_t len; printk_safe_enter_irqsave(flags); - raw_spin_lock(&logbuf_lock); skip: if (!prb_read_valid(prb, console_seq, &r)) break; @@ -2562,7 +2630,6 @@ skip: console_msg_format & MSG_FORMAT_SYSLOG, printk_time); console_seq++; - raw_spin_unlock(&logbuf_lock); /* * While actively printing out messages, if another printk() @@ -2589,8 +2656,6 @@ skip: console_locked = 0; - raw_spin_unlock(&logbuf_lock); - up_console_sem(); /* @@ -2599,9 +2664,7 @@ skip: * there's a new owner and the console_unlock() from them will do the * flush, no worries. */ - raw_spin_lock(&logbuf_lock); retry = prb_read_valid(prb, console_seq, NULL); - raw_spin_unlock(&logbuf_lock); printk_safe_exit_irqrestore(flags); if (retry && console_trylock()) @@ -2668,9 +2731,9 @@ void console_flush_on_panic(enum con_flush_mode mode) if (mode == CONSOLE_REPLAY_ALL) { unsigned long flags; - logbuf_lock_irqsave(flags); + printk_safe_enter_irqsave(flags); console_seq = prb_first_valid_seq(prb); - logbuf_unlock_irqrestore(flags); + printk_safe_exit_irqrestore(flags); } console_unlock(); } @@ -2898,9 +2961,7 @@ void register_console(struct console *newcon) /* * console_unlock(); will print out the buffered messages * for us. - */ - logbuf_lock_irqsave(flags); - /* + * * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to * the already-registered consoles. @@ -2911,8 +2972,11 @@ void register_console(struct console *newcon) */ exclusive_console = newcon; exclusive_console_stop_seq = console_seq; + + /* Get a consistent copy of @syslog_seq. */ + raw_spin_lock_irqsave(&syslog_lock, flags); console_seq = syslog_seq; - logbuf_unlock_irqrestore(flags); + raw_spin_unlock_irqrestore(&syslog_lock, flags); } console_unlock(); console_sysfs_notify(); @@ -3042,7 +3106,7 @@ void __init console_init(void) * * To mitigate this problem somewhat, only unregister consoles whose memory * intersects with the init section. Note that all other boot consoles will - * get unregistred when the real preferred console is registered. + * get unregistered when the real preferred console is registered. */ static int __init printk_late_init(void) { @@ -3276,7 +3340,6 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); void kmsg_dump(enum kmsg_dump_reason reason) { struct kmsg_dumper *dumper; - unsigned long flags; rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { @@ -3293,26 +3356,15 @@ void kmsg_dump(enum kmsg_dump_reason reason) if (reason > max_reason) continue; - /* initialize iterator with data about the stored records */ - dumper->active = true; - - logbuf_lock_irqsave(flags); - dumper->cur_seq = clear_seq; - dumper->next_seq = prb_next_seq(prb); - logbuf_unlock_irqrestore(flags); - /* invoke dumper which will iterate over records */ dumper->dump(dumper, reason); - - /* reset iterator */ - dumper->active = false; } rcu_read_unlock(); } /** - * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) - * @dumper: registered kmsg dumper + * kmsg_dump_get_line - retrieve one kmsg log line + * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to * @size: maximum size of the buffer @@ -3326,30 +3378,31 @@ void kmsg_dump(enum kmsg_dump_reason reason) * * A return value of FALSE indicates that there are no more records to * read. - * - * The function is similar to kmsg_dump_get_line(), but grabs no locks. */ -bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, - char *line, size_t size, size_t *len) +bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, + char *line, size_t size, size_t *len) { + u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; unsigned int line_count; struct printk_record r; + unsigned long flags; size_t l = 0; bool ret = false; - prb_rec_init_rd(&r, &info, line, size); + if (iter->cur_seq < min_seq) + iter->cur_seq = min_seq; - if (!dumper->active) - goto out; + printk_safe_enter_irqsave(flags); + prb_rec_init_rd(&r, &info, line, size); /* Read text or count text lines? */ if (line) { - if (!prb_read_valid(prb, dumper->cur_seq, &r)) + if (!prb_read_valid(prb, iter->cur_seq, &r)) goto out; l = record_print_text(&r, syslog, printk_time); } else { - if (!prb_read_valid_info(prb, dumper->cur_seq, + if (!prb_read_valid_info(prb, iter->cur_seq, &info, &line_count)) { goto out; } @@ -3358,52 +3411,23 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, } - dumper->cur_seq = r.info->seq + 1; + iter->cur_seq = r.info->seq + 1; ret = true; out: + printk_safe_exit_irqrestore(flags); if (len) *len = l; return ret; } - -/** - * kmsg_dump_get_line - retrieve one kmsg log line - * @dumper: registered kmsg dumper - * @syslog: include the "<4>" prefixes - * @line: buffer to copy the line to - * @size: maximum size of the buffer - * @len: length of line placed into buffer - * - * Start at the beginning of the kmsg buffer, with the oldest kmsg - * record, and copy one record into the provided buffer. - * - * Consecutive calls will return the next available record moving - * towards the end of the buffer with the youngest messages. - * - * A return value of FALSE indicates that there are no more records to - * read. - */ -bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, - char *line, size_t size, size_t *len) -{ - unsigned long flags; - bool ret; - - logbuf_lock_irqsave(flags); - ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); - logbuf_unlock_irqrestore(flags); - - return ret; -} EXPORT_SYMBOL_GPL(kmsg_dump_get_line); /** * kmsg_dump_get_buffer - copy kmsg log lines - * @dumper: registered kmsg dumper + * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @buf: buffer to copy the line to * @size: maximum size of the buffer - * @len: length of line placed into buffer + * @len_out: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer * with as many of the *youngest* kmsg records that fit into it. @@ -3416,115 +3440,93 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); * A return value of FALSE indicates that there are no more records to * read. */ -bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, - char *buf, size_t size, size_t *len) +bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, + char *buf, size_t size, size_t *len_out) { + u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; - unsigned int line_count; struct printk_record r; unsigned long flags; u64 seq; u64 next_seq; - size_t l = 0; + size_t len = 0; bool ret = false; bool time = printk_time; - prb_rec_init_rd(&r, &info, buf, size); - - if (!dumper->active || !buf || !size) + if (!buf || !size) goto out; - logbuf_lock_irqsave(flags); - if (prb_read_valid_info(prb, dumper->cur_seq, &info, NULL)) { - if (info.seq != dumper->cur_seq) { + if (iter->cur_seq < min_seq) + iter->cur_seq = min_seq; + + printk_safe_enter_irqsave(flags); + if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { + if (info.seq != iter->cur_seq) { /* messages are gone, move to first available one */ - dumper->cur_seq = info.seq; + iter->cur_seq = info.seq; } } /* last entry */ - if (dumper->cur_seq >= dumper->next_seq) { - logbuf_unlock_irqrestore(flags); + if (iter->cur_seq >= iter->next_seq) { + printk_safe_exit_irqrestore(flags); goto out; } - /* calculate length of entire buffer */ - seq = dumper->cur_seq; - while (prb_read_valid_info(prb, seq, &info, &line_count)) { - if (r.info->seq >= dumper->next_seq) - break; - l += get_record_print_text_size(&info, line_count, syslog, time); - seq = r.info->seq + 1; - } - - /* move first record forward until length fits into the buffer */ - seq = dumper->cur_seq; - while (l >= size && prb_read_valid_info(prb, seq, - &info, &line_count)) { - if (r.info->seq >= dumper->next_seq) - break; - l -= get_record_print_text_size(&info, line_count, syslog, time); - seq = r.info->seq + 1; - } + /* + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. Pass in size-1 + * because this function (by way of record_print_text()) will + * not write more than size-1 bytes of text into @buf. + */ + seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, + size - 1, syslog, time); - /* last message in next interation */ + /* + * Next kmsg_dump_get_buffer() invocation will dump block of + * older records stored right before this one. + */ next_seq = seq; - /* actually read text into the buffer now */ - l = 0; - while (prb_read_valid(prb, seq, &r)) { - if (r.info->seq >= dumper->next_seq) - break; + prb_rec_init_rd(&r, &info, buf, size); - l += record_print_text(&r, syslog, time); + len = 0; + prb_for_each_record(seq, prb, seq, &r) { + if (r.info->seq >= iter->next_seq) + break; - /* adjust record to store to remaining buffer space */ - prb_rec_init_rd(&r, &info, buf + l, size - l); + len += record_print_text(&r, syslog, time); - seq = r.info->seq + 1; + /* Adjust record to store to remaining buffer space. */ + prb_rec_init_rd(&r, &info, buf + len, size - len); } - dumper->next_seq = next_seq; + iter->next_seq = next_seq; ret = true; - logbuf_unlock_irqrestore(flags); + printk_safe_exit_irqrestore(flags); out: - if (len) - *len = l; + if (len_out) + *len_out = len; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** - * kmsg_dump_rewind_nolock - reset the iterator (unlocked version) - * @dumper: registered kmsg dumper - * - * Reset the dumper's iterator so that kmsg_dump_get_line() and - * kmsg_dump_get_buffer() can be called again and used multiple - * times within the same dumper.dump() callback. - * - * The function is similar to kmsg_dump_rewind(), but grabs no locks. - */ -void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) -{ - dumper->cur_seq = clear_seq; - dumper->next_seq = prb_next_seq(prb); -} - -/** * kmsg_dump_rewind - reset the iterator - * @dumper: registered kmsg dumper + * @iter: kmsg dump iterator * * Reset the dumper's iterator so that kmsg_dump_get_line() and * kmsg_dump_get_buffer() can be called again and used multiple * times within the same dumper.dump() callback. */ -void kmsg_dump_rewind(struct kmsg_dumper *dumper) +void kmsg_dump_rewind(struct kmsg_dump_iter *iter) { unsigned long flags; - logbuf_lock_irqsave(flags); - kmsg_dump_rewind_nolock(dumper); - logbuf_unlock_irqrestore(flags); + printk_safe_enter_irqsave(flags); + iter->cur_seq = latched_seq_read_nolock(&clear_seq); + iter->next_seq = prb_next_seq(prb); + printk_safe_exit_irqrestore(flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 2e9e3ed7d63e..7a1414622051 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -16,7 +16,7 @@ #include "internal.h" /* - * printk() could not take logbuf_lock in NMI context. Instead, + * In NMI and safe mode, printk() avoids taking locks. Instead, * it uses an alternative implementation that temporary stores * the strings into a per-CPU buffer. The content of the buffer * is later flushed into the main ring buffer via IRQ work. @@ -267,17 +267,9 @@ void printk_safe_flush(void) void printk_safe_flush_on_panic(void) { /* - * Make sure that we could access the main ring buffer. + * Make sure that we could access the safe buffers. * Do not risk a double release when more CPUs are up. */ - if (raw_spin_is_locked(&logbuf_lock)) { - if (num_online_cpus() > 1) - return; - - debug_locks_off(); - raw_spin_lock_init(&logbuf_lock); - } - if (raw_spin_is_locked(&safe_read_lock)) { if (num_online_cpus() > 1) return; @@ -319,9 +311,7 @@ void noinstr printk_nmi_exit(void) * reordering. * * It has effect only when called in NMI context. Then printk() - * will try to store the messages into the main logbuf directly - * and use the per-CPU buffers only as a fallback when the lock - * is not available. + * will store the messages into the main logbuf directly. */ void printk_nmi_direct_enter(void) { @@ -367,7 +357,7 @@ void __printk_safe_exit(void) this_cpu_dec(printk_context); } -__printf(1, 0) int vprintk_func(const char *fmt, va_list args) +asmlinkage int vprintk(const char *fmt, va_list args) { #ifdef CONFIG_KGDB_KDB /* Allow to pass printk() to kdb but avoid a recursion. */ @@ -376,20 +366,21 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) #endif /* - * Try to use the main logbuf even in NMI. But avoid calling console + * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) && - raw_spin_trylock(&logbuf_lock)) { + if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) { + unsigned long flags; int len; + printk_safe_enter_irqsave(flags); len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); - raw_spin_unlock(&logbuf_lock); + printk_safe_exit_irqrestore(flags); defer_console_output(); return len; } - /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ + /* Use extra buffer in NMI. */ if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) return vprintk_nmi(fmt, args); @@ -420,3 +411,4 @@ void __init printk_safe_init(void) /* Flush pending messages that did not have scheduled IRQ works. */ printk_safe_flush(); } +EXPORT_SYMBOL(vprintk); diff --git a/kernel/profile.c b/kernel/profile.c index 6f69a4195d56..c2ebddb5e974 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -430,7 +430,7 @@ static ssize_t prof_cpu_mask_proc_write(struct file *file, cpumask_var_t new_value; int err; - if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 98191218d891..b2890f6e6d6f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6384,6 +6384,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 50cbad89f7fa..6ee9c9bbe505 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -114,19 +114,8 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, return true; } -static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) +static void sugov_deferred_update(struct sugov_policy *sg_policy) { - if (sugov_update_next_freq(sg_policy, time, next_freq)) - cpufreq_driver_fast_switch(sg_policy->policy, next_freq); -} - -static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) -{ - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; - if (!sg_policy->work_in_progress) { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); @@ -366,16 +355,19 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, sg_policy->cached_raw_freq = cached_freq; } + if (!sugov_update_next_freq(sg_policy, time, next_f)) + return; + /* * This code runs under rq->lock for the target CPU, so it won't run * concurrently on two different CPUs for the same target and it is not * necessary to acquire the lock in the fast switch case. */ if (sg_policy->policy->fast_switch_enabled) { - sugov_fast_switch(sg_policy, time, next_f); + cpufreq_driver_fast_switch(sg_policy->policy, next_f); } else { raw_spin_lock(&sg_policy->update_lock); - sugov_deferred_update(sg_policy, time, next_f); + sugov_deferred_update(sg_policy); raw_spin_unlock(&sg_policy->update_lock); } } @@ -454,12 +446,15 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) if (sugov_should_update_freq(sg_policy, time)) { next_f = sugov_next_freq_shared(sg_cpu, time); + if (!sugov_update_next_freq(sg_policy, time, next_f)) + goto unlock; + if (sg_policy->policy->fast_switch_enabled) - sugov_fast_switch(sg_policy, time, next_f); + cpufreq_driver_fast_switch(sg_policy->policy, next_f); else - sugov_deferred_update(sg_policy, time, next_f); + sugov_deferred_update(sg_policy); } - +unlock: raw_spin_unlock(&sg_policy->update_lock); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5f611658eeab..2c36a5fad589 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -60,7 +60,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; - pc = preempt_count() - offset; + pc = irq_count() - offset; /* * We do not account for softirq time from ksoftirqd here. @@ -421,7 +421,7 @@ void vtime_task_switch(struct task_struct *prev) void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { - unsigned int pc = preempt_count() - offset; + unsigned int pc = irq_count() - offset; if (pc & HARDIRQ_OFFSET) { vtime_account_hardirq(tsk); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1d60fc2c9987..1e63db4dbd9a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -817,7 +817,7 @@ static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter, } /** - * seccomp_cache_prepare - emulate the filter to find cachable syscalls + * seccomp_cache_prepare - emulate the filter to find cacheable syscalls * @sfilter: The seccomp filter * * Returns 0 if successful or -errno if error occurred. diff --git a/kernel/signal.c b/kernel/signal.c index f2718350bf4b..e528f96eebc8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -43,7 +43,6 @@ #include <linux/cn_proc.h> #include <linux/compiler.h> #include <linux/posix-timers.h> -#include <linux/livepatch.h> #include <linux/cgroup.h> #include <linux/audit.h> @@ -181,8 +180,7 @@ void recalc_sigpending_and_wake(struct task_struct *t) void recalc_sigpending(void) { - if (!recalc_sigpending_tsk(current) && !freezing(current) && - !klp_patch_pending(current)) + if (!recalc_sigpending_tsk(current) && !freezing(current)) clear_thread_flag(TIF_SIGPENDING); } diff --git a/kernel/softirq.c b/kernel/softirq.c index bad14ca2b520..4992853ef53d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -13,6 +13,7 @@ #include <linux/kernel_stat.h> #include <linux/interrupt.h> #include <linux/init.h> +#include <linux/local_lock.h> #include <linux/mm.h> #include <linux/notifier.h> #include <linux/percpu.h> @@ -25,6 +26,7 @@ #include <linux/smpboot.h> #include <linux/tick.h> #include <linux/irq.h> +#include <linux/wait_bit.h> #include <asm/softirq_stack.h> @@ -102,20 +104,204 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); #endif /* - * preempt_count and SOFTIRQ_OFFSET usage: - * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving - * softirq processing. - * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) + * SOFTIRQ_OFFSET usage: + * + * On !RT kernels 'count' is the preempt counter, on RT kernels this applies + * to a per CPU counter and to task::softirqs_disabled_cnt. + * + * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq + * processing. + * + * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) * on local_bh_disable or local_bh_enable. + * * This lets us distinguish between whether we are currently processing * softirq and whether we just have bh disabled. */ +#ifdef CONFIG_PREEMPT_RT + +/* + * RT accounts for BH disabled sections in task::softirqs_disabled_cnt and + * also in per CPU softirq_ctrl::cnt. This is necessary to allow tasks in a + * softirq disabled section to be preempted. + * + * The per task counter is used for softirq_count(), in_softirq() and + * in_serving_softirqs() because these counts are only valid when the task + * holding softirq_ctrl::lock is running. + * + * The per CPU counter prevents pointless wakeups of ksoftirqd in case that + * the task which is in a softirq disabled section is preempted or blocks. + */ +struct softirq_ctrl { + local_lock_t lock; + int cnt; +}; + +static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { + .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), +}; + +/** + * local_bh_blocked() - Check for idle whether BH processing is blocked + * + * Returns false if the per CPU softirq::cnt is 0 otherwise true. + * + * This is invoked from the idle task to guard against false positive + * softirq pending warnings, which would happen when the task which holds + * softirq_ctrl::lock was the only running task on the CPU and blocks on + * some other lock. + */ +bool local_bh_blocked(void) +{ + return __this_cpu_read(softirq_ctrl.cnt) != 0; +} + +void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) +{ + unsigned long flags; + int newcnt; + + WARN_ON_ONCE(in_hardirq()); + + /* First entry of a task into a BH disabled section? */ + if (!current->softirq_disable_cnt) { + if (preemptible()) { + local_lock(&softirq_ctrl.lock); + /* Required to meet the RCU bottomhalf requirements. */ + rcu_read_lock(); + } else { + DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt)); + } + } + + /* + * Track the per CPU softirq disabled state. On RT this is per CPU + * state to allow preemption of bottom half disabled sections. + */ + newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt); + /* + * Reflect the result in the task state to prevent recursion on the + * local lock and to make softirq_count() & al work. + */ + current->softirq_disable_cnt = newcnt; + + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { + raw_local_irq_save(flags); + lockdep_softirqs_off(ip); + raw_local_irq_restore(flags); + } +} +EXPORT_SYMBOL(__local_bh_disable_ip); + +static void __local_bh_enable(unsigned int cnt, bool unlock) +{ + unsigned long flags; + int newcnt; + + DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != + this_cpu_read(softirq_ctrl.cnt)); + + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) { + raw_local_irq_save(flags); + lockdep_softirqs_on(_RET_IP_); + raw_local_irq_restore(flags); + } + + newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt); + current->softirq_disable_cnt = newcnt; + + if (!newcnt && unlock) { + rcu_read_unlock(); + local_unlock(&softirq_ctrl.lock); + } +} + +void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) +{ + bool preempt_on = preemptible(); + unsigned long flags; + u32 pending; + int curcnt; + + WARN_ON_ONCE(in_irq()); + lockdep_assert_irqs_enabled(); + + local_irq_save(flags); + curcnt = __this_cpu_read(softirq_ctrl.cnt); + + /* + * If this is not reenabling soft interrupts, no point in trying to + * run pending ones. + */ + if (curcnt != cnt) + goto out; + + pending = local_softirq_pending(); + if (!pending || ksoftirqd_running(pending)) + goto out; + + /* + * If this was called from non preemptible context, wake up the + * softirq daemon. + */ + if (!preempt_on) { + wakeup_softirqd(); + goto out; + } + + /* + * Adjust softirq count to SOFTIRQ_OFFSET which makes + * in_serving_softirq() become true. + */ + cnt = SOFTIRQ_OFFSET; + __local_bh_enable(cnt, false); + __do_softirq(); + +out: + __local_bh_enable(cnt, preempt_on); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__local_bh_enable_ip); + +/* + * Invoked from ksoftirqd_run() outside of the interrupt disabled section + * to acquire the per CPU local lock for reentrancy protection. + */ +static inline void ksoftirqd_run_begin(void) +{ + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); + local_irq_disable(); +} + +/* Counterpart to ksoftirqd_run_begin() */ +static inline void ksoftirqd_run_end(void) +{ + __local_bh_enable(SOFTIRQ_OFFSET, true); + WARN_ON_ONCE(in_interrupt()); + local_irq_enable(); +} + +static inline void softirq_handle_begin(void) { } +static inline void softirq_handle_end(void) { } + +static inline bool should_wake_ksoftirqd(void) +{ + return !this_cpu_read(softirq_ctrl.cnt); +} + +static inline void invoke_softirq(void) +{ + if (should_wake_ksoftirqd()) + wakeup_softirqd(); +} + +#else /* CONFIG_PREEMPT_RT */ -#ifdef CONFIG_TRACE_IRQFLAGS /* - * This is for softirq.c-internal use, where hardirqs are disabled + * This one is for softirq.c-internal use, where hardirqs are disabled * legitimately: */ +#ifdef CONFIG_TRACE_IRQFLAGS void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -206,6 +392,32 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) } EXPORT_SYMBOL(__local_bh_enable_ip); +static inline void softirq_handle_begin(void) +{ + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); +} + +static inline void softirq_handle_end(void) +{ + __local_bh_enable(SOFTIRQ_OFFSET); + WARN_ON_ONCE(in_interrupt()); +} + +static inline void ksoftirqd_run_begin(void) +{ + local_irq_disable(); +} + +static inline void ksoftirqd_run_end(void) +{ + local_irq_enable(); +} + +static inline bool should_wake_ksoftirqd(void) +{ + return true; +} + static inline void invoke_softirq(void) { if (ksoftirqd_running(local_softirq_pending())) @@ -250,6 +462,8 @@ asmlinkage __visible void do_softirq(void) local_irq_restore(flags); } +#endif /* !CONFIG_PREEMPT_RT */ + /* * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, * but break the loop if need_resched() is set or after 2 ms. @@ -318,7 +532,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) pending = local_softirq_pending(); - __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); + softirq_handle_begin(); in_hardirq = lockdep_softirq_start(); account_softirq_enter(current); @@ -354,8 +568,10 @@ restart: pending >>= softirq_bit; } - if (__this_cpu_read(ksoftirqd) == current) + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && + __this_cpu_read(ksoftirqd) == current) rcu_softirq_qs(); + local_irq_disable(); pending = local_softirq_pending(); @@ -369,8 +585,7 @@ restart: account_softirq_exit(current); lockdep_softirq_end(in_hardirq); - __local_bh_enable(SOFTIRQ_OFFSET); - WARN_ON_ONCE(in_interrupt()); + softirq_handle_end(); current_restore_flags(old_flags, PF_MEMALLOC); } @@ -465,7 +680,7 @@ inline void raise_softirq_irqoff(unsigned int nr) * Otherwise we wake up ksoftirqd to make sure we * schedule the softirq soon. */ - if (!in_interrupt()) + if (!in_interrupt() && should_wake_ksoftirqd()) wakeup_softirqd(); } @@ -531,6 +746,20 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule); +static bool tasklet_clear_sched(struct tasklet_struct *t) +{ + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) { + wake_up_var(&t->state); + return true; + } + + WARN_ONCE(1, "tasklet SCHED state not set: %s %pS\n", + t->use_callback ? "callback" : "func", + t->use_callback ? (void *)t->callback : (void *)t->func); + + return false; +} + static void tasklet_action_common(struct softirq_action *a, struct tasklet_head *tl_head, unsigned int softirq_nr) @@ -550,13 +779,12 @@ static void tasklet_action_common(struct softirq_action *a, if (tasklet_trylock(t)) { if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, - &t->state)) - BUG(); - if (t->use_callback) - t->callback(t); - else - t->func(t->data); + if (tasklet_clear_sched(t)) { + if (t->use_callback) + t->callback(t); + else + t->func(t->data); + } tasklet_unlock(t); continue; } @@ -606,21 +834,62 @@ void tasklet_init(struct tasklet_struct *t, } EXPORT_SYMBOL(tasklet_init); +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +/* + * Do not use in new code. Waiting for tasklets from atomic contexts is + * error prone and should be avoided. + */ +void tasklet_unlock_spin_wait(struct tasklet_struct *t) +{ + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* + * Prevent a live lock when current preempted soft + * interrupt processing or prevents ksoftirqd from + * running. If the tasklet runs on a different CPU + * then this has no effect other than doing the BH + * disable/enable dance for nothing. + */ + local_bh_disable(); + local_bh_enable(); + } else { + cpu_relax(); + } + } +} +EXPORT_SYMBOL(tasklet_unlock_spin_wait); +#endif + void tasklet_kill(struct tasklet_struct *t) { if (in_interrupt()) pr_notice("Attempt to kill tasklet from interrupt\n"); - while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { - do { - yield(); - } while (test_bit(TASKLET_STATE_SCHED, &t->state)); - } + while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) + wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state)); + tasklet_unlock_wait(t); - clear_bit(TASKLET_STATE_SCHED, &t->state); + tasklet_clear_sched(t); } EXPORT_SYMBOL(tasklet_kill); +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +void tasklet_unlock(struct tasklet_struct *t) +{ + smp_mb__before_atomic(); + clear_bit(TASKLET_STATE_RUN, &t->state); + smp_mb__after_atomic(); + wake_up_var(&t->state); +} +EXPORT_SYMBOL_GPL(tasklet_unlock); + +void tasklet_unlock_wait(struct tasklet_struct *t) +{ + wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state)); +} +EXPORT_SYMBOL_GPL(tasklet_unlock_wait); +#endif + void __init softirq_init(void) { int cpu; @@ -643,53 +912,21 @@ static int ksoftirqd_should_run(unsigned int cpu) static void run_ksoftirqd(unsigned int cpu) { - local_irq_disable(); + ksoftirqd_run_begin(); if (local_softirq_pending()) { /* * We can safely run softirq on inline stack, as we are not deep * in the task stack here. */ __do_softirq(); - local_irq_enable(); + ksoftirqd_run_end(); cond_resched(); return; } - local_irq_enable(); + ksoftirqd_run_end(); } #ifdef CONFIG_HOTPLUG_CPU -/* - * tasklet_kill_immediate is called to remove a tasklet which can already be - * scheduled for execution on @cpu. - * - * Unlike tasklet_kill, this function removes the tasklet - * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state. - * - * When this function is called, @cpu must be in the CPU_DEAD state. - */ -void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) -{ - struct tasklet_struct **i; - - BUG_ON(cpu_online(cpu)); - BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state)); - - if (!test_bit(TASKLET_STATE_SCHED, &t->state)) - return; - - /* CPU is dead, so no lock needed. */ - for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { - if (*i == t) { - *i = t->next; - /* If this was the tail element, move the tail ptr */ - if (*i == NULL) - per_cpu(tasklet_vec, cpu).tail = i; - return; - } - } - BUG(); -} - static int takeover_tasklets(unsigned int cpu) { /* CPU is dead, so no lock needed. */ diff --git a/kernel/sys.c b/kernel/sys.c index 2e2e3f378d97..3d62c9599dc0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -119,6 +119,12 @@ #ifndef PAC_RESET_KEYS # define PAC_RESET_KEYS(a, b) (-EINVAL) #endif +#ifndef PAC_SET_ENABLED_KEYS +# define PAC_SET_ENABLED_KEYS(a, b, c) (-EINVAL) +#endif +#ifndef PAC_GET_ENABLED_KEYS +# define PAC_GET_ENABLED_KEYS(a) (-EINVAL) +#endif #ifndef SET_TAGGED_ADDR_CTRL # define SET_TAGGED_ADDR_CTRL(a) (-EINVAL) #endif @@ -2497,6 +2503,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = PAC_RESET_KEYS(me, arg2); break; + case PR_PAC_SET_ENABLED_KEYS: + if (arg4 || arg5) + return -EINVAL; + error = PAC_SET_ENABLED_KEYS(me, arg2, arg3); + break; + case PR_PAC_GET_ENABLED_KEYS: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = PAC_GET_ENABLED_KEYS(me); + break; case PR_SET_TAGGED_ADDR_CTRL: if (arg3 || arg4 || arg5) return -EINVAL; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4d94e2b5499d..bea9d08b1698 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -2,13 +2,13 @@ /* * Alarmtimer interface * - * This interface provides a timer which is similarto hrtimers, + * This interface provides a timer which is similar to hrtimers, * but triggers a RTC alarm if the box is suspend. * * This interface is influenced by the Android RTC Alarm timer * interface. * - * Copyright (C) 2010 IBM Corperation + * Copyright (C) 2010 IBM Corporation * * Author: John Stultz <john.stultz@linaro.org> */ @@ -811,7 +811,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) /** * alarm_timer_nsleep - alarmtimer nanosleep * @which_clock: clockid - * @flags: determins abstime or relative + * @flags: determines abstime or relative * @tsreq: requested sleep time (abs or rel) * * Handles clock_nanosleep calls against _ALARM clockids diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cce484a2cc7c..1d1a61371b5a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -38,7 +38,7 @@ * calculated mult and shift factors. This guarantees that no 64bit * overflow happens when the input value of the conversion is * multiplied with the calculated mult factor. Larger ranges may - * reduce the conversion accuracy by chosing smaller mult and shift + * reduce the conversion accuracy by choosing smaller mult and shift * factors. */ void @@ -518,7 +518,7 @@ static void clocksource_suspend_select(bool fallback) * the suspend time when resuming system. * * This function is called late in the suspend process from timekeeping_suspend(), - * that means processes are freezed, non-boot cpus and interrupts are disabled + * that means processes are frozen, non-boot cpus and interrupts are disabled * now. It is therefore possible to start the suspend timer without taking the * clocksource mutex. */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5c9d968187ae..4a66725b1d4a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -683,7 +683,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) * T1 is removed, so this code is called and would reprogram * the hardware to 5s from now. Any hrtimer_start after that * will not reprogram the hardware due to hang_detected being - * set. So we'd effectivly block all timers until the T2 event + * set. So we'd effectively block all timers until the T2 event * fires. */ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) @@ -1019,7 +1019,7 @@ static void __remove_hrtimer(struct hrtimer *timer, * cpu_base->next_timer. This happens when we remove the first * timer on a remote cpu. No harm as we never dereference * cpu_base->next_timer. So the worst thing what can happen is - * an superflous call to hrtimer_force_reprogram() on the + * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ if (reprogram && timer == cpu_base->next_timer) @@ -1212,7 +1212,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) * The counterpart to hrtimer_cancel_wait_running(). * * If there is a waiter for cpu_base->expiry_lock, then it was waiting for - * the timer callback to finish. Drop expiry_lock and reaquire it. That + * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, @@ -1398,7 +1398,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, int base; /* - * On PREEMPT_RT enabled kernels hrtimers which are not explicitely + * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context for latency reasons and because the callbacks * can invoke functions which might sleep on RT, e.g. spin_lock(). @@ -1430,7 +1430,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used - * @mode: The modes which are relevant for intitialization: + * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * @@ -1487,7 +1487,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * insufficient for that. * * The sequence numbers are required because otherwise we could still observe - * a false negative if the read side got smeared over multiple consequtive + * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ @@ -1588,7 +1588,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search - * Tree, which can answer a stabbing querry for + * Tree, which can answer a stabbing query for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that @@ -1822,7 +1822,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { /* - * On PREEMPT_RT enabled kernels hrtimers which are not explicitely + * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context either for latency reasons or because the * hrtimer callback takes regular spinlocks or invokes other @@ -1835,7 +1835,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, * the same CPU. That causes a latency spike due to the wakeup of * a gazillion threads. * - * OTOH, priviledged real-time user space applications rely on the + * OTOH, privileged real-time user space applications rely on the * low latency of hard interrupt wakeups. If the current task is in * a real-time scheduling class, mark the mode for hard interrupt * expiry. diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a5cffe2a1770..a492e4da69ba 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -44,7 +44,7 @@ static u64 jiffies_read(struct clocksource *cs) * the timer interrupt frequency HZ and it suffers * inaccuracies caused by missed or lost timer * interrupts and the inability for the timer - * interrupt hardware to accuratly tick at the + * interrupt hardware to accurately tick at the * requested HZ value. It is also not recommended * for "tick-less" systems. */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5247afd7f345..406dccb79c2b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -544,7 +544,7 @@ static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec, struct timespec64 *to_set, const struct timespec64 *now) { - /* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */ + /* Allowed error in tv_nsec, arbitrarily set to 5 jiffies in ns. */ const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5; struct timespec64 delay = {.tv_sec = -1, .tv_nsec = set_offset_nsec}; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 9abe15255bc4..3bb96a8b49c9 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -279,7 +279,7 @@ void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples) * @tsk: Task for which cputime needs to be started * @samples: Storage for time samples * - * The thread group cputime accouting is avoided when there are no posix + * The thread group cputime accounting is avoided when there are no posix * CPU timers armed. Before starting a timer it's required to check whether * the time accounting is active. If not, a full update of the atomic * accounting store needs to be done and the accounting enabled. @@ -390,7 +390,7 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) /* * If posix timer expiry is handled in task work context then * timer::it_lock can be taken without disabling interrupts as all - * other locking happens in task context. This requires a seperate + * other locking happens in task context. This requires a separate * lock class key otherwise regular posix timer expiry would record * the lock class being taken in interrupt context and generate a * false positive warning. @@ -1216,7 +1216,7 @@ static void handle_posix_cpu_timers(struct task_struct *tsk) check_process_timers(tsk, &firing); /* - * The above timer checks have updated the exipry cache and + * The above timer checks have updated the expiry cache and * because nothing can have queued or modified timers after * sighand lock was taken above it is guaranteed to be * consistent. So the next timer interrupt fastpath check diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index bf540f5a4115..dd5697d7347b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1191,8 +1191,8 @@ SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock, err = do_clock_adjtime(which_clock, &ktx); - if (err >= 0) - err = put_old_timex32(utp, &ktx); + if (err >= 0 && put_old_timex32(utp, &ktx)) + return -EFAULT; return err; } diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index 77c63005dc4e..13b11eb62685 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -21,7 +21,6 @@ #define DEBUGFS_FILENAME "udelay_test" static DEFINE_MUTEX(udelay_test_lock); -static struct dentry *udelay_test_debugfs_file; static int udelay_test_usecs; static int udelay_test_iterations = DEFAULT_ITERATIONS; @@ -138,8 +137,8 @@ static const struct file_operations udelay_test_debugfs_ops = { static int __init udelay_test_init(void) { mutex_lock(&udelay_test_lock); - udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME, - S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops); + debugfs_create_file(DEBUGFS_FILENAME, S_IRUSR, NULL, NULL, + &udelay_test_debugfs_ops); mutex_unlock(&udelay_test_lock); return 0; @@ -150,7 +149,7 @@ module_init(udelay_test_init); static void __exit udelay_test_exit(void) { mutex_lock(&udelay_test_lock); - debugfs_remove(udelay_test_debugfs_file); + debugfs_remove(debugfs_lookup(DEBUGFS_FILENAME, NULL)); mutex_unlock(&udelay_test_lock); } diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index b5a65e212df2..797eb93103ad 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -53,7 +53,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * reasons. * * Each caller tries to arm the hrtimer on its own CPU, but if the - * hrtimer callbback function is currently running, then + * hrtimer callback function is currently running, then * hrtimer_start() cannot move it and the timer stays on the CPU on * which it is assigned at the moment. * diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 5a23829372c7..a44055228796 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -107,6 +107,19 @@ void tick_install_broadcast_device(struct clock_event_device *dev) tick_broadcast_device.evtdev = dev; if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(dev); + + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return; + + /* + * If the system already runs in oneshot mode, switch the newly + * registered broadcast device to oneshot mode explicitly. + */ + if (tick_broadcast_oneshot_active()) { + tick_broadcast_switch_to_oneshot(); + return; + } + /* * Inform all cpus about this. We might be in a situation * where we did not switch to oneshot mode because the per cpu @@ -115,8 +128,7 @@ void tick_install_broadcast_device(struct clock_event_device *dev) * notification the systems stays stuck in periodic mode * forever. */ - if (dev->features & CLOCK_EVT_FEAT_ONESHOT) - tick_clock_notify(); + tick_clock_notify(); } /* @@ -157,7 +169,7 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev) } /* - * Check, if the device is disfunctional and a place holder, which + * Check, if the device is dysfunctional and a placeholder, which * needs to be handled by the broadcast device. */ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) @@ -391,7 +403,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) * - the broadcast device exists * - the broadcast device is not a hrtimer based one * - the broadcast device is in periodic mode to - * avoid a hickup during switch to oneshot mode + * avoid a hiccup during switch to oneshot mode */ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 9d3a22510bab..e15bc0ef1912 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -348,12 +348,7 @@ void tick_check_new_device(struct clock_event_device *newdev) td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; - /* cpu local device ? */ - if (!tick_check_percpu(curdev, newdev, cpu)) - goto out_bc; - - /* Preference decision */ - if (!tick_check_preferred(curdev, newdev)) + if (!tick_check_replacement(curdev, newdev)) goto out_bc; if (!try_module_get(newdev->owner)) diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index f9745d47425a..475ecceda768 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -45,7 +45,7 @@ int tick_program_event(ktime_t expires, int force) } /** - * tick_resume_onshot - resume oneshot mode + * tick_resume_oneshot - resume oneshot mode */ void tick_resume_oneshot(void) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e10a4af88737..828b091501ca 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -751,7 +751,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) * Aside of that check whether the local timer softirq is * pending. If so its a bad idea to call get_next_timer_interrupt() * because there is an already expired timer, so it will request - * immeditate expiry, which rearms the hardware timer with a + * immediate expiry, which rearms the hardware timer with a * minimal delta which brings us back to this place * immediately. Lather, rinse and repeat... */ @@ -973,7 +973,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(local_softirq_pending())) { static int ratelimit; - if (ratelimit < 10 && + if (ratelimit < 10 && !local_bh_blocked() && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", (unsigned int) local_softirq_pending()); @@ -1124,7 +1124,11 @@ ktime_t tick_nohz_get_next_hrtimer(void) * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * - * Called from power state control code with interrupts disabled + * Called from power state control code with interrupts disabled. + * + * The return value of this function and/or the value returned by it through the + * @delta_next pointer can be negative which must be taken into account by its + * callers. */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 4fb06527cf64..d952ae393423 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -29,7 +29,7 @@ enum tick_nohz_mode { * @inidle: Indicator that the CPU is in the tick idle mode * @tick_stopped: Indicator that the idle tick has been stopped * @idle_active: Indicator that the CPU is actively in the tick idle mode; - * it is resetted during irq handling phases. + * it is reset during irq handling phases. * @do_timer_lst: CPU was the last one doing do_timer before going idle * @got_idle_tick: Tick timer function has run with @inidle set * @last_tick: Store the last tick expiry time when the tick diff --git a/kernel/time/time.c b/kernel/time/time.c index 3985b2b32d08..29923b20e0e4 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -571,7 +571,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies); /* * The TICK_NSEC - 1 rounds up the value to the next resolution. Note * that a remainder subtract here would not do the right thing as the - * resolution values don't fall on second boundries. I.e. the line: + * resolution values don't fall on second boundaries. I.e. the line: * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. * Note that due to the small error in the multiplier here, this * rounding is incorrect for sufficiently large values of tv_nsec, but diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index 85b98e727306..e6285288d765 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -76,7 +76,7 @@ static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, return ns; } -u64 timecounter_cyc2time(struct timecounter *tc, +u64 timecounter_cyc2time(const struct timecounter *tc, u64 cycle_tstamp) { u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6aee5768c86f..81fe2a33b80c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -596,14 +596,14 @@ EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); * careful cache layout of the timekeeper because the sequence count and * struct tk_read_base would then need two cache lines instead of one. * - * Access to the time keeper clock source is disabled accross the innermost + * Access to the time keeper clock source is disabled across the innermost * steps of suspend/resume. The accessors still work, but the timestamps * are frozen until time keeping is resumed which happens very early. * * For regular suspend/resume there is no observable difference vs. sched * clock, but it might affect some of the nasty low level debug printks. * - * OTOH, access to sched clock is not guaranteed accross suspend/resume on + * OTOH, access to sched clock is not guaranteed across suspend/resume on * all systems either so it depends on the hardware in use. * * If that turns out to be a real problem then this could be mitigated by @@ -899,7 +899,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); /** - * ktime_mono_to_any() - convert mononotic time to any other time + * ktime_mono_to_any() - convert monotonic time to any other time * @tmono: time to convert. * @offs: which offset to use */ @@ -1427,35 +1427,45 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) static int change_clocksource(void *data) { struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *new, *old; + struct clocksource *new, *old = NULL; unsigned long flags; + bool change = false; new = (struct clocksource *) data; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - - timekeeping_forward_now(tk); /* * If the cs is in module, get a module reference. Succeeds * for built-in code (owner == NULL) as well. */ if (try_module_get(new->owner)) { - if (!new->enable || new->enable(new) == 0) { - old = tk->tkr_mono.clock; - tk_setup_internals(tk, new); - if (old->disable) - old->disable(old); - module_put(old->owner); - } else { + if (!new->enable || new->enable(new) == 0) + change = true; + else module_put(new->owner); - } } + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + timekeeping_forward_now(tk); + + if (change) { + old = tk->tkr_mono.clock; + tk_setup_internals(tk, new); + } + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (old) { + if (old->disable) + old->disable(old); + + module_put(old->owner); + } + return 0; } @@ -1948,7 +1958,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, * xtime_nsec_1 = offset + xtime_nsec_2 * Which gives us: * xtime_nsec_2 = xtime_nsec_1 - offset - * Which simplfies to: + * Which simplifies to: * xtime_nsec -= offset */ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { @@ -2336,7 +2346,7 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) /* * Validate if a timespec/timeval used to inject a time - * offset is valid. Offsets can be postive or negative, so + * offset is valid. Offsets can be positive or negative, so * we don't check tv_sec. The value of the timeval/timespec * is the sum of its fields,but *NOTE*: * The field tv_usec/tv_nsec must always be non-negative and diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f475f1a027c8..d111adf4a0cb 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -894,7 +894,7 @@ static inline void forward_timer_base(struct timer_base *base) /* * No need to forward if we are close enough below jiffies. * Also while executing timers, base->clk is 1 offset ahead - * of jiffies to avoid endless requeuing to current jffies. + * of jiffies to avoid endless requeuing to current jiffies. */ if ((long)(jnow - base->clk) < 1) return; @@ -1271,7 +1271,7 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) * The counterpart to del_timer_wait_running(). * * If there is a waiter for base->expiry_lock, then it was waiting for the - * timer callback to finish. Drop expiry_lock and reaquire it. That allows + * timer callback to finish. Drop expiry_lock and reacquire it. That allows * the waiter to acquire the lock and make progress. */ static void timer_sync_wait_running(struct timer_base *base) diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 88e6b8ed6ca5..f0d5062d9cbc 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -108,7 +108,7 @@ void update_vsyscall(struct timekeeper *tk) /* * If the current clocksource is not VDSO capable, then spare the - * update of the high reolution parts. + * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c777627212f..915fe8790f04 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3545,7 +3545,11 @@ static char *trace_iter_expand_format(struct trace_iterator *iter) { char *tmp; - if (iter->fmt == static_fmt_buf) + /* + * iter->tr is NULL when used with tp_printk, which makes + * this get called where it is not safe to call krealloc(). + */ + if (!iter->tr || iter->fmt == static_fmt_buf) return NULL; tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE, @@ -3566,7 +3570,7 @@ const char *trace_event_format(struct trace_iterator *iter, const char *fmt) if (WARN_ON_ONCE(!fmt)) return fmt; - if (iter->tr->trace_flags & TRACE_ITER_HASH_PTR) + if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR) return fmt; p = fmt; @@ -4828,7 +4832,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, cpumask_var_t tracing_cpumask_new; int err; - if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) + if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); @@ -9692,7 +9696,7 @@ void __init early_trace_init(void) { if (tracepoint_printk) { tracepoint_print_iter = - kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); + kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); if (MEM_FAIL(!tracepoint_print_iter, "Failed to allocate trace iterator\n")) tracepoint_printk = 0; diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index dc971a68dda4..e57cc0870892 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -63,8 +63,10 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (event[0] == '\0') - return -EINVAL; + if (event[0] == '\0') { + ret = -EINVAL; + goto out; + } mutex_lock(&event_mutex); for_each_dyn_event_safe(pos, n) { diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index af612945a4d0..9a4b980d695b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -106,6 +106,7 @@ int create_user_ns(struct cred *new) if (!ns) goto fail_dec; + ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; @@ -841,6 +842,60 @@ static int sort_idmaps(struct uid_gid_map *map) return 0; } +/** + * verify_root_map() - check the uid 0 mapping + * @file: idmapping file + * @map_ns: user namespace of the target process + * @new_map: requested idmap + * + * If a process requests mapping parent uid 0 into the new ns, verify that the + * process writing the map had the CAP_SETFCAP capability as the target process + * will be able to write fscaps that are valid in ancestor user namespaces. + * + * Return: true if the mapping is allowed, false if not. + */ +static bool verify_root_map(const struct file *file, + struct user_namespace *map_ns, + struct uid_gid_map *new_map) +{ + int idx; + const struct user_namespace *file_ns = file->f_cred->user_ns; + struct uid_gid_extent *extent0 = NULL; + + for (idx = 0; idx < new_map->nr_extents; idx++) { + if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent0 = &new_map->extent[idx]; + else + extent0 = &new_map->forward[idx]; + if (extent0->lower_first == 0) + break; + + extent0 = NULL; + } + + if (!extent0) + return true; + + if (map_ns == file_ns) { + /* The process unshared its ns and is writing to its own + * /proc/self/uid_map. User already has full capabilites in + * the new namespace. Verify that the parent had CAP_SETFCAP + * when it unshared. + * */ + if (!file_ns->parent_could_setfcap) + return false; + } else { + /* Process p1 is writing to uid_map of p2, who is in a child + * user namespace to p1's. Verify that the opener of the map + * file has CAP_SETFCAP against the parent of the new map + * namespace */ + if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) + return false; + } + + return true; +} + static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -848,7 +903,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; - struct user_namespace *ns = seq->private; + struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; @@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* * Adjusting namespace settings requires capabilities on the target. */ - if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) + if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ @@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ - if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) + if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; @@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; + + if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) + return false; + /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 71109065bd8e..107bc38b1945 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -278,9 +278,10 @@ void touch_all_softlockup_watchdogs(void) * update as well, the only side effect might be a cycle delay for * the softlockup check. */ - for_each_cpu(cpu, &watchdog_allowed_mask) + for_each_cpu(cpu, &watchdog_allowed_mask) { per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET; - wq_watchdog_touch(-1); + wq_watchdog_touch(cpu); + } } void touch_softlockup_watchdog_sync(void) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0d150da252e8..b19d759e55a5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1412,7 +1412,6 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, */ lockdep_assert_irqs_disabled(); - debug_work_activate(work); /* if draining, only works from the same workqueue are allowed */ if (unlikely(wq->flags & __WQ_DRAINING) && @@ -1494,6 +1493,7 @@ retry: worklist = &pwq->delayed_works; } + debug_work_activate(work); insert_work(pwq, work, worklist, work_flags); out: @@ -1630,7 +1630,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); - WARN_ON_ONCE(timer->function != delayed_work_timer_fn); + WARN_ON_FUNCTION_MISMATCH(timer->function, delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); @@ -5787,22 +5787,17 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) continue; /* get the latest of pool and touched timestamps */ + if (pool->cpu >= 0) + touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); + else + touched = READ_ONCE(wq_watchdog_touched); pool_ts = READ_ONCE(pool->watchdog_ts); - touched = READ_ONCE(wq_watchdog_touched); if (time_after(pool_ts, touched)) ts = pool_ts; else ts = touched; - if (pool->cpu >= 0) { - unsigned long cpu_touched = - READ_ONCE(per_cpu(wq_watchdog_touched_cpu, - pool->cpu)); - if (time_after(cpu_touched, ts)) - ts = cpu_touched; - } - /* did we stall? */ if (time_after(jiffies, ts + thresh)) { lockup_detected = true; @@ -5826,8 +5821,8 @@ notrace void wq_watchdog_touch(int cpu) { if (cpu >= 0) per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; - else - wq_watchdog_touched = jiffies; + + wq_watchdog_touched = jiffies; } static void wq_watchdog_set_thresh(unsigned long thresh) |