diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/verifier.c | 230 | ||||
-rw-r--r-- | kernel/entry/common.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace.c | 10 | ||||
-rw-r--r-- | kernel/trace/trace_dynevent.c | 6 | ||||
-rw-r--r-- | kernel/user_namespace.c | 65 |
5 files changed, 230 insertions, 83 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3a738724a380..0399ac092b36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5856,40 +5856,51 @@ static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) return &env->insn_aux_data[env->insn_idx]; } +enum { + REASON_BOUNDS = -1, + REASON_TYPE = -2, + REASON_PATHS = -3, + REASON_LIMIT = -4, + REASON_STACK = -5, +}; + static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, - u32 *ptr_limit, u8 opcode, bool off_is_neg) + const struct bpf_reg_state *off_reg, + u32 *alu_limit, u8 opcode) { + bool off_is_neg = off_reg->smin_value < 0; bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || (opcode == BPF_SUB && !off_is_neg); - u32 off, max; + u32 max = 0, ptr_limit = 0; + + if (!tnum_is_const(off_reg->var_off) && + (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) + return REASON_BOUNDS; switch (ptr_reg->type) { case PTR_TO_STACK: /* Offset 0 is out-of-bounds, but acceptable start for the - * left direction, see BPF_REG_FP. + * left direction, see BPF_REG_FP. Also, unknown scalar + * offset where we would need to deal with min/max bounds is + * currently prohibited for unprivileged. */ max = MAX_BPF_STACK + mask_to_left; - /* Indirect variable offset stack access is prohibited in - * unprivileged mode so it's not handled here. - */ - off = ptr_reg->off + ptr_reg->var_off.value; - if (mask_to_left) - *ptr_limit = MAX_BPF_STACK + off; - else - *ptr_limit = -off - 1; - return *ptr_limit >= max ? -ERANGE : 0; + ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off); + break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; - if (mask_to_left) { - *ptr_limit = ptr_reg->umax_value + ptr_reg->off; - } else { - off = ptr_reg->smin_value + ptr_reg->off; - *ptr_limit = ptr_reg->map_ptr->value_size - off - 1; - } - return *ptr_limit >= max ? -ERANGE : 0; + ptr_limit = (mask_to_left ? + ptr_reg->smin_value : + ptr_reg->umax_value) + ptr_reg->off; + break; default: - return -EINVAL; + return REASON_TYPE; } + + if (ptr_limit >= max) + return REASON_LIMIT; + *alu_limit = ptr_limit; + return 0; } static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, @@ -5907,7 +5918,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, if (aux->alu_state && (aux->alu_state != alu_state || aux->alu_limit != alu_limit)) - return -EACCES; + return REASON_PATHS; /* Corresponding fixup done in fixup_bpf_calls(). */ aux->alu_state = alu_state; @@ -5926,14 +5937,22 @@ static int sanitize_val_alu(struct bpf_verifier_env *env, return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0); } +static bool sanitize_needed(u8 opcode) +{ + return opcode == BPF_ADD || opcode == BPF_SUB; +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, + const struct bpf_reg_state *off_reg, struct bpf_reg_state *dst_reg, - bool off_is_neg) + struct bpf_insn_aux_data *tmp_aux, + const bool commit_window) { + struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux; struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_insn_aux_data *aux = cur_aux(env); + bool off_is_neg = off_reg->smin_value < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; @@ -5951,18 +5970,33 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (vstate->speculative) goto do_sim; - alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; - alu_state |= ptr_is_dst_reg ? - BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; - - err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg); + err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode); if (err < 0) return err; + if (commit_window) { + /* In commit phase we narrow the masking window based on + * the observed pointer move after the simulated operation. + */ + alu_state = tmp_aux->alu_state; + alu_limit = abs(tmp_aux->alu_limit - alu_limit); + } else { + alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + } + err = update_alu_sanitation_state(aux, alu_state, alu_limit); if (err < 0) return err; do_sim: + /* If we're in commit phase, we're done here given we already + * pushed the truncated dst_reg into the speculative verification + * stack. + */ + if (commit_window) + return 0; + /* Simulate and find potential out-of-bounds access under * speculative execution from truncation as a result of * masking when off was not within expected range. If off @@ -5979,7 +6013,46 @@ do_sim: ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); if (!ptr_is_dst_reg && ret) *dst_reg = tmp; - return !ret ? -EFAULT : 0; + return !ret ? REASON_STACK : 0; +} + +static int sanitize_err(struct bpf_verifier_env *env, + const struct bpf_insn *insn, int reason, + const struct bpf_reg_state *off_reg, + const struct bpf_reg_state *dst_reg) +{ + static const char *err = "pointer arithmetic with it prohibited for !root"; + const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub"; + u32 dst = insn->dst_reg, src = insn->src_reg; + + switch (reason) { + case REASON_BOUNDS: + verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n", + off_reg == dst_reg ? dst : src, err); + break; + case REASON_TYPE: + verbose(env, "R%d has pointer with unsupported alu operation, %s\n", + off_reg == dst_reg ? src : dst, err); + break; + case REASON_PATHS: + verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n", + dst, op, err); + break; + case REASON_LIMIT: + verbose(env, "R%d tried to %s beyond pointer bounds, %s\n", + dst, op, err); + break; + case REASON_STACK: + verbose(env, "R%d could not be pushed for speculative verification, %s\n", + dst, err); + break; + default: + verbose(env, "verifier internal error: unknown reason (%d)\n", + reason); + break; + } + + return -EACCES; } /* check that stack access falls within stack limits and that 'reg' doesn't @@ -6016,6 +6089,37 @@ static int check_stack_access_for_ptr_arithmetic( return 0; } +static int sanitize_check_bounds(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + const struct bpf_reg_state *dst_reg) +{ + u32 dst = insn->dst_reg; + + /* For unprivileged we require that resulting offset must be in bounds + * in order to be able to sanitize access later on. + */ + if (env->bypass_spec_v1) + return 0; + + switch (dst_reg->type) { + case PTR_TO_STACK: + if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg, + dst_reg->off + dst_reg->var_off.value)) + return -EACCES; + break; + case PTR_TO_MAP_VALUE: + if (check_map_access(env, dst, dst_reg->off, 1, false)) { + verbose(env, "R%d pointer arithmetic of map value goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } + break; + default: + break; + } + + return 0; +} /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. @@ -6035,8 +6139,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; - u32 dst = insn->dst_reg, src = insn->src_reg; + struct bpf_insn_aux_data tmp_aux = {}; u8 opcode = BPF_OP(insn->code); + u32 dst = insn->dst_reg; int ret; dst_reg = ®s[dst]; @@ -6084,13 +6189,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; - case PTR_TO_MAP_VALUE: - if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { - verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", - off_reg == dst_reg ? dst : src); - return -EACCES; - } - fallthrough; default: break; } @@ -6108,13 +6206,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* pointer types do not carry 32-bit bounds at the moment. */ __mark_reg32_unbounded(dst_reg); + if (sanitize_needed(opcode)) { + ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg, + &tmp_aux, false); + if (ret < 0) + return sanitize_err(env, insn, ret, off_reg, dst_reg); + } + switch (opcode) { case BPF_ADD: - ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); - if (ret < 0) { - verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst); - return ret; - } /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -6165,11 +6265,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: - ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); - if (ret < 0) { - verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst); - return ret; - } if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ verbose(env, "R%d tried to subtract pointer from scalar\n", @@ -6250,21 +6345,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); - /* For unprivileged we require that resulting offset must be in bounds - * in order to be able to sanitize access later on. - */ - if (!env->bypass_spec_v1) { - if (dst_reg->type == PTR_TO_MAP_VALUE && - check_map_access(env, dst, dst_reg->off, 1, false)) { - verbose(env, "R%d pointer arithmetic of map value goes out of range, " - "prohibited for !root\n", dst); - return -EACCES; - } else if (dst_reg->type == PTR_TO_STACK && - check_stack_access_for_ptr_arithmetic( - env, dst, dst_reg, dst_reg->off + - dst_reg->var_off.value)) { - return -EACCES; - } + if (sanitize_check_bounds(env, insn, dst_reg) < 0) + return -EACCES; + if (sanitize_needed(opcode)) { + ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg, + &tmp_aux, true); + if (ret < 0) + return sanitize_err(env, insn, ret, off_reg, dst_reg); } return 0; @@ -6858,9 +6945,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, s32 s32_min_val, s32_max_val; u32 u32_min_val, u32_max_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; - u32 dst = insn->dst_reg; - int ret; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); + int ret; smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; @@ -6902,6 +6988,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, return 0; } + if (sanitize_needed(opcode)) { + ret = sanitize_val_alu(env, insn); + if (ret < 0) + return sanitize_err(env, insn, ret, NULL, NULL); + } + /* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops. * There are two classes of instructions: The first class we track both * alu32 and alu64 sign/unsigned bounds independently this provides the @@ -6918,21 +7010,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, */ switch (opcode) { case BPF_ADD: - ret = sanitize_val_alu(env, insn); - if (ret < 0) { - verbose(env, "R%d tried to add from different pointers or scalars\n", dst); - return ret; - } scalar32_min_max_add(dst_reg, &src_reg); scalar_min_max_add(dst_reg, &src_reg); dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: - ret = sanitize_val_alu(env, insn); - if (ret < 0) { - verbose(env, "R%d tried to sub from different pointers or scalars\n", dst); - return ret; - } scalar32_min_max_sub(dst_reg, &src_reg); scalar_min_max_sub(dst_reg, &src_reg); dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 8442e5c9cfa2..2003d69bd6d5 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -422,7 +422,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) instrumentation_begin(); if (IS_ENABLED(CONFIG_PREEMPTION)) { -#ifdef CONFIG_PREEMT_DYNAMIC +#ifdef CONFIG_PREEMPT_DYNAMIC static_call(irqentry_exit_cond_resched)(); #else irqentry_exit_cond_resched(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c777627212f..c0c9aa5cd8e2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3545,7 +3545,11 @@ static char *trace_iter_expand_format(struct trace_iterator *iter) { char *tmp; - if (iter->fmt == static_fmt_buf) + /* + * iter->tr is NULL when used with tp_printk, which makes + * this get called where it is not safe to call krealloc(). + */ + if (!iter->tr || iter->fmt == static_fmt_buf) return NULL; tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE, @@ -3566,7 +3570,7 @@ const char *trace_event_format(struct trace_iterator *iter, const char *fmt) if (WARN_ON_ONCE(!fmt)) return fmt; - if (iter->tr->trace_flags & TRACE_ITER_HASH_PTR) + if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR) return fmt; p = fmt; @@ -9692,7 +9696,7 @@ void __init early_trace_init(void) { if (tracepoint_printk) { tracepoint_print_iter = - kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); + kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); if (MEM_FAIL(!tracepoint_print_iter, "Failed to allocate trace iterator\n")) tracepoint_printk = 0; diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index dc971a68dda4..e57cc0870892 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -63,8 +63,10 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (event[0] == '\0') - return -EINVAL; + if (event[0] == '\0') { + ret = -EINVAL; + goto out; + } mutex_lock(&event_mutex); for_each_dyn_event_safe(pos, n) { diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index af612945a4d0..9a4b980d695b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -106,6 +106,7 @@ int create_user_ns(struct cred *new) if (!ns) goto fail_dec; + ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; @@ -841,6 +842,60 @@ static int sort_idmaps(struct uid_gid_map *map) return 0; } +/** + * verify_root_map() - check the uid 0 mapping + * @file: idmapping file + * @map_ns: user namespace of the target process + * @new_map: requested idmap + * + * If a process requests mapping parent uid 0 into the new ns, verify that the + * process writing the map had the CAP_SETFCAP capability as the target process + * will be able to write fscaps that are valid in ancestor user namespaces. + * + * Return: true if the mapping is allowed, false if not. + */ +static bool verify_root_map(const struct file *file, + struct user_namespace *map_ns, + struct uid_gid_map *new_map) +{ + int idx; + const struct user_namespace *file_ns = file->f_cred->user_ns; + struct uid_gid_extent *extent0 = NULL; + + for (idx = 0; idx < new_map->nr_extents; idx++) { + if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent0 = &new_map->extent[idx]; + else + extent0 = &new_map->forward[idx]; + if (extent0->lower_first == 0) + break; + + extent0 = NULL; + } + + if (!extent0) + return true; + + if (map_ns == file_ns) { + /* The process unshared its ns and is writing to its own + * /proc/self/uid_map. User already has full capabilites in + * the new namespace. Verify that the parent had CAP_SETFCAP + * when it unshared. + * */ + if (!file_ns->parent_could_setfcap) + return false; + } else { + /* Process p1 is writing to uid_map of p2, who is in a child + * user namespace to p1's. Verify that the opener of the map + * file has CAP_SETFCAP against the parent of the new map + * namespace */ + if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) + return false; + } + + return true; +} + static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -848,7 +903,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; - struct user_namespace *ns = seq->private; + struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; @@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* * Adjusting namespace settings requires capabilities on the target. */ - if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) + if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ @@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ - if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) + if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; @@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; + + if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) + return false; + /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ |