From d86ab9cff8b936aadde444d0e263a8db5ff0349b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 3 May 2017 14:30:48 +0100 Subject: cpufreq: schedutil: use now as reference when aggregating shared policy requests Currently, sugov_next_freq_shared() uses last_freq_update_time as a reference to decide when to start considering CPU contributions as stale. However, since last_freq_update_time is set by the last CPU that issued a frequency transition, this might cause problems in certain cases. In practice, the detection of stale utilization values fails whenever the CPU with such values was the last to update the policy. For example (and please note again that the SCHED_CPUFREQ_RT flag is not the problem here, but only the detection of after how much time that flag has to be considered stale), suppose a policy with 2 CPUs: CPU0 | CPU1 | | RT task scheduled | SCHED_CPUFREQ_RT is set | CPU1->last_update = now | freq transition to max | last_freq_update_time = now | more than TICK_NSEC nsecs | a small CFS wakes up | CPU0->last_update = now1 | delta_ns(CPU0) < TICK_NSEC* | CPU0's util is considered | delta_ns(CPU1) = | last_freq_update_time - | CPU1->last_update = 0 | < TICK_NSEC | CPU1 is still considered | CPU1->SCHED_CPUFREQ_RT is set | we stay at max (until CPU1 | exits from idle) | * delta_ns is actually negative as now1 > last_freq_update_time While last_freq_update_time is a sensible reference for rate limiting, it doesn't seem to be useful for working around stale CPU states. Fix the problem by always considering now (time) as the reference for deciding when CPUs have stale contributions. Signed-off-by: Juri Lelli Acked-by: Vincent Guittot Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 76877a62b5fa..622eed1b7658 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -245,11 +245,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - u64 last_freq_update_time = sg_policy->last_freq_update_time; unsigned long util = 0, max = 1; unsigned int j; @@ -265,7 +264,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) * enough, don't take the CPU into account as it probably is * idle now (and clear iowait_boost for it). */ - delta_ns = last_freq_update_time - j_sg_cpu->last_update; + delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; continue; @@ -309,7 +308,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, if (flags & SCHED_CPUFREQ_RT_DL) next_f = sg_policy->policy->cpuinfo.max_freq; else - next_f = sugov_next_freq_shared(sg_cpu); + next_f = sugov_next_freq_shared(sg_cpu, time); sugov_update_commit(sg_policy, time, next_f); } -- cgit v1.3-14-g43fede From d1174416747d790d750742d0514915deeed93acf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 10 May 2017 11:22:52 -0700 Subject: bpf: Track alignment of register values in the verifier. Currently if we add only constant values to pointers we can fully validate the alignment, and properly check if we need to reject the program on !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS architectures. However, once an unknown value is introduced we only allow byte sized memory accesses which is too restrictive. Add logic to track the known minimum alignment of register values, and propagate this state into registers containing pointers. The most common paradigm that makes use of this new logic is computing the transport header using the IP header length field. For example: struct ethhdr *ep = skb->data; struct iphdr *iph = (struct iphdr *) (ep + 1); struct tcphdr *th; ... n = iph->ihl; th = ((void *)iph + (n * 4)); port = th->dest; The existing code will reject the load of th->dest because it cannot validate that the alignment is at least 2 once "n * 4" is added the the packet pointer. In the new code, the register holding "n * 4" will have a reg->min_align value of 4, because any value multiplied by 4 will be at least 4 byte aligned. (actually, the eBPF code emitted by the compiler in this case is most likely to use a shift left by 2, but the end result is identical) At the critical addition: th = ((void *)iph + (n * 4)); The register holding 'th' will start with reg->off value of 14. The pointer addition will transform that reg into something that looks like: reg->aux_off = 14 reg->aux_off_align = 4 Next, the verifier will look at the th->dest load, and it will see a load offset of 2, and first check: if (reg->aux_off_align % size) which will pass because aux_off_align is 4. reg_off will be computed: reg_off = reg->off; ... reg_off += reg->aux_off; plus we have off==2, and it will thus check: if ((NET_IP_ALIGN + reg_off + off) % size != 0) which evaluates to: if ((NET_IP_ALIGN + 14 + 2) % size != 0) On strict alignment architectures, NET_IP_ALIGN is 2, thus: if ((2 + 14 + 2) % size != 0) which passes. These pointer transformations and checks work regardless of whether the constant offset or the variable with known alignment is added first to the pointer register. Signed-off-by: David S. Miller Acked-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 3 ++ kernel/bpf/verifier.c | 108 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 92 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5efb4db44e1e..7c6a51924afc 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -40,6 +40,9 @@ struct bpf_reg_state { */ s64 min_value; u64 max_value; + u32 min_align; + u32 aux_off; + u32 aux_off_align; }; enum bpf_stack_slot_type { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c5b56c92f8e2..cc7b626fa447 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -241,6 +241,12 @@ static void print_verifier_state(struct bpf_verifier_state *state) if (reg->max_value != BPF_REGISTER_MAX_RANGE) verbose(",max_value=%llu", (unsigned long long)reg->max_value); + if (reg->min_align) + verbose(",min_align=%u", reg->min_align); + if (reg->aux_off) + verbose(",aux_off=%u", reg->aux_off); + if (reg->aux_off_align) + verbose(",aux_off_align=%u", reg->aux_off_align); } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) @@ -466,6 +472,9 @@ static void init_reg_state(struct bpf_reg_state *regs) regs[i].imm = 0; regs[i].min_value = BPF_REGISTER_MIN_RANGE; regs[i].max_value = BPF_REGISTER_MAX_RANGE; + regs[i].min_align = 0; + regs[i].aux_off = 0; + regs[i].aux_off_align = 0; } /* frame pointer */ @@ -492,6 +501,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; regs[regno].max_value = BPF_REGISTER_MAX_RANGE; + regs[regno].min_align = 0; } static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, @@ -779,17 +789,28 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, - int off, int size) + int off, int size, bool strict) { - if (reg->id && size != 1) { - verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n"); - return -EACCES; + int reg_off; + + /* Byte size accesses are always allowed. */ + if (!strict || size == 1) + return 0; + + reg_off = reg->off; + if (reg->id) { + if (reg->aux_off_align % size) { + verbose("Packet access is only %u byte aligned, %d byte access not allowed\n", + reg->aux_off_align, size); + return -EACCES; + } + reg_off += reg->aux_off; } /* skb->data is NET_IP_ALIGN-ed */ - if ((NET_IP_ALIGN + reg->off + off) % size != 0) { + if ((NET_IP_ALIGN + reg_off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", - NET_IP_ALIGN, reg->off, off, size); + NET_IP_ALIGN, reg_off, off, size); return -EACCES; } @@ -797,9 +818,9 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, } static int check_val_ptr_alignment(const struct bpf_reg_state *reg, - int size) + int size, bool strict) { - if (size != 1) { + if (strict && size != 1) { verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); return -EACCES; } @@ -810,13 +831,16 @@ static int check_val_ptr_alignment(const struct bpf_reg_state *reg, static int check_ptr_alignment(const struct bpf_reg_state *reg, int off, int size) { + bool strict = false; + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + strict = true; + switch (reg->type) { case PTR_TO_PACKET: - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : - check_pkt_ptr_alignment(reg, off, size); + return check_pkt_ptr_alignment(reg, off, size, strict); case PTR_TO_MAP_VALUE_ADJ: - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : - check_val_ptr_alignment(reg, size); + return check_val_ptr_alignment(reg, size, strict); default: if (off % size != 0) { verbose("misaligned access off %d size %d\n", @@ -883,6 +907,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, value_regno); /* note that reg.[id|off|range] == 0 */ state->regs[value_regno].type = reg_type; + state->regs[value_regno].aux_off = 0; + state->regs[value_regno].aux_off_align = 0; } } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { @@ -1455,6 +1481,8 @@ add_imm: */ dst_reg->off += imm; } else { + bool had_id; + if (src_reg->type == PTR_TO_PACKET) { /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ tmp_reg = *dst_reg; /* save r7 state */ @@ -1488,14 +1516,23 @@ add_imm: src_reg->imm); return -EACCES; } + + had_id = (dst_reg->id != 0); + /* dst_reg stays as pkt_ptr type and since some positive * integer value was added to the pointer, increment its 'id' */ dst_reg->id = ++env->id_gen; - /* something was added to pkt_ptr, set range and off to zero */ + /* something was added to pkt_ptr, set range to zero */ + dst_reg->aux_off = dst_reg->off; dst_reg->off = 0; dst_reg->range = 0; + if (had_id) + dst_reg->aux_off_align = min(dst_reg->aux_off_align, + src_reg->min_align); + else + dst_reg->aux_off_align = src_reg->min_align; } return 0; } @@ -1669,6 +1706,13 @@ static void check_reg_overflow(struct bpf_reg_state *reg) reg->min_value = BPF_REGISTER_MIN_RANGE; } +static u32 calc_align(u32 imm) +{ + if (!imm) + return 1U << 31; + return imm - ((imm - 1) & imm); +} + static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { @@ -1676,8 +1720,10 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, s64 min_val = BPF_REGISTER_MIN_RANGE; u64 max_val = BPF_REGISTER_MAX_RANGE; u8 opcode = BPF_OP(insn->code); + u32 dst_align, src_align; dst_reg = ®s[insn->dst_reg]; + src_align = 0; if (BPF_SRC(insn->code) == BPF_X) { check_reg_overflow(®s[insn->src_reg]); min_val = regs[insn->src_reg].min_value; @@ -1693,12 +1739,18 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, regs[insn->src_reg].type != UNKNOWN_VALUE) { min_val = BPF_REGISTER_MIN_RANGE; max_val = BPF_REGISTER_MAX_RANGE; + src_align = 0; + } else { + src_align = regs[insn->src_reg].min_align; } } else if (insn->imm < BPF_REGISTER_MAX_RANGE && (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { min_val = max_val = insn->imm; + src_align = calc_align(insn->imm); } + dst_align = dst_reg->min_align; + /* We don't know anything about what was done to this register, mark it * as unknown. */ @@ -1723,18 +1775,21 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg->min_value += min_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value += max_val; + dst_reg->min_align = min(src_align, dst_align); break; case BPF_SUB: if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value -= min_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value -= max_val; + dst_reg->min_align = min(src_align, dst_align); break; case BPF_MUL: if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value *= min_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value *= max_val; + dst_reg->min_align = max(src_align, dst_align); break; case BPF_AND: /* Disallow AND'ing of negative numbers, ain't nobody got time @@ -1746,17 +1801,23 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, else dst_reg->min_value = 0; dst_reg->max_value = max_val; + dst_reg->min_align = max(src_align, dst_align); break; case BPF_LSH: /* Gotta have special overflow logic here, if we're shifting * more than MAX_RANGE then just assume we have an invalid * range. */ - if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) + if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) { dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value <<= min_val; - + dst_reg->min_align = 1; + } else { + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value <<= min_val; + if (!dst_reg->min_align) + dst_reg->min_align = 1; + dst_reg->min_align <<= min_val; + } if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->max_value = BPF_REGISTER_MAX_RANGE; else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) @@ -1766,11 +1827,19 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* RSH by a negative number is undefined, and the BPF_RSH is an * unsigned shift, so make the appropriate casts. */ - if (min_val < 0 || dst_reg->min_value < 0) + if (min_val < 0 || dst_reg->min_value < 0) { dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - else + } else { dst_reg->min_value = (u64)(dst_reg->min_value) >> min_val; + } + if (min_val < 0) { + dst_reg->min_align = 1; + } else { + dst_reg->min_align >>= (u64) min_val; + if (!dst_reg->min_align) + dst_reg->min_align = 1; + } if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value >>= max_val; break; @@ -1872,6 +1941,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].imm = insn->imm; regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; + regs[insn->dst_reg].min_align = calc_align(insn->imm); } } else if (opcode > BPF_END) { -- cgit v1.3-14-g43fede From c5fc9692d101d1318b0f53f9f691cd88ac029317 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 10 May 2017 11:25:17 -0700 Subject: bpf: Do per-instruction state dumping in verifier when log_level > 1. If log_level > 1, do a state dump every instruction and emit it in a more compact way (without a leading newline). This will facilitate more sophisticated test cases which inspect the verifier log for register state. Signed-off-by: David S. Miller Acked-by: Daniel Borkmann --- kernel/bpf/verifier.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cc7b626fa447..ff2bfe1d656a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2926,8 +2926,12 @@ static int do_check(struct bpf_verifier_env *env) goto process_bpf_exit; } - if (log_level && do_print_state) { - verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); + if (log_level > 1 || (log_level && do_print_state)) { + if (log_level > 1) + verbose("%d:", insn_idx); + else + verbose("\nfrom %d to %d:", + prev_insn_idx, insn_idx); print_verifier_state(&env->cur_state); do_print_state = false; } -- cgit v1.3-14-g43fede From e07b98d9bffe410019dfcf62c3428d4a96c56a2c Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 10 May 2017 11:38:07 -0700 Subject: bpf: Add strict alignment flag for BPF_PROG_LOAD. Add a new field, "prog_flags", and an initial flag value BPF_F_STRICT_ALIGNMENT. When set, the verifier will enforce strict pointer alignment regardless of the setting of CONFIG_EFFICIENT_UNALIGNED_ACCESS. The verifier, in this mode, will also use a fixed value of "2" in place of NET_IP_ALIGN. This facilitates test cases that will exercise and validate this part of the verifier even when run on architectures where alignment doesn't matter. Signed-off-by: David S. Miller Acked-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/syscall.c | 5 ++++- kernel/bpf/verifier.c | 23 +++++++++++++++++------ tools/build/feature/test-bpf.c | 1 + tools/include/uapi/linux/bpf.h | 11 +++++++++-- 6 files changed, 40 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7c6a51924afc..d5093b52b485 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -90,6 +90,7 @@ struct bpf_verifier_env { struct bpf_prog *prog; /* eBPF program being verified */ struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ + bool strict_alignment; /* perform strict pointer alignment checks */ struct bpf_verifier_state cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 945a1f5f63c5..94dfa9def355 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -132,6 +132,13 @@ enum bpf_attach_type { */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) +/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will perform strict alignment checking as if the kernel + * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, + * and NET_IP_ALIGN defined to 2. + */ +#define BPF_F_STRICT_ALIGNMENT (1U << 0) + #define BPF_PSEUDO_MAP_FD 1 /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -177,6 +184,7 @@ union bpf_attr { __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 prog_flags; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fd2411fd6914..265a0d854e33 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -783,7 +783,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD kern_version +#define BPF_PROG_LOAD_LAST_FIELD prog_flags static int bpf_prog_load(union bpf_attr *attr) { @@ -796,6 +796,9 @@ static int bpf_prog_load(union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; + if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) + return -EINVAL; + /* copy eBPF program license from user space */ if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff2bfe1d656a..e74fb1b87855 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -791,6 +791,7 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, int off, int size, bool strict) { + int ip_align; int reg_off; /* Byte size accesses are always allowed. */ @@ -807,10 +808,14 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, reg_off += reg->aux_off; } - /* skb->data is NET_IP_ALIGN-ed */ - if ((NET_IP_ALIGN + reg_off + off) % size != 0) { + /* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking + * we force this to 2 which is universally what architectures use + * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + */ + ip_align = strict ? 2 : NET_IP_ALIGN; + if ((ip_align + reg_off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", - NET_IP_ALIGN, reg_off, off, size); + ip_align, reg_off, off, size); return -EACCES; } @@ -828,10 +833,11 @@ static int check_val_ptr_alignment(const struct bpf_reg_state *reg, return 0; } -static int check_ptr_alignment(const struct bpf_reg_state *reg, +static int check_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int off, int size) { - bool strict = false; + bool strict = env->strict_alignment; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) strict = true; @@ -873,7 +879,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, if (size < 0) return size; - err = check_ptr_alignment(reg, off, size); + err = check_ptr_alignment(env, reg, off, size); if (err) return err; @@ -3568,6 +3574,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) } else { log_level = 0; } + if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT) + env->strict_alignment = true; + else + env->strict_alignment = false; ret = replace_map_fd_with_map_ptr(env); if (ret < 0) @@ -3673,6 +3683,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, mutex_lock(&bpf_verifier_lock); log_level = 0; + env->strict_alignment = false; env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c index ebc6dceddb58..7598361ef1f1 100644 --- a/tools/build/feature/test-bpf.c +++ b/tools/build/feature/test-bpf.c @@ -29,6 +29,7 @@ int main(void) attr.log_size = 0; attr.log_level = 0; attr.kern_version = 0; + attr.prog_flags = 0; /* * Test existence of __NR_bpf and BPF_PROG_LOAD. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e553529929f6..94dfa9def355 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -132,6 +132,13 @@ enum bpf_attach_type { */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) +/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will perform strict alignment checking as if the kernel + * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, + * and NET_IP_ALIGN defined to 2. + */ +#define BPF_F_STRICT_ALIGNMENT (1U << 0) + #define BPF_PSEUDO_MAP_FD 1 /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -177,6 +184,7 @@ union bpf_attr { __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 prog_flags; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -481,8 +489,7 @@ union bpf_attr { * u32 bpf_get_socket_uid(skb) * Get the owner uid of the socket stored inside sk_buff. * @skb: pointer to skb - * Return: uid of the socket owner on success or 0 if the socket pointer - * inside sk_buff is NULL + * Return: uid of the socket owner on success or overflowuid if failed. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ -- cgit v1.3-14-g43fede From 6832a333ed4a7cc4fcb170c045d1d96d0976fdd4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 11 May 2017 19:30:02 -0700 Subject: bpf: Handle multiple variable additions into packet pointers in verifier. We must accumulate into reg->aux_off rather than use a plain assignment. Add a test for this situation to test_align. Reported-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- tools/testing/selftests/bpf/test_align.c | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e74fb1b87855..39f2dcbc4cbc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1531,7 +1531,7 @@ add_imm: dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ - dst_reg->aux_off = dst_reg->off; + dst_reg->aux_off += dst_reg->off; dst_reg->off = 0; dst_reg->range = 0; if (had_id) diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c index ed242552e492..9644d4e069de 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@ -273,6 +273,20 @@ static struct bpf_align_test tests[] = { BPF_EXIT_INSN(), BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0), + /* Test multiple accumulations of unknown values + * into a packet pointer. + */ + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -314,6 +328,29 @@ static struct bpf_align_test tests[] = { * requirements. */ "23: R0=pkt(id=0,off=8,r=8) R1=ctx R2=pkt(id=0,off=0,r=8) R3=pkt_end R4=pkt(id=2,off=18,r=18),aux_off_align=4 R5=pkt(id=2,off=14,r=18),aux_off_align=4 R6=inv54,min_align=4 R10=fp", + + /* Constant offset is added to R5 packet pointer, + * resulting in reg->off value of 14. + */ + "26: R0=pkt(id=0,off=8,r=8) R1=ctx R2=pkt(id=0,off=0,r=8) R3=pkt_end R4=inv,aux_off_align=4 R5=pkt(id=0,off=14,r=8) R6=inv54,min_align=4 R10=fp", + /* Variable offset is added to R5, resulting in an + * auxiliary offset of 14, and an auxiliary alignment of 4. + */ + "27: R0=pkt(id=0,off=8,r=8) R1=ctx R2=pkt(id=0,off=0,r=8) R3=pkt_end R4=inv,aux_off_align=4 R5=pkt(id=3,off=0,r=0),aux_off=14,aux_off_align=4 R6=inv54,min_align=4 R10=fp", + /* Constant is added to R5 again, setting reg->off to 4. */ + "28: R0=pkt(id=0,off=8,r=8) R1=ctx R2=pkt(id=0,off=0,r=8) R3=pkt_end R4=inv,aux_off_align=4 R5=pkt(id=3,off=4,r=0),aux_off=14,aux_off_align=4 R6=inv54,min_align=4 R10=fp", + /* And once more we add a variable, which causes an accumulation + * of reg->off into reg->aux_off_align, with resulting value of + * 18. The auxiliary alignment stays at 4. + */ + "29: R0=pkt(id=0,off=8,r=8) R1=ctx R2=pkt(id=0,off=0,r=8) R3=pkt_end R4=inv,aux_off_align=4 R5=pkt(id=4,off=0,r=0),aux_off=18,aux_off_align=4 R6=inv54,min_align=4 R10=fp", + /* At the time the word size load is performed from R5, + * it's total offset is NET_IP_ALIGN + reg->off (0) + + * reg->aux_off (18) which is 20. Then the variable offset + * is considered using reg->aux_off_align which is 4 and meets + * the load's requirements. + */ + "33: R0=pkt(id=0,off=8,r=8) R1=ctx R2=pkt(id=0,off=0,r=8) R3=pkt_end R4=pkt(id=4,off=4,r=4),aux_off=18,aux_off_align=4 R5=pkt(id=4,off=0,r=4),aux_off=18,aux_off_align=4 R6=inv54,min_align=4 R10=fp", }, }, }; -- cgit v1.3-14-g43fede From b9a985db98961ae1ba0be169f19df1c567e4ffe0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 11 May 2017 18:21:01 -0500 Subject: pid_ns: Sleep in TASK_INTERRUPTIBLE in zap_pid_ns_processes The code can potentially sleep for an indefinite amount of time in zap_pid_ns_processes triggering the hung task timeout, and increasing the system average. This is undesirable. Sleep with a task state of TASK_INTERRUPTIBLE instead of TASK_UNINTERRUPTIBLE to remove these undesirable side effects. Apparently under heavy load this has been allowing Chrome to trigger the hung time task timeout error and cause ChromeOS to reboot. Reported-by: Vovo Yang Reported-by: Guenter Roeck Tested-by: Guenter Roeck Fixes: 6347e9009104 ("pidns: guarantee that the pidns init will be the last pidns process reaped") Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index d1f3e9f558b8..74a5a7255b4d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -277,7 +277,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * if reparented. */ for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->nr_hashed == init_pids) break; schedule(); -- cgit v1.3-14-g43fede From 3fd37226216620c1a468afa999739d5016fbc349 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 May 2017 19:11:31 +0300 Subject: pid_ns: Fix race between setns'ed fork() and zap_pid_ns_processes() Imagine we have a pid namespace and a task from its parent's pid_ns, which made setns() to the pid namespace. The task is doing fork(), while the pid namespace's child reaper is dying. We have the race between them: Task from parent pid_ns Child reaper copy_process() .. alloc_pid() .. .. zap_pid_ns_processes() .. disable_pid_allocation() .. read_lock(&tasklist_lock) .. iterate over pids in pid_ns .. kill tasks linked to pids .. read_unlock(&tasklist_lock) write_lock_irq(&tasklist_lock); .. attach_pid(p, PIDTYPE_PID); .. .. .. So, just created task p won't receive SIGKILL signal, and the pid namespace will be in contradictory state. Only manual kill will help there, but does the userspace care about this? I suppose, the most users just inject a task into a pid namespace and wait a SIGCHLD from it. The patch fixes the problem. It simply checks for (pid_ns->nr_hashed & PIDNS_HASH_ADDING) in copy_process(). We do it under the tasklist_lock, and can't skip PIDNS_HASH_ADDING as noted by Oleg: "zap_pid_ns_processes() does disable_pid_allocation() and then takes tasklist_lock to kill the whole namespace. Given that copy_process() checks PIDNS_HASH_ADDING under write_lock(tasklist) they can't race; if copy_process() takes this lock first, the new child will be killed, otherwise copy_process() can't miss the change in ->nr_hashed." If allocation is disabled, we just return -ENOMEM like it's made for such cases in alloc_pid(). v2: Do not move disable_pid_allocation(), do not introduce a new variable in copy_process() and simplify the patch as suggested by Oleg Nesterov. Account the problem with double irq enabling found by Eric W. Biederman. Fixes: c876ad768215 ("pidns: Stop pid allocation when init dies") Signed-off-by: Kirill Tkhai CC: Andrew Morton CC: Ingo Molnar CC: Peter Zijlstra CC: Oleg Nesterov CC: Mike Rapoport CC: Michal Hocko CC: Andy Lutomirski CC: "Eric W. Biederman" CC: Andrei Vagin CC: Cyrill Gorcunov CC: Serge Hallyn Cc: stable@vger.kernel.org Acked-by: Oleg Nesterov Signed-off-by: Eric W. Biederman --- kernel/fork.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 06d759ab4c62..aa1076c5e4a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1845,11 +1845,13 @@ static __latent_entropy struct task_struct *copy_process( */ recalc_sigpending(); if (signal_pending(current)) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } + if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { + retval = -ENOMEM; + goto bad_fork_cancel_cgroup; + } if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -1907,6 +1909,8 @@ static __latent_entropy struct task_struct *copy_process( return p; bad_fork_cancel_cgroup: + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); bad_fork_free_pid: cgroup_threadgroup_change_end(current); -- cgit v1.3-14-g43fede From 0bae5fd3330be0517fba697e6b228601d421fade Mon Sep 17 00:00:00 2001 From: Pushkar Jambhlekar Date: Thu, 11 May 2017 10:31:24 +0530 Subject: PM / hibernate: Declare variables as static Fixing sparse warnings: 'symbol not declared. Should it be static?' Signed-off-by: Pushkar Jambhlekar Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d79a38de425a..a628cccafa4a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1422,7 +1422,7 @@ static unsigned int nr_meta_pages; * Numbers of normal and highmem page frames allocated for hibernation image * before suspending devices. */ -unsigned int alloc_normal, alloc_highmem; +static unsigned int alloc_normal, alloc_highmem; /* * Memory bitmap used for marking saveable pages (during hibernation) or * hibernation image pages (during restore) -- cgit v1.3-14-g43fede From 8663effb24f9430394d3bf1ed2dac42a771421d1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 14 Apr 2017 08:48:09 -0400 Subject: sched/core: Call __schedule() from do_idle() without enabling preemption I finally got around to creating trampolines for dynamically allocated ftrace_ops with using synchronize_rcu_tasks(). For users of the ftrace function hook callbacks, like perf, that allocate the ftrace_ops descriptor via kmalloc() and friends, ftrace was not able to optimize the functions being traced to use a trampoline because they would also need to be allocated dynamically. The problem is that they cannot be freed when CONFIG_PREEMPT is set, as there's no way to tell if a task was preempted on the trampoline. That was before Paul McKenney implemented synchronize_rcu_tasks() that would make sure all tasks (except idle) have scheduled out or have entered user space. While testing this, I triggered this bug: BUG: unable to handle kernel paging request at ffffffffa0230077 ... RIP: 0010:0xffffffffa0230077 ... Call Trace: schedule+0x5/0xe0 schedule_preempt_disabled+0x18/0x30 do_idle+0x172/0x220 What happened was that the idle task was preempted on the trampoline. As synchronize_rcu_tasks() ignores the idle thread, there's nothing that lets ftrace know that the idle task was preempted on a trampoline. The idle task shouldn't need to ever enable preemption. The idle task is simply a loop that calls schedule or places the cpu into idle mode. In fact, having preemption enabled is inefficient, because it can happen when idle is just about to call schedule anyway, which would cause schedule to be called twice. Once for when the interrupt came in and was returning back to normal context, and then again in the normal path that the idle loop is running in, which would be pointless, as it had already scheduled. The only reason schedule_preempt_disable() enables preemption is to be able to call sched_submit_work(), which requires preemption enabled. As this is a nop when the task is in the RUNNING state, and idle is always in the running state, there's no reason that idle needs to enable preemption. But that means it cannot use schedule_preempt_disable() as other callers of that function require calling sched_submit_work(). Adding a new function local to kernel/sched/ that allows idle to call the scheduler without enabling preemption, fixes the synchronize_rcu_tasks() issue, as well as removes the pointless spurious schedule calls caused by interrupts happening in the brief window where preemption is enabled just before it calls schedule. Reviewed: Thomas Gleixner Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170414084809.3dacde2a@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 25 +++++++++++++++++++++++++ kernel/sched/idle.c | 2 +- kernel/sched/sched.h | 2 ++ 3 files changed, 28 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 759f4bd52cd6..803c3bc274c4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3502,6 +3502,31 @@ asmlinkage __visible void __sched schedule(void) } EXPORT_SYMBOL(schedule); +/* + * synchronize_rcu_tasks() makes sure that no task is stuck in preempted + * state (have scheduled out non-voluntarily) by making sure that all + * tasks have either left the run queue or have gone into user space. + * As idle tasks do not do either, they must not ever be preempted + * (schedule out non-voluntarily). + * + * schedule_idle() is similar to schedule_preempt_disable() except that it + * never enables preemption because it does not call sched_submit_work(). + */ +void __sched schedule_idle(void) +{ + /* + * As this skips calling sched_submit_work(), which the idle task does + * regardless because that function is a nop when the task is in a + * TASK_RUNNING state, make sure this isn't used someplace that the + * current task can be in any other state. Note, idle is always in the + * TASK_RUNNING state. + */ + WARN_ON_ONCE(current->state); + do { + __schedule(false); + } while (need_resched()); +} + #ifdef CONFIG_CONTEXT_TRACKING asmlinkage __visible void __sched schedule_user(void) { diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2a25a9ec2c6e..ef63adce0c9c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -265,7 +265,7 @@ static void do_idle(void) smp_mb__after_atomic(); sched_ttwu_pending(); - schedule_preempt_disabled(); + schedule_idle(); if (unlikely(klp_patch_pending(current))) klp_update_patch_state(current); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7808ab050599..6dda2aab731e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1467,6 +1467,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) } #endif +extern void schedule_idle(void); + extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); -- cgit v1.3-14-g43fede From 2c4569ca26986d18243f282dd727da27e9adae4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 May 2017 13:54:11 +0200 Subject: genirq: Fix chained interrupt data ordering irq_set_chained_handler_and_data() sets up the chained interrupt and then stores the handler data. That's racy against an immediate interrupt which gets handled before the store of the handler data happened. The handler will dereference a NULL pointer and crash. Cure it by storing handler data before installing the chained handler. Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 686be4b73018..c94da688ee9b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -880,8 +880,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, if (!desc) return; - __irq_do_set_handler(desc, handle, 1, NULL); desc->irq_common_data.handler_data = data; + __irq_do_set_handler(desc, handle, 1, NULL); irq_put_desc_busunlock(desc, flags); } -- cgit v1.3-14-g43fede From b9ef0326c05a008c3c576bd4d676208b50c344d5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 May 2017 11:14:35 -0400 Subject: tracing: Move postpone selftests to core from early_initcall I hit the following lockdep splat when booting with ftrace selftests enabled, as well as CONFIG_PREEMPT and LOCKDEP. Testing dynamic ftrace ops #1: (1 0 1 0 0) (1 1 2 0 0) (2 1 3 0 169) (2 2 4 0 50066) ------------[ cut here ]------------ WARNING: CPU: 0 PID: 13 at kernel/rcu/srcutree.c:202 check_init_srcu_struct+0x60/0x70 Modules linked in: CPU: 0 PID: 13 Comm: rcu_tasks_kthre Not tainted 4.12.0-rc1-test+ #587 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v02.05 05/07/2012 task: ffff880119628040 task.stack: ffffc900006a4000 RIP: 0010:check_init_srcu_struct+0x60/0x70 RSP: 0000:ffffc900006a7d98 EFLAGS: 00010246 RAX: 0000000000000246 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff880119628040 RSI: 00000000ffffffff RDI: ffffffff81e5fb40 RBP: ffffc900006a7e20 R08: 00000023b403c000 R09: 0000000000000001 R10: ffffc900006a7e40 R11: 0000000000000000 R12: ffffffff81e5fb40 R13: 0000000000000286 R14: ffff880119628040 R15: ffffc900006a7e98 FS: 0000000000000000(0000) GS:ffff88011ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff88011edff000 CR3: 0000000001e0f000 CR4: 00000000001406f0 Call Trace: ? __synchronize_srcu+0x6e/0x140 ? lock_acquire+0xdc/0x1d0 ? ktime_get_mono_fast_ns+0x5d/0xb0 synchronize_srcu+0x6f/0x110 ? synchronize_srcu+0x6f/0x110 rcu_tasks_kthread+0x20a/0x540 kthread+0x114/0x150 ? __rcu_read_unlock+0x70/0x70 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x2e/0x40 Code: f6 83 70 06 00 00 03 49 89 c5 74 0d be 01 00 00 00 48 89 df e8 42 fa ff ff 4c 89 ee 4c 89 e7 e8 b7 42 75 00 5b 41 5c 41 5d 5d c3 <0f> ff eb aa 66 90 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 ---[ end trace 5c3f4206ce50f6ac ]--- What happens is that the selftests include a creating of a dynamically allocated ftrace_ops, which requires the use of synchronize_rcu_tasks() which uses srcu, and triggers the above warning. It appears that synchronize_rcu_tasks() is not set up at early_initcall(), but it is at core_initcall(). By moving the tests down to that location works out properly. Link: http://lkml.kernel.org/r/20170517111435.7388c033@gandalf.local.home Acked-by: "Paul E. McKenney" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c4536c449021..cdf97ce8cff2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1558,7 +1558,7 @@ static __init int init_trace_selftests(void) return 0; } -early_initcall(init_trace_selftests); +core_initcall(init_trace_selftests); #else static inline int run_tracer_selftest(struct tracer *type) { -- cgit v1.3-14-g43fede From 30e7d894c1478c88d50ce94ddcdbd7f9763d9cdd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 May 2017 10:19:49 +0200 Subject: tracing/kprobes: Enforce kprobes teardown after testing Enabling the tracer selftest triggers occasionally the warning in text_poke(), which warns when the to be modified page is not marked reserved. The reason is that the tracer selftest installs kprobes on functions marked __init for testing. These probes are removed after the tests, but that removal schedules the delayed kprobes_optimizer work, which will do the actual text poke. If the work is executed after the init text is freed, then the warning triggers. The bug can be reproduced reliably when the work delay is increased. Flush the optimizer work and wait for the optimizing/unoptimizing lists to become empty before returning from the kprobes tracer selftest. That ensures that all operations which were queued due to the probes removal have completed. Link: http://lkml.kernel.org/r/20170516094802.76a468bb@gandalf.local.home Signed-off-by: Thomas Gleixner Acked-by: Masami Hiramatsu Cc: stable@vger.kernel.org Fixes: 6274de498 ("kprobes: Support delayed unoptimizing") Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 3 +++ kernel/kprobes.c | 2 +- kernel/trace/trace_kprobe.c | 5 +++++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 30f90c1a0aaf..541df0b5b815 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -349,6 +349,9 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif +extern void wait_for_kprobe_optimizer(void); +#else +static inline void wait_for_kprobe_optimizer(void) { } #endif /* CONFIG_OPTPROBES */ #ifdef CONFIG_KPROBES_ON_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7367e0ec6f81..199243bba554 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -595,7 +595,7 @@ static void kprobe_optimizer(struct work_struct *work) } /* Wait for completing optimization and unoptimization */ -static void wait_for_kprobe_optimizer(void) +void wait_for_kprobe_optimizer(void) { mutex_lock(&kprobe_mutex); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8485f6738a87..c129fca6ec99 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1535,6 +1535,11 @@ static __init int kprobe_trace_self_tests_init(void) end: release_all_trace_kprobes(); + /* + * Wait for the optimizer work to finish. Otherwise it might fiddle + * with probes in already freed __init text. + */ + wait_for_kprobe_optimizer(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else -- cgit v1.3-14-g43fede From cbab567c3dc7d6f443b4c84eab76e8967d5c1dee Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Tue, 16 May 2017 23:21:25 +0530 Subject: ftrace: Simplify glob handling in unregister_ftrace_function_probe_func() Handle a NULL glob properly and simplify the check. Link: http://lkml.kernel.org/r/5df74d4ffb4721db6d5a22fa08ca031d62ead493.1494956770.git.naveen.n.rao@linux.vnet.ibm.com Reviewed-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 39dca4e86a94..c35c3e67d09a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4144,9 +4144,9 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, int i, ret = -ENODEV; int size; - if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) + if (!glob || !strlen(glob) || !strcmp(glob, "*")) func_g.search = NULL; - else if (glob) { + else { int not; func_g.type = filter_parse_regex(glob, strlen(glob), -- cgit v1.3-14-g43fede From a0e6369e4bac8844825ae1a66ccd122b290dcc86 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Tue, 16 May 2017 23:21:26 +0530 Subject: ftrace/instances: Clear function triggers when removing instances If instance directories are deleted while there are registered function triggers: # cd /sys/kernel/debug/tracing/instances # mkdir test # echo "schedule:enable_event:sched:sched_switch" > test/set_ftrace_filter # rmdir test Unable to handle kernel paging request for data at address 0x00000008 Unable to handle kernel paging request for data at address 0x00000008 Faulting instruction address: 0xc0000000021edde8 Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=2048 NUMA pSeries Modules linked in: iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc kvm iptable_filter fuse binfmt_misc pseries_rng rng_core vmx_crypto ib_iser rdma_cm iw_cm ib_cm ib_core libiscsi scsi_transport_iscsi ip_tables x_tables autofs4 btrfs raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c multipath virtio_net virtio_blk virtio_pci crc32c_vpmsum virtio_ring virtio CPU: 8 PID: 8694 Comm: rmdir Not tainted 4.11.0-nnr+ #113 task: c0000000bab52800 task.stack: c0000000baba0000 NIP: c0000000021edde8 LR: c0000000021f0590 CTR: c000000002119620 REGS: c0000000baba3870 TRAP: 0300 Not tainted (4.11.0-nnr+) MSR: 8000000000009033 CR: 22002422 XER: 20000000 CFAR: 00007fffabb725a8 DAR: 0000000000000008 DSISR: 40000000 SOFTE: 0 GPR00: c00000000220f750 c0000000baba3af0 c000000003157e00 0000000000000000 GPR04: 0000000000000040 00000000000000eb 0000000000000040 0000000000000000 GPR08: 0000000000000000 0000000000000113 0000000000000000 c00000000305db98 GPR12: c000000002119620 c00000000fd42c00 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 c0000000bab52e90 0000000000000000 GPR24: 0000000000000000 00000000000000eb 0000000000000040 c0000000baba3bb0 GPR28: c00000009cb06eb0 c0000000bab52800 c00000009cb06eb0 c0000000baba3bb0 NIP [c0000000021edde8] ring_buffer_lock_reserve+0x8/0x4e0 LR [c0000000021f0590] trace_event_buffer_lock_reserve+0xe0/0x1a0 Call Trace: [c0000000baba3af0] [c0000000021f96c8] trace_event_buffer_commit+0x1b8/0x280 (unreliable) [c0000000baba3b60] [c00000000220f750] trace_event_buffer_reserve+0x80/0xd0 [c0000000baba3b90] [c0000000021196b8] trace_event_raw_event_sched_switch+0x98/0x180 [c0000000baba3c10] [c0000000029d9980] __schedule+0x6e0/0xab0 [c0000000baba3ce0] [c000000002122230] do_task_dead+0x70/0xc0 [c0000000baba3d10] [c0000000020ea9c8] do_exit+0x828/0xd00 [c0000000baba3dd0] [c0000000020eaf70] do_group_exit+0x60/0x100 [c0000000baba3e10] [c0000000020eb034] SyS_exit_group+0x24/0x30 [c0000000baba3e30] [c00000000200bcec] system_call+0x38/0x54 Instruction dump: 60000000 60420000 7d244b78 7f63db78 4bffaa09 393efff8 793e0020 39200000 4bfffecc 60420000 3c4c00f7 3842a020 <81230008> 2f890000 409e02f0 a14d0008 ---[ end trace b917b8985d0e650b ]--- Unable to handle kernel paging request for data at address 0x00000008 Faulting instruction address: 0xc0000000021edde8 Unable to handle kernel paging request for data at address 0x00000008 Faulting instruction address: 0xc0000000021edde8 Faulting instruction address: 0xc0000000021edde8 To address this, let's clear all registered function probes before deleting the ftrace instance. Link: http://lkml.kernel.org/r/c5f1ca624043690bd94642bb6bffd3f2fc504035.1494956770.git.naveen.n.rao@linux.vnet.ibm.com Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 8 ++++++++ kernel/trace/trace.c | 3 +++ kernel/trace/trace.h | 1 + 3 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c35c3e67d09a..74fdfe9ed3db 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4256,6 +4256,14 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, return ret; } +void clear_ftrace_function_probes(struct trace_array *tr) +{ + struct ftrace_func_probe *probe, *n; + + list_for_each_entry_safe(probe, n, &tr->func_probes, list) + unregister_ftrace_function_probe_func(NULL, tr, probe->probe_ops); +} + static LIST_HEAD(ftrace_commands); static DEFINE_MUTEX(ftrace_cmd_mutex); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cdf97ce8cff2..664c44a6d48f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7550,6 +7550,9 @@ static int instance_rmdir(const char *name) } tracing_set_nop(tr); +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) + clear_ftrace_function_probes(tr); +#endif event_trace_del_tracer(tr); ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 291a1bca5748..98e0845f7235 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -980,6 +980,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, extern int unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, struct ftrace_probe_ops *ops); +extern void clear_ftrace_function_probes(struct trace_array *tr); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); -- cgit v1.3-14-g43fede From 8a49f3e03c8ac52fe1b706fffb13142295fa0c47 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 17 May 2017 21:53:32 -0400 Subject: ftrace: Remove #ifdef from code and add clear_ftrace_function_probes() stub No need to add ugly #ifdefs in the code. Having a standard stub file is much prettier. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 -- kernel/trace/trace.h | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 664c44a6d48f..fcc9a2d774c3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7550,9 +7550,7 @@ static int instance_rmdir(const char *name) } tracing_set_nop(tr); -#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) clear_ftrace_function_probes(tr); -#endif event_trace_del_tracer(tr); ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 98e0845f7235..39fd77330aab 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -999,6 +999,10 @@ static inline __init int unregister_ftrace_command(char *cmd_name) { return -EINVAL; } +static inline void clear_ftrace_function_probes(struct trace_array *tr) +{ +} + /* * The ops parameter passed in is usually undefined. * This must be a macro. -- cgit v1.3-14-g43fede From 545a028190dae4437aac4f86da7c8ab20857647c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 May 2017 14:58:35 -0400 Subject: kprobes: Document how optimized kprobes are removed from module unload Thomas discovered a bug where the kprobe trace tests had a race condition where the kprobe_optimizer called from a delayed work queue that does the optimizing and "unoptimizing" of a kprobe, can try to modify the text after it has been freed by the init code. The kprobe trace selftest is a special case, and Thomas and myself investigated to see if there's a chance that this could also be a bug with module unloading, as the code is not obvious to how it handles this. After adding lots of printks, I figured it out. Thomas suggested that this should be commented so that others will not have to go through this exercise again. Link: http://lkml.kernel.org/r/20170516145835.3827d3aa@gandalf.local.home Acked-by: Masami Hiramatsu Suggested-by: Thomas Gleixner Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 199243bba554..2d2d3a568e4e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2183,6 +2183,12 @@ static int kprobes_module_callback(struct notifier_block *nb, * The vaddr this probe is installed will soon * be vfreed buy not synced to disk. Hence, * disarming the breakpoint isn't needed. + * + * Note, this will also move any optimized probes + * that are pending to be removed from their + * corresponding lists to the freeing_list and + * will not be touched by the delayed + * kprobe_optimizer work handler. */ kill_kprobe(p); } -- cgit v1.3-14-g43fede From 3c2ce60bdd3d57051bf85615deec04a694473840 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 18 May 2017 03:00:06 +0200 Subject: bpf: adjust verifier heuristics Current limits with regards to processing program paths do not really reflect today's needs anymore due to programs becoming more complex and verifier smarter, keeping track of more data such as const ALU operations, alignment tracking, spilling of PTR_TO_MAP_VALUE_ADJ registers, and other features allowing for smarter matching of what LLVM generates. This also comes with the side-effect that we result in fewer opportunities to prune search states and thus often need to do more work to prove safety than in the past due to different register states and stack layout where we mismatch. Generally, it's quite hard to determine what caused a sudden increase in complexity, it could be caused by something as trivial as a single branch somewhere at the beginning of the program where LLVM assigned a stack slot that is marked differently throughout other branches and thus causing a mismatch, where verifier then needs to prove safety for the whole rest of the program. Subsequently, programs with even less than half the insn size limit can get rejected. We noticed that while some programs load fine under pre 4.11, they get rejected due to hitting limits on more recent kernels. We saw that in the vast majority of cases (90+%) pruning failed due to register mismatches. In case of stack mismatches, majority of cases failed due to different stack slot types (invalid, spill, misc) rather than differences in spilled registers. This patch makes pruning more aggressive by also adding markers that sit at conditional jumps as well. Currently, we only mark jump targets for pruning. For example in direct packet access, these are usually error paths where we bail out. We found that adding these markers, it can reduce number of processed insns by up to 30%. Another option is to ignore reg->id in probing PTR_TO_MAP_VALUE_OR_NULL registers, which can help pruning slightly as well by up to 7% observed complexity reduction as stand-alone. Meaning, if a previous path with register type PTR_TO_MAP_VALUE_OR_NULL for map X was found to be safe, then in the current state a PTR_TO_MAP_VALUE_OR_NULL register for the same map X must be safe as well. Last but not least the patch also adds a scheduling point and bumps the current limit for instructions to be processed to a more adequate value. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 39f2dcbc4cbc..1eddb713b815 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -140,7 +140,7 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_INSNS 65536 +#define BPF_COMPLEXITY_LIMIT_INSNS 98304 #define BPF_COMPLEXITY_LIMIT_STACK 1024 #define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) @@ -2640,6 +2640,7 @@ peek_stack: env->explored_states[t + 1] = STATE_LIST_MARK; } else { /* conditional jump with two edges */ + env->explored_states[t] = STATE_LIST_MARK; ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret == 1) goto peek_stack; @@ -2798,6 +2799,12 @@ static bool states_equal(struct bpf_verifier_env *env, rcur->type != NOT_INIT)) continue; + /* Don't care about the reg->id in this case. */ + if (rold->type == PTR_TO_MAP_VALUE_OR_NULL && + rcur->type == PTR_TO_MAP_VALUE_OR_NULL && + rold->map_ptr == rcur->map_ptr) + continue; + if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && compare_ptrs_to_packet(rold, rcur)) continue; @@ -2932,6 +2939,9 @@ static int do_check(struct bpf_verifier_env *env) goto process_bpf_exit; } + if (need_resched()) + cond_resched(); + if (log_level > 1 || (log_level && do_print_state)) { if (log_level > 1) verbose("%d:", insn_idx); -- cgit v1.3-14-g43fede From a33d7d94eed92b23fbbc7b0de06a41b2bbaa49e3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 12 May 2017 13:15:45 -0400 Subject: tracing: Make sure RCU is watching before calling a stack trace As stack tracing now requires "rcu watching", force RCU to be watching when recording a stack trace. Link: http://lkml.kernel.org/r/20170512172449.879684501@goodmis.org Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fcc9a2d774c3..1122f151466f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2568,7 +2568,36 @@ static inline void ftrace_trace_stack(struct trace_array *tr, void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); + struct ring_buffer *buffer = tr->trace_buffer.buffer; + + if (rcu_is_watching()) { + __ftrace_trace_stack(buffer, flags, skip, pc, NULL); + return; + } + + /* + * When an NMI triggers, RCU is enabled via rcu_nmi_enter(), + * but if the above rcu_is_watching() failed, then the NMI + * triggered someplace critical, and rcu_irq_enter() should + * not be called from NMI. + */ + if (unlikely(in_nmi())) + return; + + /* + * It is possible that a function is being traced in a + * location that RCU is not watching. A call to + * rcu_irq_enter() will make sure that it is, but there's + * a few internal rcu functions that could be traced + * where that wont work either. In those cases, we just + * do nothing. + */ + if (unlikely(rcu_irq_enter_disabled())) + return; + + rcu_irq_enter_irqson(); + __ftrace_trace_stack(buffer, flags, skip, pc, NULL); + rcu_irq_exit_irqson(); } /** -- cgit v1.3-14-g43fede From 5f3394530fbe90d3bcd1c204618960bc50236578 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 19 May 2017 08:04:59 -0700 Subject: blktrace: fix integer parse sscanf is a very poor way to parse integer. For example, I input "discard" for act_mask, it gets 0xd and completely messes up. Using correct API to do integer parse. This patch also makes attributes accept any base of integer. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bd8ae8d5ae9c..193c5f5e3f79 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1662,14 +1662,14 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; if (attr == &dev_attr_act_mask) { - if (sscanf(buf, "%llx", &value) != 1) { + if (kstrtoull(buf, 0, &value)) { /* Assume it is a list of trace category names */ ret = blk_trace_str2mask(buf); if (ret < 0) goto out; value = ret; } - } else if (sscanf(buf, "%llu", &value) != 1) + } else if (kstrtoull(buf, 0, &value)) goto out; ret = -ENXIO; -- cgit v1.3-14-g43fede From e4eda884db7930cee434828759064b4711604078 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 22 May 2017 12:27:07 -0400 Subject: net: Make IP alignment calulations clearer. The assignmnet: ip_align = strict ? 2 : NET_IP_ALIGN; in compare_pkt_ptr_alignment() trips up Coverity because we can only get to this code when strict is true, therefore ip_align will always be 2 regardless of NET_IP_ALIGN's value. So just assign directly to '2' and explain the situation in the comment above. Reported-by: "Gustavo A. R. Silva" Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1eddb713b815..c72cd41f5b8b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -808,11 +808,15 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, reg_off += reg->aux_off; } - /* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking - * we force this to 2 which is universally what architectures use - * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + /* For platforms that do not have a Kconfig enabling + * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of + * NET_IP_ALIGN is universally set to '2'. And on platforms + * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get + * to this code only in strict mode where we want to emulate + * the NET_IP_ALIGN==2 checking. Therefore use an + * unconditional IP align value of '2'. */ - ip_align = strict ? 2 : NET_IP_ALIGN; + ip_align = 2; if ((ip_align + reg_off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", ip_align, reg_off, off, size); -- cgit v1.3-14-g43fede From c70d9d809fdeecedb96972457ee45c49a232d97f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 May 2017 15:40:12 -0500 Subject: ptrace: Properly initialize ptracer_cred on fork When I introduced ptracer_cred I failed to consider the weirdness of fork where the task_struct copies the old value by default. This winds up leaving ptracer_cred set even when a process forks and the child process does not wind up being ptraced. Because ptracer_cred is not set on non-ptraced processes whose parents were ptraced this has broken the ability of the enlightenment window manager to start setuid children. Fix this by properly initializing ptracer_cred in ptrace_init_task This must be done with a little bit of care to preserve the current value of ptracer_cred when ptrace carries through fork. Re-reading the ptracer_cred from the ptracing process at this point is inconsistent with how PT_PTRACE_CAP has been maintained all of these years. Tested-by: Takashi Iwai Fixes: 64b875f7ac8a ("ptrace: Capture the ptracer's creds not PT_PTRACE_CAP") Signed-off-by: "Eric W. Biederman" --- include/linux/ptrace.h | 7 +++++-- kernel/ptrace.c | 20 +++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 422bc2e4cb6a..ef3eb8bbfee4 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -54,7 +54,8 @@ extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, - struct task_struct *new_parent); + struct task_struct *new_parent, + const struct cred *ptracer_cred); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 @@ -206,7 +207,7 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) if (unlikely(ptrace) && current->ptrace) { child->ptrace = current->ptrace; - __ptrace_link(child, current->parent); + __ptrace_link(child, current->parent, current->ptracer_cred); if (child->ptrace & PT_SEIZED) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); @@ -215,6 +216,8 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) set_tsk_thread_flag(child, TIF_SIGPENDING); } + else + child->ptracer_cred = NULL; } /** diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 266ddcc1d8bb..60f356d91060 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, } +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, + const struct cred *ptracer_cred) +{ + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); + child->parent = new_parent; + child->ptracer_cred = get_cred(ptracer_cred); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. * * Must be called with the tasklist lock write-held. */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) +static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_entry)); - list_add(&child->ptrace_entry, &new_parent->ptraced); - child->parent = new_parent; rcu_read_lock(); - child->ptracer_cred = get_cred(__task_cred(new_parent)); + __ptrace_link(child, new_parent, __task_cred(new_parent)); rcu_read_unlock(); } @@ -386,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request, flags |= PT_SEIZED; task->ptrace = flags; - __ptrace_link(task, current); + ptrace_link(task, current); /* SEIZE doesn't trap tracee on attach */ if (!seize) @@ -459,7 +465,7 @@ static int ptrace_traceme(void) */ if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace = PT_PTRACED; - __ptrace_link(current, current->real_parent); + ptrace_link(current, current->real_parent); } } write_unlock_irq(&tasklist_lock); -- cgit v1.3-14-g43fede From 1ad2f5838d345e1c102bd1cd27c4f4c1349b0dc8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:05 +0200 Subject: bpf: fix incorrect pruning decision when alignment must be tracked Currently, when we enforce alignment tracking on direct packet access, the verifier lets the following program pass despite doing a packet write with unaligned access: 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (61) r7 = *(u32 *)(r1 +8) 3: (bf) r0 = r2 4: (07) r0 += 14 5: (25) if r7 > 0x1 goto pc+4 R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 6: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 7: (63) *(u32 *)(r0 -4) = r0 8: (b7) r0 = 0 9: (95) exit from 6 to 8: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 8: (b7) r0 = 0 9: (95) exit from 5 to 10: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=2 R10=fp 10: (07) r0 += 1 11: (05) goto pc-6 6: safe <----- here, wrongly found safe processed 15 insns However, if we enforce a pruning mismatch by adding state into r8 which is then being mismatched in states_equal(), we find that for the otherwise same program, the verifier detects a misaligned packet access when actually walking that path: 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (61) r7 = *(u32 *)(r1 +8) 3: (b7) r8 = 1 4: (bf) r0 = r2 5: (07) r0 += 14 6: (25) if r7 > 0x1 goto pc+4 R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 7: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 8: (63) *(u32 *)(r0 -4) = r0 9: (b7) r0 = 0 10: (95) exit from 7 to 9: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 9: (b7) r0 = 0 10: (95) exit from 6 to 11: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=2 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 11: (07) r0 += 1 12: (b7) r8 = 0 13: (05) goto pc-7 <----- mismatch due to r8 7: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=15,r=15) R1=ctx R2=pkt(id=0,off=0,r=15) R3=pkt_end R7=inv,min_value=2 R8=imm0,min_value=0,max_value=0,min_align=2147483648 R10=fp 8: (63) *(u32 *)(r0 -4) = r0 misaligned packet access off 2+15+-4 size 4 The reason why we fail to see it in states_equal() is that the third test in compare_ptrs_to_packet() ... if (old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; ... will let the above pass. The situation we run into is that old->off <= cur->off (14 <= 15), meaning that prior walked paths went with smaller offset, which was later used in the packet access after successful packet range check and found to be safe already. For example: Given is R0=pkt(id=0,off=0,r=0). Adding offset 14 as in above program to it, results in R0=pkt(id=0,off=14,r=0) before the packet range test. Now, testing this against R3=pkt_end with 'if r0 > r3 goto out' will transform R0 into R0=pkt(id=0,off=14,r=14) for the case when we're within bounds. A write into the packet at offset *(u32 *)(r0 -4), that is, 2 + 14 -4, is valid and aligned (2 is for NET_IP_ALIGN). After processing this with all fall-through paths, we later on check paths from branches. When the above skb->mark test is true, then we jump near the end of the program, perform r0 += 1, and jump back to the 'if r0 > r3 goto out' test we've visited earlier already. This time, R0 is of type R0=pkt(id=0,off=15,r=0), and we'll prune that part because this time we'll have a larger safe packet range, and we already found that with off=14 all further insn were already safe, so it's safe as well with a larger off. However, the problem is that the subsequent write into the packet with 2 + 15 -4 is then unaligned, and not caught by the alignment tracking. Note that min_align, aux_off, and aux_off_align were all 0 in this example. Since we cannot tell at this time what kind of packet access was performed in the prior walk and what minimal requirements it has (we might do so in the future, but that requires more complexity), fix it to disable this pruning case for strict alignment for now, and let the verifier do check such paths instead. With that applied, the test cases pass and reject the program due to misalignment. Fixes: d1174416747d ("bpf: Track alignment of register values in the verifier.") Reference: http://patchwork.ozlabs.org/patch/761909/ Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c72cd41f5b8b..e37e06b1229d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -843,9 +843,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, { bool strict = env->strict_alignment; - if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - strict = true; - switch (reg->type) { case PTR_TO_PACKET: return check_pkt_ptr_alignment(reg, off, size, strict); @@ -2696,7 +2693,8 @@ err_free: /* the following conditions reduce the number of explored insns * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet */ -static bool compare_ptrs_to_packet(struct bpf_reg_state *old, +static bool compare_ptrs_to_packet(struct bpf_verifier_env *env, + struct bpf_reg_state *old, struct bpf_reg_state *cur) { if (old->id != cur->id) @@ -2739,7 +2737,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old, * 'if (R4 > data_end)' and all further insn were already good with r=20, * so they will be good with r=30 and we can prune the search. */ - if (old->off <= cur->off && + if (!env->strict_alignment && old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; @@ -2810,7 +2808,7 @@ static bool states_equal(struct bpf_verifier_env *env, continue; if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && - compare_ptrs_to_packet(rold, rcur)) + compare_ptrs_to_packet(env, rold, rcur)) continue; return false; @@ -3588,10 +3586,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) } else { log_level = 0; } - if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT) + + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - else - env->strict_alignment = false; ret = replace_map_fd_with_map_ptr(env); if (ret < 0) @@ -3697,7 +3695,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, mutex_lock(&bpf_verifier_lock); log_level = 0; + env->strict_alignment = false; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + env->strict_alignment = true; env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), -- cgit v1.3-14-g43fede From a9789ef9afcb4fb0193f8dd94f2665ba3ad71e79 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:06 +0200 Subject: bpf: properly reset caller saved regs after helper call and ld_abs/ind Currently, after performing helper calls, we clear all caller saved registers, that is r0 - r5 and fill r0 depending on struct bpf_func_proto specification. The way we reset these regs can affect pruning decisions in later paths, since we only reset register's imm to 0 and type to NOT_INIT. However, we leave out clearing of other variables such as id, min_value, max_value, etc, which can later on lead to pruning mismatches due to stale data. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e37e06b1229d..339c8a1371de 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -463,19 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; +static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + + memset(®s[regno], 0, sizeof(regs[regno])); + regs[regno].type = NOT_INIT; + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; + regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} + static void init_reg_state(struct bpf_reg_state *regs) { int i; - for (i = 0; i < MAX_BPF_REG; i++) { - regs[i].type = NOT_INIT; - regs[i].imm = 0; - regs[i].min_value = BPF_REGISTER_MIN_RANGE; - regs[i].max_value = BPF_REGISTER_MAX_RANGE; - regs[i].min_align = 0; - regs[i].aux_off = 0; - regs[i].aux_off_align = 0; - } + for (i = 0; i < MAX_BPF_REG; i++) + mark_reg_not_init(regs, i); /* frame pointer */ regs[BPF_REG_FP].type = FRAME_PTR; @@ -1346,7 +1349,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs = state->regs; - struct bpf_reg_state *reg; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1413,11 +1415,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } /* reset caller saved regs */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* update return register */ if (fn->ret_type == RET_INTEGER) { @@ -2445,7 +2444,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; u8 mode = BPF_MODE(insn->code); - struct bpf_reg_state *reg; int i, err; if (!may_access_skb(env->prog->type)) { @@ -2478,11 +2476,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* reset caller saved regs to unreadable */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* mark destination R0 register as readable, since it contains * the value fetched from the packet -- cgit v1.3-14-g43fede From a316338cb71a3260201490e615f2f6d5c0d8fb2c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:08 +0200 Subject: bpf: fix wrong exposure of map_flags into fdinfo for lpm trie_alloc() always needs to have BPF_F_NO_PREALLOC passed in via attr->map_flags, since it does not support preallocation yet. We check the flag, but we never copy the flag into trie->map.map_flags, which is later on exposed into fdinfo and used by loaders such as iproute2. Latter uses this in bpf_map_selfcheck_pinned() to test whether a pinned map has the same spec as the one from the BPF obj file and if not, bails out, which is currently the case for lpm since it exposes always 0 as flags. Also copy over flags in array_map_alloc() and stack_map_alloc(). They always have to be 0 right now, but we should make sure to not miss to copy them over at a later point in time when we add actual flags for them to use. Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") Reported-by: Jarno Rajahalme Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 1 + kernel/bpf/lpm_trie.c | 1 + kernel/bpf/stackmap.c | 1 + 3 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5e00b2333c26..172dc8ee0e3b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; + array->map.map_flags = attr->map_flags; array->elem_size = elem_size; if (!percpu) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 39cfafd895b8..b09185f0f17d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) trie->map.key_size = attr->key_size; trie->map.value_size = attr->value_size; trie->map.max_entries = attr->max_entries; + trie->map.map_flags = attr->map_flags; trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4dfd6f2ec2f9..31147d730abf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->map.key_size = attr->key_size; smap->map.value_size = value_size; smap->map.max_entries = attr->max_entries; + smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; -- cgit v1.3-14-g43fede