aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks3
-rw-r--r--kernel/bpf/arraymap.c23
-rw-r--r--kernel/bpf/btf.c149
-rw-r--r--kernel/bpf/cgroup.c5
-rw-r--r--kernel/bpf/core.c315
-rw-r--r--kernel/bpf/disasm.c34
-rw-r--r--kernel/bpf/hashtab.c67
-rw-r--r--kernel/bpf/helpers.c96
-rw-r--r--kernel/bpf/local_storage.c16
-rw-r--r--kernel/bpf/lpm_trie.c1
-rw-r--r--kernel/bpf/map_in_map.c6
-rw-r--r--kernel/bpf/offload.c45
-rw-r--r--kernel/bpf/percpu_freelist.c41
-rw-r--r--kernel/bpf/percpu_freelist.h4
-rw-r--r--kernel/bpf/stackmap.c8
-rw-r--r--kernel/bpf/syscall.c102
-rw-r--r--kernel/bpf/verifier.c973
-rw-r--r--kernel/cgroup/cgroup-internal.h2
-rw-r--r--kernel/cgroup/cgroup-v1.c58
-rw-r--r--kernel/cgroup/cgroup.c27
-rw-r--r--kernel/events/core.c16
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/futex.c32
-rw-r--r--kernel/irq/affinity.c121
-rw-r--r--kernel/irq/chip.c66
-rw-r--r--kernel/irq/debugfs.c8
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/internals.h10
-rw-r--r--kernel/irq/irqdesc.c42
-rw-r--r--kernel/irq/irqdomain.c16
-rw-r--r--kernel/irq/manage.c406
-rw-r--r--kernel/kthread.c8
-rw-r--r--kernel/locking/rtmutex.c37
-rw-r--r--kernel/rcu/Kconfig30
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/sched/core.c28
-rw-r--r--kernel/sched/psi.c2
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c66
-rw-r--r--kernel/softirq.c3
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/time/Kconfig29
-rw-r--r--kernel/time/hrtimer.c2
-rw-r--r--kernel/time/posix-cpu-timers.c13
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/timekeeping_debug.c11
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/trace/bpf_trace.c14
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace_kprobe.c10
-rw-r--r--kernel/trace/trace_probe_tmpl.h6
51 files changed, 2388 insertions, 616 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 84d882f3e299..fbba478ae522 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS
def_bool y if ARCH_USE_QUEUED_SPINLOCKS
depends on SMP
+config BPF_ARCH_SPINLOCK
+ bool
+
config ARCH_USE_QUEUED_RWLOCKS
bool
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 25632a75d630..c72e0d8e1e65 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -253,8 +253,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
+ char *val;
- if (unlikely(map_flags > BPF_EXIST))
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
@@ -262,17 +263,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
/* all elements were pre-allocated, cannot insert a new one */
return -E2BIG;
- if (unlikely(map_flags == BPF_NOEXIST))
+ if (unlikely(map_flags & BPF_NOEXIST))
/* all elements already exist */
return -EEXIST;
- if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ if (unlikely((map_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)))
+ return -EINVAL;
+
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size);
- else
- memcpy(array->value +
- array->elem_size * (index & array->index_mask),
- value, map->value_size);
+ } else {
+ val = array->value +
+ array->elem_size * (index & array->index_mask);
+ if (map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, val, value, false);
+ else
+ copy_map_value(map, val, value);
+ }
return 0;
}
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index befe570be5ba..bd3921b1514b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -157,7 +157,7 @@
*
*/
-#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
+#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2)
#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
@@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t)
return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
}
+static bool __btf_type_is_struct(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
+}
+
static bool btf_type_is_array(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
@@ -525,7 +530,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
/*
* Regular int is not a bit field and it must be either
- * u8/u16/u32/u64.
+ * u8/u16/u32/u64 or __int128.
*/
static bool btf_type_int_is_regular(const struct btf_type *t)
{
@@ -538,7 +543,8 @@ static bool btf_type_int_is_regular(const struct btf_type *t)
if (BITS_PER_BYTE_MASKED(nr_bits) ||
BTF_INT_OFFSET(int_data) ||
(nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
- nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+ nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) &&
+ nr_bytes != (2 * sizeof(u64)))) {
return false;
}
@@ -1063,9 +1069,9 @@ static int btf_int_check_member(struct btf_verifier_env *env,
nr_copy_bits = BTF_INT_BITS(int_data) +
BITS_PER_BYTE_MASKED(struct_bits_off);
- if (nr_copy_bits > BITS_PER_U64) {
+ if (nr_copy_bits > BITS_PER_U128) {
btf_verifier_log_member(env, struct_type, member,
- "nr_copy_bits exceeds 64");
+ "nr_copy_bits exceeds 128");
return -EINVAL;
}
@@ -1119,9 +1125,9 @@ static int btf_int_check_kflag_member(struct btf_verifier_env *env,
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off);
- if (nr_copy_bits > BITS_PER_U64) {
+ if (nr_copy_bits > BITS_PER_U128) {
btf_verifier_log_member(env, struct_type, member,
- "nr_copy_bits exceeds 64");
+ "nr_copy_bits exceeds 128");
return -EINVAL;
}
@@ -1168,9 +1174,9 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
- if (nr_bits > BITS_PER_U64) {
+ if (nr_bits > BITS_PER_U128) {
btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
- BITS_PER_U64);
+ BITS_PER_U128);
return -EINVAL;
}
@@ -1211,31 +1217,93 @@ static void btf_int_log(struct btf_verifier_env *env,
btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
}
+static void btf_int128_print(struct seq_file *m, void *data)
+{
+ /* data points to a __int128 number.
+ * Suppose
+ * int128_num = *(__int128 *)data;
+ * The below formulas shows what upper_num and lower_num represents:
+ * upper_num = int128_num >> 64;
+ * lower_num = int128_num & 0xffffffffFFFFFFFFULL;
+ */
+ u64 upper_num, lower_num;
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ upper_num = *(u64 *)data;
+ lower_num = *(u64 *)(data + 8);
+#else
+ upper_num = *(u64 *)(data + 8);
+ lower_num = *(u64 *)data;
+#endif
+ if (upper_num == 0)
+ seq_printf(m, "0x%llx", lower_num);
+ else
+ seq_printf(m, "0x%llx%016llx", upper_num, lower_num);
+}
+
+static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
+ u16 right_shift_bits)
+{
+ u64 upper_num, lower_num;
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ upper_num = print_num[0];
+ lower_num = print_num[1];
+#else
+ upper_num = print_num[1];
+ lower_num = print_num[0];
+#endif
+
+ /* shake out un-needed bits by shift/or operations */
+ if (left_shift_bits >= 64) {
+ upper_num = lower_num << (left_shift_bits - 64);
+ lower_num = 0;
+ } else {
+ upper_num = (upper_num << left_shift_bits) |
+ (lower_num >> (64 - left_shift_bits));
+ lower_num = lower_num << left_shift_bits;
+ }
+
+ if (right_shift_bits >= 64) {
+ lower_num = upper_num >> (right_shift_bits - 64);
+ upper_num = 0;
+ } else {
+ lower_num = (lower_num >> right_shift_bits) |
+ (upper_num << (64 - right_shift_bits));
+ upper_num = upper_num >> right_shift_bits;
+ }
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ print_num[0] = upper_num;
+ print_num[1] = lower_num;
+#else
+ print_num[0] = lower_num;
+ print_num[1] = upper_num;
+#endif
+}
+
static void btf_bitfield_seq_show(void *data, u8 bits_offset,
u8 nr_bits, struct seq_file *m)
{
u16 left_shift_bits, right_shift_bits;
u8 nr_copy_bytes;
u8 nr_copy_bits;
- u64 print_num;
+ u64 print_num[2] = {};
nr_copy_bits = nr_bits + bits_offset;
nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
- print_num = 0;
- memcpy(&print_num, data, nr_copy_bytes);
+ memcpy(print_num, data, nr_copy_bytes);
#ifdef __BIG_ENDIAN_BITFIELD
left_shift_bits = bits_offset;
#else
- left_shift_bits = BITS_PER_U64 - nr_copy_bits;
+ left_shift_bits = BITS_PER_U128 - nr_copy_bits;
#endif
- right_shift_bits = BITS_PER_U64 - nr_bits;
-
- print_num <<= left_shift_bits;
- print_num >>= right_shift_bits;
+ right_shift_bits = BITS_PER_U128 - nr_bits;
- seq_printf(m, "0x%llx", print_num);
+ btf_int128_shift(print_num, left_shift_bits, right_shift_bits);
+ btf_int128_print(m, print_num);
}
@@ -1250,7 +1318,7 @@ static void btf_int_bits_seq_show(const struct btf *btf,
/*
* bits_offset is at most 7.
- * BTF_INT_OFFSET() cannot exceed 64 bits.
+ * BTF_INT_OFFSET() cannot exceed 128 bits.
*/
total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
@@ -1274,6 +1342,9 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
}
switch (nr_bits) {
+ case 128:
+ btf_int128_print(m, data);
+ break;
case 64:
if (sign)
seq_printf(m, "%lld", *(s64 *)data);
@@ -1459,7 +1530,8 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
/* "typedef void new_void", "const void"...etc */
if (!btf_type_is_void(next_type) &&
- !btf_type_is_fwd(next_type)) {
+ !btf_type_is_fwd(next_type) &&
+ !btf_type_is_func_proto(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
@@ -1979,6 +2051,43 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+ const struct btf_member *member;
+ u32 i, off = -ENOENT;
+
+ if (!__btf_type_is_struct(t))
+ return -EINVAL;
+
+ for_each_member(i, t, member) {
+ const struct btf_type *member_type = btf_type_by_id(btf,
+ member->type);
+ if (!__btf_type_is_struct(member_type))
+ continue;
+ if (member_type->size != sizeof(struct bpf_spin_lock))
+ continue;
+ if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
+ "bpf_spin_lock"))
+ continue;
+ if (off != -ENOENT)
+ /* only one 'struct bpf_spin_lock' is allowed */
+ return -E2BIG;
+ off = btf_member_bit_offset(t, member);
+ if (off % 8)
+ /* valid C code cannot generate such BTF */
+ return -EINVAL;
+ off /= 8;
+ if (off % __alignof__(struct bpf_spin_lock))
+ /* valid struct bpf_spin_lock will be 4 byte aligned */
+ return -EINVAL;
+ }
+ return off;
+}
+
static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct seq_file *m)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ab612fe9862f..4e807973aa80 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -230,6 +230,7 @@ cleanup:
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to attach
* @type: Type of attach operation
+ * @flags: Option flags
*
* Must be called with cgroup_mutex held.
*/
@@ -363,7 +364,7 @@ cleanup:
* Must be called with cgroup_mutex held.
*/
int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 unused_flags)
+ enum bpf_attach_type type)
{
struct list_head *progs = &cgrp->bpf.progs[type];
enum bpf_cgroup_storage_type stype;
@@ -572,7 +573,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
bpf_compute_and_save_data_end(skb, &saved_data_end);
ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- bpf_prog_run_save_cb);
+ __bpf_prog_run_save_cb);
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
skb->sk = save_sk;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f908b9356025..3f08c257858e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -78,7 +78,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
return NULL;
}
-struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog_aux *aux;
@@ -104,6 +104,32 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
return fp;
}
+
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+ struct bpf_prog *prog;
+ int cpu;
+
+ prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
+ if (!prog)
+ return NULL;
+
+ prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+ if (!prog->aux->stats) {
+ kfree(prog->aux);
+ vfree(prog);
+ return NULL;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_prog_stats *pstats;
+
+ pstats = per_cpu_ptr(prog->aux->stats, cpu);
+ u64_stats_init(&pstats->syncp);
+ }
+ return prog;
+}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
@@ -231,7 +257,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
void __bpf_prog_free(struct bpf_prog *fp)
{
- kfree(fp->aux);
+ if (fp->aux) {
+ free_percpu(fp->aux->stats);
+ kfree(fp->aux);
+ }
vfree(fp);
}
@@ -307,15 +336,16 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
return 0;
}
-static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
- u32 curr, const bool probe_pass)
+static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
+ s32 end_new, u32 curr, const bool probe_pass)
{
const s64 imm_min = S32_MIN, imm_max = S32_MAX;
+ s32 delta = end_new - end_old;
s64 imm = insn->imm;
- if (curr < pos && curr + imm + 1 > pos)
+ if (curr < pos && curr + imm + 1 >= end_old)
imm += delta;
- else if (curr > pos + delta && curr + imm + 1 <= pos + delta)
+ else if (curr >= end_new && curr + imm + 1 < end_new)
imm -= delta;
if (imm < imm_min || imm > imm_max)
return -ERANGE;
@@ -324,15 +354,16 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
return 0;
}
-static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
- u32 curr, const bool probe_pass)
+static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
+ s32 end_new, u32 curr, const bool probe_pass)
{
const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s32 delta = end_new - end_old;
s32 off = insn->off;
- if (curr < pos && curr + off + 1 > pos)
+ if (curr < pos && curr + off + 1 >= end_old)
off += delta;
- else if (curr > pos + delta && curr + off + 1 <= pos + delta)
+ else if (curr >= end_new && curr + off + 1 < end_new)
off -= delta;
if (off < off_min || off > off_max)
return -ERANGE;
@@ -341,10 +372,10 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
return 0;
}
-static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
- const bool probe_pass)
+static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
+ s32 end_new, const bool probe_pass)
{
- u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0);
+ u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
struct bpf_insn *insn = prog->insnsi;
int ret = 0;
@@ -356,22 +387,23 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
* do any other adjustments. Therefore skip the patchlet.
*/
if (probe_pass && i == pos) {
- i += delta + 1;
- insn++;
+ i = end_new;
+ insn = prog->insnsi + end_old;
}
code = insn->code;
- if (BPF_CLASS(code) != BPF_JMP ||
+ if ((BPF_CLASS(code) != BPF_JMP &&
+ BPF_CLASS(code) != BPF_JMP32) ||
BPF_OP(code) == BPF_EXIT)
continue;
/* Adjust offset of jmps if we cross patch boundaries. */
if (BPF_OP(code) == BPF_CALL) {
if (insn->src_reg != BPF_PSEUDO_CALL)
continue;
- ret = bpf_adj_delta_to_imm(insn, pos, delta, i,
- probe_pass);
+ ret = bpf_adj_delta_to_imm(insn, pos, end_old,
+ end_new, i, probe_pass);
} else {
- ret = bpf_adj_delta_to_off(insn, pos, delta, i,
- probe_pass);
+ ret = bpf_adj_delta_to_off(insn, pos, end_old,
+ end_new, i, probe_pass);
}
if (ret)
break;
@@ -421,7 +453,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
* we afterwards may not fail anymore.
*/
if (insn_adj_cnt > cnt_max &&
- bpf_adj_branches(prog, off, insn_delta, true))
+ bpf_adj_branches(prog, off, off + 1, off + len, true))
return NULL;
/* Several new instructions need to be inserted. Make room
@@ -453,13 +485,25 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
* the ship has sailed to reverse to the original state. An
* overflow cannot happen at this point.
*/
- BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
+ BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));
bpf_adj_linfo(prog_adj, off, insn_delta);
return prog_adj;
}
+int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
+{
+ /* Branch offsets can't overflow when program is shrinking, no need
+ * to call bpf_adj_branches(..., true) here
+ */
+ memmove(prog->insnsi + off, prog->insnsi + off + cnt,
+ sizeof(struct bpf_insn) * (prog->len - off - cnt));
+ prog->len -= cnt;
+
+ return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
+}
+
void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
@@ -934,6 +978,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
break;
+ case BPF_JMP32 | BPF_JEQ | BPF_K:
+ case BPF_JMP32 | BPF_JNE | BPF_K:
+ case BPF_JMP32 | BPF_JGT | BPF_K:
+ case BPF_JMP32 | BPF_JLT | BPF_K:
+ case BPF_JMP32 | BPF_JGE | BPF_K:
+ case BPF_JMP32 | BPF_JLE | BPF_K:
+ case BPF_JMP32 | BPF_JSGT | BPF_K:
+ case BPF_JMP32 | BPF_JSLT | BPF_K:
+ case BPF_JMP32 | BPF_JSGE | BPF_K:
+ case BPF_JMP32 | BPF_JSLE | BPF_K:
+ case BPF_JMP32 | BPF_JSET | BPF_K:
+ /* Accommodate for extra offset in case of a backjump. */
+ off = from->off;
+ if (off < 0)
+ off -= 2;
+ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
+ off);
+ break;
+
case BPF_LD | BPF_IMM | BPF_DW:
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
@@ -1130,6 +1195,31 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_2(JMP, CALL), \
/* Exit instruction. */ \
INSN_2(JMP, EXIT), \
+ /* 32-bit Jump instructions. */ \
+ /* Register based. */ \
+ INSN_3(JMP32, JEQ, X), \
+ INSN_3(JMP32, JNE, X), \
+ INSN_3(JMP32, JGT, X), \
+ INSN_3(JMP32, JLT, X), \
+ INSN_3(JMP32, JGE, X), \
+ INSN_3(JMP32, JLE, X), \
+ INSN_3(JMP32, JSGT, X), \
+ INSN_3(JMP32, JSLT, X), \
+ INSN_3(JMP32, JSGE, X), \
+ INSN_3(JMP32, JSLE, X), \
+ INSN_3(JMP32, JSET, X), \
+ /* Immediate based. */ \
+ INSN_3(JMP32, JEQ, K), \
+ INSN_3(JMP32, JNE, K), \
+ INSN_3(JMP32, JGT, K), \
+ INSN_3(JMP32, JLT, K), \
+ INSN_3(JMP32, JGE, K), \
+ INSN_3(JMP32, JLE, K), \
+ INSN_3(JMP32, JSGT, K), \
+ INSN_3(JMP32, JSLT, K), \
+ INSN_3(JMP32, JSGE, K), \
+ INSN_3(JMP32, JSLE, K), \
+ INSN_3(JMP32, JSET, K), \
/* Jump instructions. */ \
/* Register based. */ \
INSN_3(JMP, JEQ, X), \
@@ -1202,8 +1292,9 @@ bool bpf_opcode_in_insntable(u8 code)
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
* __bpf_prog_run - run eBPF program on a given context
- * @ctx: is the data we are operating on
+ * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
+ * @stack: is the eBPF storage stack
*
* Decode and execute eBPF instructions.
*/
@@ -1390,145 +1481,49 @@ select_insn:
out:
CONT;
}
- /* JMP */
JMP_JA:
insn += insn->off;
CONT;
- JMP_JEQ_X:
- if (DST == SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JEQ_K:
- if (DST == IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_X:
- if (DST != SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_K:
- if (DST != IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGT_X:
- if (DST > SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGT_K:
- if (DST > IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLT_X:
- if (DST < SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLT_K:
- if (DST < IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_X:
- if (DST >= SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_K:
- if (DST >= IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLE_X:
- if (DST <= SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLE_K:
- if (DST <= IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_X:
- if (((s64) DST) > ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_K:
- if (((s64) DST) > ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLT_X:
- if (((s64) DST) < ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLT_K:
- if (((s64) DST) < ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_X:
- if (((s64) DST) >= ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_K:
- if (((s64) DST) >= ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLE_X:
- if (((s64) DST) <= ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLE_K:
- if (((s64) DST) <= ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSET_X:
- if (DST & SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSET_K:
- if (DST & IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
JMP_EXIT:
return BPF_R0;
-
+ /* JMP */
+#define COND_JMP(SIGN, OPCODE, CMP_OP) \
+ JMP_##OPCODE##_X: \
+ if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP32_##OPCODE##_X: \
+ if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP_##OPCODE##_K: \
+ if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP32_##OPCODE##_K: \
+ if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT;
+ COND_JMP(u, JEQ, ==)
+ COND_JMP(u, JNE, !=)
+ COND_JMP(u, JGT, >)
+ COND_JMP(u, JLT, <)
+ COND_JMP(u, JGE, >=)
+ COND_JMP(u, JLE, <=)
+ COND_JMP(u, JSET, &)
+ COND_JMP(s, JSGT, >)
+ COND_JMP(s, JSLT, <)
+ COND_JMP(s, JSGE, >=)
+ COND_JMP(s, JSLE, <=)
+#undef COND_JMP
/* STX and ST and LDX*/
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
@@ -2036,6 +2031,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_spin_lock_proto __weak;
+const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
@@ -2101,6 +2098,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
return -EFAULT;
}
+DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+EXPORT_SYMBOL(bpf_stats_enabled_key);
+int sysctl_bpf_stats_enabled __read_mostly;
+
/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
#include <linux/bpf_trace.h>
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index d6b76377cb6e..de73f55e42fd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -67,7 +67,7 @@ const char *const bpf_class_string[8] = {
[BPF_STX] = "stx",
[BPF_ALU] = "alu",
[BPF_JMP] = "jmp",
- [BPF_RET] = "BUG",
+ [BPF_JMP32] = "jmp32",
[BPF_ALU64] = "alu64",
};
@@ -136,23 +136,22 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
else
print_bpf_end_insn(verbose, cbs->private_data, insn);
} else if (BPF_OP(insn->code) == BPF_NEG) {
- verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n",
- insn->code, insn->dst_reg,
- class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
+ insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
+ class == BPF_ALU ? 'w' : 'r',
insn->src_reg);
} else {
- verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d %s %d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
insn->imm);
}
} else if (class == BPF_STX) {
@@ -220,7 +219,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code);
return;
}
- } else if (class == BPF_JMP) {
+ } else if (class == BPF_JMP32 || class == BPF_JMP) {
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
@@ -244,13 +243,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
verbose(cbs->private_data, "(%02x) exit\n", insn->code);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n",
- insn->code, insn->dst_reg,
+ verbose(cbs->private_data,
+ "(%02x) if %c%d %s %c%d goto pc%+d\n",
+ insn->code, class == BPF_JMP32 ? 'w' : 'r',
+ insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ class == BPF_JMP32 ? 'w' : 'r',
insn->src_reg, insn->off);
} else {
- verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n",
- insn->code, insn->dst_reg,
+ verbose(cbs->private_data,
+ "(%02x) if %c%d %s 0x%x goto pc%+d\n",
+ insn->code, class == BPF_JMP32 ? 'w' : 'r',
+ insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
insn->imm, insn->off);
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4b7c76765d9d..fed15cf94dca 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -686,7 +686,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
}
if (htab_is_prealloc(htab)) {
- pcpu_freelist_push(&htab->freelist, &l->fnode);
+ __pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
l->htab = htab;
@@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
BITS_PER_LONG == 64;
}
-static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
-{
- u32 size = htab->map.value_size;
-
- if (percpu || fd_htab_map_needs_adjust(htab))
- size = round_up(size, 8);
- return size;
-}
-
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
struct htab_elem *old_elem)
{
- u32 size = htab_size_value(htab, percpu);
+ u32 size = htab->map.value_size;
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
@@ -748,7 +739,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
} else {
struct pcpu_freelist_node *l;
- l = pcpu_freelist_pop(&htab->freelist);
+ l = __pcpu_freelist_pop(&htab->freelist);
if (!l)
return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
@@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ check_and_init_map_lock(&htab->map,
+ l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
if (percpu) {
+ size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
@@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
- } else {
+ } else if (fd_htab_map_needs_adjust(htab)) {
+ size = round_up(size, 8);
memcpy(l_new->key + round_up(key_size, 8), value, size);
+ } else {
+ copy_map_value(&htab->map,
+ l_new->key + round_up(key_size, 8),
+ value);
}
l_new->hash = hash;
@@ -805,11 +804,11 @@ dec_count:
static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
u64 map_flags)
{
- if (l_old && map_flags == BPF_NOEXIST)
+ if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
/* elem already exists */
return -EEXIST;
- if (!l_old && map_flags == BPF_EXIST)
+ if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
/* elem doesn't exist, cannot update it */
return -ENOENT;
@@ -828,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
u32 key_size, hash;
int ret;
- if (unlikely(map_flags > BPF_EXIST))
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
@@ -841,6 +840,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
b = __select_bucket(htab, hash);
head = &b->head;
+ if (unlikely(map_flags & BPF_F_LOCK)) {
+ if (unlikely(!map_value_has_spin_lock(map)))
+ return -EINVAL;
+ /* find an element without taking the bucket lock */
+ l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
+ htab->n_buckets);
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ return ret;
+ if (l_old) {
+ /* grab the element lock and update value in place */
+ copy_map_value_locked(map,
+ l_old->key + round_up(key_size, 8),
+ value, false);
+ return 0;
+ }
+ /* fall through, grab the bucket lock and lookup again.
+ * 99.9% chance that the element won't be found,
+ * but second lookup under lock has to be done.
+ */
+ }
+
/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&b->lock, flags);
@@ -850,6 +871,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (ret)
goto err;
+ if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
+ /* first lookup without the bucket lock didn't find the element,
+ * but second lookup with the bucket lock found it.
+ * This case is highly unlikely, but has to be dealt with:
+ * grab the element lock in addition to the bucket lock
+ * and update element in place
+ */
+ copy_map_value_locked(map,
+ l_old->key + round_up(key_size, 8),
+ value, false);
+ ret = 0;
+ goto err;
+ }
+
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
l_old);
if (IS_ERR(l_new)) {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a74972b07e74..a411fc17d265 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -221,6 +221,102 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg2_type = ARG_CONST_SIZE,
};
+#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+ arch_spinlock_t *l = (void *)lock;
+ union {
+ __u32 val;
+ arch_spinlock_t lock;
+ } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
+
+ compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
+ BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
+ BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+ arch_spin_lock(l);
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+ arch_spinlock_t *l = (void *)lock;
+
+ arch_spin_unlock(l);
+}
+
+#else
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+ atomic_t *l = (void *)lock;
+
+ BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
+ do {
+ atomic_cond_read_relaxed(l, !VAL);
+ } while (atomic_xchg(l, 1));
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+ atomic_t *l = (void *)lock;
+
+ atomic_set_release(l, 0);
+}
+
+#endif
+
+static DEFINE_PER_CPU(unsigned long, irqsave_flags);
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __bpf_spin_lock(lock);
+ __this_cpu_write(irqsave_flags, flags);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_spin_lock_proto = {
+ .func = bpf_spin_lock,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_SPIN_LOCK,
+};
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+ unsigned long flags;
+
+ flags = __this_cpu_read(irqsave_flags);
+ __bpf_spin_unlock(lock);
+ local_irq_restore(flags);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_spin_unlock_proto = {
+ .func = bpf_spin_unlock,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_SPIN_LOCK,
+};
+
+void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
+ bool lock_src)
+{
+ struct bpf_spin_lock *lock;
+
+ if (lock_src)
+ lock = src + map->spin_lock_off;
+ else
+ lock = dst + map->spin_lock_off;
+ preempt_disable();
+ ____bpf_spin_lock(lock);
+ copy_map_value(map, dst, src);
+ ____bpf_spin_unlock(lock);
+ preempt_enable();
+}
+
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 07a34ef562a0..6b572e2de7fb 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
- if (flags != BPF_ANY && flags != BPF_EXIST)
+ if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
+ return -EINVAL;
+
+ if (unlikely(flags & BPF_NOEXIST))
+ return -EINVAL;
+
+ if (unlikely((flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)))
return -EINVAL;
storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
@@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
if (!storage)
return -ENOENT;
+ if (flags & BPF_F_LOCK) {
+ copy_map_value_locked(map, storage->buf->data, value, false);
+ return 0;
+ }
+
new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
map->value_size,
__GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
@@ -147,6 +159,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
return -ENOMEM;
memcpy(&new->data[0], value, map->value_size);
+ check_and_init_map_lock(map, new->data);
new = xchg(&storage->buf, new);
kfree_rcu(new, rcu);
@@ -483,6 +496,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
storage->buf = kmalloc_node(size, flags, map->numa_node);
if (!storage->buf)
goto enomem;
+ check_and_init_map_lock(map, storage->buf->data);
} else {
storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
if (!storage->percpu_buf)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index abf1002080df..93a5cbbde421 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -471,6 +471,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
}
if (!node || node->prefixlen != key->prefixlen ||
+ node->prefixlen != matchlen ||
(node->flags & LPM_TREE_NODE_FLAG_IM)) {
ret = -ENOENT;
goto out;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 52378d3e34b3..3dff41403583 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
return ERR_PTR(-EINVAL);
}
+ if (map_value_has_spin_lock(inner_map)) {
+ fdput(f);
+ return ERR_PTR(-ENOTSUPP);
+ }
+
inner_map_meta_size = sizeof(*inner_map_meta);
/* In some cases verifier needs to access beyond just base map. */
if (inner_map->ops == &array_map_ops)
@@ -53,6 +58,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
+ inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 54cf2b9c44a4..ba635209ae9a 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -35,6 +35,7 @@ static DECLARE_RWSEM(bpf_devs_lock);
struct bpf_offload_dev {
const struct bpf_prog_offload_ops *ops;
struct list_head netdevs;
+ void *priv;
};
struct bpf_offload_netdev {
@@ -173,6 +174,41 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
return ret;
}
+void
+bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
+ struct bpf_insn *insn)
+{
+ const struct bpf_prog_offload_ops *ops;
+ struct bpf_prog_offload *offload;
+ int ret = -EOPNOTSUPP;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload) {
+ ops = offload->offdev->ops;
+ if (!offload->opt_failed && ops->replace_insn)
+ ret = ops->replace_insn(env, off, insn);
+ offload->opt_failed |= ret;
+ }
+ up_read(&bpf_devs_lock);
+}
+
+void
+bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_prog_offload *offload;
+ int ret = -EOPNOTSUPP;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload) {
+ if (!offload->opt_failed && offload->offdev->ops->remove_insns)
+ ret = offload->offdev->ops->remove_insns(env, off, cnt);
+ offload->opt_failed |= ret;
+ }
+ up_read(&bpf_devs_lock);
+}
+
static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload = prog->aux->offload;
@@ -634,7 +670,7 @@ unlock:
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
struct bpf_offload_dev *
-bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)
+bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
{
struct bpf_offload_dev *offdev;
int err;
@@ -653,6 +689,7 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)
return ERR_PTR(-ENOMEM);
offdev->ops = ops;
+ offdev->priv = priv;
INIT_LIST_HEAD(&offdev->netdevs);
return offdev;
@@ -665,3 +702,9 @@ void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev)
kfree(offdev);
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy);
+
+void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
+{
+ return offdev->priv;
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 673fa6fe2d73..0c1b4ba9e90e 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
free_percpu(s->freelist);
}
-static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
- struct pcpu_freelist_node *node)
+static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
+ struct pcpu_freelist_node *node)
{
raw_spin_lock(&head->lock);
node->next = head->first;
@@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
raw_spin_unlock(&head->lock);
}
-void pcpu_freelist_push(struct pcpu_freelist *s,
+void __pcpu_freelist_push(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
- __pcpu_freelist_push(head, node);
+ ___pcpu_freelist_push(head, node);
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __pcpu_freelist_push(s, node);
+ local_irq_restore(flags);
}
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
@@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
for_each_possible_cpu(cpu) {
again:
head = per_cpu_ptr(s->freelist, cpu);
- __pcpu_freelist_push(head, buf);
+ ___pcpu_freelist_push(head, buf);
i++;
buf += elem_size;
if (i == nr_elems)
@@ -74,14 +84,12 @@ again:
local_irq_restore(flags);
}
-struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- unsigned long flags;
int orig_cpu, cpu;
- local_irq_save(flags);
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
@@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
node = head->first;
if (node) {
head->first = node->next;
- raw_spin_unlock_irqrestore(&head->lock, flags);
+ raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
- if (cpu == orig_cpu) {
- local_irq_restore(flags);
+ if (cpu == orig_cpu)
return NULL;
- }
}
}
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ struct pcpu_freelist_node *ret;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ret = __pcpu_freelist_pop(s);
+ local_irq_restore(flags);
+ return ret;
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3049aae8ea1e..c3960118e617 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -22,8 +22,12 @@ struct pcpu_freelist_node {
struct pcpu_freelist_node *next;
};
+/* pcpu_freelist_* do spin_lock_irqsave. */
void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */
+void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *);
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems);
int pcpu_freelist_init(struct pcpu_freelist *);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index d43b14535827..950ab2f28922 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -44,7 +44,7 @@ static void do_up_read(struct irq_work *entry)
struct stack_map_irq_work *work;
work = container_of(entry, struct stack_map_irq_work, irq_work);
- up_read(work->sem);
+ up_read_non_owner(work->sem);
work->sem = NULL;
}
@@ -338,6 +338,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
} else {
work->sem = &current->mm->mmap_sem;
irq_work_queue(&work->irq_work);
+ /*
+ * The irq_work will release the mmap_sem with
+ * up_read_non_owner(). The rwsem_release() is called
+ * here to release the lock from lockdep's perspective.
+ */
+ rwsem_release(&current->mm->mmap_sem.dep_map, 1, _RET_IP_);
}
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd..bc34cf9fe9ee 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}
-static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
+static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 btf_key_id, u32 btf_value_id)
{
const struct btf_type *key_type, *value_type;
@@ -478,6 +478,22 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size)
return -EINVAL;
+ map->spin_lock_off = btf_find_spin_lock(btf, value_type);
+
+ if (map_value_has_spin_lock(map)) {
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
+ return -ENOTSUPP;
+ if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
+ map->value_size) {
+ WARN_ONCE(1,
+ "verifier bug spin_lock_off %d value_size %d\n",
+ map->spin_lock_off, map->value_size);
+ return -EFAULT;
+ }
+ }
+
if (map->ops->map_check_btf)
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
@@ -542,6 +558,8 @@ static int map_create(union bpf_attr *attr)
map->btf = btf;
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
+ } else {
+ map->spin_lock_off = -EINVAL;
}
err = security_bpf_map_alloc(map);
@@ -559,12 +577,12 @@ static int map_create(union bpf_attr *attr)
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
- * bpf_map_put() is needed because the above
+ * bpf_map_put_with_uref() is needed because the above
* bpf_map_alloc_id() has published the map
* to the userspace and the userspace may
* have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
*/
- bpf_map_put(map);
+ bpf_map_put_with_uref(map);
return err;
}
@@ -664,7 +682,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
static int map_lookup_elem(union bpf_attr *attr)
{
@@ -680,6 +698,9 @@ static int map_lookup_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
+ if (attr->flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
@@ -690,6 +711,12 @@ static int map_lookup_elem(union bpf_attr *attr)
goto err_put;
}
+ if ((attr->flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -713,8 +740,13 @@ static int map_lookup_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ goto done;
+ }
+
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
@@ -740,11 +772,20 @@ static int map_lookup_elem(union bpf_attr *attr)
err = -ENOENT;
} else {
err = 0;
- memcpy(value, ptr, value_size);
+ if (attr->flags & BPF_F_LOCK)
+ /* lock 'ptr' and copy everything but lock */
+ copy_map_value_locked(map, value, ptr, true);
+ else
+ copy_map_value(map, value, ptr);
+ /* mask lock, since value wasn't zero inited */
+ check_and_init_map_lock(map, value);
}
rcu_read_unlock();
}
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+done:
if (err)
goto free_value;
@@ -800,6 +841,12 @@ static int map_update_elem(union bpf_attr *attr)
goto err_put;
}
+ if ((attr->flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -1236,24 +1283,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
+static void bpf_prog_get_stats(const struct bpf_prog *prog,
+ struct bpf_prog_stats *stats)
+{
+ u64 nsecs = 0, cnt = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ const struct bpf_prog_stats *st;
+ unsigned int start;
+ u64 tnsecs, tcnt;
+
+ st = per_cpu_ptr(prog->aux->stats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&st->syncp);
+ tnsecs = st->nsecs;
+ tcnt = st->cnt;
+ } while (u64_stats_fetch_retry_irq(&st->syncp, start));
+ nsecs += tnsecs;
+ cnt += tcnt;
+ }
+ stats->nsecs = nsecs;
+ stats->cnt = cnt;
+}
+
#ifdef CONFIG_PROC_FS
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+ struct bpf_prog_stats stats;
+ bpf_prog_get_stats(prog, &stats);
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
"prog_tag:\t%s\n"
"memlock:\t%llu\n"
- "prog_id:\t%u\n",
+ "prog_id:\t%u\n"
+ "run_time_ns:\t%llu\n"
+ "run_cnt:\t%llu\n",
prog->type,
prog->jited,
prog_tag,
prog->pages * 1ULL << PAGE_SHIFT,
- prog->aux->id);
+ prog->aux->id,
+ stats.nsecs,
+ stats.cnt);
}
#endif
@@ -1978,7 +2055,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
fd = bpf_map_new_fd(map, f_flags);
if (fd < 0)
- bpf_map_put(map);
+ bpf_map_put_with_uref(map);
return fd;
}
@@ -2075,6 +2152,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_prog_info info = {};
u32 info_len = attr->info.info_len;
+ struct bpf_prog_stats stats;
char __user *uinsns;
u32 ulen;
int err;
@@ -2114,6 +2192,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
if (err)
return err;
+ bpf_prog_get_stats(prog, &stats);
+ info.run_time_ns = stats.nsecs;
+ info.run_cnt = stats.cnt;
+
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 56674a7c3778..a7b96bf0e654 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -213,6 +213,7 @@ struct bpf_call_arg_meta {
s64 msize_smax_value;
u64 msize_umax_value;
int ptr_id;
+ int func_id;
};
static DEFINE_MUTEX(bpf_verifier_lock);
@@ -330,10 +331,19 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
type == PTR_TO_PACKET_META;
}
+static bool type_is_sk_pointer(enum bpf_reg_type type)
+{
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_TCP_SOCK;
+}
+
static bool reg_type_may_be_null(enum bpf_reg_type type)
{
return type == PTR_TO_MAP_VALUE_OR_NULL ||
- type == PTR_TO_SOCKET_OR_NULL;
+ type == PTR_TO_SOCKET_OR_NULL ||
+ type == PTR_TO_SOCK_COMMON_OR_NULL ||
+ type == PTR_TO_TCP_SOCK_OR_NULL;
}
static bool type_is_refcounted(enum bpf_reg_type type)
@@ -351,6 +361,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg)
return type_is_refcounted(reg->type);
}
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+{
+ return reg->type == PTR_TO_MAP_VALUE &&
+ map_value_has_spin_lock(reg->map_ptr);
+}
+
static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
{
return type_is_refcounted_or_null(reg->type);
@@ -370,6 +386,12 @@ static bool is_release_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_sk_release;
}
+static bool is_acquire_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_sk_lookup_tcp ||
+ func_id == BPF_FUNC_sk_lookup_udp;
+}
+
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
@@ -385,6 +407,10 @@ static const char * const reg_type_str[] = {
[PTR_TO_FLOW_KEYS] = "flow_keys",
[PTR_TO_SOCKET] = "sock",
[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
};
static char slot_type_char[] = {
@@ -611,13 +637,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
}
/* release function corresponding to acquire_reference_state(). Idempotent. */
-static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
+static int release_reference_state(struct bpf_func_state *state, int ptr_id)
{
int i, last_idx;
- if (!ptr_id)
- return -EFAULT;
-
last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].id == ptr_id) {
@@ -629,21 +652,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
return 0;
}
}
- return -EFAULT;
-}
-
-/* variation on the above for cases where we expect that there must be an
- * outstanding reference for the specified ptr_id.
- */
-static int release_reference_state(struct bpf_verifier_env *env, int ptr_id)
-{
- struct bpf_func_state *state = cur_func(env);
- int err;
-
- err = __release_reference_state(state, ptr_id);
- if (WARN_ON_ONCE(err != 0))
- verbose(env, "verifier internal error: can't release reference\n");
- return err;
+ return -EINVAL;
}
static int transfer_reference_state(struct bpf_func_state *dst,
@@ -712,6 +721,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
}
dst_state->speculative = src->speculative;
dst_state->curframe = src->curframe;
+ dst_state->active_spin_lock = src->active_spin_lock;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -1095,7 +1105,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;
- if (BPF_CLASS(code) != BPF_JMP)
+ if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
@@ -1201,6 +1211,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case CONST_PTR_TO_MAP:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
return true;
default:
return false;
@@ -1483,6 +1497,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
if (err)
verbose(env, "R%d max value is outside of the array range\n",
regno);
+
+ if (map_value_has_spin_lock(reg->map_ptr)) {
+ u32 lock = reg->map_ptr->spin_lock_off;
+
+ /* if any part of struct bpf_spin_lock can be touched by
+ * load/store reject this program.
+ * To check that [x1, x2) overlaps with [y1, y2)
+ * it is sufficient to check x1 < y2 && y1 < x2.
+ */
+ if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
+ lock < reg->umax_value + off + size) {
+ verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
+ return -EACCES;
+ }
+ }
return err;
}
@@ -1617,12 +1646,14 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
return 0;
}
-static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size, enum bpf_access_type t)
+static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
+ u32 regno, int off, int size,
+ enum bpf_access_type t)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
- struct bpf_insn_access_aux info;
+ struct bpf_insn_access_aux info = {};
+ bool valid;
if (reg->smin_value < 0) {
verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
@@ -1630,13 +1661,31 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,
return -EACCES;
}
- if (!bpf_sock_is_valid_access(off, size, t, &info)) {
- verbose(env, "invalid bpf_sock access off=%d size=%d\n",
- off, size);
- return -EACCES;
+ switch (reg->type) {
+ case PTR_TO_SOCK_COMMON:
+ valid = bpf_sock_common_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_SOCKET:
+ valid = bpf_sock_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_TCP_SOCK:
+ valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
+ break;
+ default:
+ valid = false;
}
- return 0;
+
+ if (valid) {
+ env->insn_aux_data[insn_idx].ctx_field_size =
+ info.ctx_field_size;
+ return 0;
+ }
+
+ verbose(env, "R%d invalid %s access off=%d size=%d\n",
+ regno, reg_type_str[reg->type], off, size);
+
+ return -EACCES;
}
static bool __is_pointer_value(bool allow_ptr_leaks,
@@ -1662,8 +1711,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
{
const struct bpf_reg_state *reg = reg_state(env, regno);
- return reg->type == PTR_TO_CTX ||
- reg->type == PTR_TO_SOCKET;
+ return reg->type == PTR_TO_CTX;
+}
+
+static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return type_is_sk_pointer(reg->type);
}
static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
@@ -1774,6 +1829,12 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_SOCKET:
pointer_desc = "sock ";
break;
+ case PTR_TO_SOCK_COMMON:
+ pointer_desc = "sock_common ";
+ break;
+ case PTR_TO_TCP_SOCK:
+ pointer_desc = "tcp_sock ";
+ break;
default:
break;
}
@@ -1977,11 +2038,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* PTR_TO_PACKET[_META,_END]. In the latter
* case, we know the offset is zero.
*/
- if (reg_type == SCALAR_VALUE)
+ if (reg_type == SCALAR_VALUE) {
mark_reg_unknown(env, regs, value_regno);
- else
+ } else {
mark_reg_known_zero(env, regs,
value_regno);
+ if (reg_type_may_be_null(reg_type))
+ regs[value_regno].id = ++env->id_gen;
+ }
regs[value_regno].type = reg_type;
}
@@ -2027,12 +2091,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_flow_keys_access(env, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_SOCKET) {
+ } else if (type_is_sk_pointer(reg->type)) {
if (t == BPF_WRITE) {
- verbose(env, "cannot write into socket\n");
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str[reg->type]);
return -EACCES;
}
- err = check_sock_access(env, regno, off, size, t);
+ err = check_sock_access(env, insn_idx, regno, off, size, t);
if (!err && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else {
@@ -2076,7 +2141,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
if (is_ctx_reg(env, insn->dst_reg) ||
is_pkt_reg(env, insn->dst_reg) ||
- is_flow_key_reg(env, insn->dst_reg)) {
+ is_flow_key_reg(env, insn->dst_reg) ||
+ is_sk_reg(env, insn->dst_reg)) {
verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str[reg_state(env, insn->dst_reg)->type]);
@@ -2192,6 +2258,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
}
}
+/* Implementation details:
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * Two bpf_map_lookups (even with the same key) will have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
+ * value_or_null->value transition, since the verifier only cares about
+ * the range of access to valid map value pointer and doesn't care about actual
+ * address of the map element.
+ * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
+ * reg->id > 0 after value_or_null->value transition. By doing so
+ * two bpf_map_lookups will be considered two different pointers that
+ * point to different bpf_spin_locks.
+ * The verifier allows taking only one bpf_spin_lock at a time to avoid
+ * dead-locks.
+ * Since only one bpf_spin_lock is allowed the checks are simpler than
+ * reg_is_refcounted() logic. The verifier needs to remember only
+ * one spin_lock instead of array of acquired_refs.
+ * cur_state->active_spin_lock remembers which map value element got locked
+ * and clears it after bpf_spin_unlock.
+ */
+static int process_spin_lock(struct bpf_verifier_env *env, int regno,
+ bool is_lock)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_verifier_state *cur = env->cur_state;
+ bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "R%d is not a pointer to map_value\n", regno);
+ return -EINVAL;
+ }
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map->btf) {
+ verbose(env,
+ "map '%s' has to have BTF in order to use bpf_spin_lock\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (!map_value_has_spin_lock(map)) {
+ if (map->spin_lock_off == -E2BIG)
+ verbose(env,
+ "map '%s' has more than one 'struct bpf_spin_lock'\n",
+ map->name);
+ else if (map->spin_lock_off == -ENOENT)
+ verbose(env,
+ "map '%s' doesn't have 'struct bpf_spin_lock'\n",
+ map->name);
+ else
+ verbose(env,
+ "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (map->spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
+ val + reg->off);
+ return -EINVAL;
+ }
+ if (is_lock) {
+ if (cur->active_spin_lock) {
+ verbose(env,
+ "Locking two bpf_spin_locks are not allowed\n");
+ return -EINVAL;
+ }
+ cur->active_spin_lock = reg->id;
+ } else {
+ if (!cur->active_spin_lock) {
+ verbose(env, "bpf_spin_unlock without taking a lock\n");
+ return -EINVAL;
+ }
+ if (cur->active_spin_lock != reg->id) {
+ verbose(env, "bpf_spin_unlock of different lock\n");
+ return -EINVAL;
+ }
+ cur->active_spin_lock = 0;
+ }
+ return 0;
+}
+
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{
return type == ARG_PTR_TO_MEM ||
@@ -2258,6 +2409,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
err = check_ctx_reg(env, reg, regno);
if (err < 0)
return err;
+ } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) {
+ expected_type = PTR_TO_SOCK_COMMON;
+ /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
+ if (!type_is_sk_pointer(type))
+ goto err_type;
} else if (arg_type == ARG_PTR_TO_SOCKET) {
expected_type = PTR_TO_SOCKET;
if (type != expected_type)
@@ -2268,6 +2424,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EFAULT;
}
meta->ptr_id = reg->id;
+ } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+ if (meta->func_id == BPF_FUNC_spin_lock) {
+ if (process_spin_lock(env, regno, true))
+ return -EACCES;
+ } else if (meta->func_id == BPF_FUNC_spin_unlock) {
+ if (process_spin_lock(env, regno, false))
+ return -EACCES;
+ } else {
+ verbose(env, "verifier internal error\n");
+ return -EFAULT;
+ }
} else if (arg_type_is_mem_ptr(arg_type)) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
@@ -2661,7 +2828,7 @@ static int release_reference(struct bpf_verifier_env *env,
for (i = 0; i <= vstate->curframe; i++)
release_reg_references(env, vstate->frame[i], meta->ptr_id);
- return release_reference_state(env, meta->ptr_id);
+ return release_reference_state(cur_func(env), meta->ptr_id);
}
static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -2887,6 +3054,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return err;
}
+ meta.func_id = func_id;
/* check args */
err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
if (err)
@@ -2926,8 +3094,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
}
} else if (is_release_function(func_id)) {
err = release_reference(env, &meta);
- if (err)
+ if (err) {
+ verbose(env, "func %s#%d reference has not been acquired before\n",
+ func_id_name(func_id), func_id);
return err;
+ }
}
regs = cur_regs(env);
@@ -2969,17 +3140,30 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].map_ptr = meta.map_ptr;
if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
+ if (map_value_has_spin_lock(meta.map_ptr))
+ regs[BPF_REG_0].id = ++env->id_gen;
} else {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
regs[BPF_REG_0].id = ++env->id_gen;
}
} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
- int id = acquire_reference_state(env, insn_idx);
- if (id < 0)
- return id;
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
- regs[BPF_REG_0].id = id;
+ if (is_acquire_function(func_id)) {
+ int id = acquire_reference_state(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ /* For release_reference() */
+ regs[BPF_REG_0].id = id;
+ } else {
+ /* For mark_ptr_or_null_reg() */
+ regs[BPF_REG_0].id = ++env->id_gen;
+ }
+ } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
+ regs[BPF_REG_0].id = ++env->id_gen;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
@@ -3239,6 +3423,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
@@ -4031,11 +4219,50 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* 0 - branch will not be taken and fall-through to next insn
* -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10]
*/
-static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
+static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
+ bool is_jmp32)
{
+ struct bpf_reg_state reg_lo;
+ s64 sval;
+
if (__is_pointer_value(false, reg))
return -1;
+ if (is_jmp32) {
+ reg_lo = *reg;
+ reg = &reg_lo;
+ /* For JMP32, only low 32 bits are compared, coerce_reg_to_size
+ * could truncate high bits and update umin/umax according to
+ * information of low bits.
+ */
+ coerce_reg_to_size(reg, 4);
+ /* smin/smax need special handling. For example, after coerce,
+ * if smin_value is 0x00000000ffffffffLL, the value is -1 when
+ * used as operand to JMP32. It is a negative number from s32's
+ * point of view, while it is a positive number when seen as
+ * s64. The smin/smax are kept as s64, therefore, when used with
+ * JMP32, they need to be transformed into s32, then sign
+ * extended back to s64.
+ *
+ * Also, smin/smax were copied from umin/umax. If umin/umax has
+ * different sign bit, then min/max relationship doesn't
+ * maintain after casting into s32, for this case, set smin/smax
+ * to safest range.
+ */
+ if ((reg->umax_value ^ reg->umin_value) &
+ (1ULL << 31)) {
+ reg->smin_value = S32_MIN;
+ reg->smax_value = S32_MAX;
+ }
+ reg->smin_value = (s64)(s32)reg->smin_value;
+ reg->smax_value = (s64)(s32)reg->smax_value;
+
+ val = (u32)val;
+ sval = (s64)(s32)val;
+ } else {
+ sval = (s64)val;
+ }
+
switch (opcode) {
case BPF_JEQ:
if (tnum_is_const(reg->var_off))
@@ -4058,9 +4285,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSGT:
- if (reg->smin_value > (s64)val)
+ if (reg->smin_value > sval)
return 1;
- else if (reg->smax_value < (s64)val)
+ else if (reg->smax_value < sval)
return 0;
break;
case BPF_JLT:
@@ -4070,9 +4297,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSLT:
- if (reg->smax_value < (s64)val)
+ if (reg->smax_value < sval)
return 1;
- else if (reg->smin_value >= (s64)val)
+ else if (reg->smin_value >= sval)
return 0;
break;
case BPF_JGE:
@@ -4082,9 +4309,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSGE:
- if (reg->smin_value >= (s64)val)
+ if (reg->smin_value >= sval)
return 1;
- else if (reg->smax_value < (s64)val)
+ else if (reg->smax_value < sval)
return 0;
break;
case BPF_JLE:
@@ -4094,9 +4321,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSLE:
- if (reg->smax_value <= (s64)val)
+ if (reg->smax_value <= sval)
return 1;
- else if (reg->smin_value > (s64)val)
+ else if (reg->smin_value > sval)
return 0;
break;
}
@@ -4104,6 +4331,29 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return -1;
}
+/* Generate min value of the high 32-bit from TNUM info. */
+static u64 gen_hi_min(struct tnum var)
+{
+ return var.value & ~0xffffffffULL;
+}
+
+/* Generate max value of the high 32-bit from TNUM info. */
+static u64 gen_hi_max(struct tnum var)
+{
+ return (var.value | var.mask) & ~0xffffffffULL;
+}
+
+/* Return true if VAL is compared with a s64 sign extended from s32, and they
+ * are with the same signedness.
+ */
+static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg)
+{
+ return ((s32)sval >= 0 &&
+ reg->smin_value >= 0 && reg->smax_value <= S32_MAX) ||
+ ((s32)sval < 0 &&
+ reg->smax_value <= 0 && reg->smin_value >= S32_MIN);
+}
+
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
@@ -4111,8 +4361,10 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
*/
static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
- u8 opcode)
+ u8 opcode, bool is_jmp32)
{
+ s64 sval;
+
/* If the dst_reg is a pointer, we can't learn anything about its
* variable offset from the compare (unless src_reg were a pointer into
* the same object, but we don't bother with that.
@@ -4122,19 +4374,31 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
if (__is_pointer_value(false, false_reg))
return;
+ val = is_jmp32 ? (u32)val : val;
+ sval = is_jmp32 ? (s64)(s32)val : (s64)val;
+
switch (opcode) {
case BPF_JEQ:
- /* If this is false then we know nothing Jon Snow, but if it is
- * true then we know for sure.
- */
- __mark_reg_known(true_reg, val);
- break;
case BPF_JNE:
- /* If this is true we know nothing Jon Snow, but if it is false
- * we know the value for sure;
+ {
+ struct bpf_reg_state *reg =
+ opcode == BPF_JEQ ? true_reg : false_reg;
+
+ /* For BPF_JEQ, if this is false we know nothing Jon Snow, but
+ * if it is true we know the value for sure. Likewise for
+ * BPF_JNE.
*/
- __mark_reg_known(false_reg, val);
+ if (is_jmp32) {
+ u64 old_v = reg->var_off.value;
+ u64 hi_mask = ~0xffffffffULL;
+
+ reg->var_off.value = (old_v & hi_mask) | val;
+ reg->var_off.mask &= hi_mask;
+ } else {
+ __mark_reg_known(reg, val);
+ }
break;
+ }
case BPF_JSET:
false_reg->var_off = tnum_and(false_reg->var_off,
tnum_const(~val));
@@ -4142,38 +4406,61 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
true_reg->var_off = tnum_or(true_reg->var_off,
tnum_const(val));
break;
- case BPF_JGT:
- false_reg->umax_value = min(false_reg->umax_value, val);
- true_reg->umin_value = max(true_reg->umin_value, val + 1);
- break;
- case BPF_JSGT:
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
- break;
- case BPF_JLT:
- false_reg->umin_value = max(false_reg->umin_value, val);
- true_reg->umax_value = min(true_reg->umax_value, val - 1);
- break;
- case BPF_JSLT:
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
- break;
case BPF_JGE:
- false_reg->umax_value = min(false_reg->umax_value, val - 1);
- true_reg->umin_value = max(true_reg->umin_value, val);
+ case BPF_JGT:
+ {
+ u64 false_umax = opcode == BPF_JGT ? val : val - 1;
+ u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
+
+ if (is_jmp32) {
+ false_umax += gen_hi_max(false_reg->var_off);
+ true_umin += gen_hi_min(true_reg->var_off);
+ }
+ false_reg->umax_value = min(false_reg->umax_value, false_umax);
+ true_reg->umin_value = max(true_reg->umin_value, true_umin);
break;
+ }
case BPF_JSGE:
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ case BPF_JSGT:
+ {
+ s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1;
+ s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
+
+ /* If the full s64 was not sign-extended from s32 then don't
+ * deduct further info.
+ */
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smax_value = min(false_reg->smax_value, false_smax);
+ true_reg->smin_value = max(true_reg->smin_value, true_smin);
break;
+ }
case BPF_JLE:
- false_reg->umin_value = max(false_reg->umin_value, val + 1);
- true_reg->umax_value = min(true_reg->umax_value, val);
+ case BPF_JLT:
+ {
+ u64 false_umin = opcode == BPF_JLT ? val : val + 1;
+ u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
+
+ if (is_jmp32) {
+ false_umin += gen_hi_min(false_reg->var_off);
+ true_umax += gen_hi_max(true_reg->var_off);
+ }
+ false_reg->umin_value = max(false_reg->umin_value, false_umin);
+ true_reg->umax_value = min(true_reg->umax_value, true_umax);
break;
+ }
case BPF_JSLE:
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
+ case BPF_JSLT:
+ {
+ s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1;
+ s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smin_value = max(false_reg->smin_value, false_smin);
+ true_reg->smax_value = min(true_reg->smax_value, true_smax);
break;
+ }
default:
break;
}
@@ -4196,24 +4483,34 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
*/
static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
- u8 opcode)
+ u8 opcode, bool is_jmp32)
{
+ s64 sval;
+
if (__is_pointer_value(false, false_reg))
return;
+ val = is_jmp32 ? (u32)val : val;
+ sval = is_jmp32 ? (s64)(s32)val : (s64)val;
+
switch (opcode) {
case BPF_JEQ:
- /* If this is false then we know nothing Jon Snow, but if it is
- * true then we know for sure.
- */
- __mark_reg_known(true_reg, val);
- break;
case BPF_JNE:
- /* If this is true we know nothing Jon Snow, but if it is false
- * we know the value for sure;
- */
- __mark_reg_known(false_reg, val);
+ {
+ struct bpf_reg_state *reg =
+ opcode == BPF_JEQ ? true_reg : false_reg;
+
+ if (is_jmp32) {
+ u64 old_v = reg->var_off.value;
+ u64 hi_mask = ~0xffffffffULL;
+
+ reg->var_off.value = (old_v & hi_mask) | val;
+ reg->var_off.mask &= hi_mask;
+ } else {
+ __mark_reg_known(reg, val);
+ }
break;
+ }
case BPF_JSET:
false_reg->var_off = tnum_and(false_reg->var_off,
tnum_const(~val));
@@ -4221,38 +4518,58 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
true_reg->var_off = tnum_or(true_reg->var_off,
tnum_const(val));
break;
- case BPF_JGT:
- true_reg->umax_value = min(true_reg->umax_value, val - 1);
- false_reg->umin_value = max(false_reg->umin_value, val);
- break;
- case BPF_JSGT:
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
- break;
- case BPF_JLT:
- true_reg->umin_value = max(true_reg->umin_value, val + 1);
- false_reg->umax_value = min(false_reg->umax_value, val);
- break;
- case BPF_JSLT:
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
- break;
case BPF_JGE:
- true_reg->umax_value = min(true_reg->umax_value, val);
- false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ case BPF_JGT:
+ {
+ u64 false_umin = opcode == BPF_JGT ? val : val + 1;
+ u64 true_umax = opcode == BPF_JGT ? val - 1 : val;
+
+ if (is_jmp32) {
+ false_umin += gen_hi_min(false_reg->var_off);
+ true_umax += gen_hi_max(true_reg->var_off);
+ }
+ false_reg->umin_value = max(false_reg->umin_value, false_umin);
+ true_reg->umax_value = min(true_reg->umax_value, true_umax);
break;
+ }
case BPF_JSGE:
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ case BPF_JSGT:
+ {
+ s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1;
+ s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smin_value = max(false_reg->smin_value, false_smin);
+ true_reg->smax_value = min(true_reg->smax_value, true_smax);
break;
+ }
case BPF_JLE:
- true_reg->umin_value = max(true_reg->umin_value, val);
- false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ case BPF_JLT:
+ {
+ u64 false_umax = opcode == BPF_JLT ? val : val - 1;
+ u64 true_umin = opcode == BPF_JLT ? val + 1 : val;
+
+ if (is_jmp32) {
+ false_umax += gen_hi_max(false_reg->var_off);
+ true_umin += gen_hi_min(true_reg->var_off);
+ }
+ false_reg->umax_value = min(false_reg->umax_value, false_umax);
+ true_reg->umin_value = max(true_reg->umin_value, true_umin);
break;
+ }
case BPF_JSLE:
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
+ case BPF_JSLT:
+ {
+ s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1;
+ s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smax_value = min(false_reg->smax_value, false_smax);
+ true_reg->smin_value = max(true_reg->smin_value, true_smin);
break;
+ }
default:
break;
}
@@ -4343,8 +4660,13 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
reg->type = PTR_TO_SOCKET;
+ } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
+ reg->type = PTR_TO_SOCK_COMMON;
+ } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
+ reg->type = PTR_TO_TCP_SOCK;
}
- if (is_null || !reg_is_refcounted(reg)) {
+ if (is_null || !(reg_is_refcounted(reg) ||
+ reg_may_point_to_spin_lock(reg))) {
/* We don't need id from this point onwards anymore,
* thus we should better reset it, so that state
* pruning has chances to take effect.
@@ -4366,7 +4688,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
int i, j;
if (reg_is_refcounted_or_null(&regs[regno]) && is_null)
- __release_reference_state(state, id);
+ release_reference_state(state, id);
for (i = 0; i < MAX_BPF_REG; i++)
mark_ptr_or_null_reg(state, &regs[i], id, is_null);
@@ -4390,6 +4712,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
if (BPF_SRC(insn->code) != BPF_X)
return false;
+ /* Pointers are always 64-bit. */
+ if (BPF_CLASS(insn->code) == BPF_JMP32)
+ return false;
+
switch (BPF_OP(insn->code)) {
case BPF_JGT:
if ((dst_reg->type == PTR_TO_PACKET &&
@@ -4482,16 +4808,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs;
u8 opcode = BPF_OP(insn->code);
+ bool is_jmp32;
int err;
- if (opcode > BPF_JSLE) {
- verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
+ /* Only conditional jumps are expected to reach here. */
+ if (opcode == BPF_JA || opcode > BPF_JSLE) {
+ verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
return -EINVAL;
}
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
- verbose(env, "BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
@@ -4507,7 +4835,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
} else {
if (insn->src_reg != BPF_REG_0) {
- verbose(env, "BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
}
@@ -4518,9 +4846,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return err;
dst_reg = &regs[insn->dst_reg];
+ is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
if (BPF_SRC(insn->code) == BPF_K) {
- int pred = is_branch_taken(dst_reg, insn->imm, opcode);
+ int pred = is_branch_taken(dst_reg, insn->imm, opcode,
+ is_jmp32);
if (pred == 1) {
/* only follow the goto, ignore fall-through */
@@ -4548,30 +4878,51 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state lo_reg0 = *dst_reg;
+ struct bpf_reg_state lo_reg1 = *src_reg;
+ struct bpf_reg_state *src_lo, *dst_lo;
+
+ dst_lo = &lo_reg0;
+ src_lo = &lo_reg1;
+ coerce_reg_to_size(dst_lo, 4);
+ coerce_reg_to_size(src_lo, 4);
+
if (dst_reg->type == SCALAR_VALUE &&
- regs[insn->src_reg].type == SCALAR_VALUE) {
- if (tnum_is_const(regs[insn->src_reg].var_off))
+ src_reg->type == SCALAR_VALUE) {
+ if (tnum_is_const(src_reg->var_off) ||
+ (is_jmp32 && tnum_is_const(src_lo->var_off)))
reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg, regs[insn->src_reg].var_off.value,
- opcode);
- else if (tnum_is_const(dst_reg->var_off))
+ dst_reg,
+ is_jmp32
+ ? src_lo->var_off.value
+ : src_reg->var_off.value,
+ opcode, is_jmp32);
+ else if (tnum_is_const(dst_reg->var_off) ||
+ (is_jmp32 && tnum_is_const(dst_lo->var_off)))
reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
- &regs[insn->src_reg],
- dst_reg->var_off.value, opcode);
- else if (opcode == BPF_JEQ || opcode == BPF_JNE)
+ src_reg,
+ is_jmp32
+ ? dst_lo->var_off.value
+ : dst_reg->var_off.value,
+ opcode, is_jmp32);
+ else if (!is_jmp32 &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE))
/* Comparing for equality, we can combine knowledge */
reg_combine_min_max(&other_branch_regs[insn->src_reg],
&other_branch_regs[insn->dst_reg],
- &regs[insn->src_reg],
- &regs[insn->dst_reg], opcode);
+ src_reg, dst_reg, opcode);
}
} else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg, insn->imm, opcode);
+ dst_reg, insn->imm, opcode, is_jmp32);
}
- /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
- if (BPF_SRC(insn->code) == BPF_K &&
+ /* detect if R == 0 where R is returned from bpf_map_lookup_elem().
+ * NOTE: these optimizations below are related with pointer comparison
+ * which will never be JMP32.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
reg_type_may_be_null(dst_reg->type)) {
/* Mark all identical registers in each branch as either
@@ -4713,6 +5064,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
+ if (env->cur_state->active_spin_lock) {
+ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
+ return -EINVAL;
+ }
+
if (regs[BPF_REG_6].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -4900,7 +5256,8 @@ peek_stack:
goto check_state;
t = insn_stack[cur_stack - 1];
- if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ if (BPF_CLASS(insns[t].code) == BPF_JMP ||
+ BPF_CLASS(insns[t].code) == BPF_JMP32) {
u8 opcode = BPF_OP(insns[t].code);
if (opcode == BPF_EXIT) {
@@ -4997,13 +5354,14 @@ static int check_btf_func(struct bpf_verifier_env *env,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- u32 i, nfuncs, urec_size, min_size, prev_offset;
+ u32 i, nfuncs, urec_size, min_size;
u32 krec_size = sizeof(struct bpf_func_info);
struct bpf_func_info *krecord;
const struct btf_type *type;
struct bpf_prog *prog;
const struct btf *btf;
void __user *urecord;
+ u32 prev_offset = 0;
int ret = 0;
nfuncs = attr->func_info_cnt;
@@ -5447,8 +5805,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_MAP_VALUE:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
- * We don't care about the 'id' value, because nothing
- * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+ * 'id' is not compared, since it's only used for maps with
+ * bpf_spin_lock inside map element and in such cases if
+ * the rest of the prog is valid for one map element then
+ * it's valid for all map elements regardless of the key
+ * used in bpf_map_lookup()
*/
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
range_within(rold, rcur) &&
@@ -5496,6 +5857,10 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_FLOW_KEYS:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
/* Only valid matches are exact, which memcmp() above
* would have accepted
*/
@@ -5651,6 +6016,9 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->speculative && !cur->speculative)
return false;
+ if (old->active_spin_lock != cur->active_spin_lock)
+ return false;
+
/* for states to be equal callsites have to be the same
* and all frame states need to be equivalent
*/
@@ -5813,6 +6181,10 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_CTX:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
return false;
default:
return true;
@@ -6055,7 +6427,7 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- } else if (class == BPF_JMP) {
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
@@ -6063,11 +6435,18 @@ static int do_check(struct bpf_verifier_env *env)
insn->off != 0 ||
(insn->src_reg != BPF_REG_0 &&
insn->src_reg != BPF_PSEUDO_CALL) ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
+ if (env->cur_state->active_spin_lock &&
+ (insn->src_reg == BPF_PSEUDO_CALL ||
+ insn->imm != BPF_FUNC_spin_unlock)) {
+ verbose(env, "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
else
@@ -6079,7 +6458,8 @@ static int do_check(struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
@@ -6091,11 +6471,17 @@ static int do_check(struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
+ if (env->cur_state->active_spin_lock) {
+ verbose(env, "bpf_spin_unlock is missing\n");
+ return -EINVAL;
+ }
+
if (state->curframe) {
/* exit from nested function */
env->prev_insn_idx = env->insn_idx;
@@ -6193,6 +6579,19 @@ static int check_map_prealloc(struct bpf_map *map)
!(map->map_flags & BPF_F_NO_PREALLOC);
}
+static bool is_tracing_prog_type(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
@@ -6215,6 +6614,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
+ if ((is_tracing_prog_type(prog->type) ||
+ prog->type == BPF_PROG_TYPE_SOCKET_FILTER) &&
+ map_value_has_spin_lock(map)) {
+ verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
+ return -EINVAL;
+ }
+
if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@@ -6431,6 +6837,153 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return new_prog;
}
+static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
+ u32 off, u32 cnt)
+{
+ int i, j;
+
+ /* find first prog starting at or after off (first to remove) */
+ for (i = 0; i < env->subprog_cnt; i++)
+ if (env->subprog_info[i].start >= off)
+ break;
+ /* find first prog starting at or after off + cnt (first to stay) */
+ for (j = i; j < env->subprog_cnt; j++)
+ if (env->subprog_info[j].start >= off + cnt)
+ break;
+ /* if j doesn't start exactly at off + cnt, we are just removing
+ * the front of previous prog
+ */
+ if (env->subprog_info[j].start != off + cnt)
+ j--;
+
+ if (j > i) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int move;
+
+ /* move fake 'exit' subprog as well */
+ move = env->subprog_cnt + 1 - j;
+
+ memmove(env->subprog_info + i,
+ env->subprog_info + j,
+ sizeof(*env->subprog_info) * move);
+ env->subprog_cnt -= j - i;
+
+ /* remove func_info */
+ if (aux->func_info) {
+ move = aux->func_info_cnt - j;
+
+ memmove(aux->func_info + i,
+ aux->func_info + j,
+ sizeof(*aux->func_info) * move);
+ aux->func_info_cnt -= j - i;
+ /* func_info->insn_off is set after all code rewrites,
+ * in adjust_btf_func() - no need to adjust
+ */
+ }
+ } else {
+ /* convert i from "first prog to remove" to "first to adjust" */
+ if (env->subprog_info[i].start == off)
+ i++;
+ }
+
+ /* update fake 'exit' subprog as well */
+ for (; i <= env->subprog_cnt; i++)
+ env->subprog_info[i].start -= cnt;
+
+ return 0;
+}
+
+static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
+ u32 cnt)
+{
+ struct bpf_prog *prog = env->prog;
+ u32 i, l_off, l_cnt, nr_linfo;
+ struct bpf_line_info *linfo;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo)
+ return 0;
+
+ linfo = prog->aux->linfo;
+
+ /* find first line info to remove, count lines to be removed */
+ for (i = 0; i < nr_linfo; i++)
+ if (linfo[i].insn_off >= off)
+ break;
+
+ l_off = i;
+ l_cnt = 0;
+ for (; i < nr_linfo; i++)
+ if (linfo[i].insn_off < off + cnt)
+ l_cnt++;
+ else
+ break;
+
+ /* First live insn doesn't match first live linfo, it needs to "inherit"
+ * last removed linfo. prog is already modified, so prog->len == off
+ * means no live instructions after (tail of the program was removed).
+ */
+ if (prog->len != off && l_cnt &&
+ (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
+ l_cnt--;
+ linfo[--i].insn_off = off + cnt;
+ }
+
+ /* remove the line info which refer to the removed instructions */
+ if (l_cnt) {
+ memmove(linfo + l_off, linfo + i,
+ sizeof(*linfo) * (nr_linfo - i));
+
+ prog->aux->nr_linfo -= l_cnt;
+ nr_linfo = prog->aux->nr_linfo;
+ }
+
+ /* pull all linfo[i].insn_off >= off + cnt in by cnt */
+ for (i = l_off; i < nr_linfo; i++)
+ linfo[i].insn_off -= cnt;
+
+ /* fix up all subprogs (incl. 'exit') which start >= off */
+ for (i = 0; i <= env->subprog_cnt; i++)
+ if (env->subprog_info[i].linfo_idx > l_off) {
+ /* program may have started in the removed region but
+ * may not be fully removed
+ */
+ if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
+ env->subprog_info[i].linfo_idx -= l_cnt;
+ else
+ env->subprog_info[i].linfo_idx = l_off;
+ }
+
+ return 0;
+}
+
+static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ unsigned int orig_prog_len = env->prog->len;
+ int err;
+
+ if (bpf_prog_is_dev_bound(env->prog->aux))
+ bpf_prog_offload_remove_insns(env, off, cnt);
+
+ err = bpf_remove_insns(env->prog, off, cnt);
+ if (err)
+ return err;
+
+ err = adjust_subprog_starts_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ err = bpf_adj_linfo_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ memmove(aux_data + off, aux_data + off + cnt,
+ sizeof(*aux_data) * (orig_prog_len - off - cnt));
+
+ return 0;
+}
+
/* The verifier does more data flow analysis than llvm and will not
* explore branches that are dead at run time. Malicious programs can
* have dead code too. Therefore replace all dead at-run-time code
@@ -6457,6 +7010,91 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
}
}
+static bool insn_is_cond_jump(u8 code)
+{
+ u8 op;
+
+ if (BPF_CLASS(code) == BPF_JMP32)
+ return true;
+
+ if (BPF_CLASS(code) != BPF_JMP)
+ return false;
+
+ op = BPF_OP(code);
+ return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
+}
+
+static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!insn_is_cond_jump(insn->code))
+ continue;
+
+ if (!aux_data[i + 1].seen)
+ ja.off = insn->off;
+ else if (!aux_data[i + 1 + insn->off].seen)
+ ja.off = 0;
+ else
+ continue;
+
+ if (bpf_prog_is_dev_bound(env->prog->aux))
+ bpf_prog_offload_replace_insn(env, i, &ja);
+
+ memcpy(insn, &ja, sizeof(ja));
+ }
+}
+
+static int opt_remove_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ int insn_cnt = env->prog->len;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ int j;
+
+ j = 0;
+ while (i + j < insn_cnt && !aux_data[i + j].seen)
+ j++;
+ if (!j)
+ continue;
+
+ err = verifier_remove_insns(env, i, j);
+ if (err)
+ return err;
+ insn_cnt = env->prog->len;
+ }
+
+ return 0;
+}
+
+static int opt_remove_nops(struct bpf_verifier_env *env)
+{
+ const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (memcmp(&insn[i], &ja, sizeof(ja)))
+ continue;
+
+ err = verifier_remove_insns(env, i, 1);
+ if (err)
+ return err;
+ insn_cnt--;
+ i--;
+ }
+
+ return 0;
+}
+
/* convert load instructions that access fields of a context type into a
* sequence of instructions that access fields of the underlying structure:
* struct __sk_buff -> struct sk_buff
@@ -6549,8 +7187,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = ops->convert_ctx_access;
break;
case PTR_TO_SOCKET:
+ case PTR_TO_SOCK_COMMON:
convert_ctx_access = bpf_sock_convert_ctx_access;
break;
+ case PTR_TO_TCP_SOCK:
+ convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
+ break;
default:
continue;
}
@@ -6678,7 +7320,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
subprog_end = env->subprog_info[i + 1].start;
len = subprog_end - subprog_start;
- func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
+ /* BPF_PROG_RUN doesn't call subprogs directly,
+ * hence main prog stats include the runtime of subprogs.
+ * subprogs don't have IDs and not reachable via prog_get_next_id
+ * func[i]->aux->stats will never be accessed and stays NULL
+ */
+ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
if (!func[i])
goto out_free;
memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
@@ -6917,7 +7564,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
u32 off_reg;
aux = &env->insn_aux_data[i + delta];
- if (!aux->alu_state)
+ if (!aux->alu_state ||
+ aux->alu_state == BPF_ALU_NON_POINTER)
continue;
isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
@@ -7147,7 +7795,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
{
struct bpf_verifier_env *env;
struct bpf_verifier_log *log;
- int ret = -EINVAL;
+ int i, len, ret = -EINVAL;
+ bool is_priv;
/* no program is valid */
if (ARRAY_SIZE(bpf_verifier_ops) == 0)
@@ -7161,12 +7810,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
return -ENOMEM;
log = &env->log;
+ len = (*prog)->len;
env->insn_aux_data =
- vzalloc(array_size(sizeof(struct bpf_insn_aux_data),
- (*prog)->len));
+ vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
ret = -ENOMEM;
if (!env->insn_aux_data)
goto err_free_env;
+ for (i = 0; i < len; i++)
+ env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
@@ -7194,6 +7845,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
+ is_priv = capable(CAP_SYS_ADMIN);
+ env->allow_ptr_leaks = is_priv;
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
@@ -7211,8 +7865,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (!env->explored_states)
goto skip_full_check;
- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
ret = check_subprogs(env);
if (ret < 0)
goto skip_full_check;
@@ -7242,8 +7894,17 @@ skip_full_check:
ret = check_max_stack_depth(env);
/* instruction rewrites happen after this point */
- if (ret == 0)
- sanitize_dead_code(env);
+ if (is_priv) {
+ if (ret == 0)
+ opt_hard_wire_dead_code_branches(env);
+ if (ret == 0)
+ ret = opt_remove_dead_code(env);
+ if (ret == 0)
+ ret = opt_remove_nops(env);
+ } else {
+ if (ret == 0)
+ sanitize_dead_code(env);
+ }
if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index c950864016e2..c9a35f09e4b9 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -198,7 +198,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 583b969b0c0e..f94a7229974e 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1116,13 +1116,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
void *data, unsigned long magic,
struct cgroup_namespace *ns)
{
- struct super_block *pinned_sb = NULL;
struct cgroup_sb_opts opts;
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct dentry *dentry;
int i, ret;
- bool new_root = false;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
@@ -1184,29 +1182,6 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
if (root->flags ^ opts.flags)
pr_warn("new mount options do not match the existing superblock, will be ignored\n");
- /*
- * We want to reuse @root whose lifetime is governed by its
- * ->cgrp. Let's check whether @root is alive and keep it
- * that way. As cgroup_kill_sb() can happen anytime, we
- * want to block it by pinning the sb so that @root doesn't
- * get killed before mount is complete.
- *
- * With the sb pinned, tryget_live can reliably indicate
- * whether @root can be reused. If it's being killed,
- * drain it. We can use wait_queue for the wait but this
- * path is super cold. Let's just sleep a bit and retry.
- */
- pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
- if (IS_ERR(pinned_sb) ||
- !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- if (!IS_ERR_OR_NULL(pinned_sb))
- deactivate_super(pinned_sb);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
-
ret = 0;
goto out_unlock;
}
@@ -1232,15 +1207,20 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
ret = -ENOMEM;
goto out_unlock;
}
- new_root = true;
init_cgroup_root(root, &opts);
- ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
+ ret = cgroup_setup_root(root, opts.subsys_mask);
if (ret)
cgroup_free_root(root);
out_unlock:
+ if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
mutex_unlock(&cgroup_mutex);
out_free:
kfree(opts.release_agent);
@@ -1252,25 +1232,13 @@ out_free:
dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
CGROUP_SUPER_MAGIC, ns);
- /*
- * There's a race window after we release cgroup_mutex and before
- * allocating a superblock. Make sure a concurrent process won't
- * be able to re-use the root during this window by delaying the
- * initialization of root refcnt.
- */
- if (new_root) {
- mutex_lock(&cgroup_mutex);
- percpu_ref_reinit(&root->cgrp.self.refcnt);
- mutex_unlock(&cgroup_mutex);
+ if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ struct super_block *sb = dentry->d_sb;
+ dput(dentry);
+ deactivate_locked_super(sb);
+ msleep(10);
+ dentry = ERR_PTR(restart_syscall());
}
-
- /*
- * If @pinned_sb, we're reusing an existing root and holding an
- * extra ref on its sb. Mount is complete. Put the extra ref.
- */
- if (pinned_sb)
- deactivate_super(pinned_sb);
-
return dentry;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f31bd61c9466..cef98502b124 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1927,7 +1927,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1944,7 +1944,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
- ref_flags, GFP_KERNEL);
+ 0, GFP_KERNEL);
if (ret)
goto out;
@@ -2033,7 +2033,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_namespace *ns)
{
struct dentry *dentry;
- bool new_sb;
+ bool new_sb = false;
dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
@@ -2043,6 +2043,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
*/
if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
struct dentry *nsdentry;
+ struct super_block *sb = dentry->d_sb;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
@@ -2053,12 +2054,14 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
- nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+ nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(dentry);
+ if (IS_ERR(nsdentry))
+ deactivate_locked_super(sb);
dentry = nsdentry;
}
- if (IS_ERR(dentry) || !new_sb)
+ if (!new_sb)
cgroup_put(&root->cgrp);
return dentry;
@@ -2118,18 +2121,16 @@ static void cgroup_kill_sb(struct super_block *sb)
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
/*
- * If @root doesn't have any mounts or children, start killing it.
+ * If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
* cgroup_mount() may wait for @root's release.
*
* And don't kill the default root.
*/
- if (!list_empty(&root->cgrp.self.children) ||
- root == &cgrp_dfl_root)
- cgroup_put(&root->cgrp);
- else
+ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt))
percpu_ref_kill(&root->cgrp.self.refcnt);
-
+ cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -5399,7 +5400,7 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
- BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
mutex_unlock(&cgroup_mutex);
@@ -5996,7 +5997,7 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ ret = __cgroup_bpf_detach(cgrp, prog, type);
mutex_unlock(&cgroup_mutex);
return ret;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5ede6918050..26d6edab051a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4963,6 +4963,11 @@ static void __perf_event_period(struct perf_event *event,
}
}
+static int perf_event_check_period(struct perf_event *event, u64 value)
+{
+ return event->pmu->check_period(event, value);
+}
+
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
u64 value;
@@ -4979,6 +4984,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
if (event->attr.freq && value > sysctl_perf_event_sample_rate)
return -EINVAL;
+ if (perf_event_check_period(event, value))
+ return -EINVAL;
+
event_function_call(event, __perf_event_period, &value);
return 0;
@@ -9391,6 +9399,11 @@ static int perf_pmu_nop_int(struct pmu *pmu)
return 0;
}
+static int perf_event_nop_int(struct perf_event *event, u64 value)
+{
+ return 0;
+}
+
static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
@@ -9691,6 +9704,9 @@ got_cpu_context:
pmu->pmu_disable = perf_pmu_nop_void;
}
+ if (!pmu->check_period)
+ pmu->check_period = perf_event_nop_int;
+
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4a9937076331..5ab4fe3b1dcc 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -734,6 +734,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
size = sizeof(struct ring_buffer);
size += nr_pages * sizeof(void *);
+ if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
+ goto fail;
+
rb = kzalloc(size, GFP_KERNEL);
if (!rb)
goto fail;
diff --git a/kernel/futex.c b/kernel/futex.c
index 26fb8a0909ad..ca58fc0a496e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2221,11 +2221,11 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
* decrement the counter at queue_unlock() when some error has
* occurred and we don't end up adding the task to the list.
*/
- hb_waiters_inc(hb);
+ hb_waiters_inc(hb); /* implies smp_mb(); (A) */
q->lock_ptr = &hb->lock;
- spin_lock(&hb->lock); /* implies smp_mb(); (A) */
+ spin_lock(&hb->lock);
return hb;
}
@@ -2861,35 +2861,39 @@ retry_private:
* and BUG when futex_unlock_pi() interleaves with this.
*
* Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling rt_mutex_start_proxy_lock(). This still fully
- * serializes against futex_unlock_pi() as that does the exact same
- * lock handoff sequence.
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
*/
raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
if (ret) {
if (ret == 1)
ret = 0;
-
- spin_lock(q.lock_ptr);
- goto no_block;
+ goto cleanup;
}
-
if (unlikely(to))
hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+cleanup:
spin_lock(q.lock_ptr);
/*
- * If we failed to acquire the lock (signal/timeout), we must
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
* first acquire the hb->lock before removing the lock from the
- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
- * wait lists consistent.
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
+ * lists consistent.
*
* In particular; it is important that futex_unlock_pi() can not
* observe this inconsistency.
@@ -3013,6 +3017,10 @@ retry:
* there is no point where we hold neither; and therefore
* wake_futex_pi() must observe a state consistent with what we
* observed.
+ *
+ * In particular; this forces __rt_mutex_start_proxy() to
+ * complete such that we're guaranteed to observe the
+ * rt_waiter. Also see the WARN in wake_futex_pi().
*/
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 45b68b4ea48b..f18cd5aa33e8 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -9,7 +9,7 @@
#include <linux/cpu.h>
static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- int cpus_per_vec)
+ unsigned int cpus_per_vec)
{
const struct cpumask *siblmsk;
int cpu, sibl;
@@ -95,15 +95,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
}
static int __irq_build_affinity_masks(const struct irq_affinity *affd,
- int startvec, int numvecs, int firstvec,
+ unsigned int startvec,
+ unsigned int numvecs,
+ unsigned int firstvec,
cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
struct cpumask *nmsk,
struct irq_affinity_desc *masks)
{
- int n, nodes, cpus_per_vec, extra_vecs, done = 0;
- int last_affv = firstvec + numvecs;
- int curvec = startvec;
+ unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+ unsigned int last_affv = firstvec + numvecs;
+ unsigned int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
if (!cpumask_weight(cpu_mask))
@@ -117,18 +119,16 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
*/
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
- cpumask_or(&masks[curvec].mask,
- &masks[curvec].mask,
- node_to_cpumask[n]);
+ cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
+ node_to_cpumask[n]);
if (++curvec == last_affv)
curvec = firstvec;
}
- done = numvecs;
- goto out;
+ return numvecs;
}
for_each_node_mask(n, nodemsk) {
- int ncpus, v, vecs_to_assign, vecs_per_node;
+ unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
/* Spread the vectors per node */
vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
@@ -163,8 +163,6 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
curvec = firstvec;
--nodes;
}
-
-out:
return done;
}
@@ -174,19 +172,24 @@ out:
* 2) spread other possible CPUs on these vectors
*/
static int irq_build_affinity_masks(const struct irq_affinity *affd,
- int startvec, int numvecs, int firstvec,
- cpumask_var_t *node_to_cpumask,
+ unsigned int startvec, unsigned int numvecs,
+ unsigned int firstvec,
struct irq_affinity_desc *masks)
{
- int curvec = startvec, nr_present, nr_others;
- int ret = -ENOMEM;
+ unsigned int curvec = startvec, nr_present, nr_others;
+ cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
+ int ret = -ENOMEM;
if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
return ret;
if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
- goto fail;
+ goto fail_nmsk;
+
+ node_to_cpumask = alloc_node_to_cpumask();
+ if (!node_to_cpumask)
+ goto fail_npresmsk;
ret = 0;
/* Stabilize the cpumasks */
@@ -217,13 +220,22 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
if (nr_present < numvecs)
WARN_ON(nr_present + nr_others < numvecs);
+ free_node_to_cpumask(node_to_cpumask);
+
+ fail_npresmsk:
free_cpumask_var(npresmsk);
- fail:
+ fail_nmsk:
free_cpumask_var(nmsk);
return ret;
}
+static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+{
+ affd->nr_sets = 1;
+ affd->set_size[0] = affvecs;
+}
+
/**
* irq_create_affinity_masks - Create affinity masks for multiqueue spreading
* @nvecs: The total number of vectors
@@ -232,50 +244,62 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
* Returns the irq_affinity_desc pointer or NULL if allocation failed.
*/
struct irq_affinity_desc *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
{
- int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
- int curvec, usedvecs;
- cpumask_var_t *node_to_cpumask;
+ unsigned int affvecs, curvec, usedvecs, i;
struct irq_affinity_desc *masks = NULL;
- int i, nr_sets;
/*
- * If there aren't any vectors left after applying the pre/post
- * vectors don't bother with assigning affinity.
+ * Determine the number of vectors which need interrupt affinities
+ * assigned. If the pre/post request exhausts the available vectors
+ * then nothing to do here except for invoking the calc_sets()
+ * callback so the device driver can adjust to the situation. If there
+ * is only a single vector, then managing the queue is pointless as
+ * well.
*/
- if (nvecs == affd->pre_vectors + affd->post_vectors)
+ if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors)
+ affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+ else
+ affvecs = 0;
+
+ /*
+ * Simple invocations do not provide a calc_sets() callback. Install
+ * the generic one.
+ */
+ if (!affd->calc_sets)
+ affd->calc_sets = default_calc_sets;
+
+ /* Recalculate the sets */
+ affd->calc_sets(affd, affvecs);
+
+ if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
return NULL;
- node_to_cpumask = alloc_node_to_cpumask();
- if (!node_to_cpumask)
+ /* Nothing to assign? */
+ if (!affvecs)
return NULL;
masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
if (!masks)
- goto outnodemsk;
+ return NULL;
/* Fill out vectors at the beginning that don't need affinity */
for (curvec = 0; curvec < affd->pre_vectors; curvec++)
cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+
/*
* Spread on present CPUs starting from affd->pre_vectors. If we
* have multiple sets, build each sets affinity mask separately.
*/
- nr_sets = affd->nr_sets;
- if (!nr_sets)
- nr_sets = 1;
-
- for (i = 0, usedvecs = 0; i < nr_sets; i++) {
- int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+ unsigned int this_vecs = affd->set_size[i];
int ret;
ret = irq_build_affinity_masks(affd, curvec, this_vecs,
- curvec, node_to_cpumask, masks);
+ curvec, masks);
if (ret) {
kfree(masks);
- masks = NULL;
- goto outnodemsk;
+ return NULL;
}
curvec += this_vecs;
usedvecs += this_vecs;
@@ -293,8 +317,6 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
masks[i].is_managed = 1;
-outnodemsk:
- free_node_to_cpumask(node_to_cpumask);
return masks;
}
@@ -304,25 +326,22 @@ outnodemsk:
* @maxvec: The maximum number of vectors available
* @affd: Description of the affinity requirements
*/
-int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
+unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
+ const struct irq_affinity *affd)
{
- int resv = affd->pre_vectors + affd->post_vectors;
- int vecs = maxvec - resv;
- int set_vecs;
+ unsigned int resv = affd->pre_vectors + affd->post_vectors;
+ unsigned int set_vecs;
if (resv > minvec)
return 0;
- if (affd->nr_sets) {
- int i;
-
- for (i = 0, set_vecs = 0; i < affd->nr_sets; i++)
- set_vecs += affd->sets[i];
+ if (affd->calc_sets) {
+ set_vecs = maxvec - resv;
} else {
get_online_cpus();
set_vecs = cpumask_weight(cpu_possible_mask);
put_online_cpus();
}
- return resv + min(set_vecs, vecs);
+ return resv + min(set_vecs, maxvec - resv);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 34e969069488..99b7dd6982a4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -730,6 +730,37 @@ out:
EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
/**
+ * handle_fasteoi_nmi - irq handler for NMI interrupt lines
+ * @desc: the interrupt description structure for this irq
+ *
+ * A simple NMI-safe handler, considering the restrictions
+ * from request_nmi.
+ *
+ * Only a single callback will be issued to the chip: an ->eoi()
+ * call when the interrupt has been serviced. This enables support
+ * for modern forms of interrupt handlers, which handle the flow
+ * details in hardware, transparently.
+ */
+void handle_fasteoi_nmi(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ unsigned int irq = irq_desc_get_irq(desc);
+ irqreturn_t res;
+
+ trace_irq_handler_entry(irq, action);
+ /*
+ * NMIs cannot be shared, there is only one action.
+ */
+ res = action->handler(irq, action->dev_id);
+ trace_irq_handler_exit(irq, action, res);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_nmi);
+
+/**
* handle_edge_irq - edge type IRQ handler
* @desc: the interrupt description structure for this irq
*
@@ -855,7 +886,11 @@ void handle_percpu_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- kstat_incr_irqs_this_cpu(desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -884,7 +919,11 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
- kstat_incr_irqs_this_cpu(desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -908,6 +947,29 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
chip->irq_eoi(&desc->irq_data);
}
+/**
+ * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu
+ * dev ids
+ * @desc: the interrupt description structure for this irq
+ *
+ * Similar to handle_fasteoi_nmi, but handling the dev_id cookie
+ * as a percpu pointer.
+ */
+void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ unsigned int irq = irq_desc_get_irq(desc);
+ irqreturn_t res;
+
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
+ trace_irq_handler_exit(irq, action, res);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+
static void
__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
int is_chained, const char *name)
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 6f636136cccc..516c00a5e867 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -56,6 +56,7 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE),
BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
+ BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
};
static void
@@ -140,6 +141,7 @@ static const struct irq_bit_descr irqdesc_istates[] = {
BIT_MASK_DESCR(IRQS_WAITING),
BIT_MASK_DESCR(IRQS_PENDING),
BIT_MASK_DESCR(IRQS_SUSPENDED),
+ BIT_MASK_DESCR(IRQS_NMI),
};
@@ -203,8 +205,8 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
- if (irq_settings_is_level(desc)) {
- /* Can't do level, sorry */
+ if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) {
+ /* Can't do level nor NMIs, sorry */
err = -EINVAL;
} else {
desc->istate |= IRQS_PENDING;
@@ -256,8 +258,6 @@ static int __init irq_debugfs_init(void)
int irq;
root_dir = debugfs_create_dir("irq", NULL);
- if (!root_dir)
- return -ENOMEM;
irq_domain_debugfs_init(root_dir);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 38554bc35375..6df5ddfdb0f8 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -166,7 +166,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
__irq_wake_thread(desc, action);
- /* Fall through to add to randomness */
+ /* Fall through - to add to randomness */
case IRQ_HANDLED:
*flags |= action->flags;
break;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ca6afa267070..70c3053bc1f6 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -49,6 +49,7 @@ enum {
* IRQS_WAITING - irq is waiting
* IRQS_PENDING - irq is pending and replayed later
* IRQS_SUSPENDED - irq is suspended
+ * IRQS_NMI - irq line is used to deliver NMIs
*/
enum {
IRQS_AUTODETECT = 0x00000001,
@@ -60,6 +61,7 @@ enum {
IRQS_PENDING = 0x00000200,
IRQS_SUSPENDED = 0x00000800,
IRQS_TIMINGS = 0x00001000,
+ IRQS_NMI = 0x00002000,
};
#include "debug.h"
@@ -242,12 +244,18 @@ static inline void irq_state_set_masked(struct irq_desc *desc)
#undef __irqd_to_state
-static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
}
+static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+{
+ __kstat_incr_irqs_this_cpu(desc);
+ desc->tot_count++;
+}
+
static inline int irq_desc_get_node(struct irq_desc *desc)
{
return irq_common_data_get_node(&desc->irq_common_data);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index ef8ad36cadcf..13539e12cd80 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -119,6 +119,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->depth = 1;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
+ desc->tot_count = 0;
desc->name = NULL;
desc->owner = owner;
for_each_possible_cpu(cpu)
@@ -669,6 +670,41 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
set_irq_regs(old_regs);
return ret;
}
+
+#ifdef CONFIG_IRQ_DOMAIN
+/**
+ * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ * @regs: Register file coming from the low-level handling code
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ */
+int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
+ struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ unsigned int irq;
+ int ret = 0;
+
+ nmi_enter();
+
+ irq = irq_find_mapping(domain, hwirq);
+
+ /*
+ * ack_bad_irq is not NMI-safe, just report
+ * an invalid interrupt.
+ */
+ if (likely(irq))
+ generic_handle_irq(irq);
+ else
+ ret = -EINVAL;
+
+ nmi_exit();
+ set_irq_regs(old_regs);
+ return ret;
+}
+#endif
#endif
/* Dynamic interrupt handling */
@@ -919,11 +955,15 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- int cpu;
unsigned int sum = 0;
+ int cpu;
if (!desc || !desc->kstat_irqs)
return 0;
+ if (!irq_settings_is_per_cpu_devid(desc) &&
+ !irq_settings_is_per_cpu(desc))
+ return desc->tot_count;
+
for_each_possible_cpu(cpu)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8b0be4bd6565..3bf9793d8825 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -458,6 +458,20 @@ void irq_set_default_host(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_set_default_host);
+/**
+ * irq_get_default_host() - Retrieve the "default" irq domain
+ *
+ * Returns: the default domain, if any.
+ *
+ * Modern code should never use this. This should only be used on
+ * systems that cannot implement a firmware->fwnode mapping (which
+ * both DT and ACPI provide).
+ */
+struct irq_domain *irq_get_default_host(void)
+{
+ return irq_default_domain;
+}
+
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
@@ -1749,8 +1763,6 @@ void __init irq_domain_debugfs_init(struct dentry *root)
struct irq_domain *d;
domain_dir = debugfs_create_dir("domains", root);
- if (!domain_dir)
- return;
debugfs_create_file("default", 0444, domain_dir, NULL,
&irq_domain_debug_fops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 84b54a17b95d..9ec34a2a6638 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -341,7 +341,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
/* The release function is promised process context */
might_sleep();
- if (!desc)
+ if (!desc || desc->istate & IRQS_NMI)
return -EINVAL;
/* Complete initialisation of *notify */
@@ -553,6 +553,21 @@ bool disable_hardirq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(disable_hardirq);
+/**
+ * disable_nmi_nosync - disable an nmi without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and enables are
+ * nested.
+ * The interrupt to disable must have been requested through request_nmi.
+ * Unlike disable_nmi(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
+ */
+void disable_nmi_nosync(unsigned int irq)
+{
+ disable_irq_nosync(irq);
+}
+
void __enable_irq(struct irq_desc *desc)
{
switch (desc->depth) {
@@ -609,6 +624,20 @@ out:
}
EXPORT_SYMBOL(enable_irq);
+/**
+ * enable_nmi - enable handling of an nmi
+ * @irq: Interrupt to enable
+ *
+ * The interrupt to enable must have been requested through request_nmi.
+ * Undoes the effect of one call to disable_nmi(). If this
+ * matches the last disable, processing of interrupts on this
+ * IRQ line is re-enabled.
+ */
+void enable_nmi(unsigned int irq)
+{
+ enable_irq(irq);
+}
+
static int set_irq_wake_real(unsigned int irq, unsigned int on)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -644,6 +673,12 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
if (!desc)
return -EINVAL;
+ /* Don't use NMIs as wake up interrupts please */
+ if (desc->istate & IRQS_NMI) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/* wakeup-capable irqs can be shared between drivers that
* don't need to have the same sleep mode behaviors.
*/
@@ -666,6 +701,8 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
}
}
+
+out_unlock:
irq_put_desc_busunlock(desc, flags);
return ret;
}
@@ -726,6 +763,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
case IRQ_SET_MASK_OK_DONE:
irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
irqd_set(&desc->irq_data, flags);
+ /* fall through */
case IRQ_SET_MASK_OK_NOCOPY:
flags = irqd_get_trigger_type(&desc->irq_data);
@@ -1128,6 +1166,39 @@ static void irq_release_resources(struct irq_desc *desc)
c->irq_release_resources(d);
}
+static bool irq_supports_nmi(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ /* Only IRQs directly managed by the root irqchip can be set as NMI */
+ if (d->parent_data)
+ return false;
+#endif
+ /* Don't support NMIs for chips behind a slow bus */
+ if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock)
+ return false;
+
+ return d->chip->flags & IRQCHIP_SUPPORTS_NMI;
+}
+
+static int irq_nmi_setup(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *c = d->chip;
+
+ return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL;
+}
+
+static void irq_nmi_teardown(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *c = d->chip;
+
+ if (c->irq_nmi_teardown)
+ c->irq_nmi_teardown(d);
+}
+
static int
setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
{
@@ -1302,9 +1373,17 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* fields must have IRQF_SHARED set and the bits which
* set the trigger type must match. Also all must
* agree on ONESHOT.
+ * Interrupt lines used for NMIs cannot be shared.
*/
unsigned int oldtype;
+ if (desc->istate & IRQS_NMI) {
+ pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
+ new->name, irq, desc->irq_data.chip->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* If nobody did set the configuration before, inherit
* the one provided by the requester.
@@ -1756,6 +1835,59 @@ const void *free_irq(unsigned int irq, void *dev_id)
}
EXPORT_SYMBOL(free_irq);
+/* This function must be called with desc->lock held */
+static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
+{
+ const char *devname = NULL;
+
+ desc->istate &= ~IRQS_NMI;
+
+ if (!WARN_ON(desc->action == NULL)) {
+ irq_pm_remove_action(desc, desc->action);
+ devname = desc->action->name;
+ unregister_handler_proc(irq, desc->action);
+
+ kfree(desc->action);
+ desc->action = NULL;
+ }
+
+ irq_settings_clr_disable_unlazy(desc);
+ irq_shutdown(desc);
+
+ irq_release_resources(desc);
+
+ irq_chip_pm_put(&desc->irq_data);
+ module_put(desc->owner);
+
+ return devname;
+}
+
+const void *free_nmi(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ const void *devname;
+
+ if (!desc || WARN_ON(!(desc->istate & IRQS_NMI)))
+ return NULL;
+
+ if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ return NULL;
+
+ /* NMI still enabled */
+ if (WARN_ON(desc->depth == 0))
+ disable_nmi_nosync(irq);
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ irq_nmi_teardown(desc);
+ devname = __cleanup_nmi(irq, desc);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return devname;
+}
+
/**
* request_threaded_irq - allocate an interrupt line
* @irq: Interrupt line to allocate
@@ -1925,6 +2057,101 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
}
EXPORT_SYMBOL_GPL(request_any_context_irq);
+/**
+ * request_nmi - allocate an interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @irqflags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the
+ * interrupt line and IRQ handling. It sets up the IRQ line
+ * to be handled as an NMI.
+ *
+ * An interrupt line delivering NMIs cannot be shared and IRQ handling
+ * cannot be threaded.
+ *
+ * Interrupt lines requested for NMI delivering must produce per cpu
+ * interrupts and have auto enabling setting disabled.
+ *
+ * Dev_id must be globally unique. Normally the address of the
+ * device data structure is used as the cookie. Since the handler
+ * receives this value it makes sense to use it.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail and return a negative value.
+ */
+int request_nmi(unsigned int irq, irq_handler_t handler,
+ unsigned long irqflags, const char *name, void *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ unsigned long flags;
+ int retval;
+
+ if (irq == IRQ_NOTCONNECTED)
+ return -ENOTCONN;
+
+ /* NMI cannot be shared, used for Polling */
+ if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL))
+ return -EINVAL;
+
+ if (!(irqflags & IRQF_PERCPU))
+ return -EINVAL;
+
+ if (!handler)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+
+ if (!desc || irq_settings_can_autoenable(desc) ||
+ !irq_settings_can_request(desc) ||
+ WARN_ON(irq_settings_is_per_cpu_devid(desc)) ||
+ !irq_supports_nmi(desc))
+ return -EINVAL;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING;
+ action->name = name;
+ action->dev_id = dev_id;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ goto err_out;
+
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ goto err_irq_setup;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ /* Setup NMI state */
+ desc->istate |= IRQS_NMI;
+ retval = irq_nmi_setup(desc);
+ if (retval) {
+ __cleanup_nmi(irq, desc);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return -EINVAL;
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+
+err_irq_setup:
+ irq_chip_pm_put(&desc->irq_data);
+err_out:
+ kfree(action);
+
+ return retval;
+}
+
void enable_percpu_irq(unsigned int irq, unsigned int type)
{
unsigned int cpu = smp_processor_id();
@@ -1959,6 +2186,11 @@ out:
}
EXPORT_SYMBOL_GPL(enable_percpu_irq);
+void enable_percpu_nmi(unsigned int irq, unsigned int type)
+{
+ enable_percpu_irq(irq, type);
+}
+
/**
* irq_percpu_is_enabled - Check whether the per cpu irq is enabled
* @irq: Linux irq number to check for
@@ -1998,6 +2230,11 @@ void disable_percpu_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(disable_percpu_irq);
+void disable_percpu_nmi(unsigned int irq)
+{
+ disable_percpu_irq(irq);
+}
+
/*
* Internal function to unregister a percpu irqaction.
*/
@@ -2029,6 +2266,8 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
/* Found it - now remove it from the list of entries: */
desc->action = NULL;
+ desc->istate &= ~IRQS_NMI;
+
raw_spin_unlock_irqrestore(&desc->lock, flags);
unregister_handler_proc(irq, action);
@@ -2082,6 +2321,19 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
}
EXPORT_SYMBOL_GPL(free_percpu_irq);
+void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !irq_settings_is_per_cpu_devid(desc))
+ return;
+
+ if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ return;
+
+ kfree(__free_percpu_irq(irq, dev_id));
+}
+
/**
* setup_percpu_irq - setup a per-cpu interrupt
* @irq: Interrupt line to setup
@@ -2172,6 +2424,158 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL_GPL(__request_percpu_irq);
/**
+ * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @name: An ascii name for the claiming device
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
+ * have to be setup on each CPU by calling prepare_percpu_nmi() before
+ * being enabled on the same CPU by using enable_percpu_nmi().
+ *
+ * Dev_id must be globally unique. It is a per-cpu variable, and
+ * the handler gets called with the interrupted CPU's instance of
+ * that variable.
+ *
+ * Interrupt lines requested for NMI delivering should have auto enabling
+ * setting disabled.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
+ */
+int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
+ const char *name, void __percpu *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ unsigned long flags;
+ int retval;
+
+ if (!handler)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+
+ if (!desc || !irq_settings_can_request(desc) ||
+ !irq_settings_is_per_cpu_devid(desc) ||
+ irq_settings_can_autoenable(desc) ||
+ !irq_supports_nmi(desc))
+ return -EINVAL;
+
+ /* The line cannot already be NMI */
+ if (desc->istate & IRQS_NMI)
+ return -EINVAL;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD
+ | IRQF_NOBALANCING;
+ action->name = name;
+ action->percpu_dev_id = dev_id;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ goto err_out;
+
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ goto err_irq_setup;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ desc->istate |= IRQS_NMI;
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+
+err_irq_setup:
+ irq_chip_pm_put(&desc->irq_data);
+err_out:
+ kfree(action);
+
+ return retval;
+}
+
+/**
+ * prepare_percpu_nmi - performs CPU local setup for NMI delivery
+ * @irq: Interrupt line to prepare for NMI delivery
+ *
+ * This call prepares an interrupt line to deliver NMI on the current CPU,
+ * before that interrupt line gets enabled with enable_percpu_nmi().
+ *
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
+ */
+int prepare_percpu_nmi(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc;
+ int ret = 0;
+
+ WARN_ON(preemptible());
+
+ desc = irq_get_desc_lock(irq, &flags,
+ IRQ_GET_DESC_CHECK_PERCPU);
+ if (!desc)
+ return -EINVAL;
+
+ if (WARN(!(desc->istate & IRQS_NMI),
+ KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n",
+ irq)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = irq_nmi_setup(desc);
+ if (ret) {
+ pr_err("Failed to setup NMI delivery: irq %u\n", irq);
+ goto out;
+ }
+
+out:
+ irq_put_desc_unlock(desc, flags);
+ return ret;
+}
+
+/**
+ * teardown_percpu_nmi - undoes NMI setup of IRQ line
+ * @irq: Interrupt line from which CPU local NMI configuration should be
+ * removed
+ *
+ * This call undoes the setup done by prepare_percpu_nmi().
+ *
+ * IRQ line should not be enabled for the current CPU.
+ *
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
+ */
+void teardown_percpu_nmi(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc;
+
+ WARN_ON(preemptible());
+
+ desc = irq_get_desc_lock(irq, &flags,
+ IRQ_GET_DESC_CHECK_PERCPU);
+ if (!desc)
+ return;
+
+ if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ goto out;
+
+ irq_nmi_teardown(desc);
+out:
+ irq_put_desc_unlock(desc, flags);
+}
+
+/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
* @irq: Interrupt line that is forwarded to a VM
* @which: One of IRQCHIP_STATE_* the caller wants to know about
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 087d18d771b5..65234c89d85b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,6 +101,12 @@ bool kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
+bool __kthread_should_park(struct task_struct *k)
+{
+ return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
+}
+EXPORT_SYMBOL_GPL(__kthread_should_park);
+
/**
* kthread_should_park - should this kthread park now?
*
@@ -114,7 +120,7 @@ EXPORT_SYMBOL(kthread_should_stop);
*/
bool kthread_should_park(void)
{
- return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+ return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 581edcc63c26..978d63a8261c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1726,12 +1726,33 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
rt_mutex_set_owner(lock, NULL);
}
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task)
{
int ret;
+ lockdep_assert_held(&lock->wait_lock);
+
if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
@@ -1749,9 +1770,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
ret = 0;
}
- if (unlikely(ret))
- remove_waiter(lock, waiter);
-
debug_rt_mutex_print_deadlock(waiter);
return ret;
@@ -1763,12 +1781,18 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
* @waiter: the pre-initialized rt_mutex_waiter
* @task: the task to prepare
*
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
* Returns:
* 0 - task blocked on lock
* 1 - acquired the lock for task, caller should wake it up
* <0 - error
*
- * Special API call for FUTEX_REQUEUE_PI support.
+ * Special API call for PI-futex support.
*/
int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
@@ -1778,6 +1802,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
raw_spin_lock_irq(&lock->wait_lock);
ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
@@ -1845,7 +1871,8 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
* @lock: the rt_mutex we were woken on
* @waiter: the pre-initialized rt_mutex_waiter
*
- * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
*
* Unless we acquired the lock; we're still enqueued on the wait-list and can
* in fact still be granted ownership until we're removed. Therefore we can
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 939a2056c87a..37301430970e 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -87,36 +87,6 @@ config RCU_STALL_COMMON
config RCU_NEED_SEGCBLIST
def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
-config CONTEXT_TRACKING
- bool
-
-config CONTEXT_TRACKING_FORCE
- bool "Force context tracking"
- depends on CONTEXT_TRACKING
- default y if !NO_HZ_FULL
- help
- The major pre-requirement for full dynticks to work is to
- support the context tracking subsystem. But there are also
- other dependencies to provide in order to make the full
- dynticks working.
-
- This option stands for testing when an arch implements the
- context tracking backend but doesn't yet fullfill all the
- requirements to make the full dynticks feature working.
- Without the full dynticks, there is no way to test the support
- for context tracking and the subsystems that rely on it: RCU
- userspace extended quiescent state and tickless cputime
- accounting. This option copes with the absence of the full
- dynticks subsystem by forcing the context tracking on all
- CPUs in the system.
-
- Say Y only if you're working on the development of an
- architecture backend for the context tracking.
-
- Say N otherwise, this option brings an overhead that you
- don't want in production.
-
-
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
range 2 64 if 64BIT
diff --git a/kernel/relay.c b/kernel/relay.c
index 04f248644e06..9e0f52375487 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -428,6 +428,8 @@ static struct dentry *relay_create_buf_file(struct rchan *chan,
dentry = chan->cb->create_buf_file(tmpname, chan->parent,
S_IRUSR, buf,
&chan->is_global);
+ if (IS_ERR(dentry))
+ dentry = NULL;
kfree(tmpname);
@@ -461,7 +463,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
dentry = chan->cb->create_buf_file(NULL, NULL,
S_IRUSR, buf,
&chan->is_global);
- if (WARN_ON(dentry))
+ if (IS_ERR_OR_NULL(dentry))
goto free_buf;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c09d0bd9377a..43f44539b88f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6161,6 +6161,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL(___might_sleep);
+
+void __cant_sleep(const char *file, int line, int preempt_offset)
+{
+ static unsigned long prev_jiffy;
+
+ if (irqs_disabled())
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ if (preempt_count() > preempt_offset)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index c3484785b179..0e97ca9306ef 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -322,7 +322,7 @@ static bool update_stats(struct psi_group *group)
expires = group->next_update;
if (now < expires)
goto out;
- if (now - expires > psi_period)
+ if (now - expires >= psi_period)
missed_periods = div_u64(now - expires, psi_period);
/*
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e815781ed751..a43c601ac252 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -267,6 +267,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
+ preempt_disable();
for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
@@ -275,6 +276,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
*match = f;
}
}
+ preempt_enable();
return ret;
}
#endif /* CONFIG_SECCOMP_FILTER */
diff --git a/kernel/signal.c b/kernel/signal.c
index af27629918cf..5d53183e2705 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -688,6 +688,48 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in
}
EXPORT_SYMBOL_GPL(dequeue_signal);
+static int dequeue_synchronous_signal(kernel_siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+ struct sigpending *pending = &tsk->pending;
+ struct sigqueue *q, *sync = NULL;
+
+ /*
+ * Might a synchronous signal be in the queue?
+ */
+ if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
+ return 0;
+
+ /*
+ * Return the first synchronous signal in the queue.
+ */
+ list_for_each_entry(q, &pending->list, list) {
+ /* Synchronous signals have a postive si_code */
+ if ((q->info.si_code > SI_USER) &&
+ (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
+ sync = q;
+ goto next;
+ }
+ }
+ return 0;
+next:
+ /*
+ * Check if there is another siginfo for the same signal.
+ */
+ list_for_each_entry_continue(q, &pending->list, list) {
+ if (q->info.si_signo == sync->info.si_signo)
+ goto still_pending;
+ }
+
+ sigdelset(&pending->signal, sync->info.si_signo);
+ recalc_sigpending();
+still_pending:
+ list_del_init(&sync->list);
+ copy_siginfo(info, &sync->info);
+ __sigqueue_free(sync);
+ return info->si_signo;
+}
+
/*
* Tell a process that it has a new active signal..
*
@@ -1057,10 +1099,9 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
result = TRACE_SIGNAL_DELIVERED;
/*
- * Skip useless siginfo allocation for SIGKILL SIGSTOP,
- * and kernel threads.
+ * Skip useless siginfo allocation for SIGKILL and kernel threads.
*/
- if (sig_kernel_only(sig) || (t->flags & PF_KTHREAD))
+ if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
goto out_set;
/*
@@ -2394,6 +2435,14 @@ relock:
goto relock;
}
+ /* Has this task already been marked for death? */
+ if (signal_group_exit(signal)) {
+ ksig->info.si_signo = signr = SIGKILL;
+ sigdelset(&current->pending.signal, SIGKILL);
+ recalc_sigpending();
+ goto fatal;
+ }
+
for (;;) {
struct k_sigaction *ka;
@@ -2407,7 +2456,15 @@ relock:
goto relock;
}
- signr = dequeue_signal(current, &current->blocked, &ksig->info);
+ /*
+ * Signals generated by the execution of an instruction
+ * need to be delivered before any other pending signals
+ * so that the instruction pointer in the signal stack
+ * frame points to the faulting instruction.
+ */
+ signr = dequeue_synchronous_signal(&ksig->info);
+ if (!signr)
+ signr = dequeue_signal(current, &current->blocked, &ksig->info);
if (!signr)
break; /* will return 0 */
@@ -2489,6 +2546,7 @@ relock:
continue;
}
+ fatal:
spin_unlock_irq(&sighand->siglock);
/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d28813306b2c..10277429ed84 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -89,7 +89,8 @@ static bool ksoftirqd_running(unsigned long pending)
if (pending & SOFTIRQ_NOW_MASK)
return false;
- return tsk && (tsk->state == TASK_RUNNING);
+ return tsk && (tsk->state == TASK_RUNNING) &&
+ !__kthread_should_park(tsk);
}
/*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba4d9e85feb8..7578e21a711b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -224,6 +224,11 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
#endif
static int proc_dopipe_max_size(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_BPF_SYSCALL
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+#endif
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses its own private copy */
@@ -1229,6 +1234,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
.extra2 = &one,
},
+ {
+ .procname = "bpf_stats_enabled",
+ .data = &sysctl_bpf_stats_enabled,
+ .maxlen = sizeof(sysctl_bpf_stats_enabled),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_bpf_stats,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
{
@@ -3260,6 +3274,29 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
#endif /* CONFIG_PROC_SYSCTL */
+#ifdef CONFIG_BPF_SYSCALL
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret, bpf_stats = *(int *)table->data;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &bpf_stats;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ *(int *)table->data = bpf_stats;
+ if (bpf_stats)
+ static_branch_enable(&bpf_stats_enabled_key);
+ else
+ static_branch_disable(&bpf_stats_enabled_key);
+ }
+ return ret;
+}
+#endif
/*
* No sense putting this after each symbol definition, twice,
* exception granted :-)
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 58b981f4bb5d..e2c038d6c13c 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -117,6 +117,35 @@ config NO_HZ_FULL
endchoice
+config CONTEXT_TRACKING
+ bool
+
+config CONTEXT_TRACKING_FORCE
+ bool "Force context tracking"
+ depends on CONTEXT_TRACKING
+ default y if !NO_HZ_FULL
+ help
+ The major pre-requirement for full dynticks to work is to
+ support the context tracking subsystem. But there are also
+ other dependencies to provide in order to make the full
+ dynticks working.
+
+ This option stands for testing when an arch implements the
+ context tracking backend but doesn't yet fullfill all the
+ requirements to make the full dynticks feature working.
+ Without the full dynticks, there is no way to test the support
+ for context tracking and the subsystems that rely on it: RCU
+ userspace extended quiescent state and tickless cputime
+ accounting. This option copes with the absence of the full
+ dynticks subsystem by forcing the context tracking on all
+ CPUs in the system.
+
+ Say Y only if you're working on the development of an
+ architecture backend for the context tracking.
+
+ Say N otherwise, this option brings an overhead that you
+ don't want in production.
+
config NO_HZ
bool "Old Idle dynticks config"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 0f5f96075110..41dfff23c1f9 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -364,7 +364,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
-
+ /* fall through */
default:
return false;
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 80f955210861..0a426f4e3125 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -67,13 +67,13 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
int i;
u64 delta, incr;
- if (timer->it.cpu.incr == 0)
+ if (!timer->it_interval)
return;
if (now < timer->it.cpu.expires)
return;
- incr = timer->it.cpu.incr;
+ incr = timer->it_interval;
delta = now + incr - timer->it.cpu.expires;
/* Don't use (incr*2 < delta), incr*2 might overflow. */
@@ -520,7 +520,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
*/
wake_up_process(timer->it_process);
timer->it.cpu.expires = 0;
- } else if (timer->it.cpu.incr == 0) {
+ } else if (!timer->it_interval) {
/*
* One-shot timer. Clear it as soon as it's fired.
*/
@@ -606,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
*/
ret = 0;
- old_incr = timer->it.cpu.incr;
+ old_incr = timer->it_interval;
old_expires = timer->it.cpu.expires;
if (unlikely(timer->it.cpu.firing)) {
timer->it.cpu.firing = -1;
@@ -684,8 +684,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* Install the new reload setting, and
* set up the signal and overrun bookkeeping.
*/
- timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
- timer->it_interval = ns_to_ktime(timer->it.cpu.incr);
+ timer->it_interval = timespec64_to_ktime(new->it_interval);
/*
* This acts as a modification timestamp for the timer,
@@ -724,7 +723,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
/*
* Easy part: convert the reload time.
*/
- itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
+ itp->it_interval = ktime_to_timespec64(timer->it_interval);
if (!timer->it.cpu.expires)
return;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 803fa67aace9..ee834d4fb814 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -375,6 +375,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
switch (mode) {
case TICK_BROADCAST_FORCE:
tick_broadcast_forced = 1;
+ /* fall through */
case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 86489950d690..b73e8850e58d 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -37,15 +37,8 @@ DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time);
static int __init tk_debug_sleep_time_init(void)
{
- struct dentry *d;
-
- d = debugfs_create_file("sleep_time", 0444, NULL, NULL,
- &tk_debug_sleep_time_fops);
- if (!d) {
- pr_err("Failed to create sleep_time debug file\n");
- return -ENOMEM;
- }
-
+ debugfs_create_file("sleep_time", 0444, NULL, NULL,
+ &tk_debug_sleep_time_fops);
return 0;
}
late_initcall(tk_debug_sleep_time_init);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 444156debfa0..167e71f9ed3c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -647,7 +647,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
-
+ /* fall through */
default:
return false;
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8b068adb9da1..f1a86a0d881d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1204,22 +1204,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
{
- int err;
-
- mutex_lock(&bpf_event_mutex);
- err = __bpf_probe_register(btp, prog);
- mutex_unlock(&bpf_event_mutex);
- return err;
+ return __bpf_probe_register(btp, prog);
}
int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
{
- int err;
-
- mutex_lock(&bpf_event_mutex);
- err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
- mutex_unlock(&bpf_event_mutex);
- return err;
+ return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
}
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c521b7347482..c4238b441624 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3384,6 +3384,8 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
const char tgid_space[] = " ";
const char space[] = " ";
+ print_event_info(buf, m);
+
seq_printf(m, "# %s _-----=> irqs-off\n",
tgid ? tgid_space : space);
seq_printf(m, "# %s / _----=> need-resched\n",
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d5fb09ebba8b..9eaf07f99212 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -861,22 +861,14 @@ static const struct file_operations kprobe_profile_ops = {
static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
- mm_segment_t old_fs;
int ret, len = 0;
u8 c;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- pagefault_disable();
-
do {
- ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+ ret = probe_mem_read(&c, (u8 *)addr + len, 1);
len++;
} while (c && ret == 0 && len < MAX_STRING_SIZE);
- pagefault_enable();
- set_fs(old_fs);
-
return (ret < 0) ? ret : len;
}
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 5c56afc17cf8..4737bb8c07a3 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -180,10 +180,12 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs,
if (unlikely(arg->dynamic))
*dl = make_data_loc(maxlen, dyndata - base);
ret = process_fetch_insn(arg->code, regs, dl, base);
- if (unlikely(ret < 0 && arg->dynamic))
+ if (unlikely(ret < 0 && arg->dynamic)) {
*dl = make_data_loc(0, dyndata - base);
- else
+ } else {
dyndata += ret;
+ maxlen -= ret;
+ }
}
}