From cf2c6f08632f127fcab808224f80ea1d3709f242 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:36:20 -0700 Subject: bpf: Sync tools/include/uapi/linux/bpf.h Commit 47316f4a3053 missed updating tools/.../bpf.h. Sync it. Fixes: 47316f4a3053 ("bpf: Support input xdp_md context in BPF_PROG_TEST_RUN") Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bf9252c7381e..b46a383e8db7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -324,9 +324,6 @@ union bpf_iter_link_info { * **BPF_PROG_TYPE_SK_LOOKUP** * *data_in* and *data_out* must be NULL. * - * **BPF_PROG_TYPE_XDP** - * *ctx_in* and *ctx_out* must be NULL. - * * **BPF_PROG_TYPE_RAW_TRACEPOINT**, * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** * -- cgit v1.2.3-59-g8ed1b From f170acda7ffaf0473d06e1e17c12cd9fd63904f5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 14 Jul 2021 21:43:17 +0900 Subject: bpf: Fix a typo of reuseport map in bpf.h. Fix s/BPF_MAP_TYPE_REUSEPORT_ARRAY/BPF_MAP_TYPE_REUSEPORT_SOCKARRAY/ typo in bpf.h. Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210714124317.67526-1-kuniyu@amazon.co.jp --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b46a383e8db7..bafb6282032b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3246,7 +3246,7 @@ union bpf_attr { * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a - * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. * It checks the selected socket is matching the incoming * request in the socket buffer. * Return diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b46a383e8db7..bafb6282032b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3246,7 +3246,7 @@ union bpf_attr { * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a - * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. * It checks the selected socket is matching the incoming * request in the socket buffer. * Return -- cgit v1.2.3-59-g8ed1b From b00628b1c7d595ae5b544e059c27b1f5828314b4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:09 -0700 Subject: bpf: Introduce bpf timers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded in hash/array/lru maps as a regular field and helpers to operate on it: // Initialize the timer. // First 4 bits of 'flags' specify clockid. // Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, int flags); // Configure the timer to call 'callback_fn' static function. long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn); // Arm the timer to expire 'nsec' nanoseconds from the current time. long bpf_timer_start(struct bpf_timer *timer, u64 nsec, u64 flags); // Cancel the timer and wait for callback_fn to finish if it was running. long bpf_timer_cancel(struct bpf_timer *timer); Here is how BPF program might look like: struct map_elem { int counter; struct bpf_timer timer; }; struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 1000); __type(key, int); __type(value, struct map_elem); } hmap SEC(".maps"); static int timer_cb(void *map, int *key, struct map_elem *val); /* val points to particular map element that contains bpf_timer. */ SEC("fentry/bpf_fentry_test1") int BPF_PROG(test1, int a) { struct map_elem *val; int key = 0; val = bpf_map_lookup_elem(&hmap, &key); if (val) { bpf_timer_init(&val->timer, &hmap, CLOCK_REALTIME); bpf_timer_set_callback(&val->timer, timer_cb); bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */, 0); } } This patch adds helper implementations that rely on hrtimers to call bpf functions as timers expire. The following patches add necessary safety checks. Only programs with CAP_BPF are allowed to use bpf_timer. The amount of timers used by the program is constrained by the memcg recorded at map creation time. The bpf_timer_init() helper needs explicit 'map' argument because inner maps are dynamic and not known at load time. While the bpf_timer_set_callback() is receiving hidden 'aux->prog' argument supplied by the verifier. The prog pointer is needed to do refcnting of bpf program to make sure that program doesn't get freed while the timer is armed. This approach relies on "user refcnt" scheme used in prog_array that stores bpf programs for bpf_tail_call. The bpf_timer_set_callback() will increment the prog refcnt which is paired with bpf_timer_cancel() that will drop the prog refcnt. The ops->map_release_uref is responsible for cancelling the timers and dropping prog refcnt when user space reference to a map reaches zero. This uref approach is done to make sure that Ctrl-C of user space process will not leave timers running forever unless the user space explicitly pinned a map that contained timers in bpffs. bpf_timer_init() and bpf_timer_set_callback() will return -EPERM if map doesn't have user references (is not held by open file descriptor from user space and not pinned in bpffs). The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel and free the timer if given map element had it allocated. "bpftool map update" command can be used to cancel timers. The 'struct bpf_timer' is explicitly __attribute__((aligned(8))) because '__u64 :64' has 1 byte alignment of 8 byte padding. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 3 + include/uapi/linux/bpf.h | 73 ++++++++++ kernel/bpf/helpers.c | 324 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 109 ++++++++++++++ kernel/trace/bpf_trace.c | 2 +- scripts/bpf_doc.py | 2 + tools/include/uapi/linux/bpf.h | 73 ++++++++++ 7 files changed, 585 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4afbff308ca3..125240b7cefb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -168,6 +168,7 @@ struct bpf_map { u32 max_entries; u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ + int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; u32 btf_key_type_id; @@ -221,6 +222,7 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); +void bpf_timer_cancel_and_free(void *timer); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; @@ -314,6 +316,7 @@ enum bpf_arg_type { ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ + ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ __BPF_ARG_TYPE_MAX, }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bafb6282032b..3544ec5234f0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4777,6 +4777,70 @@ union bpf_attr { * Execute close syscall for given FD. * Return * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4948,6 +5012,10 @@ union bpf_attr { FN(sys_bpf), \ FN(btf_find_by_name_kind), \ FN(sys_close), \ + FN(timer_init), \ + FN(timer_set_callback), \ + FN(timer_start), \ + FN(timer_cancel), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -6074,6 +6142,11 @@ struct bpf_spin_lock { __u32 val; }; +struct bpf_timer { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 38be3cfc2f58..74b16593983d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -999,6 +999,322 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +/* BPF map elements can contain 'struct bpf_timer'. + * Such map owns all of its BPF timers. + * 'struct bpf_timer' is allocated as part of map element allocation + * and it's zero initialized. + * That space is used to keep 'struct bpf_timer_kern'. + * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and + * remembers 'struct bpf_map *' pointer it's part of. + * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. + * bpf_timer_start() arms the timer. + * If user space reference to a map goes to zero at this point + * ops->map_release_uref callback is responsible for cancelling the timers, + * freeing their memory, and decrementing prog's refcnts. + * bpf_timer_cancel() cancels the timer and decrements prog's refcnt. + * Inner maps can contain bpf timers as well. ops->map_release_uref is + * freeing the timers when inner map is replaced or deleted by user space. + */ +struct bpf_hrtimer { + struct hrtimer timer; + struct bpf_map *map; + struct bpf_prog *prog; + void __rcu *callback_fn; + void *value; +}; + +/* the actual struct hidden inside uapi struct bpf_timer */ +struct bpf_timer_kern { + struct bpf_hrtimer *timer; + /* bpf_spin_lock is used here instead of spinlock_t to make + * sure that it always fits into space resereved by struct bpf_timer + * regardless of LOCKDEP and spinlock debug flags. + */ + struct bpf_spin_lock lock; +} __attribute__((aligned(8))); + +static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); + +static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) +{ + struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); + struct bpf_map *map = t->map; + void *value = t->value; + void *callback_fn; + void *key; + u32 idx; + int ret; + + callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); + if (!callback_fn) + goto out; + + /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and + * cannot be preempted by another bpf_timer_cb() on the same cpu. + * Remember the timer this callback is servicing to prevent + * deadlock if callback_fn() calls bpf_timer_cancel() or + * bpf_map_delete_elem() on the same timer. + */ + this_cpu_write(hrtimer_running, t); + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* compute the key */ + idx = ((char *)value - array->value) / array->elem_size; + key = &idx; + } else { /* hash or lru */ + key = value - round_up(map->key_size, 8); + } + + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)key, + (u64)(long)value, 0, 0); + WARN_ON(ret != 0); /* Next patch moves this check into the verifier */ + + this_cpu_write(hrtimer_running, NULL); +out: + return HRTIMER_NORESTART; +} + +BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map, + u64, flags) +{ + clockid_t clockid = flags & (MAX_CLOCKS - 1); + struct bpf_hrtimer *t; + int ret = 0; + + BUILD_BUG_ON(MAX_CLOCKS != 16); + BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer)); + BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer)); + + if (in_nmi()) + return -EOPNOTSUPP; + + if (flags >= MAX_CLOCKS || + /* similar to timerfd except _ALARM variants are not supported */ + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME && + clockid != CLOCK_BOOTTIME)) + return -EINVAL; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (t) { + ret = -EBUSY; + goto out; + } + if (!atomic64_read(&map->usercnt)) { + /* maps with timers must be either held by user space + * or pinned in bpffs. + */ + ret = -EPERM; + goto out; + } + /* allocate hrtimer via map_kmalloc to use memcg accounting */ + t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); + if (!t) { + ret = -ENOMEM; + goto out; + } + t->value = (void *)timer - map->timer_off; + t->map = map; + t->prog = NULL; + rcu_assign_pointer(t->callback_fn, NULL); + hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); + t->timer.function = bpf_timer_cb; + timer->timer = t; +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + return ret; +} + +static const struct bpf_func_proto bpf_timer_init_proto = { + .func = bpf_timer_init, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn, + struct bpf_prog_aux *, aux) +{ + struct bpf_prog *prev, *prog = aux->prog; + struct bpf_hrtimer *t; + int ret = 0; + + if (in_nmi()) + return -EOPNOTSUPP; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (!t) { + ret = -EINVAL; + goto out; + } + if (!atomic64_read(&t->map->usercnt)) { + /* maps with timers must be either held by user space + * or pinned in bpffs. Otherwise timer might still be + * running even when bpf prog is detached and user space + * is gone, since map_release_uref won't ever be called. + */ + ret = -EPERM; + goto out; + } + prev = t->prog; + if (prev != prog) { + /* Bump prog refcnt once. Every bpf_timer_set_callback() + * can pick different callback_fn-s within the same prog. + */ + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) { + ret = PTR_ERR(prog); + goto out; + } + if (prev) + /* Drop prev prog refcnt when swapping with new prog */ + bpf_prog_put(prev); + t->prog = prog; + } + rcu_assign_pointer(t->callback_fn, callback_fn); +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + return ret; +} + +static const struct bpf_func_proto bpf_timer_set_callback_proto = { + .func = bpf_timer_set_callback, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, + .arg2_type = ARG_PTR_TO_FUNC, +}; + +BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags) +{ + struct bpf_hrtimer *t; + int ret = 0; + + if (in_nmi()) + return -EOPNOTSUPP; + if (flags) + return -EINVAL; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (!t || !t->prog) { + ret = -EINVAL; + goto out; + } + hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT); +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + return ret; +} + +static const struct bpf_func_proto bpf_timer_start_proto = { + .func = bpf_timer_start, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static void drop_prog_refcnt(struct bpf_hrtimer *t) +{ + struct bpf_prog *prog = t->prog; + + if (prog) { + bpf_prog_put(prog); + t->prog = NULL; + rcu_assign_pointer(t->callback_fn, NULL); + } +} + +BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) +{ + struct bpf_hrtimer *t; + int ret = 0; + + if (in_nmi()) + return -EOPNOTSUPP; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (!t) { + ret = -EINVAL; + goto out; + } + if (this_cpu_read(hrtimer_running) == t) { + /* If bpf callback_fn is trying to bpf_timer_cancel() + * its own timer the hrtimer_cancel() will deadlock + * since it waits for callback_fn to finish + */ + ret = -EDEADLK; + goto out; + } + drop_prog_refcnt(t); +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + /* Cancel the timer and wait for associated callback to finish + * if it was running. + */ + ret = ret ?: hrtimer_cancel(&t->timer); + return ret; +} + +static const struct bpf_func_proto bpf_timer_cancel_proto = { + .func = bpf_timer_cancel, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, +}; + +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_timer_cancel_and_free(void *val) +{ + struct bpf_timer_kern *timer = val; + struct bpf_hrtimer *t; + + /* Performance optimization: read timer->timer without lock first. */ + if (!READ_ONCE(timer->timer)) + return; + + __bpf_spin_lock_irqsave(&timer->lock); + /* re-read it under lock */ + t = timer->timer; + if (!t) + goto out; + drop_prog_refcnt(t); + /* The subsequent bpf_timer_start/cancel() helpers won't be able to use + * this timer, since it won't be initialized. + */ + timer->timer = NULL; +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + if (!t) + return; + /* Cancel the timer and wait for callback to complete if it was running. + * If hrtimer_cancel() can be safely called it's safe to call kfree(t) + * right after for both preallocated and non-preallocated maps. + * The timer->timer = NULL was already done and no code path can + * see address 't' anymore. + * + * Check that bpf_map_delete/update_elem() wasn't called from timer + * callback_fn. In such case don't call hrtimer_cancel() (since it will + * deadlock) and don't call hrtimer_try_to_cancel() (since it will just + * return -1). Though callback_fn is still running on this cpu it's + * safe to do kfree(t) because bpf_timer_cb() read everything it needed + * from 't'. The bpf subprog callback_fn won't be able to access 't', + * since timer->timer = NULL was already done. The timer will be + * effectively cancelled because bpf_timer_cb() will return + * HRTIMER_NORESTART. + */ + if (this_cpu_read(hrtimer_running) != t) + hrtimer_cancel(&t->timer); + kfree(t); +} + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -1065,6 +1381,14 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_timer_init: + return &bpf_timer_init_proto; + case BPF_FUNC_timer_set_callback: + return &bpf_timer_set_callback_proto; + case BPF_FUNC_timer_start: + return &bpf_timer_start_proto; + case BPF_FUNC_timer_cancel: + return &bpf_timer_cancel_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3dbb3b40b754..e8645c819803 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4656,6 +4656,38 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, return 0; } +static int process_timer_func(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + bool is_const = tnum_is_const(reg->var_off); + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (!is_const) { + verbose(env, + "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map->btf) { + verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n", + map->name); + return -EINVAL; + } + if (val) { + /* This restriction will be removed in the next patch */ + verbose(env, "bpf_timer field can only be first in the map value element\n"); + return -EINVAL; + } + if (meta->map_ptr) { + verbose(env, "verifier bug. Two map pointers in a timer helper\n"); + return -EFAULT; + } + meta->map_ptr = map; + return 0; +} + static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_MEM || @@ -4788,6 +4820,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4819,6 +4852,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_FUNC] = &func_ptr_types, [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, + [ARG_PTR_TO_TIMER] = &timer_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4948,6 +4982,10 @@ skip_type_check: if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ + if (meta->map_ptr && meta->map_ptr != reg->map_ptr) { + verbose(env, "Map pointer doesn't match bpf_timer.\n"); + return -EINVAL; + } meta->map_ptr = reg->map_ptr; } else if (arg_type == ARG_PTR_TO_MAP_KEY) { /* bpf_map_xxx(..., map_ptr, ..., key) call: @@ -5000,6 +5038,9 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } + } else if (arg_type == ARG_PTR_TO_TIMER) { + if (process_timer_func(env, regno, meta)) + return -EACCES; } else if (arg_type == ARG_PTR_TO_FUNC) { meta->subprogno = reg->subprogno; } else if (arg_type_is_mem_ptr(arg_type)) { @@ -5742,6 +5783,34 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_timer_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr; + + /* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn); + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -6069,6 +6138,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (func_id == BPF_FUNC_timer_set_callback) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_timer_callback_state); + if (err < 0) + return -EINVAL; + } + if (func_id == BPF_FUNC_snprintf) { err = check_bpf_snprintf_call(env, regs); if (err < 0) @@ -12591,6 +12667,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env) continue; } + if (insn->imm == BPF_FUNC_timer_set_callback) { + /* The verifier will process callback_fn as many times as necessary + * with different maps and the register states prepared by + * set_timer_callback_state will be accurate. + * + * The following use case is valid: + * map1 is shared by prog1, prog2, prog3. + * prog1 calls bpf_timer_init for some map1 elements + * prog2 calls bpf_timer_set_callback for some map1 elements. + * Those that were not bpf_timer_init-ed will return -EINVAL. + * prog3 calls bpf_timer_start for some map1 elements. + * Those that were not both bpf_timer_init-ed and + * bpf_timer_set_callback-ed will return -EINVAL. + */ + struct bpf_insn ld_addrs[2] = { + BPF_LD_IMM64(BPF_REG_3, (long)prog->aux), + }; + + insn_buf[0] = ld_addrs[0]; + insn_buf[1] = ld_addrs[1]; + insn_buf[2] = *insn; + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 64bd2d84367f..6c77d25137e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1059,7 +1059,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_snprintf: return &bpf_snprintf_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 2d94025b38e9..00ac7b79cddb 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -547,6 +547,7 @@ class PrinterHelpers(Printer): 'struct inode', 'struct socket', 'struct file', + 'struct bpf_timer', ] known_types = { '...', @@ -594,6 +595,7 @@ class PrinterHelpers(Printer): 'struct inode', 'struct socket', 'struct file', + 'struct bpf_timer', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bafb6282032b..3544ec5234f0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4777,6 +4777,70 @@ union bpf_attr { * Execute close syscall for given FD. * Return * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4948,6 +5012,10 @@ union bpf_attr { FN(sys_bpf), \ FN(btf_find_by_name_kind), \ FN(sys_close), \ + FN(timer_init), \ + FN(timer_set_callback), \ + FN(timer_start), \ + FN(timer_cancel), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -6074,6 +6142,11 @@ struct bpf_spin_lock { __u32 val; }; +struct bpf_timer { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. -- cgit v1.2.3-59-g8ed1b From 9b99edcae5c80c8fb9f8e7149bae528c9e610a72 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:55 +0200 Subject: bpf: Add bpf_get_func_ip helper for tracing programs Adding bpf_get_func_ip helper for BPF_PROG_TYPE_TRACING programs, specifically for all trampoline attach types. The trampoline's caller IP address is stored in (ctx - 8) address. so there's no reason to actually call the helper, but rather fixup the call instruction and return [ctx - 8] value directly. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210714094400.396467-4-jolsa@kernel.org --- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/verifier.c | 43 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 15 +++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 4 files changed, 72 insertions(+) (limited to 'tools/include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3544ec5234f0..89688f16ad60 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4841,6 +4841,12 @@ union bpf_attr { * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing programs). + * Return + * Address of the traced function. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5016,6 +5022,7 @@ union bpf_attr { FN(timer_set_callback), \ FN(timer_start), \ FN(timer_cancel), \ + FN(get_func_ip), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 344ee67265cc..ceef190514e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6161,6 +6161,27 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, return err; } +static int check_get_func_ip(struct bpf_verifier_env *env) +{ + enum bpf_attach_type eatype = env->prog->expected_attach_type; + enum bpf_prog_type type = resolve_prog_type(env->prog); + int func_id = BPF_FUNC_get_func_ip; + + if (type == BPF_PROG_TYPE_TRACING) { + if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT && + eatype != BPF_MODIFY_RETURN) { + verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n", + func_id_name(func_id), func_id); + return -ENOTSUPP; + } + return 0; + } + + verbose(env, "func %s#%d not supported for program type %d\n", + func_id_name(func_id), func_id, type); + return -ENOTSUPP; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -6439,6 +6460,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack) env->prog->call_get_stack = true; + if (func_id == BPF_FUNC_get_func_ip) { + if (check_get_func_ip(env)) + return -ENOTSUPP; + env->prog->call_get_func_ip = true; + } + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -12632,6 +12659,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; bool expect_blinding = bpf_jit_blinding_enabled(prog); + enum bpf_prog_type prog_type = resolve_prog_type(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; @@ -12998,6 +13026,21 @@ patch_map_ops_generic: continue; } + /* Implement bpf_get_func_ip inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_ip) { + /* Load IP address from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); + if (!new_prog) + return -ENOMEM; + + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6c77d25137e0..3e71503eeb23 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -948,6 +948,19 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx) +{ + /* This helper call is inlined by verifier. */ + return ((u64 *)ctx)[-1]; +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { + .func = bpf_get_func_ip_tracing, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1058,6 +1071,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_for_each_map_elem_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; + case BPF_FUNC_get_func_ip: + return &bpf_get_func_ip_proto_tracing; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3544ec5234f0..89688f16ad60 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4841,6 +4841,12 @@ union bpf_attr { * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing programs). + * Return + * Address of the traced function. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5016,6 +5022,7 @@ union bpf_attr { FN(timer_set_callback), \ FN(timer_start), \ FN(timer_cancel), \ + FN(get_func_ip), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3-59-g8ed1b From 9ffd9f3ff7193933dae171740ab70a103d460065 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:56 +0200 Subject: bpf: Add bpf_get_func_ip helper for kprobe programs Adding bpf_get_func_ip helper for BPF_PROG_TYPE_KPROBE programs, so it's now possible to call bpf_get_func_ip from both kprobe and kretprobe programs. Taking the caller's address from 'struct kprobe::addr', which is defined for both kprobe and kretprobe. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20210714094400.396467-5-jolsa@kernel.org --- include/uapi/linux/bpf.h | 2 +- kernel/bpf/verifier.c | 2 ++ kernel/trace/bpf_trace.c | 16 ++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 +- 4 files changed, 20 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 89688f16ad60..2db6925e04f4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4844,7 +4844,7 @@ union bpf_attr { * * u64 bpf_get_func_ip(void *ctx) * Description - * Get address of the traced function (for tracing programs). + * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ceef190514e4..97216f799ba8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6175,6 +6175,8 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } return 0; + } else if (type == BPF_PROG_TYPE_KPROBE) { + return 0; } verbose(env, "func %s#%d not supported for program type %d\n", diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3e71503eeb23..0b113716bc7a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -961,6 +961,20 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) +{ + struct kprobe *kp = kprobe_running(); + + return kp ? (u64) kp->addr : 0; +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { + .func = bpf_get_func_ip_kprobe, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1092,6 +1106,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_override_return: return &bpf_override_return_proto; #endif + case BPF_FUNC_get_func_ip: + return &bpf_get_func_ip_proto_kprobe; default: return bpf_tracing_func_proto(func_id, prog); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 89688f16ad60..2db6925e04f4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4844,7 +4844,7 @@ union bpf_attr { * * u64 bpf_get_func_ip(void *ctx) * Description - * Get address of the traced function (for tracing programs). + * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. */ -- cgit v1.2.3-59-g8ed1b From 3a755cd8b7c601f756cbbf908b84f7cc8c04a02b Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 2 Aug 2021 11:02:19 +0800 Subject: bonding: add new option lacp_active Add an option lacp_active, which is similar with team's runner.active. This option specifies whether to send LACPDU frames periodically. If set on, the LACPDU frames are sent along with the configured lacp_rate setting. If set off, the LACPDU frames acts as "speak when spoken to". Note, the LACPDU state frames still will be sent when init or unbind port. v2: remove module parameter Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- Documentation/networking/bonding.rst | 12 ++++++++++++ drivers/net/bonding/bond_3ad.c | 11 ++++++----- drivers/net/bonding/bond_main.c | 1 + drivers/net/bonding/bond_netlink.c | 16 ++++++++++++++++ drivers/net/bonding/bond_options.c | 27 +++++++++++++++++++++++++++ drivers/net/bonding/bond_procfs.c | 2 ++ drivers/net/bonding/bond_sysfs.c | 25 ++++++++++++++++++++----- include/net/bond_3ad.h | 1 + include/net/bond_options.h | 1 + include/net/bonding.h | 1 + include/uapi/linux/if_link.h | 1 + tools/include/uapi/linux/if_link.h | 1 + 12 files changed, 89 insertions(+), 10 deletions(-) (limited to 'tools/include') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index 62f2aab8eaec..31cfd7d674a6 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -501,6 +501,18 @@ fail_over_mac This option was added in bonding version 3.2.0. The "follow" policy was added in bonding version 3.3.0. +lacp_active + Option specifying whether to send LACPDU frames periodically. + + off or 0 + LACPDU frames acts as "speak when spoken to". + + on or 1 + LACPDU frames are sent along the configured links + periodically. See lacp_rate for more details. + + The default is on. + lacp_rate Option specifying the rate in which we'll ask our link partner diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 6908822d9773..a4a202b9a0a2 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -96,7 +96,7 @@ static int ad_marker_send(struct port *port, struct bond_marker *marker); static void ad_mux_machine(struct port *port, bool *update_slave_arr); static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port); static void ad_tx_machine(struct port *port); -static void ad_periodic_machine(struct port *port); +static void ad_periodic_machine(struct port *port, struct bond_params bond_params); static void ad_port_selection_logic(struct port *port, bool *update_slave_arr); static void ad_agg_selection_logic(struct aggregator *aggregator, bool *update_slave_arr); @@ -1294,10 +1294,11 @@ static void ad_tx_machine(struct port *port) /** * ad_periodic_machine - handle a port's periodic state machine * @port: the port we're looking at + * @bond_params: bond parameters we will use * * Turn ntt flag on priodically to perform periodic transmission of lacpdu's. */ -static void ad_periodic_machine(struct port *port) +static void ad_periodic_machine(struct port *port, struct bond_params bond_params) { periodic_states_t last_state; @@ -1306,8 +1307,8 @@ static void ad_periodic_machine(struct port *port) /* check if port was reinitialized */ if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) || - (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) - ) { + (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) || + !bond_params.lacp_active) { port->sm_periodic_state = AD_NO_PERIODIC; } /* check if state machine should change state */ @@ -2341,7 +2342,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work) } ad_rx_machine(NULL, port); - ad_periodic_machine(port); + ad_periodic_machine(port, bond->params); ad_port_selection_logic(port, &update_slave_arr); ad_mux_machine(port, &update_slave_arr); ad_tx_machine(port); diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 616ebbb08ca6..3ba5f4871162 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -5478,6 +5478,7 @@ static int bond_check_params(struct bond_params *params) params->downdelay = downdelay; params->peer_notif_delay = 0; params->use_carrier = use_carrier; + params->lacp_active = 1; params->lacp_fast = lacp_fast; params->primary[0] = 0; params->primary_reselect = primary_reselect_value; diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 0561ece1ba45..5d54e11d18fa 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -100,6 +100,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = { [IFLA_BOND_MIN_LINKS] = { .type = NLA_U32 }, [IFLA_BOND_LP_INTERVAL] = { .type = NLA_U32 }, [IFLA_BOND_PACKETS_PER_SLAVE] = { .type = NLA_U32 }, + [IFLA_BOND_AD_LACP_ACTIVE] = { .type = NLA_U8 }, [IFLA_BOND_AD_LACP_RATE] = { .type = NLA_U8 }, [IFLA_BOND_AD_SELECT] = { .type = NLA_U8 }, [IFLA_BOND_AD_INFO] = { .type = NLA_NESTED }, @@ -387,6 +388,16 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[], if (err) return err; } + + if (data[IFLA_BOND_AD_LACP_ACTIVE]) { + int lacp_active = nla_get_u8(data[IFLA_BOND_AD_LACP_ACTIVE]); + + bond_opt_initval(&newval, lacp_active); + err = __bond_opt_set(bond, BOND_OPT_LACP_ACTIVE, &newval); + if (err) + return err; + } + if (data[IFLA_BOND_AD_LACP_RATE]) { int lacp_rate = nla_get_u8(data[IFLA_BOND_AD_LACP_RATE]); @@ -490,6 +501,7 @@ static size_t bond_get_size(const struct net_device *bond_dev) nla_total_size(sizeof(u32)) + /* IFLA_BOND_MIN_LINKS */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_LP_INTERVAL */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_PACKETS_PER_SLAVE */ + nla_total_size(sizeof(u8)) + /* IFLA_BOND_AD_LACP_ACTIVE */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_AD_LACP_RATE */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_AD_SELECT */ nla_total_size(sizeof(struct nlattr)) + /* IFLA_BOND_AD_INFO */ @@ -622,6 +634,10 @@ static int bond_fill_info(struct sk_buff *skb, packets_per_slave)) goto nla_put_failure; + if (nla_put_u8(skb, IFLA_BOND_AD_LACP_ACTIVE, + bond->params.lacp_active)) + goto nla_put_failure; + if (nla_put_u8(skb, IFLA_BOND_AD_LACP_RATE, bond->params.lacp_fast)) goto nla_put_failure; diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 0cf25de6f46d..a8fde3bc458f 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -58,6 +58,8 @@ static int bond_option_lp_interval_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_pps_set(struct bonding *bond, const struct bond_opt_value *newval); +static int bond_option_lacp_active_set(struct bonding *bond, + const struct bond_opt_value *newval); static int bond_option_lacp_rate_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_select_set(struct bonding *bond, @@ -135,6 +137,12 @@ static const struct bond_opt_value bond_intmax_tbl[] = { { NULL, -1, 0} }; +static const struct bond_opt_value bond_lacp_active[] = { + { "off", 0, 0}, + { "on", 1, BOND_VALFLAG_DEFAULT}, + { NULL, -1, 0} +}; + static const struct bond_opt_value bond_lacp_rate_tbl[] = { { "slow", AD_LACP_SLOW, 0}, { "fast", AD_LACP_FAST, 0}, @@ -283,6 +291,15 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = { .values = bond_intmax_tbl, .set = bond_option_updelay_set }, + [BOND_OPT_LACP_ACTIVE] = { + .id = BOND_OPT_LACP_ACTIVE, + .name = "lacp_active", + .desc = "Send LACPDU frames with configured lacp rate or acts as speak when spoken to", + .flags = BOND_OPTFLAG_IFDOWN, + .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), + .values = bond_lacp_active, + .set = bond_option_lacp_active_set + }, [BOND_OPT_LACP_RATE] = { .id = BOND_OPT_LACP_RATE, .name = "lacp_rate", @@ -1333,6 +1350,16 @@ static int bond_option_pps_set(struct bonding *bond, return 0; } +static int bond_option_lacp_active_set(struct bonding *bond, + const struct bond_opt_value *newval) +{ + netdev_dbg(bond->dev, "Setting LACP active to %s (%llu)\n", + newval->string, newval->value); + bond->params.lacp_active = newval->value; + + return 0; +} + static int bond_option_lacp_rate_set(struct bonding *bond, const struct bond_opt_value *newval) { diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index 0fb1da361bb1..f3e3bfd72556 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -133,6 +133,8 @@ static void bond_info_show_master(struct seq_file *seq) struct ad_info ad_info; seq_puts(seq, "\n802.3ad info\n"); + seq_printf(seq, "LACP active: %s\n", + (bond->params.lacp_active) ? "on" : "off"); seq_printf(seq, "LACP rate: %s\n", (bond->params.lacp_fast) ? "fast" : "slow"); seq_printf(seq, "Min links: %d\n", bond->params.min_links); diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index 5f9e9a240226..b9e9842fed94 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -339,10 +339,24 @@ static ssize_t bonding_show_peer_notif_delay(struct device *d, static DEVICE_ATTR(peer_notif_delay, 0644, bonding_show_peer_notif_delay, bonding_sysfs_store_option); -/* Show the LACP interval. */ -static ssize_t bonding_show_lacp(struct device *d, - struct device_attribute *attr, - char *buf) +/* Show the LACP activity and interval. */ +static ssize_t bonding_show_lacp_active(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct bonding *bond = to_bond(d); + const struct bond_opt_value *val; + + val = bond_opt_get_val(BOND_OPT_LACP_ACTIVE, bond->params.lacp_active); + + return sprintf(buf, "%s %d\n", val->string, bond->params.lacp_active); +} +static DEVICE_ATTR(lacp_active, 0644, + bonding_show_lacp_active, bonding_sysfs_store_option); + +static ssize_t bonding_show_lacp_rate(struct device *d, + struct device_attribute *attr, + char *buf) { struct bonding *bond = to_bond(d); const struct bond_opt_value *val; @@ -352,7 +366,7 @@ static ssize_t bonding_show_lacp(struct device *d, return sprintf(buf, "%s %d\n", val->string, bond->params.lacp_fast); } static DEVICE_ATTR(lacp_rate, 0644, - bonding_show_lacp, bonding_sysfs_store_option); + bonding_show_lacp_rate, bonding_sysfs_store_option); static ssize_t bonding_show_min_links(struct device *d, struct device_attribute *attr, @@ -738,6 +752,7 @@ static struct attribute *per_bond_attrs[] = { &dev_attr_downdelay.attr, &dev_attr_updelay.attr, &dev_attr_peer_notif_delay.attr, + &dev_attr_lacp_active.attr, &dev_attr_lacp_rate.attr, &dev_attr_ad_select.attr, &dev_attr_xmit_hash_policy.attr, diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index c8696a230b7d..38785d48baff 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -303,6 +303,7 @@ int __bond_3ad_get_active_agg_info(struct bonding *bond, int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); int bond_3ad_set_carrier(struct bonding *bond); +void bond_3ad_update_lacp_active(struct bonding *bond); void bond_3ad_update_lacp_rate(struct bonding *bond); void bond_3ad_update_ad_actor_settings(struct bonding *bond); int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats); diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 9d382f2f0bc5..e64833a674eb 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -64,6 +64,7 @@ enum { BOND_OPT_AD_USER_PORT_KEY, BOND_OPT_NUM_PEER_NOTIF_ALIAS, BOND_OPT_PEER_NOTIF_DELAY, + BOND_OPT_LACP_ACTIVE, BOND_OPT_LAST }; diff --git a/include/net/bonding.h b/include/net/bonding.h index 625d9c72dee3..46df47004803 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -129,6 +129,7 @@ struct bond_params { int updelay; int downdelay; int peer_notif_delay; + int lacp_active; int lacp_fast; unsigned int min_links; int ad_select; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 49b22afab78f..5310003523ce 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -855,6 +855,7 @@ enum { IFLA_BOND_AD_ACTOR_SYSTEM, IFLA_BOND_TLB_DYNAMIC_LB, IFLA_BOND_PEER_NOTIF_DELAY, + IFLA_BOND_AD_LACP_ACTIVE, __IFLA_BOND_MAX, }; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index d208b2af697f..eb15f319aa57 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -653,6 +653,7 @@ enum { IFLA_BOND_AD_ACTOR_SYSTEM, IFLA_BOND_TLB_DYNAMIC_LB, IFLA_BOND_PEER_NOTIF_DELAY, + IFLA_BOND_AD_LACP_ACTIVE, __IFLA_BOND_MAX, }; -- cgit v1.2.3-59-g8ed1b From b89fbfbb854c9afc3047e8273cc3a694650b802e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:57 -0700 Subject: bpf: Implement minimal BPF perf link Introduce a new type of BPF link - BPF perf link. This brings perf_event-based BPF program attachments (perf_event, tracepoints, kprobes, and uprobes) into the common BPF link infrastructure, allowing to list all active perf_event based attachments, auto-detaching BPF program from perf_event when link's FD is closed, get generic BPF link fdinfo/get_info functionality. BPF_LINK_CREATE command expects perf_event's FD as target_fd. No extra flags are currently supported. Force-detaching and atomic BPF program updates are not yet implemented, but with perf_event-based BPF links we now have common framework for this without the need to extend ioctl()-based perf_event interface. One interesting consideration is a new value for bpf_attach_type, which BPF_LINK_CREATE command expects. Generally, it's either 1-to-1 mapping from bpf_attach_type to bpf_prog_type, or many-to-1 mapping from a subset of bpf_attach_types to one bpf_prog_type (e.g., see BPF_PROG_TYPE_SK_SKB or BPF_PROG_TYPE_CGROUP_SOCK). In this case, though, we have three different program types (KPROBE, TRACEPOINT, PERF_EVENT) using the same perf_event-based mechanism, so it's many bpf_prog_types to one bpf_attach_type. I chose to define a single BPF_PERF_EVENT attach type for all of them and adjust link_create()'s logic for checking correspondence between attach type and program type. The alternative would be to define three new attach types (e.g., BPF_KPROBE, BPF_TRACEPOINT, and BPF_PERF_EVENT), but that seemed like unnecessary overkill and BPF_KPROBE will cause naming conflicts with BPF_KPROBE() macro, defined by libbpf. I chose to not do this to avoid unnecessary proliferation of bpf_attach_type enum values and not have to deal with naming conflicts. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-5-andrii@kernel.org --- include/linux/bpf_types.h | 3 ++ include/linux/trace_events.h | 3 ++ include/uapi/linux/bpf.h | 2 + kernel/bpf/syscall.c | 105 ++++++++++++++++++++++++++++++++++++++--- kernel/events/core.c | 10 ++-- tools/include/uapi/linux/bpf.h | 2 + 6 files changed, 112 insertions(+), 13 deletions(-) (limited to 'tools/include') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index ae3ac3a2018c..9c81724e4b98 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -136,3 +136,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) #endif +#ifdef CONFIG_PERF_EVENTS +BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) +#endif diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index ad413b382a3c..8ac92560d3a3 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -803,6 +803,9 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +void perf_event_free_bpf_prog(struct perf_event *event); + void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2db6925e04f4..94fe8329b28f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -993,6 +993,7 @@ enum bpf_attach_type { BPF_SK_SKB_VERDICT, BPF_SK_REUSEPORT_SELECT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, __MAX_BPF_ATTACH_TYPE }; @@ -1006,6 +1007,7 @@ enum bpf_link_type { BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, MAX_BPF_LINK_TYPE, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9a2068e39d23..80c03bedd6e6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2906,6 +2906,79 @@ static const struct bpf_link_ops bpf_raw_tp_link_lops = { .fill_link_info = bpf_raw_tp_link_fill_link_info, }; +#ifdef CONFIG_PERF_EVENTS +struct bpf_perf_link { + struct bpf_link link; + struct file *perf_file; +}; + +static void bpf_perf_link_release(struct bpf_link *link) +{ + struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); + struct perf_event *event = perf_link->perf_file->private_data; + + perf_event_free_bpf_prog(event); + fput(perf_link->perf_file); +} + +static void bpf_perf_link_dealloc(struct bpf_link *link) +{ + struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); + + kfree(perf_link); +} + +static const struct bpf_link_ops bpf_perf_link_lops = { + .release = bpf_perf_link_release, + .dealloc = bpf_perf_link_dealloc, +}; + +static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_perf_link *link; + struct perf_event *event; + struct file *perf_file; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + perf_file = perf_event_get(attr->link_create.target_fd); + if (IS_ERR(perf_file)) + return PTR_ERR(perf_file); + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_file; + } + bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); + link->perf_file = perf_file; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto out_put_file; + } + + event = perf_file->private_data; + err = perf_event_set_bpf_prog(event, prog); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_file; + } + /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out_put_file: + fput(perf_file); + return err; +} +#endif /* CONFIG_PERF_EVENTS */ + #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) @@ -4147,15 +4220,26 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) if (ret) goto out; - if (prog->type == BPF_PROG_TYPE_EXT) { + switch (prog->type) { + case BPF_PROG_TYPE_EXT: ret = tracing_bpf_link_attach(attr, uattr, prog); goto out; - } - - ptype = attach_type_to_prog_type(attr->link_create.attach_type); - if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { - ret = -EINVAL; - goto out; + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + if (attr->link_create.attach_type != BPF_PERF_EVENT) { + ret = -EINVAL; + goto out; + } + ptype = prog->type; + break; + default: + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { + ret = -EINVAL; + goto out; + } + break; } switch (ptype) { @@ -4179,6 +4263,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; +#endif +#ifdef CONFIG_PERF_EVENTS + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_KPROBE: + ret = bpf_perf_link_attach(attr, prog); + break; #endif default: ret = -EINVAL; diff --git a/kernel/events/core.c b/kernel/events/core.c index 2f07718bd41c..9fd65667bcb2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4697,7 +4697,6 @@ errout: } static void perf_event_free_filter(struct perf_event *event); -static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { @@ -5574,7 +5573,6 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); @@ -10013,7 +10011,7 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { bool is_kprobe, is_tracepoint, is_syscall_tp; @@ -10047,7 +10045,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *pr return perf_event_attach_bpf_prog(event, prog); } -static void perf_event_free_bpf_prog(struct perf_event *event) +void perf_event_free_bpf_prog(struct perf_event *event) { if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); @@ -10066,12 +10064,12 @@ static void perf_event_free_filter(struct perf_event *event) { } -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { return -ENOENT; } -static void perf_event_free_bpf_prog(struct perf_event *event) +void perf_event_free_bpf_prog(struct perf_event *event) { } #endif /* CONFIG_EVENT_TRACING */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2db6925e04f4..94fe8329b28f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -993,6 +993,7 @@ enum bpf_attach_type { BPF_SK_SKB_VERDICT, BPF_SK_REUSEPORT_SELECT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, __MAX_BPF_ATTACH_TYPE }; @@ -1006,6 +1007,7 @@ enum bpf_link_type { BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, MAX_BPF_LINK_TYPE, }; -- cgit v1.2.3-59-g8ed1b From 82e6b1eee6a8875ef4eacfd60711cce6965c6b04 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:58 -0700 Subject: bpf: Allow to specify user-provided bpf_cookie for BPF perf links Add ability for users to specify custom u64 value (bpf_cookie) when creating BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event, tracepoints). This is useful for cases when the same BPF program is used for attaching and processing invocation of different tracepoints/kprobes/uprobes in a generic fashion, but such that each invocation is distinguished from each other (e.g., BPF program can look up additional information associated with a specific kernel function without having to rely on function IP lookups). This enables new use cases to be implemented simply and efficiently that previously were possible only through code generation (and thus multiple instances of almost identical BPF program) or compilation at runtime (BCC-style) on target hosts (even more expensive resource-wise). For uprobes it is not even possible in some cases to know function IP before hand (e.g., when attaching to shared library without PID filtering, in which case base load address is not known for a library). This is done by storing u64 bpf_cookie in struct bpf_prog_array_item, corresponding to each attached and run BPF program. Given cgroup BPF programs already use two 8-byte pointers for their needs and cgroup BPF programs don't have (yet?) support for bpf_cookie, reuse that space through union of cgroup_storage and new bpf_cookie field. Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx. This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF program execution code, which luckily is now also split from BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper giving access to this user-provided cookie value from inside a BPF program. Generic perf_event BPF programs will access this value from perf_event itself through passed in BPF program context. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org --- drivers/media/rc/bpf-lirc.c | 4 ++-- include/linux/bpf.h | 16 +++++++++++++++- include/linux/perf_event.h | 1 + include/linux/trace_events.h | 6 +++--- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/core.c | 29 ++++++++++++++++++----------- kernel/bpf/syscall.c | 2 +- kernel/events/core.c | 21 ++++++++++++++------- kernel/trace/bpf_trace.c | 8 +++++--- tools/include/uapi/linux/bpf.h | 7 +++++++ 10 files changed, 73 insertions(+), 28 deletions(-) (limited to 'tools/include') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index bb5a9dc78f1b..3eff08d7b8e5 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -160,7 +160,7 @@ static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog) goto unlock; } - ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + ret = bpf_prog_array_copy(old_array, NULL, prog, 0, &new_array); if (ret < 0) goto unlock; @@ -193,7 +193,7 @@ static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog) } old_array = lirc_rcu_dereference(raw->progs); - ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array); + ret = bpf_prog_array_copy(old_array, prog, NULL, 0, &new_array); /* * Do not use bpf_prog_array_delete_safe() as we would end up * with a dummy entry in the array, and the we would free the diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 344e0d4d8ef6..83c3cc5e90df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1114,7 +1114,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, */ struct bpf_prog_array_item { struct bpf_prog *prog; - struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + union { + struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + u64 bpf_cookie; + }; }; struct bpf_prog_array { @@ -1140,6 +1143,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, + u64 bpf_cookie, struct bpf_prog_array **new_array); struct bpf_run_ctx {}; @@ -1149,6 +1153,11 @@ struct bpf_cg_run_ctx { const struct bpf_prog_array_item *prog_item; }; +struct bpf_trace_run_ctx { + struct bpf_run_ctx run_ctx; + u64 bpf_cookie; +}; + static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) { struct bpf_run_ctx *old_ctx = NULL; @@ -1239,6 +1248,8 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; u32 ret = 1; migrate_disable(); @@ -1246,11 +1257,14 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, array = rcu_dereference(array_rcu); if (unlikely(!array)) goto out; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { + run_ctx.bpf_cookie = item->bpf_cookie; ret &= run_prog(prog, ctx); item++; } + bpf_reset_run_ctx(old_run_ctx); out: rcu_read_unlock(); migrate_enable(); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2d510ad750ed..fe156a8170aa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -762,6 +762,7 @@ struct perf_event { #ifdef CONFIG_BPF_SYSCALL perf_overflow_handler_t orig_overflow_handler; struct bpf_prog *prog; + u64 bpf_cookie; #endif #ifdef CONFIG_EVENT_TRACING diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8ac92560d3a3..8e0631a4b046 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -675,7 +675,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); -int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); @@ -692,7 +692,7 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c } static inline int -perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -EOPNOTSUPP; } @@ -803,7 +803,7 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_free_bpf_prog(struct perf_event *event); void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 94fe8329b28f..63ee482d50e1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1448,6 +1448,13 @@ union bpf_attr { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; }; } link_create; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5ee2ec27c3d4..91f24c7b38a1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2119,13 +2119,13 @@ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, + u64 bpf_cookie, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; - struct bpf_prog_array_item *existing; + struct bpf_prog_array_item *existing, *new; struct bpf_prog_array *array; bool found_exclude = false; - int new_prog_idx = 0; /* Figure out how many existing progs we need to carry over to * the new array. @@ -2162,20 +2162,27 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); if (!array) return -ENOMEM; + new = array->items; /* Fill in the new prog array */ if (carry_prog_cnt) { existing = old_array->items; - for (; existing->prog; existing++) - if (existing->prog != exclude_prog && - existing->prog != &dummy_bpf_prog.prog) { - array->items[new_prog_idx++].prog = - existing->prog; - } + for (; existing->prog; existing++) { + if (existing->prog == exclude_prog || + existing->prog == &dummy_bpf_prog.prog) + continue; + + new->prog = existing->prog; + new->bpf_cookie = existing->bpf_cookie; + new++; + } } - if (include_prog) - array->items[new_prog_idx++].prog = include_prog; - array->items[new_prog_idx].prog = NULL; + if (include_prog) { + new->prog = include_prog; + new->bpf_cookie = bpf_cookie; + new++; + } + new->prog = NULL; *new_array = array; return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 80c03bedd6e6..7420e1334ab2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2963,7 +2963,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro } event = perf_file->private_data; - err = perf_event_set_bpf_prog(event, prog); + err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie); if (err) { bpf_link_cleanup(&link_primer); goto out_put_file; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9fd65667bcb2..2d1e63dd97f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5643,7 +5643,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (IS_ERR(prog)) return PTR_ERR(prog); - err = perf_event_set_bpf_prog(event, prog); + err = perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; @@ -9936,7 +9936,9 @@ out: event->orig_overflow_handler(event, data, regs); } -static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) +static int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ @@ -9966,6 +9968,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog } event->prog = prog; + event->bpf_cookie = bpf_cookie; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); return 0; @@ -9983,7 +9986,9 @@ static void perf_event_free_bpf_handler(struct perf_event *event) bpf_prog_put(prog); } #else -static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) +static int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { return -EOPNOTSUPP; } @@ -10011,12 +10016,13 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, + u64 bpf_cookie) { bool is_kprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) - return perf_event_set_bpf_handler(event, prog); + return perf_event_set_bpf_handler(event, prog, bpf_cookie); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; @@ -10042,7 +10048,7 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) return -EACCES; } - return perf_event_attach_bpf_prog(event, prog); + return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } void perf_event_free_bpf_prog(struct perf_event *event) @@ -10064,7 +10070,8 @@ static void perf_event_free_filter(struct perf_event *event) { } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, + u64 bpf_cookie) { return -ENOENT; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 91867b14b222..57879d28f824 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1675,7 +1675,8 @@ static DEFINE_MUTEX(bpf_event_mutex); #define BPF_TRACE_MAX_PROGS 64 int perf_event_attach_bpf_prog(struct perf_event *event, - struct bpf_prog *prog) + struct bpf_prog *prog, + u64 bpf_cookie) { struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; @@ -1702,12 +1703,13 @@ int perf_event_attach_bpf_prog(struct perf_event *event, goto unlock; } - ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array); if (ret < 0) goto unlock; /* set the new array to event->tp_event and set event->prog */ event->prog = prog; + event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free(old_array); @@ -1728,7 +1730,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) goto unlock; old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); - ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); if (ret == -ENOENT) goto unlock; if (ret < 0) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 94fe8329b28f..63ee482d50e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1448,6 +1448,13 @@ union bpf_attr { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; }; } link_create; -- cgit v1.2.3-59-g8ed1b From 7adfc6c9b315e174cf8743b21b7b691c8766791b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:59 -0700 Subject: bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs to get access to a user-provided bpf_cookie value, specified during BPF program attachment (BPF link creation) time. Naming is hard, though. With the concept being named "BPF cookie", I've considered calling the helper: - bpf_get_cookie() -- seems too unspecific and easily mistaken with socket cookie; - bpf_get_bpf_cookie() -- too much tautology; - bpf_get_link_cookie() -- would be ok, but while we create a BPF link to attach BPF program to BPF hook, it's still an "attachment" and the bpf_cookie is associated with BPF program attachment to a hook, not a BPF link itself. Technically, we could support bpf_cookie with old-style cgroup programs.So I ultimately rejected it in favor of bpf_get_attach_cookie(). Currently all perf_event-backed BPF program types support bpf_get_attach_cookie() helper. Follow-up patches will add support for fentry/fexit programs as well. While at it, mark bpf_tracing_func_proto() as static to make it obvious that it's only used from within the kernel/trace/bpf_trace.c. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org --- include/linux/bpf.h | 3 --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 16 ++++++++++++++++ 4 files changed, 66 insertions(+), 4 deletions(-) (limited to 'tools/include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83c3cc5e90df..f4c16f19f83e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2102,9 +2102,6 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; -const struct bpf_func_proto *bpf_tracing_func_proto( - enum bpf_func_id func_id, const struct bpf_prog *prog); - const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63ee482d50e1..c4f7892edb2b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4856,6 +4856,21 @@ union bpf_attr { * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5032,6 +5047,7 @@ union bpf_attr { FN(timer_start), \ FN(timer_cancel), \ FN(get_func_ip), \ + FN(get_attach_cookie), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 57879d28f824..cbc73c08c4a4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -975,7 +975,34 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { .arg1_type = ARG_PTR_TO_CTX, }; -const struct bpf_func_proto * +BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = { + .func = bpf_get_attach_cookie_trace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx) +{ + return ctx->event->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = { + .func = bpf_get_attach_cookie_pe, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -1109,6 +1136,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_kprobe; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -1219,6 +1248,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_tp; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -1326,6 +1357,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: return &bpf_read_branch_records_proto; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_pe; default: return bpf_tracing_func_proto(func_id, prog); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 63ee482d50e1..c4f7892edb2b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4856,6 +4856,21 @@ union bpf_attr { * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5032,6 +5047,7 @@ union bpf_attr { FN(timer_start), \ FN(timer_cancel), \ FN(get_func_ip), \ + FN(get_attach_cookie), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3-59-g8ed1b From 6fc88c354f3af83ffa2c285b86e76c759755693f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 19 Aug 2021 02:24:20 -0700 Subject: bpf: Migrate cgroup_bpf to internal cgroup_bpf_attach_type enum Add an enum (cgroup_bpf_attach_type) containing only valid cgroup_bpf attach types and a function to map bpf_attach_type values to the new enum. Inspired by netns_bpf_attach_type. Then, migrate cgroup_bpf to use cgroup_bpf_attach_type wherever possible. Functionality is unchanged as attach_type_to_prog_type switches in bpf/syscall.c were preventing non-cgroup programs from making use of the invalid cgroup_bpf array slots. As a result struct cgroup_bpf uses 504 fewer bytes relative to when its arrays were sized using MAX_BPF_ATTACH_TYPE. bpf_cgroup_storage is notably not migrated as struct bpf_cgroup_storage_key is part of uapi and contains a bpf_attach_type member which is not meant to be opaque. Similarly, bpf_cgroup_link continues to report its bpf_attach_type member to userspace via fdinfo and bpf_link_info. To ease disambiguation, bpf_attach_type variables are renamed from 'type' to 'atype' when changed to cgroup_bpf_attach_type. Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210819092420.1984861-2-davemarchevsky@fb.com --- include/linux/bpf-cgroup.h | 182 ++++++++++++++++++++++++++++------------- include/uapi/linux/bpf.h | 2 +- kernel/bpf/cgroup.c | 156 +++++++++++++++++++++-------------- net/ipv4/af_inet.c | 6 +- net/ipv4/udp.c | 2 +- net/ipv6/af_inet6.c | 6 +- net/ipv6/udp.c | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 8 files changed, 226 insertions(+), 132 deletions(-) (limited to 'tools/include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a74cd1c3bd87..2746fd804216 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,9 +23,73 @@ struct ctl_table_header; struct task_struct; #ifdef CONFIG_CGROUP_BPF +enum cgroup_bpf_attach_type { + CGROUP_BPF_ATTACH_TYPE_INVALID = -1, + CGROUP_INET_INGRESS = 0, + CGROUP_INET_EGRESS, + CGROUP_INET_SOCK_CREATE, + CGROUP_SOCK_OPS, + CGROUP_DEVICE, + CGROUP_INET4_BIND, + CGROUP_INET6_BIND, + CGROUP_INET4_CONNECT, + CGROUP_INET6_CONNECT, + CGROUP_INET4_POST_BIND, + CGROUP_INET6_POST_BIND, + CGROUP_UDP4_SENDMSG, + CGROUP_UDP6_SENDMSG, + CGROUP_SYSCTL, + CGROUP_UDP4_RECVMSG, + CGROUP_UDP6_RECVMSG, + CGROUP_GETSOCKOPT, + CGROUP_SETSOCKOPT, + CGROUP_INET4_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, + CGROUP_INET4_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, + CGROUP_INET_SOCK_RELEASE, + MAX_CGROUP_BPF_ATTACH_TYPE +}; + +#define CGROUP_ATYPE(type) \ + case BPF_##type: return type + +static inline enum cgroup_bpf_attach_type +to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type) +{ + switch (attach_type) { + CGROUP_ATYPE(CGROUP_INET_INGRESS); + CGROUP_ATYPE(CGROUP_INET_EGRESS); + CGROUP_ATYPE(CGROUP_INET_SOCK_CREATE); + CGROUP_ATYPE(CGROUP_SOCK_OPS); + CGROUP_ATYPE(CGROUP_DEVICE); + CGROUP_ATYPE(CGROUP_INET4_BIND); + CGROUP_ATYPE(CGROUP_INET6_BIND); + CGROUP_ATYPE(CGROUP_INET4_CONNECT); + CGROUP_ATYPE(CGROUP_INET6_CONNECT); + CGROUP_ATYPE(CGROUP_INET4_POST_BIND); + CGROUP_ATYPE(CGROUP_INET6_POST_BIND); + CGROUP_ATYPE(CGROUP_UDP4_SENDMSG); + CGROUP_ATYPE(CGROUP_UDP6_SENDMSG); + CGROUP_ATYPE(CGROUP_SYSCTL); + CGROUP_ATYPE(CGROUP_UDP4_RECVMSG); + CGROUP_ATYPE(CGROUP_UDP6_RECVMSG); + CGROUP_ATYPE(CGROUP_GETSOCKOPT); + CGROUP_ATYPE(CGROUP_SETSOCKOPT); + CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME); + CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME); + CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE); + default: + return CGROUP_BPF_ATTACH_TYPE_INVALID; + } +} + +#undef CGROUP_ATYPE -extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; -#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) +extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE]; +#define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype]) #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -67,15 +131,15 @@ struct bpf_prog_array; struct cgroup_bpf { /* array of effective progs in this cgroup */ - struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE]; + struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE]; /* attached progs to this cgroup and attach flags * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will * have either zero or one element * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS */ - struct list_head progs[MAX_BPF_ATTACH_TYPE]; - u32 flags[MAX_BPF_ATTACH_TYPE]; + struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; + u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; /* list of cgroup shared storages */ struct list_head storages; @@ -115,28 +179,28 @@ int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sk(struct sock *sk, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type); + short access, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, int *optname, char __user *optval, @@ -179,9 +243,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS)) \ + if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ - BPF_CGROUP_INET_INGRESS); \ + CGROUP_INET_INGRESS); \ \ __ret; \ }) @@ -189,54 +253,54 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ + if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ - BPF_CGROUP_INET_EGRESS); \ + CGROUP_INET_EGRESS); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, atype) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, type); \ + if (cgroup_bpf_enabled(atype)) { \ + __ret = __cgroup_bpf_run_filter_sk(sk, atype); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_RELEASE) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET4_POST_BIND) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND) -#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) \ ({ \ u32 __unused_flags; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + if (cgroup_bpf_enabled(atype)) \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ NULL, \ &__unused_flags); \ __ret; \ }) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) \ ({ \ u32 __unused_flags; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ + if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ t_ctx, \ &__unused_flags); \ release_sock(sk); \ @@ -249,13 +313,13 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE). */ -#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, bind_flags) \ +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags) \ ({ \ u32 __flags = 0; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ + if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ NULL, &__flags); \ release_sock(sk); \ if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ @@ -265,33 +329,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ - ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ - cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) && \ + ((cgroup_bpf_enabled(CGROUP_INET4_CONNECT) || \ + cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) && \ (sk)->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL) /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a * fullsock and its parent fullsock cannot be traced by @@ -311,33 +375,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS)) \ + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ - BPF_CGROUP_SOCK_OPS); \ + CGROUP_SOCK_OPS); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ sock_ops, \ - BPF_CGROUP_SOCK_OPS); \ + CGROUP_SOCK_OPS); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE)) \ - __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ + if (cgroup_bpf_enabled(CGROUP_DEVICE)) \ + __ret = __cgroup_bpf_check_dev_permission(atype, major, minor, \ access, \ - BPF_CGROUP_DEVICE); \ + CGROUP_DEVICE); \ \ __ret; \ }) @@ -346,10 +410,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL)) \ + if (cgroup_bpf_enabled(CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ - BPF_CGROUP_SYSCTL); \ + CGROUP_SYSCTL); \ __ret; \ }) @@ -357,7 +421,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -368,7 +432,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ get_user(__ret, optlen); \ __ret; \ }) @@ -377,7 +441,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ @@ -392,7 +456,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ sock, level, optname, optval, optlen, retval); \ __ret; \ @@ -451,14 +515,14 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, return 0; } -#define cgroup_bpf_enabled(type) (0) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) +#define cgroup_bpf_enabled(atype) (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, flags) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) @@ -470,7 +534,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c4f7892edb2b..191f0b286ee3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -84,7 +84,7 @@ struct bpf_lpm_trie_key { struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ - __u32 attach_type; /* program attach type */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8e9d99e2ade4..03145d45e3d5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@ #include "../cgroup/cgroup-internal.h" -DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); void cgroup_bpf_offline(struct cgroup *cgrp) @@ -113,12 +113,12 @@ static void cgroup_bpf_release(struct work_struct *work) struct list_head *storages = &cgrp->bpf.storages; struct bpf_cgroup_storage *storage, *stmp; - unsigned int type; + unsigned int atype; mutex_lock(&cgroup_mutex); - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { - struct list_head *progs = &cgrp->bpf.progs[type]; + for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { + struct list_head *progs = &cgrp->bpf.progs[atype]; struct bpf_prog_list *pl, *pltmp; list_for_each_entry_safe(pl, pltmp, progs, node) { @@ -128,10 +128,10 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key[type]); + static_branch_dec(&cgroup_bpf_enabled_key[atype]); } old_array = rcu_dereference_protected( - cgrp->bpf.effective[type], + cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); bpf_prog_array_free(old_array); } @@ -196,7 +196,7 @@ static u32 prog_list_length(struct list_head *head) * if parent has overridable or multi-prog, allow attaching */ static bool hierarchy_allows_attach(struct cgroup *cgrp, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *p; @@ -204,12 +204,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (!p) return true; do { - u32 flags = p->bpf.flags[type]; + u32 flags = p->bpf.flags[atype]; u32 cnt; if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[type]); + cnt = prog_list_length(&p->bpf.progs[atype]); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -225,7 +225,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, * to programs in this cgroup */ static int compute_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_prog_array **array) { struct bpf_prog_array_item *item; @@ -236,8 +236,8 @@ static int compute_effective_progs(struct cgroup *cgrp, /* count number of effective programs by walking parents */ do { - if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[type]); + if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) + cnt += prog_list_length(&p->bpf.progs[atype]); p = cgroup_parent(p); } while (p); @@ -249,10 +249,10 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - list_for_each_entry(pl, &p->bpf.progs[type], node) { + list_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; @@ -269,10 +269,10 @@ static int compute_effective_progs(struct cgroup *cgrp, } static void activate_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_prog_array *old_array) { - old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array, + old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array, lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array @@ -328,7 +328,7 @@ cleanup: } static int update_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup_subsys_state *css; int err; @@ -340,7 +340,7 @@ static int update_effective_progs(struct cgroup *cgrp, if (percpu_ref_is_zero(&desc->bpf.refcnt)) continue; - err = compute_effective_progs(desc, type, &desc->bpf.inactive); + err = compute_effective_progs(desc, atype, &desc->bpf.inactive); if (err) goto cleanup; } @@ -357,7 +357,7 @@ static int update_effective_progs(struct cgroup *cgrp, continue; } - activate_effective_progs(desc, type, desc->bpf.inactive); + activate_effective_progs(desc, atype, desc->bpf.inactive); desc->bpf.inactive = NULL; } @@ -436,11 +436,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); - struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; + struct list_head *progs; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || @@ -454,10 +455,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; - if (!hierarchy_allows_attach(cgrp, type)) + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + + if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags) + if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -490,16 +497,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; bpf_cgroup_storages_assign(pl->storage, storage); - cgrp->bpf.flags[type] = saved_flags; + cgrp->bpf.flags[atype] = saved_flags; - err = update_effective_progs(cgrp, type); + err = update_effective_progs(cgrp, atype); if (err) goto cleanup; if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key[type]); + static_branch_inc(&cgroup_bpf_enabled_key[atype]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; @@ -520,7 +527,7 @@ cleanup: * all descendant cgroups. This function is guaranteed to succeed. */ static void replace_effective_prog(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_cgroup_link *link) { struct bpf_prog_array_item *item; @@ -539,10 +546,10 @@ static void replace_effective_prog(struct cgroup *cgrp, /* find position of link in effective progs array */ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { - if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - head = &cg->bpf.progs[type]; + head = &cg->bpf.progs[atype]; list_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; @@ -554,7 +561,7 @@ static void replace_effective_prog(struct cgroup *cgrp, found: BUG_ON(!cg); progs = rcu_dereference_protected( - desc->bpf.effective[type], + desc->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); item = &progs->items[pos]; WRITE_ONCE(item->prog, link->link.prog); @@ -574,11 +581,18 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, struct bpf_prog *new_prog) { - struct list_head *progs = &cgrp->bpf.progs[link->type]; + enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; + struct list_head *progs; bool found = false; + atype = to_cgroup_bpf_attach_type(link->type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + if (link->link.prog->type != new_prog->type) return -EINVAL; @@ -592,7 +606,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, return -ENOENT; old_prog = xchg(&link->link.prog, new_prog); - replace_effective_prog(cgrp, link->type, link); + replace_effective_prog(cgrp, atype, link); bpf_prog_put(old_prog); return 0; } @@ -667,12 +681,20 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, enum bpf_attach_type type) { - struct list_head *progs = &cgrp->bpf.progs[type]; - u32 flags = cgrp->bpf.flags[type]; - struct bpf_prog_list *pl; + enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; + struct bpf_prog_list *pl; + struct list_head *progs; + u32 flags; int err; + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + flags = cgrp->bpf.flags[atype]; + if (prog && link) /* only one of prog or link can be specified */ return -EINVAL; @@ -686,7 +708,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; pl->link = NULL; - err = update_effective_progs(cgrp, type); + err = update_effective_progs(cgrp, atype); if (err) goto cleanup; @@ -695,10 +717,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ - cgrp->bpf.flags[type] = 0; + cgrp->bpf.flags[atype] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key[type]); + static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; cleanup: @@ -714,13 +736,21 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; - struct list_head *progs = &cgrp->bpf.progs[type]; - u32 flags = cgrp->bpf.flags[type]; + enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; + struct list_head *progs; struct bpf_prog *prog; int cnt, ret = 0, i; + u32 flags; + + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + flags = cgrp->bpf.flags[atype]; - effective = rcu_dereference_protected(cgrp->bpf.effective[type], + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) @@ -925,14 +955,14 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->cgroup = cgrp; link->type = attr->link_create.attach_type; - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto out_put_cgroup; } - err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, - BPF_F_ALLOW_MULTI); + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, + link->type, BPF_F_ALLOW_MULTI); if (err) { bpf_link_cleanup(&link_primer); goto out_put_cgroup; @@ -986,7 +1016,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { unsigned int offset = skb->data - skb_network_header(skb); struct sock *save_sk; @@ -1008,11 +1038,11 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - if (type == BPF_CGROUP_INET_EGRESS) { + if (atype == CGROUP_INET_EGRESS) { ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( - cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); } else { - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], skb, + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); ret = (ret == 1 ? 0 : -EPERM); } @@ -1038,12 +1068,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); * and if it returned != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sk(struct sock *sk, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sk, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1065,7 +1095,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags) { @@ -1090,7 +1120,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[type], &ctx, + ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, bpf_prog_run, flags); return ret == 1 ? 0 : -EPERM; @@ -1115,19 +1145,19 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); */ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sock_ops, + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type) + short access, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp; struct bpf_cgroup_dev_ctx ctx = { @@ -1139,7 +1169,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, + allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); rcu_read_unlock(); @@ -1231,7 +1261,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct bpf_sysctl_kern ctx = { .head = head, @@ -1271,7 +1301,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1289,7 +1319,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, #ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, - enum bpf_attach_type attach_type) + enum cgroup_bpf_attach_type attach_type) { struct bpf_prog_array *prog_array; bool empty; @@ -1364,7 +1394,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT)) return 0; /* Allocate a bit more than the initial user buffer for @@ -1385,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], &ctx, bpf_prog_run); release_sock(sk); @@ -1460,7 +1490,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT)) return retval; ctx.optlen = max_optlen; @@ -1495,7 +1525,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], &ctx, bpf_prog_run); release_sock(sk); @@ -1556,7 +1586,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], &ctx, bpf_prog_run); if (!ret) return -EPERM; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0e4d758c2585..1d816a5fd3eb 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -452,7 +452,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET4_BIND, &flags); + CGROUP_INET4_BIND, &flags); if (err) return err; @@ -781,7 +781,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETPEERNAME, + CGROUP_INET4_GETPEERNAME, NULL); } else { __be32 addr = inet->inet_rcv_saddr; @@ -790,7 +790,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETSOCKNAME, + CGROUP_INET4_GETSOCKNAME, NULL); } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1a742b710e54..8851c9463b4b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1143,7 +1143,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d92c90d97763..b5878bb8e419 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -455,7 +455,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET6_BIND, &flags); + CGROUP_INET6_BIND, &flags); if (err) return err; @@ -532,7 +532,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, if (np->sndflow) sin->sin6_flowinfo = np->flow_label; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) @@ -541,7 +541,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, NULL); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c5e15e94bb00..ea53847b5b7e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1475,7 +1475,7 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c4f7892edb2b..191f0b286ee3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -84,7 +84,7 @@ struct bpf_lpm_trie_key { struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ - __u32 attach_type; /* program attach type */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; union bpf_iter_link_info { -- cgit v1.2.3-59-g8ed1b From f2e85d4a751663514c1e84ea65f334ce1ca13a28 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:50 +0530 Subject: tools: include: Add ethtool_drvinfo definition to UAPI header Instead of copying the whole header in, just add the struct definitions we need for now. In the future it can be synced as a copy of in-tree header if required. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-3-memxor@gmail.com --- tools/include/uapi/linux/ethtool.h | 53 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h index c86c3e942df9..47afae3895ec 100644 --- a/tools/include/uapi/linux/ethtool.h +++ b/tools/include/uapi/linux/ethtool.h @@ -48,4 +48,57 @@ struct ethtool_channels { __u32 combined_count; }; +#define ETHTOOL_FWVERS_LEN 32 +#define ETHTOOL_BUSINFO_LEN 32 +#define ETHTOOL_EROMVERS_LEN 32 + +/** + * struct ethtool_drvinfo - general driver and device information + * @cmd: Command number = %ETHTOOL_GDRVINFO + * @driver: Driver short name. This should normally match the name + * in its bus driver structure (e.g. pci_driver::name). Must + * not be an empty string. + * @version: Driver version string; may be an empty string + * @fw_version: Firmware version string; may be an empty string + * @erom_version: Expansion ROM version string; may be an empty string + * @bus_info: Device bus address. This should match the dev_name() + * string for the underlying bus device, if there is one. May be + * an empty string. + * @reserved2: Reserved for future use; see the note on reserved space. + * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and + * %ETHTOOL_SPFLAGS commands; also the number of strings in the + * %ETH_SS_PRIV_FLAGS set + * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS + * command; also the number of strings in the %ETH_SS_STATS set + * @testinfo_len: Number of results returned by the %ETHTOOL_TEST + * command; also the number of strings in the %ETH_SS_TEST set + * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM + * and %ETHTOOL_SEEPROM commands, in bytes + * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS + * command, in bytes + * + * Users can use the %ETHTOOL_GSSET_INFO command to get the number of + * strings in any string set (from Linux 2.6.34). + * + * Drivers should set at most @driver, @version, @fw_version and + * @bus_info in their get_drvinfo() implementation. The ethtool + * core fills in the other fields using other driver operations. + */ +struct ethtool_drvinfo { + __u32 cmd; + char driver[32]; + char version[32]; + char fw_version[ETHTOOL_FWVERS_LEN]; + char bus_info[ETHTOOL_BUSINFO_LEN]; + char erom_version[ETHTOOL_EROMVERS_LEN]; + char reserved2[12]; + __u32 n_priv_flags; + __u32 n_stats; + __u32 testinfo_len; + __u32 eedump_len; + __u32 regdump_len; +}; + +#define ETHTOOL_GDRVINFO 0x00000003 + #endif /* _UAPI_LINUX_ETHTOOL_H */ -- cgit v1.2.3-59-g8ed1b From dd6e10fbd9fb86a571d925602c8a24bb4d09a2a7 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:49 -0700 Subject: bpf: Add bpf_task_pt_regs() helper The motivation behind this helper is to access userspace pt_regs in a kprobe handler. uprobe's ctx is the userspace pt_regs. kprobe's ctx is the kernelspace pt_regs. bpf_task_pt_regs() allows accessing userspace pt_regs in a kprobe handler. The final case (kernelspace pt_regs in uprobe) is pretty rare (usermode helper) so I think that can be solved later if necessary. More concretely, this helper is useful in doing BPF-based DWARF stack unwinding. Currently the kernel can only do framepointer based stack unwinds for userspace code. This is because the DWARF state machines are too fragile to be computed in kernelspace [0]. The idea behind DWARF-based stack unwinds w/ BPF is to copy a chunk of the userspace stack (while in prog context) and send it up to userspace for unwinding (probably with libunwind) [1]. This would effectively enable profiling applications with -fomit-frame-pointer using kprobes and uprobes. [0]: https://lkml.org/lkml/2012/2/10/356 [1]: https://github.com/danobi/bpf-dwarf-walk Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/e2718ced2d51ef4268590ab8562962438ab82815.1629772842.git.dxu@dxuuu.xyz --- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/helpers.c | 3 +++ kernel/trace/bpf_trace.c | 19 +++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 4 files changed, 36 insertions(+) (limited to 'tools/include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 191f0b286ee3..791f31dd0abe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4871,6 +4871,12 @@ union bpf_attr { * Return * Value specified by user at BPF link creation/attachment time * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5048,6 +5054,7 @@ union bpf_attr { FN(timer_cancel), \ FN(get_func_ip), \ FN(get_attach_cookie), \ + FN(task_pt_regs), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 609674f409ed..c227b7d4f56c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1327,6 +1327,7 @@ const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; +const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -1424,6 +1425,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; + case BPF_FUNC_task_pt_regs: + return &bpf_task_pt_regs_proto; default: return NULL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4e54f3dc209f..580e14ee7ff9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -723,6 +723,23 @@ const struct bpf_func_proto bpf_get_current_task_btf_proto = { .ret_btf_id = &btf_task_struct_ids[0], }; +BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) +{ + return (unsigned long) task_pt_regs(task); +} + +BTF_ID_LIST(bpf_task_pt_regs_ids) +BTF_ID(struct, pt_regs) + +const struct bpf_func_proto bpf_task_pt_regs_proto = { + .func = bpf_task_pt_regs, + .gpl_only = true, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_task_struct_ids[0], + .ret_type = RET_PTR_TO_BTF_ID, + .ret_btf_id = &bpf_task_pt_regs_ids[0], +}; + BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1032,6 +1049,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; + case BPF_FUNC_task_pt_regs: + return &bpf_task_pt_regs_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 191f0b286ee3..791f31dd0abe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4871,6 +4871,12 @@ union bpf_attr { * Return * Value specified by user at BPF link creation/attachment time * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5048,6 +5054,7 @@ union bpf_attr { FN(timer_cancel), \ FN(get_func_ip), \ FN(get_attach_cookie), \ + FN(task_pt_regs), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3-59-g8ed1b From 49b99da2c9ce13ffcd93fe3a0f5670791c1d76f7 Mon Sep 17 00:00:00 2001 From: Rocco Yue Date: Fri, 27 Aug 2021 23:04:12 +0800 Subject: ipv6: add IFLA_INET6_RA_MTU to expose mtu value The kernel provides a "/proc/sys/net/ipv6/conf//mtu" file, which can temporarily record the mtu value of the last received RA message when the RA mtu value is lower than the interface mtu, but this proc has following limitations: (1) when the interface mtu (/sys/class/net//mtu) is updeated, mtu6 (/proc/sys/net/ipv6/conf//mtu) will be updated to the value of interface mtu; (2) mtu6 (/proc/sys/net/ipv6/conf//mtu) only affect ipv6 connection, and not affect ipv4. Therefore, when the mtu option is carried in the RA message, there will be a problem that the user sometimes cannot obtain RA mtu value correctly by reading mtu6. After this patch set, if a RA message carries the mtu option, you can send a netlink msg which nlmsg_type is RTM_GETLINK, and then by parsing the attribute of IFLA_INET6_RA_MTU to get the mtu value carried in the RA message received on the inet6 device. In addition, you can also get a link notification when ra_mtu is updated so it doesn't have to poll. In this way, if the MTU values that the device receives from the network in the PCO IPv4 and the RA IPv6 procedures are different, the user can obtain the correct ipv6 ra_mtu value and compare the value of ra_mtu and ipv4 mtu, then the device can use the lower MTU value for both IPv4 and IPv6. Signed-off-by: Rocco Yue Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20210827150412.9267-1-rocco.yue@mediatek.com Signed-off-by: Jakub Kicinski --- include/net/if_inet6.h | 2 ++ include/uapi/linux/if_link.h | 1 + net/ipv6/addrconf.c | 10 ++++++++++ net/ipv6/ndisc.c | 17 +++++++++++------ tools/include/uapi/linux/if_link.h | 1 + 5 files changed, 25 insertions(+), 6 deletions(-) (limited to 'tools/include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 42235c178b06..653e7d0f65cb 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -210,6 +210,8 @@ struct inet6_dev { unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ struct rcu_head rcu; + + unsigned int ra_mtu; }; static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8aad65b69054..eebd3894fe89 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -417,6 +417,7 @@ enum { IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ IFLA_INET6_TOKEN, /* device token */ IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ + IFLA_INET6_RA_MTU, /* mtu carried in the RA message */ __IFLA_INET6_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8381288a0d6e..17756f3ed33b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -394,6 +394,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; ndev->cnf.mtu6 = dev->mtu; + ndev->ra_mtu = 0; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); if (!ndev->nd_parms) { kfree(ndev); @@ -3849,6 +3850,7 @@ restart: } idev->tstamp = jiffies; + idev->ra_mtu = 0; /* Last: Shot the device (if unregistered) */ if (unregister) { @@ -5543,6 +5545,7 @@ static inline size_t inet6_ifla6_size(void) + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */ + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */ + + nla_total_size(4) /* IFLA_INET6_RA_MTU */ + 0; } @@ -5651,6 +5654,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) goto nla_put_failure; + if (idev->ra_mtu && + nla_put_u32(skb, IFLA_INET6_RA_MTU, idev->ra_mtu)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -5767,6 +5774,9 @@ update_lft: static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = { [IFLA_INET6_ADDR_GEN_MODE] = { .type = NLA_U8 }, [IFLA_INET6_TOKEN] = { .len = sizeof(struct in6_addr) }, + [IFLA_INET6_RA_MTU] = { .type = NLA_REJECT, + .reject_message = + "IFLA_INET6_RA_MTU can not be set" }, }; static int check_addr_gen_mode(int mode) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index c467c6419893..4b098521a44c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1391,12 +1391,6 @@ skip_defrtr: } } - /* - * Send a notify if RA changed managed/otherconf flags or timer settings - */ - if (send_ifinfo_notify) - inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); - skip_linkparms: /* @@ -1496,6 +1490,11 @@ skip_routeinfo: memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); + if (in6_dev->ra_mtu != mtu) { + in6_dev->ra_mtu = mtu; + send_ifinfo_notify = true; + } + if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (in6_dev->cnf.mtu6 != mtu) { @@ -1519,6 +1518,12 @@ skip_routeinfo: ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: + /* Send a notify if RA changed managed/otherconf flags or + * timer settings or ra_mtu value + */ + if (send_ifinfo_notify) + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + fib6_info_release(rt); if (neigh) neigh_release(neigh); diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index eb15f319aa57..b3610fdd1fee 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -230,6 +230,7 @@ enum { IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ IFLA_INET6_TOKEN, /* device token */ IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ + IFLA_INET6_RA_MTU, /* mtu carried in the RA message */ __IFLA_INET6_MAX }; -- cgit v1.2.3-59-g8ed1b