aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-10-04 13:38:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-10-04 13:38:03 -0700
commit0326074ff4652329f2a1a9c8685104576bd8d131 (patch)
tree9a7574c7ccb05bf4c7cb34fc5a65457bb8f495cb /kernel
parentMerge tag 'landlock-6.1-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux (diff)
parenteth: pse: add missing static inlines (diff)
downloadlinux-dev-0326074ff4652329f2a1a9c8685104576bd8d131.tar.xz
linux-dev-0326074ff4652329f2a1a9c8685104576bd8d131.zip
Merge tag 'net-next-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - Introduce and use a single page frag cache for allocating small skb heads, clawing back the 10-20% performance regression in UDP flood test from previous fixes. - Run packets which already went thru HW coalescing thru SW GRO. This significantly improves TCP segment coalescing and simplifies deployments as different workloads benefit from HW or SW GRO. - Shrink the size of the base zero-copy send structure. - Move TCP init under a new slow / sleepable version of DO_ONCE(). BPF: - Add BPF-specific, any-context-safe memory allocator. - Add helpers/kfuncs for PKCS#7 signature verification from BPF programs. - Define a new map type and related helpers for user space -> kernel communication over a ring buffer (BPF_MAP_TYPE_USER_RINGBUF). - Allow targeting BPF iterators to loop through resources of one task/thread. - Add ability to call selected destructive functions. Expose crash_kexec() to allow BPF to trigger a kernel dump. Use CAP_SYS_BOOT check on the loading process to judge permissions. - Enable BPF to collect custom hierarchical cgroup stats efficiently by integrating with the rstat framework. - Support struct arguments for trampoline based programs. Only structs with size <= 16B and x86 are supported. - Invoke cgroup/connect{4,6} programs for unprivileged ICMP ping sockets (instead of just TCP and UDP sockets). - Add a helper for accessing CLOCK_TAI for time sensitive network related programs. - Support accessing network tunnel metadata's flags. - Make TCP SYN ACK RTO tunable by BPF programs with TCP Fast Open. - Add support for writing to Netfilter's nf_conn:mark. Protocols: - WiFi: more Extremely High Throughput (EHT) and Multi-Link Operation (MLO) work (802.11be, WiFi 7). - vsock: improve support for SO_RCVLOWAT. - SMC: support SO_REUSEPORT. - Netlink: define and document how to use netlink in a "modern" way. Support reporting missing attributes via extended ACK. - IPSec: support collect metadata mode for xfrm interfaces. - TCPv6: send consistent autoflowlabel in SYN_RECV state and RST packets. - TCP: introduce optional per-netns connection hash table to allow better isolation between namespaces (opt-in, at the cost of memory and cache pressure). - MPTCP: support TCP_FASTOPEN_CONNECT. - Add NEXT-C-SID support in Segment Routing (SRv6) End behavior. - Adjust IP_UNICAST_IF sockopt behavior for connected UDP sockets. - Open vSwitch: - Allow specifying ifindex of new interfaces. - Allow conntrack and metering in non-initial user namespace. - TLS: support the Korean ARIA-GCM crypto algorithm. - Remove DECnet support. Driver API: - Allow selecting the conduit interface used by each port in DSA switches, at runtime. - Ethernet Power Sourcing Equipment and Power Device support. - Add tc-taprio support for queueMaxSDU parameter, i.e. setting per traffic class max frame size for time-based packet schedules. - Support PHY rate matching - adapting between differing host-side and link-side speeds. - Introduce QUSGMII PHY mode and 1000BASE-KX interface mode. - Validate OF (device tree) nodes for DSA shared ports; make phylink-related properties mandatory on DSA and CPU ports. Enforcing more uniformity should allow transitioning to phylink. - Require that flash component name used during update matches one of the components for which version is reported by info_get(). - Remove "weight" argument from driver-facing NAPI API as much as possible. It's one of those magic knobs which seemed like a good idea at the time but is too indirect to use in practice. - Support offload of TLS connections with 256 bit keys. New hardware / drivers: - Ethernet: - Microchip KSZ9896 6-port Gigabit Ethernet Switch - Renesas Ethernet AVB (EtherAVB-IF) Gen4 SoCs - Analog Devices ADIN1110 and ADIN2111 industrial single pair Ethernet (10BASE-T1L) MAC+PHY. - Rockchip RV1126 Gigabit Ethernet (a version of stmmac IP). - Ethernet SFPs / modules: - RollBall / Hilink / Turris 10G copper SFPs - HALNy GPON module - WiFi: - CYW43439 SDIO chipset (brcmfmac) - CYW89459 PCIe chipset (brcmfmac) - BCM4378 on Apple platforms (brcmfmac) Drivers: - CAN: - gs_usb: HW timestamp support - Ethernet PHYs: - lan8814: cable diagnostics - Ethernet NICs: - Intel (100G): - implement control of FCS/CRC stripping - port splitting via devlink - L2TPv3 filtering offload - nVidia/Mellanox: - tunnel offload for sub-functions - MACSec offload, w/ Extended packet number and replay window offload - significantly restructure, and optimize the AF_XDP support, align the behavior with other vendors - Huawei: - configuring DSCP map for traffic class selection - querying standard FEC statistics - querying SerDes lane number via ethtool - Marvell/Cavium: - egress priority flow control - MACSec offload - AMD/SolarFlare: - PTP over IPv6 and raw Ethernet - small / embedded: - ax88772: convert to phylink (to support SFP cages) - altera: tse: convert to phylink - ftgmac100: support fixed link - enetc: standard Ethtool counters - macb: ZynqMP SGMII dynamic configuration support - tsnep: support multi-queue and use page pool - lan743x: Rx IP & TCP checksum offload - igc: add xdp frags support to ndo_xdp_xmit - Ethernet high-speed switches: - Marvell (prestera): - support SPAN port features (traffic mirroring) - nexthop object offloading - Microchip (sparx5): - multicast forwarding offload - QoS queuing offload (tc-mqprio, tc-tbf, tc-ets) - Ethernet embedded switches: - Marvell (mv88e6xxx): - support RGMII cmode - NXP (felix): - standardized ethtool counters - Microchip (lan966x): - QoS queuing offload (tc-mqprio, tc-tbf, tc-cbs, tc-ets) - traffic policing and mirroring - link aggregation / bonding offload - QUSGMII PHY mode support - Qualcomm 802.11ax WiFi (ath11k): - cold boot calibration support on WCN6750 - support to connect to a non-transmit MBSSID AP profile - enable remain-on-channel support on WCN6750 - Wake-on-WLAN support for WCN6750 - support to provide transmit power from firmware via nl80211 - support to get power save duration for each client - spectral scan support for 160 MHz - MediaTek WiFi (mt76): - WiFi-to-Ethernet bridging offload for MT7986 chips - RealTek WiFi (rtw89): - P2P support" * tag 'net-next-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1864 commits) eth: pse: add missing static inlines once: rename _SLOW to _SLEEPABLE net: pse-pd: add regulator based PSE driver dt-bindings: net: pse-dt: add bindings for regulator based PoDL PSE controller ethtool: add interface to interact with Ethernet Power Equipment net: mdiobus: search for PSE nodes by parsing PHY nodes. net: mdiobus: fwnode_mdiobus_register_phy() rework error handling net: add framework to support Ethernet PSE and PDs devices dt-bindings: net: phy: add PoDL PSE property net: marvell: prestera: Propagate nh state from hw to kernel net: marvell: prestera: Add neighbour cache accounting net: marvell: prestera: add stub handler neighbour events net: marvell: prestera: Add heplers to interact with fib_notifier_info net: marvell: prestera: Add length macros for prestera_ip_addr net: marvell: prestera: add delayed wq and flush wq on deinit net: marvell: prestera: Add strict cleanup of fib arbiter net: marvell: prestera: Add cleanup of allocated fib_nodes net: marvell: prestera: Add router nexthops ABI eth: octeon: fix build after netif_napi_add() changes net/mlx5: E-Switch, Return EBUSY if can't get mode lock ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/arraymap.c33
-rw-r--r--kernel/bpf/bpf_iter.c10
-rw-r--r--kernel/bpf/bpf_local_storage.c10
-rw-r--r--kernel/bpf/bpf_lsm.c23
-rw-r--r--kernel/bpf/bpf_task_storage.c8
-rw-r--r--kernel/bpf/btf.c287
-rw-r--r--kernel/bpf/cgroup.c185
-rw-r--r--kernel/bpf/cgroup_iter.c282
-rw-r--r--kernel/bpf/core.c10
-rw-r--r--kernel/bpf/cpumap.c6
-rw-r--r--kernel/bpf/devmap.c6
-rw-r--r--kernel/bpf/dispatcher.c27
-rw-r--r--kernel/bpf/hashtab.c206
-rw-r--r--kernel/bpf/helpers.c120
-rw-r--r--kernel/bpf/local_storage.c5
-rw-r--r--kernel/bpf/lpm_trie.c4
-rw-r--r--kernel/bpf/memalloc.c635
-rw-r--r--kernel/bpf/offload.c6
-rw-r--r--kernel/bpf/percpu_freelist.c48
-rw-r--r--kernel/bpf/queue_stack_maps.c2
-rw-r--r--kernel/bpf/ringbuf.c253
-rw-r--r--kernel/bpf/syscall.c46
-rw-r--r--kernel/bpf/task_iter.c224
-rw-r--r--kernel/bpf/trampoline.c68
-rw-r--r--kernel/bpf/verifier.c588
-rw-r--r--kernel/cgroup/cgroup.c5
-rw-r--r--kernel/cgroup/rstat.c48
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/taskstats.c1
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/bpf_trace.c211
-rw-r--r--kernel/trace/ftrace.c3
33 files changed, 2691 insertions, 686 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 057ba8e01e70..341c94f208f4 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
@@ -24,6 +24,9 @@ endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
+ifeq ($(CONFIG_CGROUPS),y)
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o
+endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 624527401d4d..832b2659e96e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -279,7 +279,8 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
off += size;
}
rcu_read_unlock();
@@ -338,8 +339,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EINVAL;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
- value, map->value_size);
+ val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
+ copy_map_value(map, val, value);
+ check_and_free_fields(array, val);
} else {
val = array->value +
(u64)array->elem_size * (index & array->index_mask);
@@ -383,7 +385,8 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+ copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
+ check_and_free_fields(array, per_cpu_ptr(pptr, cpu));
off += size;
}
rcu_read_unlock();
@@ -421,8 +424,20 @@ static void array_map_free(struct bpf_map *map)
int i;
if (map_value_has_kptrs(map)) {
- for (i = 0; i < array->map.max_entries; i++)
- bpf_map_free_kptrs(map, array_map_elem_ptr(array, i));
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ for (i = 0; i < array->map.max_entries; i++) {
+ void __percpu *pptr = array->pptrs[i & array->index_mask];
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ }
+ } else {
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_map_free_kptrs(map, array_map_elem_ptr(array, i));
+ }
bpf_map_free_kptr_off_tab(map);
}
@@ -608,9 +623,9 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
pptr = v;
size = array->elem_size;
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(info->percpu_value_buf + off,
- per_cpu_ptr(pptr, cpu),
- size);
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
off += size;
}
ctx.value = info->percpu_value_buf;
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 24b755eca0b3..5dc307bdeaeb 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -202,6 +202,11 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
}
stop:
offs = seq->count;
+ if (IS_ERR(p)) {
+ seq->op->stop(seq, NULL);
+ err = PTR_ERR(p);
+ goto done;
+ }
/* bpf program called if !p */
seq->op->stop(seq, p);
if (!p) {
@@ -689,19 +694,24 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
+ struct bpf_run_ctx run_ctx, *old_run_ctx;
int ret;
if (prog->aux->sleepable) {
rcu_read_lock_trace();
migrate_disable();
might_fault();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
migrate_enable();
rcu_read_unlock_trace();
} else {
rcu_read_lock();
migrate_disable();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
migrate_enable();
rcu_read_unlock();
}
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 8ce40fd869f6..802fc15b0d73 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -555,11 +555,11 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem, map_node))) {
if (busy_counter) {
migrate_disable();
- __this_cpu_inc(*busy_counter);
+ this_cpu_inc(*busy_counter);
}
bpf_selem_unlink(selem, false);
if (busy_counter) {
- __this_cpu_dec(*busy_counter);
+ this_cpu_dec(*busy_counter);
migrate_enable();
}
cond_resched_rcu();
@@ -582,7 +582,7 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
synchronize_rcu();
kvfree(smap->buckets);
- kfree(smap);
+ bpf_map_area_free(smap);
}
int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
@@ -610,7 +610,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
unsigned int i;
u32 nbuckets;
- smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
if (!smap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&smap->map, attr);
@@ -623,7 +623,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
if (!smap->buckets) {
- kfree(smap);
+ bpf_map_area_free(smap);
return ERR_PTR(-ENOMEM);
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 761998fda762..d6c9b3705f24 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -41,17 +41,21 @@ BTF_SET_END(bpf_lsm_hooks)
*/
BTF_SET_START(bpf_lsm_current_hooks)
/* operate on freshly allocated sk without any cgroup association */
+#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_sk_alloc_security)
BTF_ID(func, bpf_lsm_sk_free_security)
+#endif
BTF_SET_END(bpf_lsm_current_hooks)
/* List of LSM hooks that trigger while the socket is properly locked.
*/
BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_socket_sock_rcv_skb)
BTF_ID(func, bpf_lsm_sock_graft)
BTF_ID(func, bpf_lsm_inet_csk_clone)
BTF_ID(func, bpf_lsm_inet_conn_established)
+#endif
BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
/* List of LSM hooks that trigger while the socket is _not_ locked,
@@ -59,8 +63,10 @@ BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
* in the early init phase.
*/
BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_socket_post_create)
BTF_ID(func, bpf_lsm_socket_socketpair)
+#endif
BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)
#ifdef CONFIG_CGROUP_BPF
@@ -189,6 +195,14 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ if (prog->expected_attach_type == BPF_LSM_CGROUP) {
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+ }
+
switch (func_id) {
case BPF_FUNC_inode_storage_get:
return &bpf_inode_storage_get_proto;
@@ -212,15 +226,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
case BPF_FUNC_get_attach_cookie:
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
- case BPF_FUNC_get_local_storage:
- return prog->expected_attach_type == BPF_LSM_CGROUP ?
- &bpf_get_local_storage_proto : NULL;
- case BPF_FUNC_set_retval:
- return prog->expected_attach_type == BPF_LSM_CGROUP ?
- &bpf_set_retval_proto : NULL;
- case BPF_FUNC_get_retval:
- return prog->expected_attach_type == BPF_LSM_CGROUP ?
- &bpf_get_retval_proto : NULL;
#ifdef CONFIG_NET
case BPF_FUNC_setsockopt:
if (prog->expected_attach_type != BPF_LSM_CGROUP)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index e9014dc62682..6f290623347e 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -26,20 +26,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
migrate_disable();
- __this_cpu_inc(bpf_task_storage_busy);
+ this_cpu_inc(bpf_task_storage_busy);
}
static void bpf_task_storage_unlock(void)
{
- __this_cpu_dec(bpf_task_storage_busy);
+ this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
}
static bool bpf_task_storage_trylock(void)
{
migrate_disable();
- if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
- __this_cpu_dec(bpf_task_storage_busy);
+ if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+ this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
return false;
}
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7e64447659f3..eba603cec2c5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -208,7 +208,7 @@ enum btf_kfunc_hook {
};
enum {
- BTF_KFUNC_SET_MAX_CNT = 32,
+ BTF_KFUNC_SET_MAX_CNT = 256,
BTF_DTOR_KFUNC_MAX_CNT = 256,
};
@@ -818,6 +818,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
return NULL;
return btf->types[type_id];
}
+EXPORT_SYMBOL_GPL(btf_type_by_id);
/*
* Regular int is not a bit field and it must be either
@@ -1396,7 +1397,6 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
const char *fmt, ...)
{
struct bpf_verifier_log *log = &env->log;
- u8 kind = BTF_INFO_KIND(t->info);
struct btf *btf = env->btf;
va_list args;
@@ -1412,7 +1412,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
- btf_kind_str[kind],
+ btf_type_str(t),
__btf_name_by_offset(btf, t->name_off),
log_details ? " " : "");
@@ -3128,7 +3128,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
if (v->next_member) {
const struct btf_type *last_member_type;
const struct btf_member *last_member;
- u16 last_member_type_id;
+ u32 last_member_type_id;
last_member = btf_type_member(v->t) + v->next_member - 1;
last_member_type_id = last_member->type;
@@ -4854,7 +4854,6 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
u32 hdr_len, hdr_copy, btf_data_size;
const struct btf_header *hdr;
struct btf *btf;
- int err;
btf = env->btf;
btf_data_size = btf->data_size;
@@ -4911,11 +4910,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return -EINVAL;
}
- err = btf_check_sec_info(env, btf_data_size);
- if (err)
- return err;
-
- return 0;
+ return btf_check_sec_info(env, btf_data_size);
}
static int btf_check_type_tags(struct btf_verifier_env *env,
@@ -5328,6 +5323,34 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
return btf_type_is_int(t);
}
+static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+ int off)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 offset = 0, nr_args;
+ int i;
+
+ if (!func_proto)
+ return off / 8;
+
+ nr_args = btf_type_vlen(func_proto);
+ args = (const struct btf_param *)(func_proto + 1);
+ for (i = 0; i < nr_args; i++) {
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return i;
+ }
+
+ t = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return nr_args;
+
+ return nr_args + 1;
+}
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -5347,7 +5370,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
tname, off);
return false;
}
- arg = off / 8;
+ arg = get_ctx_arg_idx(btf, t, off);
args = (const struct btf_param *)(t + 1);
/* if (t == NULL) Fall back to default BPF prog with
* MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -5398,7 +5421,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_small_int(t)) {
bpf_log(log,
"ret type %s not allowed for fmod_ret\n",
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
return false;
}
break;
@@ -5417,7 +5440,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_small_int(t) || btf_is_any_enum(t))
+ if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -5425,7 +5448,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
"func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
tname, arg,
__btf_name_by_offset(btf, t->name_off),
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
return false;
}
@@ -5509,11 +5532,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_struct(t)) {
bpf_log(log,
"func '%s' arg%d type %s is not a struct\n",
- tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, arg, btf_type_str(t));
return false;
}
bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
- tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
+ tname, arg, info->btf_id, btf_type_str(t),
__btf_name_by_offset(btf, t->name_off));
return true;
}
@@ -5864,26 +5887,25 @@ again:
}
static int __get_type_size(struct btf *btf, u32 btf_id,
- const struct btf_type **bad_type)
+ const struct btf_type **ret_type)
{
const struct btf_type *t;
+ *ret_type = btf_type_by_id(btf, 0);
if (!btf_id)
/* void */
return 0;
t = btf_type_by_id(btf, btf_id);
while (t && btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!t) {
- *bad_type = btf_type_by_id(btf, 0);
+ if (!t)
return -EINVAL;
- }
+ *ret_type = t;
if (btf_type_is_ptr(t))
/* kernel size of pointer. Not BPF's size of pointer*/
return sizeof(void *);
- if (btf_type_is_int(t) || btf_is_any_enum(t))
+ if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
return t->size;
- *bad_type = t;
return -EINVAL;
}
@@ -5902,8 +5924,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
/* BTF function prototype doesn't match the verifier types.
* Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
*/
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
m->arg_size[i] = 8;
+ m->arg_flags[i] = 0;
+ }
m->ret_size = 8;
m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
@@ -5917,10 +5941,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = __get_type_size(btf, func->type, &t);
- if (ret < 0) {
+ if (ret < 0 || __btf_type_is_struct(t)) {
bpf_log(log,
"The function %s return type %s is unsupported.\n",
- tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, btf_type_str(t));
return -EINVAL;
}
m->ret_size = ret;
@@ -5933,10 +5957,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = __get_type_size(btf, args[i].type, &t);
- if (ret < 0) {
+
+ /* No support of struct argument size greater than 16 bytes */
+ if (ret < 0 || ret > 16) {
bpf_log(log,
"The function %s arg%d type %s is unsupported.\n",
- tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, i, btf_type_str(t));
return -EINVAL;
}
if (ret == 0) {
@@ -5946,6 +5972,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
m->arg_size[i] = ret;
+ m->arg_flags[i] = __btf_type_is_struct(t) ? BTF_FMODEL_STRUCT_ARG : 0;
}
m->nr_args = nargs;
return 0;
@@ -6167,14 +6194,41 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
return true;
}
+static bool btf_is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg,
+ const char *name)
+{
+ int len, target_len = strlen(name);
+ const struct btf_type *t;
+ const char *param_name;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len != target_len)
+ return false;
+ if (strcmp(param_name, name))
+ return false;
+
+ return true;
+}
+
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
bool ptr_to_mem_ok,
- u32 kfunc_flags)
+ struct bpf_kfunc_arg_meta *kfunc_meta,
+ bool processing_call)
{
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
- bool rel = false, kptr_get = false, trusted_arg = false;
+ bool rel = false, kptr_get = false, trusted_args = false;
+ bool sleepable = false;
struct bpf_verifier_log *log = &env->log;
u32 i, nargs, ref_id, ref_obj_id = 0;
bool is_kfunc = btf_is_kernel(btf);
@@ -6207,11 +6261,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
- if (is_kfunc) {
+ if (is_kfunc && kfunc_meta) {
/* Only kfunc can be release func */
- rel = kfunc_flags & KF_RELEASE;
- kptr_get = kfunc_flags & KF_KPTR_GET;
- trusted_arg = kfunc_flags & KF_TRUSTED_ARGS;
+ rel = kfunc_meta->flags & KF_RELEASE;
+ kptr_get = kfunc_meta->flags & KF_KPTR_GET;
+ trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
+ sleepable = kfunc_meta->flags & KF_SLEEPABLE;
}
/* check that BTF function arguments match actual types that the
@@ -6221,9 +6276,42 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
enum bpf_arg_type arg_type = ARG_DONTCARE;
u32 regno = i + 1;
struct bpf_reg_state *reg = &regs[regno];
+ bool obj_ptr = false;
t = btf_type_skip_modifiers(btf, args[i].type, NULL);
if (btf_type_is_scalar(t)) {
+ if (is_kfunc && kfunc_meta) {
+ bool is_buf_size = false;
+
+ /* check for any const scalar parameter of name "rdonly_buf_size"
+ * or "rdwr_buf_size"
+ */
+ if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
+ "rdonly_buf_size")) {
+ kfunc_meta->r0_rdonly = true;
+ is_buf_size = true;
+ } else if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
+ "rdwr_buf_size"))
+ is_buf_size = true;
+
+ if (is_buf_size) {
+ if (kfunc_meta->r0_size) {
+ bpf_log(log, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
+ return -EINVAL;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ bpf_log(log, "R%d is not a const\n", regno);
+ return -EINVAL;
+ }
+
+ kfunc_meta->r0_size = reg->var_off.value;
+ ret = mark_chain_precision(env, regno);
+ if (ret)
+ return ret;
+ }
+ }
+
if (reg->type == SCALAR_VALUE)
continue;
bpf_log(log, "R%d is not a scalar\n", regno);
@@ -6236,10 +6324,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
+ /* These register types have special constraints wrt ref_obj_id
+ * and offset checks. The rest of trusted args don't.
+ */
+ obj_ptr = reg->type == PTR_TO_CTX || reg->type == PTR_TO_BTF_ID ||
+ reg2btf_ids[base_type(reg->type)];
+
/* Check if argument must be a referenced pointer, args + i has
* been verified to be a pointer (after skipping modifiers).
+ * PTR_TO_CTX is ok without having non-zero ref_obj_id.
*/
- if (is_kfunc && trusted_arg && !reg->ref_obj_id) {
+ if (is_kfunc && trusted_args && (obj_ptr && reg->type != PTR_TO_CTX) && !reg->ref_obj_id) {
bpf_log(log, "R%d must be referenced\n", regno);
return -EINVAL;
}
@@ -6248,12 +6343,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
/* Trusted args have the same offset checks as release arguments */
- if (trusted_arg || (rel && reg->ref_obj_id))
+ if ((trusted_args && obj_ptr) || (rel && reg->ref_obj_id))
arg_type |= OBJ_RELEASE;
ret = check_func_arg_reg_off(env, reg, regno, arg_type);
if (ret < 0)
return ret;
+ if (is_kfunc && reg->ref_obj_id) {
+ /* Ensure only one argument is referenced PTR_TO_BTF_ID */
+ if (ref_obj_id) {
+ bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id, ref_obj_id);
+ return -EFAULT;
+ }
+ ref_regno = regno;
+ ref_obj_id = reg->ref_obj_id;
+ }
+
/* kptr_get is only true for kfunc */
if (i == 0 && kptr_get) {
struct bpf_map_value_off_desc *off_desc;
@@ -6326,16 +6432,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (reg->type == PTR_TO_BTF_ID) {
reg_btf = reg->btf;
reg_ref_id = reg->btf_id;
- /* Ensure only one argument is referenced PTR_TO_BTF_ID */
- if (reg->ref_obj_id) {
- if (ref_obj_id) {
- bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
- regno, reg->ref_obj_id, ref_obj_id);
- return -EFAULT;
- }
- ref_regno = regno;
- ref_obj_id = reg->ref_obj_id;
- }
} else {
reg_btf = btf_vmlinux;
reg_ref_id = *reg2btf_ids[base_type(reg->type)];
@@ -6347,7 +6443,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
reg_ref_t->name_off);
if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
reg->off, btf, ref_id,
- trusted_arg || (rel && reg->ref_obj_id))) {
+ trusted_args || (rel && reg->ref_obj_id))) {
bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
func_name, i,
btf_type_str(ref_t), ref_tname,
@@ -6355,21 +6451,26 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
reg_ref_tname);
return -EINVAL;
}
- } else if (ptr_to_mem_ok) {
+ } else if (ptr_to_mem_ok && processing_call) {
const struct btf_type *resolve_ret;
u32 type_size;
if (is_kfunc) {
bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
+ bool arg_dynptr = btf_type_is_struct(ref_t) &&
+ !strcmp(ref_tname,
+ stringify_struct(bpf_dynptr_kern));
/* Permit pointer to mem, but only when argument
* type is pointer to scalar, or struct composed
* (recursively) of scalars.
* When arg_mem_size is true, the pointer can be
* void *.
+ * Also permit initialized local dynamic pointers.
*/
if (!btf_type_is_scalar(ref_t) &&
!__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
+ !arg_dynptr &&
(arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
bpf_log(log,
"arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
@@ -6377,6 +6478,34 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (arg_dynptr) {
+ if (reg->type != PTR_TO_STACK) {
+ bpf_log(log, "arg#%d pointer type %s %s not to stack\n",
+ i, btf_type_str(ref_t),
+ ref_tname);
+ return -EINVAL;
+ }
+
+ if (!is_dynptr_reg_valid_init(env, reg)) {
+ bpf_log(log,
+ "arg#%d pointer type %s %s must be valid and initialized\n",
+ i, btf_type_str(ref_t),
+ ref_tname);
+ return -EINVAL;
+ }
+
+ if (!is_dynptr_type_expected(env, reg,
+ ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) {
+ bpf_log(log,
+ "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n",
+ i, btf_type_str(ref_t),
+ ref_tname);
+ return -EINVAL;
+ }
+
+ continue;
+ }
+
/* Check for mem, len pair */
if (arg_mem_size) {
if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
@@ -6419,11 +6548,21 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
func_name);
return -EINVAL;
}
+
+ if (sleepable && !env->prog->aux->sleepable) {
+ bpf_log(log, "kernel function %s is sleepable but the program is not\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ if (kfunc_meta && ref_obj_id)
+ kfunc_meta->ref_obj_id = ref_obj_id;
+
/* returns argument register number > 0 in case of reference release kfunc */
return rel ? ref_regno : 0;
}
-/* Compare BTF of a function with given bpf_reg_state.
+/* Compare BTF of a function declaration with given bpf_reg_state.
* Returns:
* EFAULT - there is a verifier bug. Abort verification.
* EINVAL - there is a type mismatch or BTF is not available.
@@ -6450,7 +6589,50 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0);
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, false);
+
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
+ if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
+}
+
+/* Compare BTF of a function call with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ *
+ * NOTE: the code is duplicated from btf_check_subprog_arg_match()
+ * because btf_check_func_arg_match() is still doing both. Once that
+ * function is split in 2, we can call from here btf_check_subprog_arg_match()
+ * first, and then treat the calling part in a new code path.
+ */
+int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ bool is_global;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, true);
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
@@ -6464,9 +6646,9 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
- u32 kfunc_flags)
+ struct bpf_kfunc_arg_meta *meta)
{
- return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags);
+ return btf_check_func_arg_match(env, btf, func_id, regs, true, meta, true);
}
/* Convert BTF of a function into bpf_reg_state if possible
@@ -6580,7 +6762,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
continue;
}
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
+ i, btf_type_str(t), tname);
return -EINVAL;
}
return 0;
@@ -7235,6 +7417,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_STRUCT_OPS:
return BTF_KFUNC_HOOK_STRUCT_OPS;
case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
return BTF_KFUNC_HOOK_TRACING;
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4a400cd63731..bf2fdb33fb31 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1020,6 +1020,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
enum cgroup_bpf_attach_type from_atype, to_atype;
@@ -1029,8 +1030,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
int total_cnt = 0;
u32 flags;
+ if (effective_query && prog_attach_flags)
+ return -EINVAL;
+
if (type == BPF_LSM_CGROUP) {
- if (attr->query.prog_cnt && prog_ids && !prog_attach_flags)
+ if (!effective_query && attr->query.prog_cnt &&
+ prog_ids && !prog_attach_flags)
return -EINVAL;
from_atype = CGROUP_LSM_START;
@@ -1045,7 +1050,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
for (atype = from_atype; atype <= to_atype; atype++) {
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+ if (effective_query) {
effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
total_cnt += bpf_prog_array_length(effective);
@@ -1054,6 +1059,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
}
+ /* always output uattr->query.attach_flags as 0 during effective query */
+ flags = effective_query ? 0 : flags;
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
@@ -1068,7 +1075,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+ if (effective_query) {
effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
@@ -1090,15 +1097,16 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
if (++i == cnt)
break;
}
- }
- if (prog_attach_flags) {
- flags = cgrp->bpf.flags[atype];
+ if (prog_attach_flags) {
+ flags = cgrp->bpf.flags[atype];
- for (i = 0; i < cnt; i++)
- if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags)))
- return -EFAULT;
- prog_attach_flags += cnt;
+ for (i = 0; i < cnt; i++)
+ if (copy_to_user(prog_attach_flags + i,
+ &flags, sizeof(flags)))
+ return -EFAULT;
+ prog_attach_flags += cnt;
+ }
}
prog_ids += cnt;
@@ -1529,6 +1537,37 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
return ret;
}
+BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
+{
+ /* flags argument is not used now,
+ * but provides an ability to extend the API.
+ * verifier checks that its value is correct.
+ */
+ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ struct bpf_cgroup_storage *storage;
+ struct bpf_cg_run_ctx *ctx;
+ void *ptr;
+
+ /* get current cgroup storage from BPF run context */
+ ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+ storage = ctx->prog_item->cgroup_storage[stype];
+
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ ptr = &READ_ONCE(storage->buf)->data[0];
+ else
+ ptr = this_cpu_ptr(storage->percpu_buf);
+
+ return (unsigned long)ptr;
+}
+
+const struct bpf_func_proto bpf_get_local_storage_proto = {
+ .func = bpf_get_local_storage,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
BPF_CALL_0(bpf_get_retval)
{
struct bpf_cg_run_ctx *ctx =
@@ -1560,32 +1599,26 @@ const struct bpf_func_proto bpf_set_retval_proto = {
};
static const struct bpf_func_proto *
-cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
- case BPF_FUNC_get_retval:
- return &bpf_get_retval_proto;
- case BPF_FUNC_set_retval:
- return &bpf_set_retval_proto;
default:
return bpf_base_func_proto(func_id);
}
}
-static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- return cgroup_base_func_proto(func_id, prog);
-}
-
static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -2098,11 +2131,17 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
static const struct bpf_func_proto *
sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_strtol:
- return &bpf_strtol_proto;
- case BPF_FUNC_strtoul:
- return &bpf_strtoul_proto;
case BPF_FUNC_sysctl_get_name:
return &bpf_sysctl_get_name_proto;
case BPF_FUNC_sysctl_get_current_value:
@@ -2113,8 +2152,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sysctl_set_new_value_proto;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -2235,6 +2276,16 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_get_netns_cookie:
@@ -2256,8 +2307,10 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -2422,3 +2475,69 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
const struct bpf_prog_ops cg_sockopt_prog_ops = {
};
+
+/* Common helpers for cgroup hooks. */
+const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_get_retval_proto;
+ }
+ case BPF_FUNC_set_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_set_retval_proto;
+ }
+ default:
+ return NULL;
+ }
+}
+
+/* Common helpers for cgroup hooks with valid process context. */
+const struct bpf_func_proto *
+cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
new file mode 100644
index 000000000000..0d200a993489
--- /dev/null
+++ b/kernel/bpf/cgroup_iter.c
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/cgroup.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
+
+/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+ *
+ * 1. Walk the descendants of a cgroup in pre-order.
+ * 2. Walk the descendants of a cgroup in post-order.
+ * 3. Walk the ancestors of a cgroup.
+ * 4. Show the given cgroup only.
+ *
+ * For walking descendants, cgroup_iter can walk in either pre-order or
+ * post-order. For walking ancestors, the iter walks up from a cgroup to
+ * the root.
+ *
+ * The iter program can terminate the walk early by returning 1. Walk
+ * continues if prog returns 0.
+ *
+ * The prog can check (seq->num == 0) to determine whether this is
+ * the first element. The prog may also be passed a NULL cgroup,
+ * which means the walk has completed and the prog has a chance to
+ * do post-processing, such as outputting an epilogue.
+ *
+ * Note: the iter_prog is called with cgroup_mutex held.
+ *
+ * Currently only one session is supported, which means, depending on the
+ * volume of data bpf program intends to send to user space, the number
+ * of cgroups that can be walked is limited. For example, given the current
+ * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
+ * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
+ * be walked is 512. This is a limitation of cgroup_iter. If the output data
+ * is larger than the kernel buffer size, after all data in the kernel buffer
+ * is consumed by user space, the subsequent read() syscall will signal
+ * EOPNOTSUPP. In order to work around, the user may have to update their
+ * program to reduce the volume of data sent to output. For example, skip
+ * some uninteresting cgroups.
+ */
+
+struct bpf_iter__cgroup {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct cgroup *, cgroup);
+};
+
+struct cgroup_iter_priv {
+ struct cgroup_subsys_state *start_css;
+ bool visited_all;
+ bool terminate;
+ int order;
+};
+
+static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ mutex_lock(&cgroup_mutex);
+
+ /* cgroup_iter doesn't support read across multiple sessions. */
+ if (*pos > 0) {
+ if (p->visited_all)
+ return NULL;
+
+ /* Haven't visited all, but because cgroup_mutex has dropped,
+ * return -EOPNOTSUPP to indicate incomplete iteration.
+ */
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ ++*pos;
+ p->terminate = false;
+ p->visited_all = false;
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(NULL, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(NULL, p->start_css);
+ else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
+ return p->start_css;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop);
+
+static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ mutex_unlock(&cgroup_mutex);
+
+ /* pass NULL to the prog for post-processing */
+ if (!v) {
+ __cgroup_iter_seq_show(seq, NULL, true);
+ p->visited_all = true;
+ }
+}
+
+static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
+ struct cgroup_iter_priv *p = seq->private;
+
+ ++*pos;
+ if (p->terminate)
+ return NULL;
+
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ return curr->parent;
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ return NULL;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop)
+{
+ struct cgroup_iter_priv *p = seq->private;
+ struct bpf_iter__cgroup ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ /* cgroup is dead, skip this element */
+ if (css && cgroup_is_dead(css->cgroup))
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.cgroup = css ? css->cgroup : NULL;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ /* if prog returns > 0, terminate after this element. */
+ if (ret != 0)
+ p->terminate = true;
+
+ return 0;
+}
+
+static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
+ false);
+}
+
+static const struct seq_operations cgroup_iter_seq_ops = {
+ .start = cgroup_iter_seq_start,
+ .next = cgroup_iter_seq_next,
+ .stop = cgroup_iter_seq_stop,
+ .show = cgroup_iter_seq_show,
+};
+
+BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+
+static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+ struct cgroup *cgrp = aux->cgroup.start;
+
+ p->start_css = &cgrp->self;
+ p->terminate = false;
+ p->visited_all = false;
+ p->order = aux->cgroup.order;
+ return 0;
+}
+
+static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
+ .seq_ops = &cgroup_iter_seq_ops,
+ .init_seq_private = cgroup_iter_seq_init,
+ .seq_priv_size = sizeof(struct cgroup_iter_priv),
+};
+
+static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ int fd = linfo->cgroup.cgroup_fd;
+ u64 id = linfo->cgroup.cgroup_id;
+ int order = linfo->cgroup.order;
+ struct cgroup *cgrp;
+
+ if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
+ order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
+ order != BPF_CGROUP_ITER_ANCESTORS_UP &&
+ order != BPF_CGROUP_ITER_SELF_ONLY)
+ return -EINVAL;
+
+ if (fd && id)
+ return -EINVAL;
+
+ if (fd)
+ cgrp = cgroup_get_from_fd(fd);
+ else if (id)
+ cgrp = cgroup_get_from_id(id);
+ else /* walk the entire hierarchy by default. */
+ cgrp = cgroup_get_from_path("/");
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ aux->cgroup.start = cgrp;
+ aux->cgroup.order = order;
+ return 0;
+}
+
+static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
+{
+ cgroup_put(aux->cgroup.start);
+}
+
+static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ char *buf;
+
+ buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ seq_puts(seq, "cgroup_path:\t<unknown>\n");
+ goto show_order;
+ }
+
+ /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
+ * will print nothing.
+ *
+ * Path is in the calling process's cgroup namespace.
+ */
+ cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ seq_printf(seq, "cgroup_path:\t%s\n", buf);
+ kfree(buf);
+
+show_order:
+ if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ seq_puts(seq, "order: descendants_pre\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ seq_puts(seq, "order: descendants_post\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ seq_puts(seq, "order: ancestors_up\n");
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ seq_puts(seq, "order: self_only\n");
+}
+
+static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.cgroup.order = aux->cgroup.order;
+ info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
+ struct cgroup *cgroup)
+
+static struct bpf_iter_reg bpf_cgroup_reg_info = {
+ .target = "cgroup",
+ .feature = BPF_ITER_RESCHED,
+ .attach_target = bpf_iter_attach_cgroup,
+ .detach_target = bpf_iter_detach_cgroup,
+ .show_fdinfo = bpf_iter_cgroup_show_fdinfo,
+ .fill_link_info = bpf_iter_cgroup_fill_link_info,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__cgroup, cgroup),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &cgroup_iter_seq_info,
+};
+
+static int __init bpf_cgroup_iter_init(void)
+{
+ bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
+ return bpf_iter_reg_target(&bpf_cgroup_reg_info);
+}
+
+late_initcall(bpf_cgroup_iter_init);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3d9eb3ae334c..711fd293b6de 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -825,6 +825,11 @@ struct bpf_prog_pack {
unsigned long bitmap[];
};
+void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
+{
+ memset(area, 0, size);
+}
+
#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
static DEFINE_MUTEX(pack_mutex);
@@ -864,7 +869,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
return pack;
}
-static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
+void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
struct bpf_prog_pack *pack;
@@ -905,7 +910,7 @@ out:
return ptr;
}
-static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+void bpf_prog_pack_free(struct bpf_binary_header *hdr)
{
struct bpf_prog_pack *pack = NULL, *tmp;
unsigned int nbits;
@@ -2623,6 +2628,7 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index f4860ac756cd..b5ba34ddd4b6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -97,7 +97,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
- cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT);
+ cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE);
if (!cmap)
return ERR_PTR(-ENOMEM);
@@ -118,7 +118,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return &cmap->map;
free_cmap:
- kfree(cmap);
+ bpf_map_area_free(cmap);
return ERR_PTR(err);
}
@@ -623,7 +623,7 @@ static void cpu_map_free(struct bpf_map *map)
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
bpf_map_area_free(cmap->cpu_map);
- kfree(cmap);
+ bpf_map_area_free(cmap);
}
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a0e02b009487..f9a87dcc5535 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -163,13 +163,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
- dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT);
+ dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
if (!dtab)
return ERR_PTR(-ENOMEM);
err = dev_map_init_map(dtab, attr);
if (err) {
- kfree(dtab);
+ bpf_map_area_free(dtab);
return ERR_PTR(err);
}
@@ -240,7 +240,7 @@ static void dev_map_free(struct bpf_map *map)
bpf_map_area_free(dtab->netdev_map);
}
- kfree(dtab);
+ bpf_map_area_free(dtab);
}
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 2444bd15cc2d..fa64b80b8bca 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -85,12 +85,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
return false;
}
-int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
{
return -ENOTSUPP;
}
-static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf)
{
s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
int i;
@@ -99,12 +99,12 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
if (d->progs[i].prog)
*ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
}
- return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+ return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs);
}
static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
{
- void *old, *new;
+ void *old, *new, *tmp;
u32 noff;
int err;
@@ -117,8 +117,14 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
}
new = d->num_progs ? d->image + noff : NULL;
+ tmp = d->num_progs ? d->rw_image + noff : NULL;
if (new) {
- if (bpf_dispatcher_prepare(d, new))
+ /* Prepare the dispatcher in d->rw_image. Then use
+ * bpf_arch_text_copy to update d->image, which is RO+X.
+ */
+ if (bpf_dispatcher_prepare(d, new, tmp))
+ return;
+ if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2)))
return;
}
@@ -140,9 +146,18 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
mutex_lock(&d->mutex);
if (!d->image) {
- d->image = bpf_jit_alloc_exec_page();
+ d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero);
if (!d->image)
goto out;
+ d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!d->rw_image) {
+ u32 size = PAGE_SIZE;
+
+ bpf_arch_text_copy(d->image, &size, sizeof(size));
+ bpf_prog_pack_free((struct bpf_binary_header *)d->image);
+ d->image = NULL;
+ goto out;
+ }
bpf_image_ksym_add(d->image, &d->ksym);
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 6c530a5e560a..ed3f8a53603b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,6 +14,7 @@
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
+#include <linux/bpf_mem_alloc.h>
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
@@ -67,24 +68,16 @@
* In theory the BPF locks could be converted to regular spinlocks as well,
* but the bucket locks and percpu_freelist locks can be taken from
* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
- * atomic contexts even on RT. These mechanisms require preallocated maps,
- * so there is no need to invoke memory allocations within the lock held
- * sections.
- *
- * BPF maps which need dynamic allocation are only used from (forced)
- * thread context on RT and can therefore use regular spinlocks which in
- * turn allows to invoke memory allocations from the lock held section.
- *
- * On a non RT kernel this distinction is neither possible nor required.
- * spinlock maps to raw_spinlock and the extra code is optimized out by the
- * compiler.
+ * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
+ * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
+ * because there is no memory allocation within the lock held sections. However
+ * after hash map was fully converted to use bpf_mem_alloc, there will be
+ * non-synchronous memory allocation for non-preallocated hash map, so it is
+ * safe to always use raw spinlock for bucket lock.
*/
struct bucket {
struct hlist_nulls_head head;
- union {
- raw_spinlock_t raw_lock;
- spinlock_t lock;
- };
+ raw_spinlock_t raw_lock;
};
#define HASHTAB_MAP_LOCK_COUNT 8
@@ -92,6 +85,8 @@ struct bucket {
struct bpf_htab {
struct bpf_map map;
+ struct bpf_mem_alloc ma;
+ struct bpf_mem_alloc pcpu_ma;
struct bucket *buckets;
void *elems;
union {
@@ -99,7 +94,12 @@ struct bpf_htab {
struct bpf_lru lru;
};
struct htab_elem *__percpu *extra_elems;
- atomic_t count; /* number of elements in this hashtable */
+ /* number of elements in non-preallocated hashtable are kept
+ * in either pcount or count
+ */
+ struct percpu_counter pcount;
+ atomic_t count;
+ bool use_percpu_counter;
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
@@ -114,14 +114,14 @@ struct htab_elem {
struct {
void *padding;
union {
- struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
struct htab_elem *batch_flink;
};
};
};
union {
- struct rcu_head rcu;
+ /* pointer to per-cpu pointer */
+ void *ptr_to_pptr;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -133,26 +133,15 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab)
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}
-static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
-{
- return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
-}
-
static void htab_init_buckets(struct bpf_htab *htab)
{
unsigned int i;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
- if (htab_use_raw_lock(htab)) {
- raw_spin_lock_init(&htab->buckets[i].raw_lock);
- lockdep_set_class(&htab->buckets[i].raw_lock,
- &htab->lockdep_key);
- } else {
- spin_lock_init(&htab->buckets[i].lock);
- lockdep_set_class(&htab->buckets[i].lock,
+ raw_spin_lock_init(&htab->buckets[i].raw_lock);
+ lockdep_set_class(&htab->buckets[i].raw_lock,
&htab->lockdep_key);
- }
cond_resched();
}
}
@@ -165,17 +154,14 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
hash = hash & HASHTAB_MAP_LOCK_MASK;
- migrate_disable();
+ preempt_disable();
if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
__this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
+ preempt_enable();
return -EBUSY;
}
- if (htab_use_raw_lock(htab))
- raw_spin_lock_irqsave(&b->raw_lock, flags);
- else
- spin_lock_irqsave(&b->lock, flags);
+ raw_spin_lock_irqsave(&b->raw_lock, flags);
*pflags = flags;
return 0;
@@ -186,12 +172,9 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
unsigned long flags)
{
hash = hash & HASHTAB_MAP_LOCK_MASK;
- if (htab_use_raw_lock(htab))
- raw_spin_unlock_irqrestore(&b->raw_lock, flags);
- else
- spin_unlock_irqrestore(&b->lock, flags);
+ raw_spin_unlock_irqrestore(&b->raw_lock, flags);
__this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
+ preempt_enable();
}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -428,8 +411,6 @@ static int htab_map_alloc_check(union bpf_attr *attr)
bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
int numa_node = bpf_map_attr_numa_node(attr);
- BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
- offsetof(struct htab_elem, hash_node.pprev));
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
@@ -491,7 +472,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
struct bpf_htab *htab;
int err, i;
- htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
+ htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -550,6 +531,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab_init_buckets(htab);
+/* compute_batch_value() computes batch value as num_online_cpus() * 2
+ * and __percpu_counter_compare() needs
+ * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
+ * for percpu_counter to be faster than atomic_t. In practice the average bpf
+ * hash map size is 10k, which means that a system with 64 cpus will fill
+ * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
+ * define our own batch count as 32 then 10k hash map can be filled up to 80%:
+ * 10k - 8k > 32 _batch_ * 64 _cpus_
+ * and __percpu_counter_compare() will still be fast. At that point hash map
+ * collisions will dominate its performance anyway. Assume that hash map filled
+ * to 50+% isn't going to be O(1) and use the following formula to choose
+ * between percpu_counter and atomic_t.
+ */
+#define PERCPU_COUNTER_BATCH 32
+ if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
+ htab->use_percpu_counter = true;
+
+ if (htab->use_percpu_counter) {
+ err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
+ if (err)
+ goto free_map_locked;
+ }
+
if (prealloc) {
err = prealloc_init(htab);
if (err)
@@ -563,6 +567,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (err)
goto free_prealloc;
}
+ } else {
+ err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+ if (err)
+ goto free_map_locked;
+ if (percpu) {
+ err = bpf_mem_alloc_init(&htab->pcpu_ma,
+ round_up(htab->map.value_size, 8), true);
+ if (err)
+ goto free_map_locked;
+ }
}
return &htab->map;
@@ -570,12 +584,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
free_prealloc:
prealloc_destroy(htab);
free_map_locked:
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
free_percpu(htab->map_locked[i]);
bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
free_htab:
lockdep_unregister_key(&htab->lockdep_key);
- kfree(htab);
+ bpf_map_area_free(htab);
return ERR_PTR(err);
}
@@ -847,17 +865,9 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
- free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+ bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
check_and_free_fields(htab, l);
- kfree(l);
-}
-
-static void htab_elem_free_rcu(struct rcu_head *head)
-{
- struct htab_elem *l = container_of(head, struct htab_elem, rcu);
- struct bpf_htab *htab = l->htab;
-
- htab_elem_free(htab, l);
+ bpf_mem_cache_free(&htab->ma, l);
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -871,6 +881,31 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
}
}
+static bool is_map_full(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
+ PERCPU_COUNTER_BATCH) >= 0;
+ return atomic_read(&htab->count) >= htab->map.max_entries;
+}
+
+static void inc_elem_count(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_inc(&htab->count);
+}
+
+static void dec_elem_count(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_dec(&htab->count);
+}
+
+
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
htab_put_fd_value(htab, l);
@@ -879,9 +914,8 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
check_and_free_fields(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
- atomic_dec(&htab->count);
- l->htab = htab;
- call_rcu(&l->rcu, htab_elem_free_rcu);
+ dec_elem_count(htab);
+ htab_elem_free(htab, l);
}
}
@@ -906,13 +940,12 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
{
- /* When using prealloc and not setting the initial value on all cpus,
- * zero-fill element values for other cpus (just as what happens when
- * not using prealloc). Otherwise, bpf program has no way to ensure
+ /* When not setting the initial value on all cpus, zero-fill element
+ * values for other cpus. Otherwise, bpf program has no way to ensure
* known initial values for cpus other than current one
* (onallcpus=false always when coming from bpf prog).
*/
- if (htab_is_prealloc(htab) && !onallcpus) {
+ if (!onallcpus) {
u32 size = round_up(htab->map.value_size, 8);
int current_cpu = raw_smp_processor_id();
int cpu;
@@ -963,19 +996,16 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = container_of(l, struct htab_elem, fnode);
}
} else {
- if (atomic_inc_return(&htab->count) > htab->map.max_entries)
- if (!old_elem) {
+ if (is_map_full(htab))
+ if (!old_elem)
/* when map is full and update() is replacing
* old element, it's ok to allocate, since
* old element will be freed immediately.
* Otherwise return an error
*/
- l_new = ERR_PTR(-E2BIG);
- goto dec_count;
- }
- l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
- GFP_NOWAIT | __GFP_NOWARN,
- htab->map.numa_node);
+ return ERR_PTR(-E2BIG);
+ inc_elem_count(htab);
+ l_new = bpf_mem_cache_alloc(&htab->ma);
if (!l_new) {
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
@@ -986,18 +1016,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
memcpy(l_new->key, key, key_size);
if (percpu) {
- size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
- pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
- GFP_NOWAIT | __GFP_NOWARN);
+ pptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
if (!pptr) {
- kfree(l_new);
+ bpf_mem_cache_free(&htab->ma, l_new);
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ l_new->ptr_to_pptr = pptr;
+ pptr = *(void **)pptr;
}
pcpu_init_value(htab, pptr, value, onallcpus);
@@ -1016,7 +1046,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new->hash = hash;
return l_new;
dec_count:
- atomic_dec(&htab->count);
+ dec_elem_count(htab);
return l_new;
}
@@ -1416,6 +1446,10 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
+ /* It's called from a worker thread, so disable migration here,
+ * since bpf_mem_cache_free() relies on that.
+ */
+ migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
@@ -1426,6 +1460,7 @@ static void delete_all_elements(struct bpf_htab *htab)
htab_elem_free(htab, l);
}
}
+ migrate_enable();
}
static void htab_free_malloced_timers(struct bpf_htab *htab)
@@ -1475,10 +1510,10 @@ static void htab_map_free(struct bpf_map *map)
* There is no need to synchronize_rcu() here to protect map elements.
*/
- /* some of free_htab_elem() callbacks for elements of this map may
- * not have executed. Wait for them.
+ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
+ * underneath and is reponsible for waiting for callbacks to finish
+ * during bpf_mem_alloc_destroy().
*/
- rcu_barrier();
if (!htab_is_prealloc(htab)) {
delete_all_elements(htab);
} else {
@@ -1489,10 +1524,14 @@ static void htab_map_free(struct bpf_map *map)
bpf_map_free_kptr_off_tab(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
free_percpu(htab->map_locked[i]);
lockdep_unregister_key(&htab->lockdep_key);
- kfree(htab);
+ bpf_map_area_free(htab);
}
static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -1691,8 +1730,11 @@ again_nocopy:
/* do not grab the lock unless need it (bucket_cnt > 0). */
if (locked) {
ret = htab_lock_bucket(htab, b, batch, &flags);
- if (ret)
- goto next_batch;
+ if (ret) {
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ goto after_loop;
+ }
}
bucket_cnt = 0;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1f961f9982d2..a6b04faed282 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -15,6 +15,7 @@
#include <linux/ctype.h>
#include <linux/jiffies.h>
#include <linux/pid_namespace.h>
+#include <linux/poison.h>
#include <linux/proc_ns.h>
#include <linux/security.h>
#include <linux/btf_ids.h>
@@ -198,6 +199,18 @@ const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
.ret_type = RET_INTEGER,
};
+BPF_CALL_0(bpf_ktime_get_tai_ns)
+{
+ /* NMI safe access to clock tai */
+ return ktime_get_tai_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
+ .func = bpf_ktime_get_tai_ns,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
BPF_CALL_0(bpf_get_current_pid_tgid)
{
struct task_struct *task = current;
@@ -415,40 +428,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
-
-#ifdef CONFIG_CGROUP_BPF
-
-BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
-{
- /* flags argument is not used now,
- * but provides an ability to extend the API.
- * verifier checks that its value is correct.
- */
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage;
- struct bpf_cg_run_ctx *ctx;
- void *ptr;
-
- /* get current cgroup storage from BPF run context */
- ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
- storage = ctx->prog_item->cgroup_storage[stype];
-
- if (stype == BPF_CGROUP_STORAGE_SHARED)
- ptr = &READ_ONCE(storage->buf)->data[0];
- else
- ptr = this_cpu_ptr(storage->percpu_buf);
-
- return (unsigned long)ptr;
-}
-
-const struct bpf_func_proto bpf_get_local_storage_proto = {
- .func = bpf_get_local_storage,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-#endif
+#endif /* CONFIG_CGROUPS */
#define BPF_STRTOX_BASE_MASK 0x1F
@@ -577,7 +557,6 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
-#endif
BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
{
@@ -1398,10 +1377,9 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
}
/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
- * helper is determined dynamically by the verifier.
+ * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
+ * denote type that verifier will determine.
*/
-#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
-
static const struct bpf_func_proto bpf_kptr_xchg_proto = {
.func = bpf_kptr_xchg,
.gpl_only = false,
@@ -1430,7 +1408,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ
ptr->size |= type << DYNPTR_TYPE_SHIFT;
}
-static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_SIZE_MASK;
}
@@ -1468,6 +1446,8 @@ BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_
{
int err;
+ BTF_TYPE_EMIT(struct bpf_dynptr);
+
err = bpf_dynptr_check_size(size);
if (err)
goto error;
@@ -1617,6 +1597,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
+ case BPF_FUNC_ktime_get_tai_ns:
+ return &bpf_ktime_get_tai_ns_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
@@ -1627,26 +1609,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
- case BPF_FUNC_ringbuf_reserve_dynptr:
- return &bpf_ringbuf_reserve_dynptr_proto;
- case BPF_FUNC_ringbuf_submit_dynptr:
- return &bpf_ringbuf_submit_dynptr_proto;
- case BPF_FUNC_ringbuf_discard_dynptr:
- return &bpf_ringbuf_discard_dynptr_proto;
- case BPF_FUNC_for_each_map_elem:
- return &bpf_for_each_map_elem_proto;
- case BPF_FUNC_loop:
- return &bpf_loop_proto;
case BPF_FUNC_strncmp:
return &bpf_strncmp_proto;
- case BPF_FUNC_dynptr_from_mem:
- return &bpf_dynptr_from_mem_proto;
- case BPF_FUNC_dynptr_read:
- return &bpf_dynptr_read_proto;
- case BPF_FUNC_dynptr_write:
- return &bpf_dynptr_write_proto;
- case BPF_FUNC_dynptr_data:
- return &bpf_dynptr_data_proto;
+ case BPF_FUNC_strtol:
+ return &bpf_strtol_proto;
+ case BPF_FUNC_strtoul:
+ return &bpf_strtoul_proto;
default:
break;
}
@@ -1675,6 +1643,26 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_timer_cancel_proto;
case BPF_FUNC_kptr_xchg:
return &bpf_kptr_xchg_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_loop:
+ return &bpf_loop_proto;
+ case BPF_FUNC_user_ringbuf_drain:
+ return &bpf_user_ringbuf_drain_proto;
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ return &bpf_ringbuf_reserve_dynptr_proto;
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ return &bpf_ringbuf_submit_dynptr_proto;
+ case BPF_FUNC_ringbuf_discard_dynptr:
+ return &bpf_ringbuf_discard_dynptr_proto;
+ case BPF_FUNC_dynptr_from_mem:
+ return &bpf_dynptr_from_mem_proto;
+ case BPF_FUNC_dynptr_read:
+ return &bpf_dynptr_read_proto;
+ case BPF_FUNC_dynptr_write:
+ return &bpf_dynptr_write_proto;
+ case BPF_FUNC_dynptr_data:
+ return &bpf_dynptr_data_proto;
default:
break;
}
@@ -1711,3 +1699,21 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return NULL;
}
}
+
+BTF_SET8_START(tracing_btf_ids)
+#ifdef CONFIG_KEXEC_CORE
+BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
+#endif
+BTF_SET8_END(tracing_btf_ids)
+
+static const struct btf_kfunc_id_set tracing_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &tracing_btf_ids,
+};
+
+static int __init kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &tracing_kfunc_set);
+}
+
+late_initcall(kfunc_init);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 49ef0ce040c7..098cf336fae6 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -313,8 +313,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
/* max_entries is not used and enforced to be 0 */
return ERR_PTR(-EINVAL);
- map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
- __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node);
+ map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node);
if (!map)
return ERR_PTR(-ENOMEM);
@@ -346,7 +345,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
WARN_ON(!RB_EMPTY_ROOT(&map->root));
WARN_ON(!list_empty(&map->list));
- kfree(map);
+ bpf_map_area_free(map);
}
static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index d789e3b831ad..d833496e9e42 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -558,7 +558,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
attr->value_size > LPM_VAL_SIZE_MAX)
return ERR_PTR(-EINVAL);
- trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE);
if (!trie)
return ERR_PTR(-ENOMEM);
@@ -609,7 +609,7 @@ static void trie_free(struct bpf_map *map)
}
out:
- kfree(trie);
+ bpf_map_area_free(trie);
}
static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
new file mode 100644
index 000000000000..5f83be1d2018
--- /dev/null
+++ b/kernel/bpf/memalloc.c
@@ -0,0 +1,635 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <linux/mm.h>
+#include <linux/llist.h>
+#include <linux/bpf.h>
+#include <linux/irq_work.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+#include <asm/local.h>
+
+/* Any context (including NMI) BPF specific memory allocator.
+ *
+ * Tracing BPF programs can attach to kprobe and fentry. Hence they
+ * run in unknown context where calling plain kmalloc() might not be safe.
+ *
+ * Front-end kmalloc() with per-cpu per-bucket cache of free elements.
+ * Refill this cache asynchronously from irq_work.
+ *
+ * CPU_0 buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ * ...
+ * CPU_N buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ *
+ * The buckets are prefilled at the start.
+ * BPF programs always run with migration disabled.
+ * It's safe to allocate from cache of the current cpu with irqs disabled.
+ * Free-ing is always done into bucket of the current cpu as well.
+ * irq_work trims extra free elements from buckets with kfree
+ * and refills them with kmalloc, so global kmalloc logic takes care
+ * of freeing objects allocated by one cpu and freed on another.
+ *
+ * Every allocated objected is padded with extra 8 bytes that contains
+ * struct llist_node.
+ */
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+/* similar to kmalloc, but sizeof == 8 bucket is gone */
+static u8 size_index[24] __ro_after_init = {
+ 3, /* 8 */
+ 3, /* 16 */
+ 4, /* 24 */
+ 4, /* 32 */
+ 5, /* 40 */
+ 5, /* 48 */
+ 5, /* 56 */
+ 5, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 6, /* 104 */
+ 6, /* 112 */
+ 6, /* 120 */
+ 6, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static int bpf_mem_cache_idx(size_t size)
+{
+ if (!size || size > 4096)
+ return -1;
+
+ if (size <= 192)
+ return size_index[(size - 1) / 8] - 1;
+
+ return fls(size - 1) - 1;
+}
+
+#define NUM_CACHES 11
+
+struct bpf_mem_cache {
+ /* per-cpu list of free objects of size 'unit_size'.
+ * All accesses are done with interrupts disabled and 'active' counter
+ * protection with __llist_add() and __llist_del_first().
+ */
+ struct llist_head free_llist;
+ local_t active;
+
+ /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
+ * are sequenced by per-cpu 'active' counter. But unit_free() cannot
+ * fail. When 'active' is busy the unit_free() will add an object to
+ * free_llist_extra.
+ */
+ struct llist_head free_llist_extra;
+
+ struct irq_work refill_work;
+ struct obj_cgroup *objcg;
+ int unit_size;
+ /* count of objects in free_llist */
+ int free_cnt;
+ int low_watermark, high_watermark, batch;
+ int percpu_size;
+
+ struct rcu_head rcu;
+ struct llist_head free_by_rcu;
+ struct llist_head waiting_for_gp;
+ atomic_t call_rcu_in_progress;
+};
+
+struct bpf_mem_caches {
+ struct bpf_mem_cache cache[NUM_CACHES];
+};
+
+static struct llist_node notrace *__llist_del_first(struct llist_head *head)
+{
+ struct llist_node *entry, *next;
+
+ entry = head->first;
+ if (!entry)
+ return NULL;
+ next = entry->next;
+ head->first = next;
+ return entry;
+}
+
+static void *__alloc(struct bpf_mem_cache *c, int node)
+{
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
+ */
+ gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
+
+ if (c->percpu_size) {
+ void **obj = kmalloc_node(c->percpu_size, flags, node);
+ void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
+
+ if (!obj || !pptr) {
+ free_percpu(pptr);
+ kfree(obj);
+ return NULL;
+ }
+ obj[1] = pptr;
+ return obj;
+ }
+
+ return kmalloc_node(c->unit_size, flags, node);
+}
+
+static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ if (c->objcg)
+ return get_mem_cgroup_from_objcg(c->objcg);
+#endif
+
+#ifdef CONFIG_MEMCG
+ return root_mem_cgroup;
+#else
+ return NULL;
+#endif
+}
+
+/* Mostly runs from irq_work except __init phase. */
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
+{
+ struct mem_cgroup *memcg = NULL, *old_memcg;
+ unsigned long flags;
+ void *obj;
+ int i;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ for (i = 0; i < cnt; i++) {
+ obj = __alloc(c, node);
+ if (!obj)
+ break;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ /* In RT irq_work runs in per-cpu kthread, so disable
+ * interrupts to avoid preemption and interrupts and
+ * reduce the chance of bpf prog executing on this cpu
+ * when active counter is busy.
+ */
+ local_irq_save(flags);
+ /* alloc_bulk runs from irq_work which will not preempt a bpf
+ * program that does unit_alloc/unit_free since IRQs are
+ * disabled there. There is no race to increment 'active'
+ * counter. It protects free_llist from corruption in case NMI
+ * bpf prog preempted this loop.
+ */
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ __llist_add(obj, &c->free_llist);
+ c->free_cnt++;
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(flags);
+ }
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+}
+
+static void free_one(struct bpf_mem_cache *c, void *obj)
+{
+ if (c->percpu_size) {
+ free_percpu(((void **)obj)[1]);
+ kfree(obj);
+ return;
+ }
+
+ kfree(obj);
+}
+
+static void __free_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+ struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
+ struct llist_node *pos, *t;
+
+ llist_for_each_safe(pos, t, llnode)
+ free_one(c, pos);
+ atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void __free_rcu_tasks_trace(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+
+ call_rcu(&c->rcu, __free_rcu);
+}
+
+static void enque_to_free(struct bpf_mem_cache *c, void *obj)
+{
+ struct llist_node *llnode = obj;
+
+ /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
+ * Nothing races to add to free_by_rcu list.
+ */
+ __llist_add(llnode, &c->free_by_rcu);
+}
+
+static void do_call_rcu(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ if (atomic_xchg(&c->call_rcu_in_progress, 1))
+ return;
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+ /* There is no concurrent __llist_add(waiting_for_gp) access.
+ * It doesn't race with llist_del_all either.
+ * But there could be two concurrent llist_del_all(waiting_for_gp):
+ * from __free_rcu() and from drain_mem_cache().
+ */
+ __llist_add(llnode, &c->waiting_for_gp);
+ /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * Then use call_rcu() to wait for normal progs to finish
+ * and finally do free_one() on each element.
+ */
+ call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
+}
+
+static void free_bulk(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+ int cnt;
+
+ do {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_save(flags);
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ else
+ cnt = 0;
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(flags);
+ if (llnode)
+ enque_to_free(c, llnode);
+ } while (cnt > (c->high_watermark + c->low_watermark) / 2);
+
+ /* and drain free_llist_extra */
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
+ enque_to_free(c, llnode);
+ do_call_rcu(c);
+}
+
+static void bpf_mem_refill(struct irq_work *work)
+{
+ struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
+ int cnt;
+
+ /* Racy access to free_cnt. It doesn't need to be 100% accurate */
+ cnt = c->free_cnt;
+ if (cnt < c->low_watermark)
+ /* irq_work runs on this cpu and kmalloc will allocate
+ * from the current numa node which is what we want here.
+ */
+ alloc_bulk(c, c->batch, NUMA_NO_NODE);
+ else if (cnt > c->high_watermark)
+ free_bulk(c);
+}
+
+static void notrace irq_work_raise(struct bpf_mem_cache *c)
+{
+ irq_work_queue(&c->refill_work);
+}
+
+/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
+ * the freelist cache will be elem_size * 64 (or less) on each cpu.
+ *
+ * For bpf programs that don't have statically known allocation sizes and
+ * assuming (low_mark + high_mark) / 2 as an average number of elements per
+ * bucket and all buckets are used the total amount of memory in freelists
+ * on each cpu will be:
+ * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
+ * == ~ 116 Kbyte using below heuristic.
+ * Initialized, but unused bpf allocator (not bpf map specific one) will
+ * consume ~ 11 Kbyte per cpu.
+ * Typical case will be between 11K and 116K closer to 11K.
+ * bpf progs can and should share bpf_mem_cache when possible.
+ */
+
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
+ init_irq_work(&c->refill_work, bpf_mem_refill);
+ if (c->unit_size <= 256) {
+ c->low_watermark = 32;
+ c->high_watermark = 96;
+ } else {
+ /* When page_size == 4k, order-0 cache will have low_mark == 2
+ * and high_mark == 6 with batch alloc of 3 individual pages at
+ * a time.
+ * 8k allocs and above low == 1, high == 3, batch == 1.
+ */
+ c->low_watermark = max(32 * 256 / c->unit_size, 1);
+ c->high_watermark = max(96 * 256 / c->unit_size, 3);
+ }
+ c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+
+ /* To avoid consuming memory assume that 1st run of bpf
+ * prog won't be doing more than 4 map_update_elem from
+ * irq disabled region
+ */
+ alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
+}
+
+/* When size != 0 bpf_mem_cache for each cpu.
+ * This is typical bpf hash map use case when all elements have equal size.
+ *
+ * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
+ * kmalloc/kfree. Max allocation size is 4096 in this case.
+ * This is bpf_dynptr and bpf_kptr use case.
+ */
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+{
+ static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+ struct bpf_mem_caches *cc, __percpu *pcc;
+ struct bpf_mem_cache *c, __percpu *pc;
+ struct obj_cgroup *objcg = NULL;
+ int cpu, i, unit_size, percpu_size = 0;
+
+ if (size) {
+ pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
+ if (!pc)
+ return -ENOMEM;
+
+ if (percpu)
+ /* room for llist_node and per-cpu pointer */
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ else
+ size += LLIST_NODE_SZ; /* room for llist_node */
+ unit_size = size;
+
+#ifdef CONFIG_MEMCG_KMEM
+ objcg = get_obj_cgroup_from_current();
+#endif
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(pc, cpu);
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ prefill_mem_cache(c, cpu);
+ }
+ ma->cache = pc;
+ return 0;
+ }
+
+ /* size == 0 && percpu is an invalid combination */
+ if (WARN_ON_ONCE(percpu))
+ return -EINVAL;
+
+ pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+#ifdef CONFIG_MEMCG_KMEM
+ objcg = get_obj_cgroup_from_current();
+#endif
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ c->unit_size = sizes[i];
+ c->objcg = objcg;
+ prefill_mem_cache(c, cpu);
+ }
+ }
+ ma->caches = pcc;
+ return 0;
+}
+
+static void drain_mem_cache(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ /* No progs are using this bpf_mem_cache, but htab_map_free() called
+ * bpf_mem_cache_free() for all remaining elements and they can be in
+ * free_by_rcu or in waiting_for_gp lists, so drain those lists now.
+ */
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
+ free_one(c, llnode);
+}
+
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+{
+ free_percpu(ma->cache);
+ free_percpu(ma->caches);
+ ma->cache = NULL;
+ ma->caches = NULL;
+}
+
+static void free_mem_alloc(struct bpf_mem_alloc *ma)
+{
+ /* waiting_for_gp lists was drained, but __free_rcu might
+ * still execute. Wait for it now before we freeing percpu caches.
+ */
+ rcu_barrier_tasks_trace();
+ rcu_barrier();
+ free_mem_alloc_no_barrier(ma);
+}
+
+static void free_mem_alloc_deferred(struct work_struct *work)
+{
+ struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
+
+ free_mem_alloc(ma);
+ kfree(ma);
+}
+
+static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
+{
+ struct bpf_mem_alloc *copy;
+
+ if (!rcu_in_progress) {
+ /* Fast path. No callbacks are pending, hence no need to do
+ * rcu_barrier-s.
+ */
+ free_mem_alloc_no_barrier(ma);
+ return;
+ }
+
+ copy = kmalloc(sizeof(*ma), GFP_KERNEL);
+ if (!copy) {
+ /* Slow path with inline barrier-s */
+ free_mem_alloc(ma);
+ return;
+ }
+
+ /* Defer barriers into worker to let the rest of map memory to be freed */
+ copy->cache = ma->cache;
+ ma->cache = NULL;
+ copy->caches = ma->caches;
+ ma->caches = NULL;
+ INIT_WORK(&copy->work, free_mem_alloc_deferred);
+ queue_work(system_unbound_wq, &copy->work);
+}
+
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i, rcu_in_progress;
+
+ if (ma->cache) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ /* objcg is the same across cpus */
+ if (c->objcg)
+ obj_cgroup_put(c->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+ if (ma->caches) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ }
+ if (c->objcg)
+ obj_cgroup_put(c->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+}
+
+/* notrace is necessary here and in other functions to make sure
+ * bpf programs cannot attach to them and cause llist corruptions.
+ */
+static void notrace *unit_alloc(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode = NULL;
+ unsigned long flags;
+ int cnt = 0;
+
+ /* Disable irqs to prevent the following race for majority of prog types:
+ * prog_A
+ * bpf_mem_alloc
+ * preemption or irq -> prog_B
+ * bpf_mem_alloc
+ *
+ * but prog_B could be a perf_event NMI prog.
+ * Use per-cpu 'active' counter to order free_list access between
+ * unit_alloc/unit_free/bpf_mem_refill.
+ */
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ WARN_ON(cnt < 0);
+
+ if (cnt < c->low_watermark)
+ irq_work_raise(c);
+ return llnode;
+}
+
+/* Though 'ptr' object could have been allocated on a different cpu
+ * add it to the free_llist of the current cpu.
+ * Let kfree() logic deal with it when it's later called from irq_work.
+ */
+static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+ int cnt = 0;
+
+ BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ __llist_add(llnode, &c->free_llist);
+ cnt = ++c->free_cnt;
+ } else {
+ /* unit_free() cannot fail. Therefore add an object to atomic
+ * llist. free_bulk() will drain it. Though free_llist_extra is
+ * a per-cpu list we have to use atomic llist_add here, since
+ * it also can be interrupted by bpf nmi prog that does another
+ * unit_free() into the same free_llist_extra.
+ */
+ llist_add(llnode, &c->free_llist_extra);
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ if (cnt > c->high_watermark)
+ /* free few objects from current cpu into global kmalloc pool */
+ irq_work_raise(c);
+}
+
+/* Called from BPF program or from sys_bpf syscall.
+ * In both cases migration is disabled.
+ */
+void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
+{
+ int idx;
+ void *ret;
+
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
+ if (idx < 0)
+ return NULL;
+
+ ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ int idx;
+
+ if (!ptr)
+ return;
+
+ idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ if (idx < 0)
+ return;
+
+ unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
+{
+ void *ret;
+
+ ret = unit_alloc(this_cpu_ptr(ma->cache));
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free(this_cpu_ptr(ma->cache), ptr);
+}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index bd09290e3648..13e4efc971e6 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -372,7 +372,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
attr->map_type != BPF_MAP_TYPE_HASH)
return ERR_PTR(-EINVAL);
- offmap = kzalloc(sizeof(*offmap), GFP_USER);
+ offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE);
if (!offmap)
return ERR_PTR(-ENOMEM);
@@ -404,7 +404,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
err_unlock:
up_write(&bpf_devs_lock);
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
return ERR_PTR(err);
}
@@ -428,7 +428,7 @@ void bpf_map_offload_map_free(struct bpf_map *map)
up_write(&bpf_devs_lock);
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
}
int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 00b874c8e889..b6e7f5c5b9ab 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -58,23 +58,21 @@ static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
{
int cpu, orig_cpu;
- orig_cpu = cpu = raw_smp_processor_id();
+ orig_cpu = raw_smp_processor_id();
while (1) {
- struct pcpu_freelist_head *head;
+ for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) {
+ struct pcpu_freelist_head *head;
- head = per_cpu_ptr(s->freelist, cpu);
- if (raw_spin_trylock(&head->lock)) {
- pcpu_freelist_push_node(head, node);
- raw_spin_unlock(&head->lock);
- return;
+ head = per_cpu_ptr(s->freelist, cpu);
+ if (raw_spin_trylock(&head->lock)) {
+ pcpu_freelist_push_node(head, node);
+ raw_spin_unlock(&head->lock);
+ return;
+ }
}
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
/* cannot lock any per cpu lock, try extralist */
- if (cpu == orig_cpu &&
- pcpu_freelist_try_push_extra(s, node))
+ if (pcpu_freelist_try_push_extra(s, node))
return;
}
}
@@ -125,13 +123,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- int orig_cpu, cpu;
+ int cpu;
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
if (!READ_ONCE(head->first))
- goto next_cpu;
+ continue;
raw_spin_lock(&head->lock);
node = head->first;
if (node) {
@@ -140,12 +137,6 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
return node;
}
raw_spin_unlock(&head->lock);
-next_cpu:
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
- if (cpu == orig_cpu)
- break;
}
/* per cpu lists are all empty, try extralist */
@@ -164,13 +155,12 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- int orig_cpu, cpu;
+ int cpu;
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
if (!READ_ONCE(head->first))
- goto next_cpu;
+ continue;
if (raw_spin_trylock(&head->lock)) {
node = head->first;
if (node) {
@@ -180,12 +170,6 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
}
raw_spin_unlock(&head->lock);
}
-next_cpu:
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
- if (cpu == orig_cpu)
- break;
}
/* cannot pop from per cpu lists, try extralist */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index a1c0794ae49d..8a5e060de63b 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -78,8 +78,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
if (!qs)
return ERR_PTR(-ENOMEM);
- memset(qs, 0, sizeof(*qs));
-
bpf_map_init_from_attr(&qs->map, attr);
qs->size = size;
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index ded4faeca192..9e832acf4692 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -38,10 +38,43 @@ struct bpf_ringbuf {
struct page **pages;
int nr_pages;
spinlock_t spinlock ____cacheline_aligned_in_smp;
- /* Consumer and producer counters are put into separate pages to allow
- * mapping consumer page as r/w, but restrict producer page to r/o.
- * This protects producer position from being modified by user-space
- * application and ruining in-kernel position tracking.
+ /* For user-space producer ring buffers, an atomic_t busy bit is used
+ * to synchronize access to the ring buffers in the kernel, rather than
+ * the spinlock that is used for kernel-producer ring buffers. This is
+ * done because the ring buffer must hold a lock across a BPF program's
+ * callback:
+ *
+ * __bpf_user_ringbuf_peek() // lock acquired
+ * -> program callback_fn()
+ * -> __bpf_user_ringbuf_sample_release() // lock released
+ *
+ * It is unsafe and incorrect to hold an IRQ spinlock across what could
+ * be a long execution window, so we instead simply disallow concurrent
+ * access to the ring buffer by kernel consumers, and return -EBUSY from
+ * __bpf_user_ringbuf_peek() if the busy bit is held by another task.
+ */
+ atomic_t busy ____cacheline_aligned_in_smp;
+ /* Consumer and producer counters are put into separate pages to
+ * allow each position to be mapped with different permissions.
+ * This prevents a user-space application from modifying the
+ * position and ruining in-kernel tracking. The permissions of the
+ * pages depend on who is producing samples: user-space or the
+ * kernel.
+ *
+ * Kernel-producer
+ * ---------------
+ * The producer position and data pages are mapped as r/o in
+ * userspace. For this approach, bits in the header of samples are
+ * used to signal to user-space, and to other producers, whether a
+ * sample is currently being written.
+ *
+ * User-space producer
+ * -------------------
+ * Only the page containing the consumer position is mapped r/o in
+ * user-space. User-space producers also use bits of the header to
+ * communicate to the kernel, but the kernel must carefully check and
+ * validate each sample to ensure that they're correctly formatted, and
+ * fully contained within the ring buffer.
*/
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
@@ -116,7 +149,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
err_free_pages:
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
return NULL;
}
@@ -136,6 +169,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
return NULL;
spin_lock_init(&rb->spinlock);
+ atomic_set(&rb->busy, 0);
init_waitqueue_head(&rb->waitq);
init_irq_work(&rb->work, bpf_ringbuf_notify);
@@ -164,7 +198,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
return ERR_PTR(-E2BIG);
#endif
- rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT);
+ rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE);
if (!rb_map)
return ERR_PTR(-ENOMEM);
@@ -172,7 +206,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
if (!rb_map->rb) {
- kfree(rb_map);
+ bpf_map_area_free(rb_map);
return ERR_PTR(-ENOMEM);
}
@@ -190,7 +224,7 @@ static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
vunmap(rb);
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
}
static void ringbuf_map_free(struct bpf_map *map)
@@ -199,7 +233,7 @@ static void ringbuf_map_free(struct bpf_map *map)
rb_map = container_of(map, struct bpf_ringbuf_map, map);
bpf_ringbuf_free(rb_map->rb);
- kfree(rb_map);
+ bpf_map_area_free(rb_map);
}
static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
@@ -224,7 +258,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
-static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_ringbuf_map *rb_map;
@@ -242,6 +276,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
vma->vm_pgoff + RINGBUF_PGOFF);
}
+static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ if (vma->vm_flags & VM_WRITE) {
+ if (vma->vm_pgoff == 0)
+ /* Disallow writable mappings to the consumer pointer,
+ * and allow writable mappings to both the producer
+ * position, and the ring buffer data itself.
+ */
+ return -EPERM;
+ } else {
+ vma->vm_flags &= ~VM_MAYWRITE;
+ }
+ /* remap_vmalloc_range() checks size and offset constraints */
+ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
{
unsigned long cons_pos, prod_pos;
@@ -251,8 +305,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
return prod_pos - cons_pos;
}
-static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
- struct poll_table_struct *pts)
+static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
+{
+ return rb->mask + 1;
+}
+
+static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
{
struct bpf_ringbuf_map *rb_map;
@@ -264,13 +323,26 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
return 0;
}
+static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ poll_wait(filp, &rb_map->rb->waitq, pts);
+
+ if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb))
+ return EPOLLOUT | EPOLLWRNORM;
+ return 0;
+}
+
BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
const struct bpf_map_ops ringbuf_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = ringbuf_map_alloc,
.map_free = ringbuf_map_free,
- .map_mmap = ringbuf_map_mmap,
- .map_poll = ringbuf_map_poll,
+ .map_mmap = ringbuf_map_mmap_kern,
+ .map_poll = ringbuf_map_poll_kern,
.map_lookup_elem = ringbuf_map_lookup_elem,
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
@@ -278,6 +350,20 @@ const struct bpf_map_ops ringbuf_map_ops = {
.map_btf_id = &ringbuf_map_btf_ids[0],
};
+BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops user_ringbuf_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = ringbuf_map_alloc,
+ .map_free = ringbuf_map_free,
+ .map_mmap = ringbuf_map_mmap_user,
+ .map_poll = ringbuf_map_poll_user,
+ .map_lookup_elem = ringbuf_map_lookup_elem,
+ .map_update_elem = ringbuf_map_update_elem,
+ .map_delete_elem = ringbuf_map_delete_elem,
+ .map_get_next_key = ringbuf_map_get_next_key,
+ .map_btf_id = &user_ringbuf_map_btf_ids[0],
+};
+
/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
* calculate offset from record metadata to ring buffer in pages, rounded
* down. This page offset is stored as part of record metadata and allows to
@@ -312,7 +398,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
return NULL;
len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
- if (len > rb->mask + 1)
+ if (len > ringbuf_total_data_sz(rb))
return NULL;
cons_pos = smp_load_acquire(&rb->consumer_pos);
@@ -459,7 +545,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
case BPF_RB_AVAIL_DATA:
return ringbuf_avail_data_sz(rb);
case BPF_RB_RING_SIZE:
- return rb->mask + 1;
+ return ringbuf_total_data_sz(rb);
case BPF_RB_CONS_POS:
return smp_load_acquire(&rb->consumer_pos);
case BPF_RB_PROD_POS:
@@ -553,3 +639,138 @@ const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
.arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
+
+static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size)
+{
+ int err;
+ u32 hdr_len, sample_len, total_len, flags, *hdr;
+ u64 cons_pos, prod_pos;
+
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ if (prod_pos % 8)
+ return -EINVAL;
+
+ /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+ if (cons_pos >= prod_pos)
+ return -ENODATA;
+
+ hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ hdr_len = smp_load_acquire(hdr);
+ flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT);
+ sample_len = hdr_len & ~flags;
+ total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* The sample must fit within the region advertised by the producer position. */
+ if (total_len > prod_pos - cons_pos)
+ return -EINVAL;
+
+ /* The sample must fit within the data region of the ring buffer. */
+ if (total_len > ringbuf_total_data_sz(rb))
+ return -E2BIG;
+
+ /* The sample must fit into a struct bpf_dynptr. */
+ err = bpf_dynptr_check_size(sample_len);
+ if (err)
+ return -E2BIG;
+
+ if (flags & BPF_RINGBUF_DISCARD_BIT) {
+ /* If the discard bit is set, the sample should be skipped.
+ *
+ * Update the consumer pos, and return -EAGAIN so the caller
+ * knows to skip this sample and try to read the next one.
+ */
+ smp_store_release(&rb->consumer_pos, cons_pos + total_len);
+ return -EAGAIN;
+ }
+
+ if (flags & BPF_RINGBUF_BUSY_BIT)
+ return -ENODATA;
+
+ *sample = (void *)((uintptr_t)rb->data +
+ (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
+ *size = sample_len;
+ return 0;
+}
+
+static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
+{
+ u64 consumer_pos;
+ u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* Using smp_load_acquire() is unnecessary here, as the busy-bit
+ * prevents another task from writing to consumer_pos after it was read
+ * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
+ */
+ consumer_pos = rb->consumer_pos;
+ /* Synchronizes with smp_load_acquire() in user-space producer. */
+ smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
+}
+
+BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
+ void *, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct bpf_ringbuf *rb;
+ long samples, discarded_samples = 0, ret = 0;
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP;
+ int busy = 0;
+
+ if (unlikely(flags & ~wakeup_flags))
+ return -EINVAL;
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+ /* If another consumer is already consuming a sample, wait for them to finish. */
+ if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
+ return -EBUSY;
+
+ for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) {
+ int err;
+ u32 size;
+ void *sample;
+ struct bpf_dynptr_kern dynptr;
+
+ err = __bpf_user_ringbuf_peek(rb, &sample, &size);
+ if (err) {
+ if (err == -ENODATA) {
+ break;
+ } else if (err == -EAGAIN) {
+ discarded_samples++;
+ continue;
+ } else {
+ ret = err;
+ goto schedule_work_return;
+ }
+ }
+
+ bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+ ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0);
+ __bpf_user_ringbuf_sample_release(rb, size, flags);
+ }
+ ret = samples - discarded_samples;
+
+schedule_work_return:
+ /* Prevent the clearing of the busy-bit from being reordered before the
+ * storing of any rb consumer or producer positions.
+ */
+ smp_mb__before_atomic();
+ atomic_set(&rb->busy, 0);
+
+ if (flags & BPF_RB_FORCE_WAKEUP)
+ irq_work_queue(&rb->work);
+ else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0)
+ irq_work_queue(&rb->work);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
+ .func = bpf_user_ringbuf_drain,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 27760627370d..7b373a5e861f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -598,7 +598,7 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
if (off_desc->type == BPF_KPTR_UNREF) {
u64 *p = (u64 *)btf_id_ptr;
- WRITE_ONCE(p, 0);
+ WRITE_ONCE(*p, 0);
continue;
}
old_ptr = xchg(btf_id_ptr, 0);
@@ -638,7 +638,10 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
bpf_map_free_id(map, do_idr_lock);
btf_put(map->btf);
INIT_WORK(&map->work, bpf_map_free_deferred);
- schedule_work(&map->work);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_unbound_wq, &map->work);
}
}
@@ -1046,7 +1049,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
}
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY) {
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
@@ -1413,19 +1417,14 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
}
value_size = bpf_map_value_size(map);
-
- err = -ENOMEM;
- value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
- if (!value)
+ value = kvmemdup_bpfptr(uvalue, value_size);
+ if (IS_ERR(value)) {
+ err = PTR_ERR(value);
goto free_key;
-
- err = -EFAULT;
- if (copy_from_bpfptr(value, uvalue, value_size) != 0)
- goto free_value;
+ }
err = bpf_map_update_value(map, f, key, value, attr->flags);
-free_value:
kvfree(value);
free_key:
kvfree(key);
@@ -1437,9 +1436,9 @@ err_put:
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
-static int map_delete_elem(union bpf_attr *attr)
+static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_user_ptr(attr->key);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
int ufd = attr->map_fd;
struct bpf_map *map;
struct fd f;
@@ -1459,7 +1458,7 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}
- key = __bpf_copy_key(ukey, map->key_size);
+ key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
@@ -2094,6 +2093,17 @@ struct bpf_prog_kstats {
u64 misses;
};
+void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
+{
+ struct bpf_prog_stats *stats;
+ unsigned int flags;
+
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->misses);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
+}
+
static void bpf_prog_get_stats(const struct bpf_prog *prog,
struct bpf_prog_kstats *stats)
{
@@ -4395,7 +4405,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (attr->task_fd_query.flags != 0)
return -EINVAL;
+ rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
if (!task)
return -ENOENT;
@@ -4941,7 +4953,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
- err = map_delete_elem(&attr);
+ err = map_delete_elem(&attr, uattr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
@@ -5073,8 +5085,10 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
switch (cmd) {
case BPF_MAP_CREATE:
+ case BPF_MAP_DELETE_ELEM:
case BPF_MAP_UPDATE_ELEM:
case BPF_MAP_FREEZE:
+ case BPF_MAP_GET_FD_BY_ID:
case BPF_PROG_LOAD:
case BPF_BTF_LOAD:
case BPF_LINK_CREATE:
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8c921799def4..67e03e1833ba 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -10,8 +10,17 @@
#include <linux/btf_ids.h>
#include "mmap_unlock_work.h"
+static const char * const iter_task_type_names[] = {
+ "ALL",
+ "TID",
+ "PID",
+};
+
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
+ enum bpf_iter_task_type type;
+ u32 pid;
+ u32 pid_visiting;
};
struct bpf_iter_seq_task_info {
@@ -22,18 +31,115 @@ struct bpf_iter_seq_task_info {
u32 tid;
};
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
+ u32 *tid,
+ bool skip_if_dup_files)
+{
+ struct task_struct *task, *next_task;
+ struct pid *pid;
+ u32 saved_tid;
+
+ if (!*tid) {
+ /* The first time, the iterator calls this function. */
+ pid = find_pid_ns(common->pid, common->ns);
+ if (!pid)
+ return NULL;
+
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ if (!task)
+ return NULL;
+
+ *tid = common->pid;
+ common->pid_visiting = common->pid;
+
+ return task;
+ }
+
+ /* If the control returns to user space and comes back to the
+ * kernel again, *tid and common->pid_visiting should be the
+ * same for task_seq_start() to pick up the correct task.
+ */
+ if (*tid == common->pid_visiting) {
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ task = get_pid_task(pid, PIDTYPE_PID);
+
+ return task;
+ }
+
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ if (!pid)
+ return NULL;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return NULL;
+
+retry:
+ if (!pid_alive(task)) {
+ put_task_struct(task);
+ return NULL;
+ }
+
+ next_task = next_thread(task);
+ put_task_struct(task);
+ if (!next_task)
+ return NULL;
+
+ saved_tid = *tid;
+ *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
+ if (!*tid || *tid == common->pid) {
+ /* Run out of tasks of a process. The tasks of a
+ * thread_group are linked as circular linked list.
+ */
+ *tid = saved_tid;
+ return NULL;
+ }
+
+ get_task_struct(next_task);
+ common->pid_visiting = *tid;
+
+ if (skip_if_dup_files && task->files == task->group_leader->files) {
+ task = next_task;
+ goto retry;
+ }
+
+ return next_task;
+}
+
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
u32 *tid,
bool skip_if_dup_files)
{
struct task_struct *task = NULL;
struct pid *pid;
+ if (common->type == BPF_TASK_ITER_TID) {
+ if (*tid && *tid != common->pid)
+ return NULL;
+ rcu_read_lock();
+ pid = find_pid_ns(common->pid, common->ns);
+ if (pid) {
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ *tid = common->pid;
+ }
+ rcu_read_unlock();
+
+ return task;
+ }
+
+ if (common->type == BPF_TASK_ITER_TGID) {
+ rcu_read_lock();
+ task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+ rcu_read_unlock();
+
+ return task;
+ }
+
rcu_read_lock();
retry:
- pid = find_ge_pid(*tid, ns);
+ pid = find_ge_pid(*tid, common->ns);
if (pid) {
- *tid = pid_nr_ns(pid, ns);
+ *tid = pid_nr_ns(pid, common->ns);
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
++*tid;
@@ -56,7 +162,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -73,7 +179,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
++info->tid;
put_task_struct((struct task_struct *)v);
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -117,6 +223,41 @@ static void task_seq_stop(struct seq_file *seq, void *v)
put_task_struct((struct task_struct *)v);
}
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ unsigned int flags;
+ struct pid *pid;
+ pid_t tgid;
+
+ if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
+ return -EINVAL;
+
+ aux->task.type = BPF_TASK_ITER_ALL;
+ if (linfo->task.tid != 0) {
+ aux->task.type = BPF_TASK_ITER_TID;
+ aux->task.pid = linfo->task.tid;
+ }
+ if (linfo->task.pid != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+ aux->task.pid = linfo->task.pid;
+ }
+ if (linfo->task.pid_fd != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+
+ pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ tgid = pid_nr_ns(pid, task_active_pid_ns(current));
+ aux->task.pid = tgid;
+ put_pid(pid);
+ }
+
+ return 0;
+}
+
static const struct seq_operations task_seq_ops = {
.start = task_seq_start,
.next = task_seq_next,
@@ -137,8 +278,7 @@ struct bpf_iter_seq_task_file_info {
static struct file *
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
{
- struct pid_namespace *ns = info->common.ns;
- u32 curr_tid = info->tid;
+ u32 saved_tid = info->tid;
struct task_struct *curr_task;
unsigned int curr_fd = info->fd;
@@ -151,21 +291,18 @@ again:
curr_task = info->task;
curr_fd = info->fd;
} else {
- curr_task = task_seq_get_next(ns, &curr_tid, true);
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
if (!curr_task) {
info->task = NULL;
- info->tid = curr_tid;
return NULL;
}
- /* set info->task and info->tid */
+ /* set info->task */
info->task = curr_task;
- if (curr_tid == info->tid) {
+ if (saved_tid == info->tid)
curr_fd = info->fd;
- } else {
- info->tid = curr_tid;
+ else
curr_fd = 0;
- }
}
rcu_read_lock();
@@ -186,9 +323,15 @@ again:
/* the current task is done, go to the next task */
rcu_read_unlock();
put_task_struct(curr_task);
+
+ if (info->common.type == BPF_TASK_ITER_TID) {
+ info->task = NULL;
+ return NULL;
+ }
+
info->task = NULL;
info->fd = 0;
- curr_tid = ++(info->tid);
+ saved_tid = ++(info->tid);
goto again;
}
@@ -269,6 +412,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
struct bpf_iter_seq_task_common *common = priv_data;
common->ns = get_pid_ns(task_active_pid_ns(current));
+ common->type = aux->task.type;
+ common->pid = aux->task.pid;
+
return 0;
}
@@ -307,11 +453,10 @@ enum bpf_task_vma_iter_find_op {
static struct vm_area_struct *
task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
{
- struct pid_namespace *ns = info->common.ns;
enum bpf_task_vma_iter_find_op op;
struct vm_area_struct *curr_vma;
struct task_struct *curr_task;
- u32 curr_tid = info->tid;
+ u32 saved_tid = info->tid;
/* If this function returns a non-NULL vma, it holds a reference to
* the task_struct, and holds read lock on vma->mm->mmap_lock.
@@ -371,14 +516,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
}
} else {
again:
- curr_task = task_seq_get_next(ns, &curr_tid, true);
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
if (!curr_task) {
- info->tid = curr_tid + 1;
+ info->tid++;
goto finish;
}
- if (curr_tid != info->tid) {
- info->tid = curr_tid;
+ if (saved_tid != info->tid) {
/* new task, process the first vma */
op = task_vma_iter_first_vma;
} else {
@@ -430,9 +574,12 @@ again:
return curr_vma;
next_task:
+ if (info->common.type == BPF_TASK_ITER_TID)
+ goto finish;
+
put_task_struct(curr_task);
info->task = NULL;
- curr_tid++;
+ info->tid++;
goto again;
finish:
@@ -531,8 +678,33 @@ static const struct bpf_iter_seq_info task_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
};
+static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
+{
+ switch (aux->task.type) {
+ case BPF_TASK_ITER_TID:
+ info->iter.task.tid = aux->task.pid;
+ break;
+ case BPF_TASK_ITER_TGID:
+ info->iter.task.pid = aux->task.pid;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
+{
+ seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
+ if (aux->task.type == BPF_TASK_ITER_TID)
+ seq_printf(seq, "tid:\t%u\n", aux->task.pid);
+ else if (aux->task.type == BPF_TASK_ITER_TGID)
+ seq_printf(seq, "pid:\t%u\n", aux->task.pid);
+}
+
static struct bpf_iter_reg task_reg_info = {
.target = "task",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 1,
.ctx_arg_info = {
@@ -540,6 +712,8 @@ static struct bpf_iter_reg task_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
static const struct bpf_iter_seq_info task_file_seq_info = {
@@ -551,6 +725,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {
static struct bpf_iter_reg task_file_reg_info = {
.target = "task_file",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
@@ -560,6 +735,8 @@ static struct bpf_iter_reg task_file_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_file_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
static const struct bpf_iter_seq_info task_vma_seq_info = {
@@ -571,6 +748,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = {
static struct bpf_iter_reg task_vma_reg_info = {
.target = "task_vma",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
@@ -580,6 +758,8 @@ static struct bpf_iter_reg task_vma_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_vma_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index ff87e38af8a7..bf0906e1e2b9 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -116,22 +116,6 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
-void *bpf_jit_alloc_exec_page(void)
-{
- void *image;
-
- image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!image)
- return NULL;
-
- set_vm_flush_reset_perms(image);
- /* Keep image as writeable. The alternative is to keep flipping ro/rw
- * every time new program is attached or detached.
- */
- set_memory_x((long)image, 1);
- return image;
-}
-
void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
{
ksym->start = (unsigned long) data;
@@ -404,9 +388,10 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
goto out_free_im;
err = -ENOMEM;
- im->image = image = bpf_jit_alloc_exec_page();
+ im->image = image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!image)
goto out_uncharge;
+ set_vm_flush_reset_perms(image);
err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
if (err)
@@ -483,6 +468,9 @@ again:
if (err < 0)
goto out;
+ set_memory_ro((long)im->image, 1);
+ set_memory_x((long)im->image, 1);
+
WARN_ON(tr->cur_image && tr->selector == 0);
WARN_ON(!tr->cur_image && tr->selector);
if (tr->cur_image)
@@ -863,17 +851,6 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
return start;
}
-static void notrace inc_misses_counter(struct bpf_prog *prog)
-{
- struct bpf_prog_stats *stats;
- unsigned int flags;
-
- stats = this_cpu_ptr(prog->stats);
- flags = u64_stats_update_begin_irqsave(&stats->syncp);
- u64_stats_inc(&stats->misses);
- u64_stats_update_end_irqrestore(&stats->syncp, flags);
-}
-
/* The logic is similar to bpf_prog_run(), but with an explicit
* rcu_read_lock() and migrate_disable() which are required
* for the trampoline. The macro is split into
@@ -895,8 +872,8 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
- inc_misses_counter(prog);
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
return 0;
}
return bpf_prog_start_time();
@@ -930,7 +907,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
+ this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock();
}
@@ -966,8 +943,8 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r
migrate_disable();
might_fault();
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
- inc_misses_counter(prog);
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
return 0;
}
@@ -982,11 +959,34 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
+ this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock_trace();
}
+u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ migrate_disable();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ migrate_enable();
+ rcu_read_unlock();
+}
+
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
{
percpu_ref_get(&tr->pcref);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3eadb14e090b..6f6d2d511c06 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,7 @@
#include <linux/error-injection.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
+#include <linux/poison.h>
#include "disasm.h"
@@ -370,6 +371,7 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
bpf_verifier_vlog(log, fmt, args);
va_end(args);
}
+EXPORT_SYMBOL_GPL(bpf_log);
static const char *ltrim(const char *s)
{
@@ -427,6 +429,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
+ type = base_type(type);
return type == PTR_TO_PACKET ||
type == PTR_TO_PACKET_META;
}
@@ -456,10 +459,9 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
{
- return base_type(type) == PTR_TO_SOCKET ||
- base_type(type) == PTR_TO_TCP_SOCK ||
- base_type(type) == PTR_TO_MEM ||
- base_type(type) == PTR_TO_BTF_ID;
+ type = base_type(type);
+ return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_MEM || type == PTR_TO_BTF_ID;
}
static bool type_is_rdonly_mem(u32 type)
@@ -467,25 +469,11 @@ static bool type_is_rdonly_mem(u32 type)
return type & MEM_RDONLY;
}
-static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
-{
- return type == ARG_PTR_TO_SOCK_COMMON;
-}
-
static bool type_may_be_null(u32 type)
{
return type & PTR_MAYBE_NULL;
}
-static bool may_be_acquire_function(enum bpf_func_id func_id)
-{
- return func_id == BPF_FUNC_sk_lookup_tcp ||
- func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp ||
- func_id == BPF_FUNC_map_lookup_elem ||
- func_id == BPF_FUNC_ringbuf_reserve;
-}
-
static bool is_acquire_function(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -518,6 +506,26 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
func_id == BPF_FUNC_skc_to_tcp_request_sock;
}
+static bool is_dynptr_ref_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_dynptr_data;
+}
+
+static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ int ref_obj_uses = 0;
+
+ if (is_ptr_cast_function(func_id))
+ ref_obj_uses++;
+ if (is_acquire_function(func_id, map))
+ ref_obj_uses++;
+ if (is_dynptr_ref_function(func_id))
+ ref_obj_uses++;
+
+ return ref_obj_uses > 1;
+}
+
static bool is_cmpxchg_insn(const struct bpf_insn *insn)
{
return BPF_CLASS(insn->code) == BPF_STX &&
@@ -555,6 +563,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
[PTR_TO_MAP_KEY] = "map_key",
+ [PTR_TO_DYNPTR] = "dynptr_ptr",
};
if (type & PTR_MAYBE_NULL) {
@@ -773,8 +782,8 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
return true;
}
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- enum bpf_arg_type arg_type)
+bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
int spi = get_spi(reg->off);
@@ -790,11 +799,24 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_re
return false;
}
+ return true;
+}
+
+bool is_dynptr_type_expected(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type)
+{
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type dynptr_type;
+ int spi = get_spi(reg->off);
+
/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
if (arg_type == ARG_PTR_TO_DYNPTR)
return true;
- return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type);
+ dynptr_type = arg_to_dynptr_type(arg_type);
+
+ return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
}
/* The reg state of a pointer or a bounded scalar was saved when
@@ -1086,6 +1108,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
id = ++env->id_gen;
state->refs[new_ofs].id = id;
state->refs[new_ofs].insn_idx = insn_idx;
+ state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
return id;
}
@@ -1098,6 +1121,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].id == ptr_id) {
+ /* Cannot release caller references in callbacks */
+ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ return -EINVAL;
if (last_idx && i != last_idx)
memcpy(&state->refs[i], &state->refs[last_idx],
sizeof(*state->refs));
@@ -1739,6 +1765,7 @@ static void init_func_state(struct bpf_verifier_env *env,
state->callsite = callsite;
state->frameno = frameno;
state->subprogno = subprogno;
+ state->callback_ret_range = tnum_range(0, 0);
init_reg_state(env, state);
mark_verifier_state_scratched(env);
}
@@ -2898,7 +2925,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
return 0;
}
-static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
return __mark_chain_precision(env, regno, -1);
}
@@ -5223,6 +5250,25 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
env,
regno, reg->off, access_size,
zero_size_allowed, ACCESS_HELPER, meta);
+ case PTR_TO_CTX:
+ /* in case the function doesn't know how to access the context,
+ * (because we are in a program of type SYSCALL for example), we
+ * can not statically check its size.
+ * Dynamically check it now.
+ */
+ if (!env->ops->convert_ctx_access) {
+ enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ;
+ int offset = access_size - 1;
+
+ /* Allow zero-byte read from PTR_TO_CTX */
+ if (access_size == 0)
+ return zero_size_allowed ? 0 : -EACCES;
+
+ return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
+ atype, -1, false);
+ }
+
+ fallthrough;
default: /* scalar_value or invalid ptr */
/* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
@@ -5656,6 +5702,12 @@ static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK }
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types dynptr_types = {
+ .types = {
+ PTR_TO_STACK,
+ PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL,
+ }
+};
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
@@ -5682,7 +5734,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
[ARG_PTR_TO_KPTR] = &kptr_types,
- [ARG_PTR_TO_DYNPTR] = &stack_ptr_types,
+ [ARG_PTR_TO_DYNPTR] = &dynptr_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -5751,13 +5803,22 @@ found:
if (meta->func_id == BPF_FUNC_kptr_xchg) {
if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno))
return -EACCES;
- } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
- btf_vmlinux, *arg_btf_id,
- strict_type_match)) {
- verbose(env, "R%d is of type %s but %s is expected\n",
- regno, kernel_type_name(reg->btf, reg->btf_id),
- kernel_type_name(btf_vmlinux, *arg_btf_id));
- return -EACCES;
+ } else {
+ if (arg_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ btf_vmlinux, *arg_btf_id,
+ strict_type_match)) {
+ verbose(env, "R%d is of type %s but %s is expected\n",
+ regno, kernel_type_name(reg->btf, reg->btf_id),
+ kernel_type_name(btf_vmlinux, *arg_btf_id));
+ return -EACCES;
+ }
}
}
@@ -6025,6 +6086,13 @@ skip_type_check:
err = check_mem_size_reg(env, reg, regno, true, meta);
break;
case ARG_PTR_TO_DYNPTR:
+ /* We only need to check for initialized / uninitialized helper
+ * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
+ * assumption is that if it is, that a helper function
+ * initialized the dynptr on behalf of the BPF program.
+ */
+ if (base_type(reg->type) == PTR_TO_DYNPTR)
+ break;
if (arg_type & MEM_UNINIT) {
if (!is_dynptr_reg_valid_uninit(env, reg)) {
verbose(env, "Dynptr has to be an uninitialized dynptr\n");
@@ -6040,21 +6108,27 @@ skip_type_check:
}
meta->uninit_dynptr_regno = regno;
- } else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) {
+ } else if (!is_dynptr_reg_valid_init(env, reg)) {
+ verbose(env,
+ "Expected an initialized dynptr as arg #%d\n",
+ arg + 1);
+ return -EINVAL;
+ } else if (!is_dynptr_type_expected(env, reg, arg_type)) {
const char *err_extra = "";
switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
case DYNPTR_TYPE_LOCAL:
- err_extra = "local ";
+ err_extra = "local";
break;
case DYNPTR_TYPE_RINGBUF:
- err_extra = "ringbuf ";
+ err_extra = "ringbuf";
break;
default:
+ err_extra = "<unknown>";
break;
}
-
- verbose(env, "Expected an initialized %sdynptr as arg #%d\n",
+ verbose(env,
+ "Expected a dynptr of type %s as arg #%d\n",
err_extra, arg + 1);
return -EINVAL;
}
@@ -6199,6 +6273,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_ringbuf_discard_dynptr)
goto error;
break;
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ if (func_id != BPF_FUNC_user_ringbuf_drain)
+ goto error;
+ break;
case BPF_MAP_TYPE_STACK_TRACE:
if (func_id != BPF_FUNC_get_stackid)
goto error;
@@ -6318,6 +6396,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_RINGBUF)
goto error;
break;
+ case BPF_FUNC_user_ringbuf_drain:
+ if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
+ goto error;
+ break;
case BPF_FUNC_get_stackid:
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
@@ -6456,33 +6538,6 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
return true;
}
-static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
-{
- int count = 0;
-
- if (arg_type_may_be_refcounted(fn->arg1_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg2_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg3_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg4_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg5_type))
- count++;
-
- /* A reference acquiring function cannot acquire
- * another refcounted ptr.
- */
- if (may_be_acquire_function(func_id) && count)
- return false;
-
- /* We only support one arg being unreferenced at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- return count <= 1;
-}
-
static bool check_btf_id_ok(const struct bpf_func_proto *fn)
{
int i;
@@ -6501,43 +6556,25 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
return true;
}
-static int check_func_proto(const struct bpf_func_proto *fn, int func_id,
- struct bpf_call_arg_meta *meta)
+static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
- check_btf_id_ok(fn) &&
- check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
+ check_btf_id_ok(fn) ? 0 : -EINVAL;
}
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
* are now invalid, so turn them into unknown SCALAR_VALUE.
*/
-static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
- struct bpf_func_state *state)
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct bpf_reg_state *regs = state->regs, *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (reg_is_pkt_pointer_any(&regs[i]))
- mark_reg_unknown(env, regs, i);
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
if (reg_is_pkt_pointer_any(reg))
__mark_reg_unknown(env, reg);
- }
-}
-
-static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
-{
- struct bpf_verifier_state *vstate = env->cur_state;
- int i;
-
- for (i = 0; i <= vstate->curframe; i++)
- __clear_all_pkt_pointers(env, vstate->frame[i]);
+ }));
}
enum {
@@ -6566,41 +6603,24 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range
reg->range = AT_PKT_END;
}
-static void release_reg_references(struct bpf_verifier_env *env,
- struct bpf_func_state *state,
- int ref_obj_id)
-{
- struct bpf_reg_state *regs = state->regs, *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].ref_obj_id == ref_obj_id)
- mark_reg_unknown(env, regs, i);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->ref_obj_id == ref_obj_id)
- __mark_reg_unknown(env, reg);
- }
-}
-
/* The pointer with the specified id has released its reference to kernel
* resources. Identify all copies of the same pointer and clear the reference.
*/
static int release_reference(struct bpf_verifier_env *env,
int ref_obj_id)
{
- struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
int err;
- int i;
err = release_reference_state(cur_func(env), ref_obj_id);
if (err)
return err;
- for (i = 0; i <= vstate->curframe; i++)
- release_reg_references(env, vstate->frame[i], ref_obj_id);
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id)
+ __mark_reg_unknown(env, reg);
+ }));
return 0;
}
@@ -6648,7 +6668,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
func_info_aux = env->prog->aux->func_info_aux;
if (func_info_aux)
is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_subprog_arg_match(env, subprog, caller->regs);
+ err = btf_check_subprog_call(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
if (is_global) {
@@ -6822,6 +6842,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return err;
callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
return 0;
}
@@ -6843,6 +6864,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
return 0;
}
@@ -6872,6 +6894,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
return 0;
}
@@ -6899,6 +6922,30 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
+ return 0;
+}
+
+static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
+ * callback_ctx, u64 flags);
+ * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx);
+ */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
+ callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
return 0;
}
@@ -6926,7 +6973,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller = state->frame[state->curframe];
if (callee->in_callback_fn) {
/* enforce R0 return value range [0, 1]. */
- struct tnum range = tnum_range(0, 1);
+ struct tnum range = callee->callback_ret_range;
if (r0->type != SCALAR_VALUE) {
verbose(env, "R0 not a scalar value\n");
@@ -6941,10 +6988,17 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller->regs[BPF_REG_0] = *r0;
}
- /* Transfer references to the caller */
- err = copy_reference_state(caller, callee);
- if (err)
- return err;
+ /* callback_fn frame should have released its own additions to parent's
+ * reference state at this point, or check_reference_leak would
+ * complain, hence it must be the same as the caller. There is no need
+ * to copy it back.
+ */
+ if (!callee->in_callback_fn) {
+ /* Transfer references to the caller */
+ err = copy_reference_state(caller, callee);
+ if (err)
+ return err;
+ }
*insn_idx = callee->callsite + 1;
if (env->log.level & BPF_LOG_LEVEL) {
@@ -7066,13 +7120,20 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
static int check_reference_leak(struct bpf_verifier_env *env)
{
struct bpf_func_state *state = cur_func(env);
+ bool refs_lingering = false;
int i;
+ if (state->frameno && !state->in_callback_fn)
+ return 0;
+
for (i = 0; i < state->acquired_refs; i++) {
+ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
+ refs_lingering = true;
}
- return state->acquired_refs ? -EINVAL : 0;
+ return refs_lingering ? -EINVAL : 0;
}
static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
@@ -7219,7 +7280,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
- err = check_func_proto(fn, func_id, &meta);
+ err = check_func_proto(fn, func_id);
if (err) {
verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
@@ -7344,6 +7405,33 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
break;
+ case BPF_FUNC_dynptr_data:
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
+ if (arg_type_is_dynptr(fn->arg_type[i])) {
+ struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
+
+ if (meta.ref_obj_id) {
+ verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
+ return -EFAULT;
+ }
+
+ if (base_type(reg->type) != PTR_TO_DYNPTR)
+ /* Find the id of the dynptr we're
+ * tracking the reference of
+ */
+ meta.ref_obj_id = stack_slot_get_id(env, reg);
+ break;
+ }
+ }
+ if (i == MAX_BPF_FUNC_REG_ARGS) {
+ verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n");
+ return -EFAULT;
+ }
+ break;
+ case BPF_FUNC_user_ringbuf_drain:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_user_ringbuf_callback_state);
+ break;
}
if (err)
@@ -7360,13 +7448,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* update return register (already marked as written above) */
ret_type = fn->ret_type;
- ret_flag = type_flag(fn->ret_type);
- if (ret_type == RET_INTEGER) {
+ ret_flag = type_flag(ret_type);
+
+ switch (base_type(ret_type)) {
+ case RET_INTEGER:
/* sets type to SCALAR_VALUE */
mark_reg_unknown(env, regs, BPF_REG_0);
- } else if (ret_type == RET_VOID) {
+ break;
+ case RET_VOID:
regs[BPF_REG_0].type = NOT_INIT;
- } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {
+ break;
+ case RET_PTR_TO_MAP_VALUE:
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
/* remember map_ptr, so that check_map_access()
@@ -7385,20 +7477,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
map_value_has_spin_lock(meta.map_ptr)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
- } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
+ break;
+ case RET_PTR_TO_SOCKET:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
+ break;
+ case RET_PTR_TO_SOCK_COMMON:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
+ break;
+ case RET_PTR_TO_TCP_SOCK:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
+ break;
+ case RET_PTR_TO_ALLOC_MEM:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = meta.mem_size;
- } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
+ break;
+ case RET_PTR_TO_MEM_OR_BTF_ID:
+ {
const struct btf_type *t;
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -7430,7 +7528,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
- } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
+ break;
+ }
+ case RET_PTR_TO_BTF_ID:
+ {
struct btf *ret_btf;
int ret_btf_id;
@@ -7440,6 +7541,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
ret_btf = meta.kptr_off_desc->kptr.btf;
ret_btf_id = meta.kptr_off_desc->kptr.btf_id;
} else {
+ if (fn->ret_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n",
+ func_id_name(func_id));
+ return -EINVAL;
+ }
ret_btf = btf_vmlinux;
ret_btf_id = *fn->ret_btf_id;
}
@@ -7451,7 +7558,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
- } else {
+ break;
+ }
+ default:
verbose(env, "unknown return type %u of func %s#%d\n",
base_type(ret_type), func_id_name(func_id), func_id);
return -EINVAL;
@@ -7460,7 +7569,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (type_may_be_null(regs[BPF_REG_0].type))
regs[BPF_REG_0].id = ++env->id_gen;
- if (is_ptr_cast_function(func_id)) {
+ if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
+ verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n",
+ func_id_name(func_id), func_id);
+ return -EFAULT;
+ }
+
+ if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
} else if (is_acquire_function(func_id, meta.map_ptr)) {
@@ -7472,21 +7587,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].id = id;
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = id;
- } else if (func_id == BPF_FUNC_dynptr_data) {
- int dynptr_id = 0, i;
-
- /* Find the id of the dynptr we're acquiring a reference to */
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
- if (arg_type_is_dynptr(fn->arg_type[i])) {
- if (dynptr_id) {
- verbose(env, "verifier internal error: multiple dynptr args in func\n");
- return -EFAULT;
- }
- dynptr_id = stack_slot_get_id(env, &regs[BPF_REG_1 + i]);
- }
- }
- /* For release_reference() */
- regs[BPF_REG_0].ref_obj_id = dynptr_id;
}
do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
@@ -7558,6 +7658,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
{
const struct btf_type *t, *func, *func_proto, *ptr_type;
struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_kfunc_arg_meta meta = { 0 };
const char *func_name, *ptr_type_name;
u32 i, nargs, func_id, ptr_type_id;
int err, insn_idx = *insn_idx_p;
@@ -7585,10 +7686,17 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_name);
return -EACCES;
}
+ if (*kfunc_flags & KF_DESTRUCTIVE && !capable(CAP_SYS_BOOT)) {
+ verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capabilities\n");
+ return -EACCES;
+ }
+
acq = *kfunc_flags & KF_ACQUIRE;
+ meta.flags = *kfunc_flags;
+
/* Check the arguments */
- err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, *kfunc_flags);
+ err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta);
if (err < 0)
return err;
/* In case of release function, we get register number of refcounted
@@ -7609,7 +7717,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
- if (acq && !btf_type_is_ptr(t)) {
+ if (acq && !btf_type_is_struct_ptr(desc_btf, t)) {
verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
return -EINVAL;
}
@@ -7621,17 +7729,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
ptr_type = btf_type_skip_modifiers(desc_btf, t->type,
&ptr_type_id);
if (!btf_type_is_struct(ptr_type)) {
- ptr_type_name = btf_name_by_offset(desc_btf,
- ptr_type->name_off);
- verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
- func_name, btf_type_str(ptr_type),
- ptr_type_name);
- return -EINVAL;
+ if (!meta.r0_size) {
+ ptr_type_name = btf_name_by_offset(desc_btf,
+ ptr_type->name_off);
+ verbose(env,
+ "kernel function %s returns pointer type %s %s is not supported\n",
+ func_name,
+ btf_type_str(ptr_type),
+ ptr_type_name);
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM;
+ regs[BPF_REG_0].mem_size = meta.r0_size;
+
+ if (meta.r0_rdonly)
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+
+ /* Ensures we don't access the memory after a release_reference() */
+ if (meta.ref_obj_id)
+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+ } else {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
}
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].type = PTR_TO_BTF_ID;
- regs[BPF_REG_0].btf_id = ptr_type_id;
if (*kfunc_flags & KF_RET_NULL) {
regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
@@ -9274,34 +9398,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static void __find_good_pkt_pointers(struct bpf_func_state *state,
- struct bpf_reg_state *dst_reg,
- enum bpf_reg_type type, int new_range)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++) {
- reg = &state->regs[i];
- if (reg->type == type && reg->id == dst_reg->id)
- /* keep the maximum range already checked */
- reg->range = max(reg->range, new_range);
- }
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->type == type && reg->id == dst_reg->id)
- reg->range = max(reg->range, new_range);
- }
-}
-
static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
struct bpf_reg_state *dst_reg,
enum bpf_reg_type type,
bool range_right_open)
{
- int new_range, i;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ int new_range;
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open))
@@ -9366,9 +9470,11 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* the range won't allow anything.
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
- for (i = 0; i <= vstate->curframe; i++)
- __find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
- new_range);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == type && reg->id == dst_reg->id)
+ /* keep the maximum range already checked */
+ reg->range = max(reg->range, new_range);
+ }));
}
static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
@@ -9857,7 +9963,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
if (!reg_may_point_to_spin_lock(reg)) {
/* For not-NULL ptr, reg->ref_obj_id will be reset
- * in release_reg_references().
+ * in release_reference().
*
* reg->id is still used by spin_lock ptr. Other
* than spin_lock ptr type, reg->id can be reset.
@@ -9867,22 +9973,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
}
-static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
- bool is_null)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- mark_ptr_or_null_reg(state, reg, id, is_null);
- }
-}
-
/* The logic is similar to find_good_pkt_pointers(), both could eventually
* be folded together at some point.
*/
@@ -9890,10 +9980,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
bool is_null)
{
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs, *reg;
u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
- int i;
if (ref_obj_id && ref_obj_id == id && is_null)
/* regs[regno] is in the " == NULL" branch.
@@ -9902,8 +9991,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
*/
WARN_ON_ONCE(release_reference_state(state, id));
- for (i = 0; i <= vstate->curframe; i++)
- __mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ mark_ptr_or_null_reg(state, reg, id, is_null);
+ }));
}
static bool try_match_pkt_pointers(const struct bpf_insn *insn,
@@ -10016,23 +10106,11 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
{
struct bpf_func_state *state;
struct bpf_reg_state *reg;
- int i, j;
-
- for (i = 0; i <= vstate->curframe; i++) {
- state = vstate->frame[i];
- for (j = 0; j < MAX_BPF_REG; j++) {
- reg = &state->regs[j];
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
- }
- bpf_for_each_spilled_reg(j, state, reg) {
- if (!reg)
- continue;
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
- }
- }
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+ *reg = *known_reg;
+ }));
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
@@ -12333,6 +12411,16 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
+ /* We must do check_reference_leak here before
+ * prepare_func_exit to handle the case when
+ * state->curframe > 0, it may be a callback
+ * function, for which reference_state must
+ * match caller reference state when it exits.
+ */
+ err = check_reference_leak(env);
+ if (err)
+ return err;
+
if (state->curframe) {
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
@@ -12342,10 +12430,6 @@ static int do_check(struct bpf_verifier_env *env)
continue;
}
- err = check_reference_leak(env);
- if (err)
- return err;
-
err = check_return_code(env);
if (err)
return err;
@@ -12558,14 +12642,6 @@ err_put:
return err;
}
-static int check_map_prealloc(struct bpf_map *map)
-{
- return (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
- map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
- !(map->map_flags & BPF_F_NO_PREALLOC);
-}
-
static bool is_tracing_prog_type(enum bpf_prog_type type)
{
switch (type) {
@@ -12580,50 +12656,12 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
}
}
-static bool is_preallocated_map(struct bpf_map *map)
-{
- if (!check_map_prealloc(map))
- return false;
- if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
- return false;
- return true;
-}
-
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
- /*
- * Validate that trace type programs use preallocated hash maps.
- *
- * For programs attached to PERF events this is mandatory as the
- * perf NMI can hit any arbitrary code sequence.
- *
- * All other trace types using preallocated hash maps are unsafe as
- * well because tracepoint or kprobes can be inside locked regions
- * of the memory allocator or at a place where a recursion into the
- * memory allocator would see inconsistent state.
- *
- * On RT enabled kernels run-time allocation of all trace type
- * programs is strictly prohibited due to lock type constraints. On
- * !RT kernels it is allowed for backwards compatibility reasons for
- * now, but warnings are emitted so developers are made aware of
- * the unsafety and can fix their programs before this is enforced.
- */
- if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) {
- if (prog_type == BPF_PROG_TYPE_PERF_EVENT) {
- verbose(env, "perf_event programs can only use preallocated hash map\n");
- return -EINVAL;
- }
- if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- verbose(env, "trace type programs can only use preallocated hash map\n");
- return -EINVAL;
- }
- WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
- verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
- }
if (map_value_has_spin_lock(map)) {
if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
@@ -12670,13 +12708,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
- if (!is_preallocated_map(map)) {
- verbose(env,
- "Sleepable programs can only use preallocated maps\n");
- return -EINVAL;
- }
- break;
case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
case BPF_MAP_TYPE_INODE_STORAGE:
case BPF_MAP_TYPE_SK_STORAGE:
case BPF_MAP_TYPE_TASK_STORAGE:
@@ -13470,9 +13503,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->code = BPF_LDX | BPF_PROBE_MEM |
BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
- } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) {
- verbose(env, "Writes through BTF pointers are not allowed\n");
- return -EINVAL;
}
continue;
default:
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5f2090d051ac..8ad2c267ff47 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6167,11 +6167,6 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
return ERR_CAST(css);
cgrp = css->cgroup;
- if (!cgroup_on_dfl(cgrp)) {
- cgroup_put(cgrp);
- return ERR_PTR(-EBADF);
- }
-
return cgrp;
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index feb59380c896..793ecff29038 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -3,6 +3,10 @@
#include <linux/sched/cputime.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -141,6 +145,31 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
return pos;
}
+/*
+ * A hook for bpf stat collectors to attach to and flush their stats.
+ * Together with providing bpf kfuncs for cgroup_rstat_updated() and
+ * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
+ * collect cgroup stats can integrate with rstat for efficient flushing.
+ *
+ * A static noinline declaration here could cause the compiler to optimize away
+ * the function. A global noinline declaration will keep the definition, but may
+ * optimize away the callsite. Therefore, __weak is needed to ensure that the
+ * call is still emitted, by telling the compiler that we don't know what the
+ * function might eventually be.
+ *
+ * __diag_* below are needed to dismiss the missing prototype warning.
+ */
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "kfuncs which will be used in BPF programs");
+
+__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
+ struct cgroup *parent, int cpu)
+{
+}
+
+__diag_pop();
+
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -168,6 +197,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
rcu_read_lock();
list_for_each_entry_rcu(css, &pos->rstat_css_list,
@@ -501,3 +531,21 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}
+
+/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
+BTF_SET8_START(bpf_rstat_kfunc_ids)
+BTF_ID_FLAGS(func, cgroup_rstat_updated)
+BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
+BTF_SET8_END(bpf_rstat_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_rstat_kfunc_ids,
+};
+
+static int __init bpf_rstat_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_rstat_kfunc_set);
+}
+late_initcall(bpf_rstat_kfunc_init);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ca9d834d0b84..3220b0a2fb4a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1607,9 +1607,10 @@ int register_kprobe(struct kprobe *p)
struct kprobe *old_p;
struct module *probed_mod;
kprobe_opcode_t *addr;
+ bool on_func_entry;
/* Adjust probe address from symbol */
- addr = kprobe_addr(p);
+ addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
if (IS_ERR(addr))
return PTR_ERR(addr);
p->addr = addr;
@@ -1629,6 +1630,9 @@ int register_kprobe(struct kprobe *p)
mutex_lock(&kprobe_mutex);
+ if (on_func_entry)
+ p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY;
+
old_p = get_kprobe(p->addr);
if (old_p) {
/* Since this may unoptimize 'old_p', locking 'text_mutex'. */
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f7e246336218..8ce3fa0c19e2 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -688,6 +688,7 @@ static struct genl_family family __ro_after_init = {
.module = THIS_MODULE,
.ops = taskstats_ops,
.n_ops = ARRAY_SIZE(taskstats_ops),
+ .resv_start_op = CGROUPSTATS_CMD_GET + 1,
.netnsok = true,
};
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1052126bdca2..e9e95c790b8e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -51,6 +51,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS
This allows for use of regs_get_kernel_argument() and
kernel_stack_pointer().
+config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
+ bool
+ help
+ If the architecture generates __patchable_function_entries sections
+ but does not want them included in the ftrace locations.
+
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 68e5cdd24cef..688552df95ca 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,8 @@
#include <linux/fprobe.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/key.h>
+#include <linux/verification.h>
#include <net/bpf_sk_storage.h>
@@ -1026,11 +1028,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
.arg1_type = ARG_PTR_TO_CTX,
};
+#ifdef CONFIG_X86_KERNEL_IBT
+static unsigned long get_entry_ip(unsigned long fentry_ip)
+{
+ u32 instr;
+
+ /* Being extra safe in here in case entry ip is on the page-edge. */
+ if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1))
+ return fentry_ip;
+ if (is_endbr(instr))
+ fentry_ip -= ENDBR_INSN_SIZE;
+ return fentry_ip;
+}
+#else
+#define get_entry_ip(fentry_ip) fentry_ip
+#endif
+
BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
struct kprobe *kp = kprobe_running();
- return kp ? (uintptr_t)kp->addr : 0;
+ if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY))
+ return 0;
+
+ return get_entry_ip((uintptr_t)kp->addr);
}
static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
@@ -1181,6 +1202,184 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+#ifdef CONFIG_KEYS
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "kfuncs which will be used in BPF programs");
+
+/**
+ * bpf_lookup_user_key - lookup a key by its serial
+ * @serial: key handle serial number
+ * @flags: lookup-specific flags
+ *
+ * Search a key with a given *serial* and the provided *flags*.
+ * If found, increment the reference count of the key by one, and
+ * return it in the bpf_key structure.
+ *
+ * The bpf_key structure must be passed to bpf_key_put() when done
+ * with it, so that the key reference count is decremented and the
+ * bpf_key structure is freed.
+ *
+ * Permission checks are deferred to the time the key is used by
+ * one of the available key-specific kfuncs.
+ *
+ * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
+ * special keyring (e.g. session keyring), if it doesn't yet exist.
+ * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
+ * for the key construction, and to retrieve uninstantiated keys (keys
+ * without data attached to them).
+ *
+ * Return: a bpf_key pointer with a valid key pointer if the key is found, a
+ * NULL pointer otherwise.
+ */
+struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
+{
+ key_ref_t key_ref;
+ struct bpf_key *bkey;
+
+ if (flags & ~KEY_LOOKUP_ALL)
+ return NULL;
+
+ /*
+ * Permission check is deferred until the key is used, as the
+ * intent of the caller is unknown here.
+ */
+ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
+ if (IS_ERR(key_ref))
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
+ if (!bkey) {
+ key_put(key_ref_to_ptr(key_ref));
+ return NULL;
+ }
+
+ bkey->key = key_ref_to_ptr(key_ref);
+ bkey->has_ref = true;
+
+ return bkey;
+}
+
+/**
+ * bpf_lookup_system_key - lookup a key by a system-defined ID
+ * @id: key ID
+ *
+ * Obtain a bpf_key structure with a key pointer set to the passed key ID.
+ * The key pointer is marked as invalid, to prevent bpf_key_put() from
+ * attempting to decrement the key reference count on that pointer. The key
+ * pointer set in such way is currently understood only by
+ * verify_pkcs7_signature().
+ *
+ * Set *id* to one of the values defined in include/linux/verification.h:
+ * 0 for the primary keyring (immutable keyring of system keys);
+ * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
+ * (where keys can be added only if they are vouched for by existing keys
+ * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
+ * keyring (primarily used by the integrity subsystem to verify a kexec'ed
+ * kerned image and, possibly, the initramfs signature).
+ *
+ * Return: a bpf_key pointer with an invalid key pointer set from the
+ * pre-determined ID on success, a NULL pointer otherwise
+ */
+struct bpf_key *bpf_lookup_system_key(u64 id)
+{
+ struct bpf_key *bkey;
+
+ if (system_keyring_id_check(id) < 0)
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
+ if (!bkey)
+ return NULL;
+
+ bkey->key = (struct key *)(unsigned long)id;
+ bkey->has_ref = false;
+
+ return bkey;
+}
+
+/**
+ * bpf_key_put - decrement key reference count if key is valid and free bpf_key
+ * @bkey: bpf_key structure
+ *
+ * Decrement the reference count of the key inside *bkey*, if the pointer
+ * is valid, and free *bkey*.
+ */
+void bpf_key_put(struct bpf_key *bkey)
+{
+ if (bkey->has_ref)
+ key_put(bkey->key);
+
+ kfree(bkey);
+}
+
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+/**
+ * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
+ * @data_ptr: data to verify
+ * @sig_ptr: signature of the data
+ * @trusted_keyring: keyring with keys trusted for signature verification
+ *
+ * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
+ * with keys in a keyring referenced by *trusted_keyring*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
+ struct bpf_dynptr_kern *sig_ptr,
+ struct bpf_key *trusted_keyring)
+{
+ int ret;
+
+ if (trusted_keyring->has_ref) {
+ /*
+ * Do the permission check deferred in bpf_lookup_user_key().
+ * See bpf_lookup_user_key() for more details.
+ *
+ * A call to key_task_permission() here would be redundant, as
+ * it is already done by keyring_search() called by
+ * find_asymmetric_key().
+ */
+ ret = key_validate(trusted_keyring->key);
+ if (ret < 0)
+ return ret;
+ }
+
+ return verify_pkcs7_signature(data_ptr->data,
+ bpf_dynptr_get_size(data_ptr),
+ sig_ptr->data,
+ bpf_dynptr_get_size(sig_ptr),
+ trusted_keyring->key,
+ VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
+ NULL);
+}
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+
+__diag_pop();
+
+BTF_SET8_START(key_sig_kfunc_set)
+BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
+#endif
+BTF_SET8_END(key_sig_kfunc_set)
+
+static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &key_sig_kfunc_set,
+};
+
+static int __init bpf_key_sig_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_key_sig_kfunc_set);
+}
+
+late_initcall(bpf_key_sig_kfuncs_init);
+#endif /* CONFIG_KEYS */
+
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -2042,9 +2241,15 @@ static __always_inline
void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
{
cant_sleep();
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
+ goto out;
+ }
rcu_read_lock();
(void) bpf_prog_run(prog, args);
rcu_read_unlock();
+out:
+ this_cpu_dec(*(prog->active));
}
#define UNPACK(...) __VA_ARGS__
@@ -2414,13 +2619,13 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
}
static void
-kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
+kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
struct pt_regs *regs)
{
struct bpf_kprobe_multi_link *link;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, entry_ip, regs);
+ kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
}
static int symbols_cmp_r(const void *a, const void *b, const void *priv)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 439e2ab6905e..447d2e2a8549 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -8265,8 +8265,7 @@ static int kallsyms_callback(void *data, const char *name,
if (args->addrs[idx])
return 0;
- addr = ftrace_location(addr);
- if (!addr)
+ if (!ftrace_location(addr))
return 0;
args->addrs[idx] = addr;