aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-04-29 11:57:23 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-04-29 11:57:23 -0700
commit9d31d2338950293ec19d9b095fbaa9030899dcb4 (patch)
treee688040d0557c24a2eeb9f6c9c223d949f6f7ef9 /kernel
parentMerge tag 'x86-mm-2021-04-29' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (diff)
parentnet: selftest: fix build issue if INET is disabled (diff)
downloadlinux-dev-9d31d2338950293ec19d9b095fbaa9030899dcb4.tar.xz
linux-dev-9d31d2338950293ec19d9b095fbaa9030899dcb4.zip
Merge tag 'net-next-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - bpf: - allow bpf programs calling kernel functions (initially to reuse TCP congestion control implementations) - enable task local storage for tracing programs - remove the need to store per-task state in hash maps, and allow tracing programs access to task local storage previously added for BPF_LSM - add bpf_for_each_map_elem() helper, allowing programs to walk all map elements in a more robust and easier to verify fashion - sockmap: support UDP and cross-protocol BPF_SK_SKB_VERDICT redirection - lpm: add support for batched ops in LPM trie - add BTF_KIND_FLOAT support - mostly to allow use of BTF on s390 which has floats in its headers files - improve BPF syscall documentation and extend the use of kdoc parsing scripts we already employ for bpf-helpers - libbpf, bpftool: support static linking of BPF ELF files - improve support for encapsulation of L2 packets - xdp: restructure redirect actions to avoid a runtime lookup, improving performance by 4-8% in microbenchmarks - xsk: build skb by page (aka generic zerocopy xmit) - improve performance of software AF_XDP path by 33% for devices which don't need headers in the linear skb part (e.g. virtio) - nexthop: resilient next-hop groups - improve path stability on next-hops group changes (incl. offload for mlxsw) - ipv6: segment routing: add support for IPv4 decapsulation - icmp: add support for RFC 8335 extended PROBE messages - inet: use bigger hash table for IP ID generation - tcp: deal better with delayed TX completions - make sure we don't give up on fast TCP retransmissions only because driver is slow in reporting that it completed transmitting the original - tcp: reorder tcp_congestion_ops for better cache locality - mptcp: - add sockopt support for common TCP options - add support for common TCP msg flags - include multiple address ids in RM_ADDR - add reset option support for resetting one subflow - udp: GRO L4 improvements - improve 'forward' / 'frag_list' co-existence with UDP tunnel GRO, allowing the first to take place correctly even for encapsulated UDP traffic - micro-optimize dev_gro_receive() and flow dissection, avoid retpoline overhead on VLAN and TEB GRO - use less memory for sysctls, add a new sysctl type, to allow using u8 instead of "int" and "long" and shrink networking sysctls - veth: allow GRO without XDP - this allows aggregating UDP packets before handing them off to routing, bridge, OvS, etc. - allow specifing ifindex when device is moved to another namespace - netfilter: - nft_socket: add support for cgroupsv2 - nftables: add catch-all set element - special element used to define a default action in case normal lookup missed - use net_generic infra in many modules to avoid allocating per-ns memory unnecessarily - xps: improve the xps handling to avoid potential out-of-bound accesses and use-after-free when XPS change race with other re-configuration under traffic - add a config knob to turn off per-cpu netdev refcnt to catch underflows in testing Device APIs: - add WWAN subsystem to organize the WWAN interfaces better and hopefully start driving towards more unified and vendor- independent APIs - ethtool: - add interface for reading IEEE MIB stats (incl. mlx5 and bnxt support) - allow network drivers to dump arbitrary SFP EEPROM data, current offset+length API was a poor fit for modern SFP which define EEPROM in terms of pages (incl. mlx5 support) - act_police, flow_offload: add support for packet-per-second policing (incl. offload for nfp) - psample: add additional metadata attributes like transit delay for packets sampled from switch HW (and corresponding egress and policy-based sampling in the mlxsw driver) - dsa: improve support for sandwiched LAGs with bridge and DSA - netfilter: - flowtable: use direct xmit in topologies with IP forwarding, bridging, vlans etc. - nftables: counter hardware offload support - Bluetooth: - improvements for firmware download w/ Intel devices - add support for reading AOSP vendor capabilities - add support for virtio transport driver - mac80211: - allow concurrent monitor iface and ethernet rx decap - set priority and queue mapping for injected frames - phy: add support for Clause-45 PHY Loopback - pci/iov: add sysfs MSI-X vector assignment interface to distribute MSI-X resources to VFs (incl. mlx5 support) New hardware/drivers: - dsa: mv88e6xxx: add support for Marvell mv88e6393x - 11-port Ethernet switch with 8x 1-Gigabit Ethernet and 3x 10-Gigabit interfaces. - dsa: support for legacy Broadcom tags used on BCM5325, BCM5365 and BCM63xx switches - Microchip KSZ8863 and KSZ8873; 3x 10/100Mbps Ethernet switches - ath11k: support for QCN9074 a 802.11ax device - Bluetooth: Broadcom BCM4330 and BMC4334 - phy: Marvell 88X2222 transceiver support - mdio: add BCM6368 MDIO mux bus controller - r8152: support RTL8153 and RTL8156 (USB Ethernet) chips - mana: driver for Microsoft Azure Network Adapter (MANA) - Actions Semi Owl Ethernet MAC - can: driver for ETAS ES58X CAN/USB interfaces Pure driver changes: - add XDP support to: enetc, igc, stmmac - add AF_XDP support to: stmmac - virtio: - page_to_skb() use build_skb when there's sufficient tailroom (21% improvement for 1000B UDP frames) - support XDP even without dedicated Tx queues - share the Tx queues with the stack when necessary - mlx5: - flow rules: add support for mirroring with conntrack, matching on ICMP, GTP, flex filters and more - support packet sampling with flow offloads - persist uplink representor netdev across eswitch mode changes - allow coexistence of CQE compression and HW time-stamping - add ethtool extended link error state reporting - ice, iavf: support flow filters, UDP Segmentation Offload - dpaa2-switch: - move the driver out of staging - add spanning tree (STP) support - add rx copybreak support - add tc flower hardware offload on ingress traffic - ionic: - implement Rx page reuse - support HW PTP time-stamping - octeon: support TC hardware offloads - flower matching on ingress and egress ratelimitting. - stmmac: - add RX frame steering based on VLAN priority in tc flower - support frame preemption (FPE) - intel: add cross time-stamping freq difference adjustment - ocelot: - support forwarding of MRP frames in HW - support multiple bridges - support PTP Sync one-step timestamping - dsa: mv88e6xxx, dpaa2-switch: offload bridge port flags like learning, flooding etc. - ipa: add IPA v4.5, v4.9 and v4.11 support (Qualcomm SDX55, SM8350, SC7280 SoCs) - mt7601u: enable TDLS support - mt76: - add support for 802.3 rx frames (mt7915/mt7615) - mt7915 flash pre-calibration support - mt7921/mt7663 runtime power management fixes" * tag 'net-next-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2451 commits) net: selftest: fix build issue if INET is disabled net: netrom: nr_in: Remove redundant assignment to ns net: tun: Remove redundant assignment to ret net: phy: marvell: add downshift support for M88E1240 net: dsa: ksz: Make reg_mib_cnt a u8 as it never exceeds 255 net/sched: act_ct: Remove redundant ct get and check icmp: standardize naming of RFC 8335 PROBE constants bpf, selftests: Update array map tests for per-cpu batched ops bpf: Add batched ops support for percpu array bpf: Implement formatted output helpers with bstr_printf seq_file: Add a seq_bprintf function sfc: adjust efx->xdp_tx_queue_count with the real number of initialized queues net:nfc:digital: Fix a double free in digital_tg_recv_dep_req net: fix a concurrency bug in l2tp_tunnel_register() net/smc: Remove redundant assignment to rc mpls: Remove redundant assignment to err llc2: Remove redundant assignment to rc net/tls: Remove redundant initialization of record rds: Remove redundant assignment to nr_sig dt-bindings: net: mdio-gpio: add compatible for microchip,mdio-smi0 ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c42
-rw-r--r--kernel/bpf/bpf_inode_storage.c2
-rw-r--r--kernel/bpf/bpf_iter.c16
-rw-r--r--kernel/bpf/bpf_local_storage.c39
-rw-r--r--kernel/bpf/bpf_lsm.c8
-rw-r--r--kernel/bpf/bpf_task_storage.c100
-rw-r--r--kernel/bpf/btf.c325
-rw-r--r--kernel/bpf/core.c54
-rw-r--r--kernel/bpf/cpumap.c27
-rw-r--r--kernel/bpf/devmap.c47
-rw-r--r--kernel/bpf/disasm.c13
-rw-r--r--kernel/bpf/hashtab.c67
-rw-r--r--kernel/bpf/helpers.c335
-rw-r--r--kernel/bpf/inode.c2
-rw-r--r--kernel/bpf/local_storage.c5
-rw-r--r--kernel/bpf/lpm_trie.c3
-rw-r--r--kernel/bpf/syscall.c31
-rw-r--r--kernel/bpf/trampoline.c4
-rw-r--r--kernel/bpf/verifier.c820
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/sysctl.c65
-rw-r--r--kernel/trace/bpf_trace.c371
23 files changed, 1735 insertions, 649 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index d1249340fd6b..7f33098ca63f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -9,8 +9,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
-obj-${CONFIG_BPF_LSM} += bpf_task_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
@@ -18,7 +18,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
-obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o
obj-$(CONFIG_BPF_SYSCALL) += offload.o
obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1f8453343bf2..3c4105603f9d 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -625,6 +625,42 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
};
+static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
+ void *callback_ctx, u64 flags)
+{
+ u32 i, key, num_elems = 0;
+ struct bpf_array *array;
+ bool is_percpu;
+ u64 ret = 0;
+ void *val;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ array = container_of(map, struct bpf_array, map);
+ if (is_percpu)
+ migrate_disable();
+ for (i = 0; i < map->max_entries; i++) {
+ if (is_percpu)
+ val = this_cpu_ptr(array->pptrs[i]);
+ else
+ val = array->value + array->elem_size * i;
+ num_elems++;
+ key = i;
+ ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
+ (u64)(long)&key, (u64)(long)val,
+ (u64)(long)callback_ctx, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ break;
+ }
+
+ if (is_percpu)
+ migrate_enable();
+ return num_elems;
+}
+
static int array_map_btf_id;
const struct bpf_map_ops array_map_ops = {
.map_meta_equal = array_map_meta_equal,
@@ -643,6 +679,8 @@ const struct bpf_map_ops array_map_ops = {
.map_check_btf = array_map_check_btf,
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_array_elem,
.map_btf_name = "bpf_array",
.map_btf_id = &array_map_btf_id,
.iter_seq_info = &iter_seq_info,
@@ -660,6 +698,10 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_delete_elem = array_map_delete_elem,
.map_seq_show_elem = percpu_array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_array_elem,
.map_btf_name = "bpf_array",
.map_btf_id = &percpu_array_map_btf_id,
.iter_seq_info = &iter_seq_info,
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index b58b2efb9b43..2921ca39a93e 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -237,7 +237,7 @@ static void inode_storage_map_free(struct bpf_map *map)
smap = (struct bpf_local_storage_map *)map;
bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap);
+ bpf_local_storage_map_free(smap, NULL);
}
static int inode_storage_map_btf_id;
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index a0d9eade9c80..931870f9cf56 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
*/
return ret == 0 ? 0 : -EAGAIN;
}
+
+BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
+ void *, callback_ctx, u64, flags)
+{
+ return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
+}
+
+const struct bpf_func_proto bpf_for_each_map_elem_proto = {
+ .func = bpf_for_each_map_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index dd5aedee99e7..b305270b7a4b 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage *local_storage;
bool free_local_storage = false;
+ unsigned long flags;
if (unlikely(!selem_linked_to_storage(selem)))
/* selem has already been unlinked from sk */
return;
local_storage = rcu_dereference(selem->local_storage);
- raw_spin_lock_bh(&local_storage->lock);
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, true);
- raw_spin_unlock_bh(&local_storage->lock);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (free_local_storage)
kfree_rcu(local_storage, rcu);
@@ -167,6 +168,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map *smap;
struct bpf_local_storage_map_bucket *b;
+ unsigned long flags;
if (unlikely(!selem_linked_to_map(selem)))
/* selem has already be unlinked from smap */
@@ -174,21 +176,22 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
smap = rcu_dereference(SDATA(selem)->smap);
b = select_bucket(smap, selem);
- raw_spin_lock_bh(&b->lock);
+ raw_spin_lock_irqsave(&b->lock, flags);
if (likely(selem_linked_to_map(selem)))
hlist_del_init_rcu(&selem->map_node);
- raw_spin_unlock_bh(&b->lock);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
}
void bpf_selem_link_map(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
+ unsigned long flags;
- raw_spin_lock_bh(&b->lock);
+ raw_spin_lock_irqsave(&b->lock, flags);
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
hlist_add_head_rcu(&selem->map_node, &b->list);
- raw_spin_unlock_bh(&b->lock);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
}
void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
@@ -224,16 +227,18 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
sdata = SDATA(selem);
if (cacheit_lockit) {
+ unsigned long flags;
+
/* spinlock is needed to avoid racing with the
* parallel delete. Otherwise, publishing an already
* deleted sdata to the cache will become a use-after-free
* problem in the next bpf_local_storage_lookup().
*/
- raw_spin_lock_bh(&local_storage->lock);
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
if (selem_linked_to_storage(selem))
rcu_assign_pointer(local_storage->cache[smap->cache_idx],
sdata);
- raw_spin_unlock_bh(&local_storage->lock);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
}
return sdata;
@@ -327,6 +332,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
+ unsigned long flags;
int err;
/* BPF_EXIST and BPF_NOEXIST cannot be both set */
@@ -374,7 +380,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
}
}
- raw_spin_lock_bh(&local_storage->lock);
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
/* Recheck local_storage->list under local_storage->lock */
if (unlikely(hlist_empty(&local_storage->list))) {
@@ -428,11 +434,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
}
unlock:
- raw_spin_unlock_bh(&local_storage->lock);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
return SDATA(selem);
unlock_err:
- raw_spin_unlock_bh(&local_storage->lock);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
return ERR_PTR(err);
}
@@ -468,7 +474,8 @@ void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
spin_unlock(&cache->idx_lock);
}
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
+ int __percpu *busy_counter)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage_map_bucket *b;
@@ -497,7 +504,15 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
while ((selem = hlist_entry_safe(
rcu_dereference_raw(hlist_first_rcu(&b->list)),
struct bpf_local_storage_elem, map_node))) {
+ if (busy_counter) {
+ migrate_disable();
+ __this_cpu_inc(*busy_counter);
+ }
bpf_selem_unlink(selem);
+ if (busy_counter) {
+ __this_cpu_dec(*busy_counter);
+ migrate_enable();
+ }
cond_resched_rcu();
}
rcu_read_unlock();
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 0ff58259ccf8..5efb2b24012c 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -67,7 +67,7 @@ BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags)
BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm)
-const static struct bpf_func_proto bpf_bprm_opts_set_proto = {
+static const struct bpf_func_proto bpf_bprm_opts_set_proto = {
.func = bpf_bprm_opts_set,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -88,7 +88,7 @@ static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog)
BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
-const static struct bpf_func_proto bpf_ima_inode_hash_proto = {
+static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.func = bpf_ima_inode_hash,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -115,10 +115,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_spin_lock_proto;
case BPF_FUNC_spin_unlock:
return &bpf_spin_unlock_proto;
- case BPF_FUNC_task_storage_get:
- return &bpf_task_storage_get_proto;
- case BPF_FUNC_task_storage_delete:
- return &bpf_task_storage_delete_proto;
case BPF_FUNC_bprm_opts_set:
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index e0da0258b732..3ce75758d394 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -15,21 +15,41 @@
#include <linux/bpf_local_storage.h>
#include <linux/filter.h>
#include <uapi/linux/btf.h>
-#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
+static DEFINE_PER_CPU(int, bpf_task_storage_busy);
+
+static void bpf_task_storage_lock(void)
+{
+ migrate_disable();
+ __this_cpu_inc(bpf_task_storage_busy);
+}
+
+static void bpf_task_storage_unlock(void)
+{
+ __this_cpu_dec(bpf_task_storage_busy);
+ migrate_enable();
+}
+
+static bool bpf_task_storage_trylock(void)
+{
+ migrate_disable();
+ if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+ __this_cpu_dec(bpf_task_storage_busy);
+ migrate_enable();
+ return false;
+ }
+ return true;
+}
+
static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
{
struct task_struct *task = owner;
- struct bpf_storage_blob *bsb;
- bsb = bpf_task(task);
- if (!bsb)
- return NULL;
- return &bsb->storage;
+ return &task->bpf_storage;
}
static struct bpf_local_storage_data *
@@ -38,13 +58,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
{
struct bpf_local_storage *task_storage;
struct bpf_local_storage_map *smap;
- struct bpf_storage_blob *bsb;
-
- bsb = bpf_task(task);
- if (!bsb)
- return NULL;
- task_storage = rcu_dereference(bsb->storage);
+ task_storage = rcu_dereference(task->bpf_storage);
if (!task_storage)
return NULL;
@@ -57,16 +72,12 @@ void bpf_task_storage_free(struct task_struct *task)
struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
bool free_task_storage = false;
- struct bpf_storage_blob *bsb;
struct hlist_node *n;
-
- bsb = bpf_task(task);
- if (!bsb)
- return;
+ unsigned long flags;
rcu_read_lock();
- local_storage = rcu_dereference(bsb->storage);
+ local_storage = rcu_dereference(task->bpf_storage);
if (!local_storage) {
rcu_read_unlock();
return;
@@ -81,7 +92,8 @@ void bpf_task_storage_free(struct task_struct *task)
* when unlinking elem from the local_storage->list and
* the map's bucket->list.
*/
- raw_spin_lock_bh(&local_storage->lock);
+ bpf_task_storage_lock();
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
/* Always unlink from map before unlinking from
* local_storage.
@@ -90,7 +102,8 @@ void bpf_task_storage_free(struct task_struct *task)
free_task_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, false);
}
- raw_spin_unlock_bh(&local_storage->lock);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_task_storage_unlock();
rcu_read_unlock();
/* free_task_storage should always be true as long as
@@ -123,7 +136,9 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
goto out;
}
+ bpf_task_storage_lock();
sdata = task_storage_lookup(task, map, true);
+ bpf_task_storage_unlock();
put_pid(pid);
return sdata ? sdata->data : NULL;
out:
@@ -150,13 +165,15 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
*/
WARN_ON_ONCE(!rcu_read_lock_held());
task = pid_task(pid, PIDTYPE_PID);
- if (!task || !task_storage_ptr(task)) {
+ if (!task) {
err = -ENOENT;
goto out;
}
+ bpf_task_storage_lock();
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags);
+ bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
out:
@@ -199,7 +216,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
goto out;
}
+ bpf_task_storage_lock();
err = task_storage_delete(task, map);
+ bpf_task_storage_unlock();
out:
put_pid(pid);
return err;
@@ -213,44 +232,47 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
- /* explicitly check that the task_storage_ptr is not
- * NULL as task_storage_lookup returns NULL in this case and
- * bpf_local_storage_update expects the owner to have a
- * valid storage pointer.
- */
- if (!task || !task_storage_ptr(task))
+ if (!task)
+ return (unsigned long)NULL;
+
+ if (!bpf_task_storage_trylock())
return (unsigned long)NULL;
sdata = task_storage_lookup(task, map, true);
if (sdata)
- return (unsigned long)sdata->data;
+ goto unlock;
- /* This helper must only be called from places where the lifetime of the task
- * is guaranteed. Either by being refcounted or by being protected
- * by an RCU read-side critical section.
- */
- if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+ /* only allocate new storage, when the task is refcounted */
+ if (refcount_read(&task->usage) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
BPF_NOEXIST);
- return IS_ERR(sdata) ? (unsigned long)NULL :
- (unsigned long)sdata->data;
- }
- return (unsigned long)NULL;
+unlock:
+ bpf_task_storage_unlock();
+ return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL :
+ (unsigned long)sdata->data;
}
BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
task)
{
+ int ret;
+
if (!task)
return -EINVAL;
+ if (!bpf_task_storage_trylock())
+ return -EBUSY;
+
/* This helper must only be called from places where the lifetime of the task
* is guaranteed. Either by being refcounted or by being protected
* by an RCU read-side critical section.
*/
- return task_storage_delete(task, map);
+ ret = task_storage_delete(task, map);
+ bpf_task_storage_unlock();
+ return ret;
}
static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -276,7 +298,7 @@ static void task_storage_map_free(struct bpf_map *map)
smap = (struct bpf_local_storage_map *)map;
bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap);
+ bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
}
static int task_storage_map_btf_id;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index b1a76fe046cb..0600ed325fa0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -173,7 +173,7 @@
#define BITS_ROUNDUP_BYTES(bits) \
(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
-#define BTF_INFO_MASK 0x8f00ffff
+#define BTF_INFO_MASK 0x9f00ffff
#define BTF_INT_MASK 0x0fffffff
#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
@@ -280,9 +280,10 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
[BTF_KIND_VAR] = "VAR",
[BTF_KIND_DATASEC] = "DATASEC",
+ [BTF_KIND_FLOAT] = "FLOAT",
};
-static const char *btf_type_str(const struct btf_type *t)
+const char *btf_type_str(const struct btf_type *t)
{
return btf_kind_str[BTF_INFO_KIND(t->info)];
}
@@ -574,6 +575,7 @@ static bool btf_type_has_size(const struct btf_type *t)
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
case BTF_KIND_DATASEC:
+ case BTF_KIND_FLOAT:
return true;
}
@@ -787,7 +789,6 @@ static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf,
while (btf_type_is_modifier(t) &&
BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
- id = t->type;
t = btf_type_by_id(btf, t->type);
}
@@ -1704,6 +1705,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
+ case BTF_KIND_FLOAT:
size = type->size;
goto resolved;
@@ -1849,7 +1851,7 @@ static int btf_df_check_kflag_member(struct btf_verifier_env *env,
return -EINVAL;
}
-/* Used for ptr, array and struct/union type members.
+/* Used for ptr, array struct/union and float type members.
* int, enum and modifier types have their specific callback functions.
*/
static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
@@ -3675,6 +3677,81 @@ static const struct btf_kind_operations datasec_ops = {
.show = btf_datasec_show,
};
+static s32 btf_float_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 &&
+ t->size != 16) {
+ btf_verifier_log_type(env, t, "Invalid type_size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static int btf_float_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u64 start_offset_bytes;
+ u64 end_offset_bytes;
+ u64 misalign_bits;
+ u64 align_bytes;
+ u64 align_bits;
+
+ /* Different architectures have different alignment requirements, so
+ * here we check only for the reasonable minimum. This way we ensure
+ * that types after CO-RE can pass the kernel BTF verifier.
+ */
+ align_bytes = min_t(u64, sizeof(void *), member_type->size);
+ align_bits = align_bytes * BITS_PER_BYTE;
+ div64_u64_rem(member->offset, align_bits, &misalign_bits);
+ if (misalign_bits) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not properly aligned");
+ return -EINVAL;
+ }
+
+ start_offset_bytes = member->offset / BITS_PER_BYTE;
+ end_offset_bytes = start_offset_bytes + member_type->size;
+ if (end_offset_bytes > struct_type->size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void btf_float_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u", t->size);
+}
+
+static const struct btf_kind_operations float_ops = {
+ .check_meta = btf_float_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_float_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
+ .log_details = btf_float_log,
+ .show = btf_df_show,
+};
+
static int btf_func_proto_check(struct btf_verifier_env *env,
const struct btf_type *t)
{
@@ -3808,6 +3885,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_FUNC_PROTO] = &func_proto_ops,
[BTF_KIND_VAR] = &var_ops,
[BTF_KIND_DATASEC] = &datasec_ops,
+ [BTF_KIND_FLOAT] = &float_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -4298,7 +4376,7 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_LINK_TYPE
static const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
{
@@ -4592,8 +4670,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
arg = off / 8;
args = (const struct btf_param *)(t + 1);
- /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */
- nr_args = t ? btf_type_vlen(t) : 5;
+ /* if (t == NULL) Fall back to default BPF prog with
+ * MAX_BPF_FUNC_REG_ARGS u64 arguments.
+ */
+ nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS;
if (prog->aux->attach_btf_trace) {
/* skip first 'void *__data' argument in btf_trace_##name typedef */
args++;
@@ -4649,7 +4729,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
} else {
if (!t)
- /* Default prog with 5 args */
+ /* Default prog with MAX_BPF_FUNC_REG_ARGS args */
return true;
t = btf_type_by_id(btf, args[arg].type);
}
@@ -5100,12 +5180,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
if (!func) {
/* BTF function prototype doesn't match the verifier types.
- * Fall back to 5 u64 args.
+ * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
*/
- for (i = 0; i < 5; i++)
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
m->arg_size[i] = 8;
m->ret_size = 8;
- m->nr_args = 5;
+ m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
}
args = (const struct btf_param *)(func + 1);
@@ -5281,121 +5361,190 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
-/* Compare BTF of a function with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs)
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
+static int btf_check_func_arg_match(struct bpf_verifier_env *env,
+ const struct btf *btf, u32 func_id,
+ struct bpf_reg_state *regs,
+ bool ptr_to_mem_ok)
{
struct bpf_verifier_log *log = &env->log;
- struct bpf_prog *prog = env->prog;
- struct btf *btf = prog->aux->btf;
- const struct btf_param *args;
+ const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
- u32 i, nargs, btf_id, type_size;
- const char *tname;
- bool is_global;
-
- if (!prog->aux->func_info)
- return -EINVAL;
-
- btf_id = prog->aux->func_info[subprog].type_id;
- if (!btf_id)
- return -EFAULT;
-
- if (prog->aux->func_info_aux[subprog].unreliable)
- return -EINVAL;
+ const struct btf_param *args;
+ u32 i, nargs, ref_id;
- t = btf_type_by_id(btf, btf_id);
+ t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
/* These checks were already done by the verifier while loading
- * struct bpf_func_info
+ * struct bpf_func_info or in add_kfunc_call().
*/
- bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
- subprog);
+ bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
+ func_id);
return -EFAULT;
}
- tname = btf_name_by_offset(btf, t->name_off);
+ func_name = btf_name_by_offset(btf, t->name_off);
t = btf_type_by_id(btf, t->type);
if (!t || !btf_type_is_func_proto(t)) {
- bpf_log(log, "Invalid BTF of func %s\n", tname);
+ bpf_log(log, "Invalid BTF of func %s\n", func_name);
return -EFAULT;
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
- if (nargs > 5) {
- bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs);
- goto out;
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
+ MAX_BPF_FUNC_REG_ARGS);
+ return -EINVAL;
}
- is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
for (i = 0; i < nargs; i++) {
- struct bpf_reg_state *reg = &regs[i + 1];
+ u32 regno = i + 1;
+ struct bpf_reg_state *reg = &regs[regno];
- t = btf_type_by_id(btf, args[i].type);
- while (btf_type_is_modifier(t))
- t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+ if (btf_type_is_scalar(t)) {
if (reg->type == SCALAR_VALUE)
continue;
- bpf_log(log, "R%d is not a scalar\n", i + 1);
- goto out;
+ bpf_log(log, "R%d is not a scalar\n", regno);
+ return -EINVAL;
}
- if (btf_type_is_ptr(t)) {
+
+ if (!btf_type_is_ptr(t)) {
+ bpf_log(log, "Unrecognized arg#%d type %s\n",
+ i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+ if (btf_is_kernel(btf)) {
+ const struct btf_type *reg_ref_t;
+ const struct btf *reg_btf;
+ const char *reg_ref_tname;
+ u32 reg_ref_id;
+
+ if (!btf_type_is_struct(ref_t)) {
+ bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ func_name, i, btf_type_str(ref_t),
+ ref_tname);
+ return -EINVAL;
+ }
+
+ if (reg->type == PTR_TO_BTF_ID) {
+ reg_btf = reg->btf;
+ reg_ref_id = reg->btf_id;
+ } else if (reg2btf_ids[reg->type]) {
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[reg->type];
+ } else {
+ bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
+ func_name, i,
+ btf_type_str(ref_t), ref_tname, regno);
+ return -EINVAL;
+ }
+
+ reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
+ &reg_ref_id);
+ reg_ref_tname = btf_name_by_offset(reg_btf,
+ reg_ref_t->name_off);
+ if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
+ reg->off, btf, ref_id)) {
+ bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+ func_name, i,
+ btf_type_str(ref_t), ref_tname,
+ regno, btf_type_str(reg_ref_t),
+ reg_ref_tname);
+ return -EINVAL;
+ }
+ } else if (btf_get_prog_ctx_type(log, btf, t,
+ env->prog->type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
- if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
- if (reg->type != PTR_TO_CTX) {
- bpf_log(log,
- "arg#%d expected pointer to ctx, but got %s\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)]);
- goto out;
- }
- if (check_ctx_reg(env, reg, i + 1))
- goto out;
- continue;
+ if (reg->type != PTR_TO_CTX) {
+ bpf_log(log,
+ "arg#%d expected pointer to ctx, but got %s\n",
+ i, btf_type_str(t));
+ return -EINVAL;
}
+ if (check_ctx_reg(env, reg, regno))
+ return -EINVAL;
+ } else if (ptr_to_mem_ok) {
+ const struct btf_type *resolve_ret;
+ u32 type_size;
- if (!is_global)
- goto out;
-
- t = btf_type_skip_modifiers(btf, t->type, NULL);
-
- ref_t = btf_resolve_size(btf, t, &type_size);
- if (IS_ERR(ref_t)) {
+ resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+ if (IS_ERR(resolve_ret)) {
bpf_log(log,
- "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
- i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
- PTR_ERR(ref_t));
- goto out;
+ "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(ref_t), ref_tname,
+ PTR_ERR(resolve_ret));
+ return -EINVAL;
}
- if (check_mem_reg(env, reg, i + 1, type_size))
- goto out;
-
- continue;
+ if (check_mem_reg(env, reg, regno, type_size))
+ return -EINVAL;
+ } else {
+ return -EINVAL;
}
- bpf_log(log, "Unrecognized arg#%d type %s\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)]);
- goto out;
}
+
return 0;
-out:
+}
+
+/* Compare BTF of a function with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ bool is_global;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
* In such cases mark the function as unreliable from BTF point of view.
*/
- prog->aux->func_info_aux[subprog].unreliable = true;
- return -EINVAL;
+ if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
+}
+
+int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
+ const struct btf *btf, u32 func_id,
+ struct bpf_reg_state *regs)
+{
+ return btf_check_func_arg_match(env, btf, func_id, regs, false);
}
/* Convert BTF of a function into bpf_reg_state if possible
@@ -5458,9 +5607,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
- if (nargs > 5) {
- bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n",
- tname, nargs);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
+ tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
/* check that function returns int */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 75244ecb2389..5e31ee9f7512 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -143,25 +143,25 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
if (!prog->aux->nr_linfo || !prog->jit_requested)
return 0;
- prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
- sizeof(*prog->aux->jited_linfo),
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
+ sizeof(*prog->aux->jited_linfo),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!prog->aux->jited_linfo)
return -ENOMEM;
return 0;
}
-void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
{
- kfree(prog->aux->jited_linfo);
- prog->aux->jited_linfo = NULL;
-}
+ if (prog->aux->jited_linfo &&
+ (!prog->jited || !prog->aux->jited_linfo[0])) {
+ kvfree(prog->aux->jited_linfo);
+ prog->aux->jited_linfo = NULL;
+ }
-void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
-{
- if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
- bpf_prog_free_jited_linfo(prog);
+ kfree(prog->aux->kfunc_tab);
+ prog->aux->kfunc_tab = NULL;
}
/* The jit engine is responsible to provide an array
@@ -217,12 +217,6 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
}
-void bpf_prog_free_linfo(struct bpf_prog *prog)
-{
- bpf_prog_free_jited_linfo(prog);
- kvfree(prog->aux->linfo);
-}
-
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
@@ -1369,11 +1363,10 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
* __bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
- * @stack: is the eBPF storage stack
*
* Decode and execute eBPF instructions.
*/
-static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
@@ -1707,7 +1700,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
- return ___bpf_prog_run(regs, insn, stack); \
+ return ___bpf_prog_run(regs, insn); \
}
#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
@@ -1724,7 +1717,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
BPF_R3 = r3; \
BPF_R4 = r4; \
BPF_R5 = r5; \
- return ___bpf_prog_run(regs, insn, stack); \
+ return ___bpf_prog_run(regs, insn); \
}
#define EVAL1(FN, X) FN(X)
@@ -1849,9 +1842,15 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
/* In case of BPF to BPF calls, verifier did all the prep
* work with regards to JITing, etc.
*/
+ bool jit_needed = false;
+
if (fp->bpf_func)
goto finalize;
+ if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
+ bpf_prog_has_kfunc_call(fp))
+ jit_needed = true;
+
bpf_prog_select_func(fp);
/* eBPF JITs can rewrite the program in case constant
@@ -1866,14 +1865,10 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
return fp;
fp = bpf_int_jit_compile(fp);
- if (!fp->jited) {
- bpf_prog_free_jited_linfo(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ bpf_prog_jit_attempt_done(fp);
+ if (!fp->jited && jit_needed) {
*err = -ENOTSUPP;
return fp;
-#endif
- } else {
- bpf_prog_free_unused_jited_linfo(fp);
}
} else {
*err = bpf_prog_offload_compile(fp);
@@ -2354,6 +2349,11 @@ bool __weak bpf_jit_needs_zext(void)
return false;
}
+bool __weak bpf_jit_supports_kfunc_call(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 5d1469de6921..5dd3e866599a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -27,7 +27,7 @@
#include <linux/capability.h>
#include <trace/events/xdp.h>
-#include <linux/netdevice.h> /* netif_receive_skb_core */
+#include <linux/netdevice.h> /* netif_receive_skb_list */
#include <linux/etherdevice.h> /* eth_type_trans */
/* General idea: XDP packets getting XDP redirected to another CPU,
@@ -252,11 +252,12 @@ static int cpu_map_kthread_run(void *data)
*/
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
struct xdp_cpumap_stats stats = {}; /* zero stats */
+ unsigned int kmem_alloc_drops = 0, sched = 0;
gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
- unsigned int drops = 0, sched = 0;
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
int i, n, m, nframes;
+ LIST_HEAD(list);
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -297,7 +298,7 @@ static int cpu_map_kthread_run(void *data)
if (unlikely(m == 0)) {
for (i = 0; i < nframes; i++)
skbs[i] = NULL; /* effect: xdp_return_frame */
- drops += nframes;
+ kmem_alloc_drops += nframes;
}
}
@@ -305,7 +306,6 @@ static int cpu_map_kthread_run(void *data)
for (i = 0; i < nframes; i++) {
struct xdp_frame *xdpf = frames[i];
struct sk_buff *skb = skbs[i];
- int ret;
skb = __xdp_build_skb_from_frame(xdpf, skb,
xdpf->dev_rx);
@@ -314,13 +314,13 @@ static int cpu_map_kthread_run(void *data)
continue;
}
- /* Inject into network stack */
- ret = netif_receive_skb_core(skb);
- if (ret == NET_RX_DROP)
- drops++;
+ list_add_tail(&skb->list, &list);
}
+ netif_receive_skb_list(&list);
+
/* Feedback loop via tracepoint */
- trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats);
+ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops,
+ sched, &stats);
local_bh_enable(); /* resched point, may call do_softirq() */
}
@@ -543,7 +543,6 @@ static void cpu_map_free(struct bpf_map *map)
* complete.
*/
- bpf_clear_redirect_map(map);
synchronize_rcu();
/* For cpu_map the remote CPUs can still be using the entries
@@ -563,7 +562,7 @@ static void cpu_map_free(struct bpf_map *map)
kfree(cmap);
}
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
struct bpf_cpu_map_entry *rcpu;
@@ -600,6 +599,11 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
+static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
+}
+
static int cpu_map_btf_id;
const struct bpf_map_ops cpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -612,6 +616,7 @@ const struct bpf_map_ops cpu_map_ops = {
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_cpu_map",
.map_btf_id = &cpu_map_btf_id,
+ .map_redirect = cpu_map_redirect,
};
static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 85d9d1b72a33..aa516472ce46 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -197,7 +197,6 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
- bpf_clear_redirect_map(map);
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -258,7 +257,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct hlist_head *head = dev_map_index_hash(dtab, key);
@@ -330,7 +329,7 @@ bool dev_map_can_have_prog(struct bpf_map *map)
static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
struct net_device *dev = bq->dev;
- int sent = 0, drops = 0, err = 0;
+ int sent = 0, err = 0;
int i;
if (unlikely(!bq->count))
@@ -344,29 +343,23 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
if (sent < 0) {
+ /* If ndo_xdp_xmit fails with an errno, no frames have
+ * been xmit'ed.
+ */
err = sent;
sent = 0;
- goto error;
}
- drops = bq->count - sent;
-out:
- bq->count = 0;
- trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
- bq->dev_rx = NULL;
- __list_del_clearprev(&bq->flush_node);
- return;
-error:
- /* If ndo_xdp_xmit fails with an errno, no frames have been
- * xmit'ed and it's our responsibility to them free all.
+ /* If not all frames have been transmitted, it is our
+ * responsibility to free them
*/
- for (i = 0; i < bq->count; i++) {
- struct xdp_frame *xdpf = bq->q[i];
+ for (i = sent; unlikely(i < bq->count); i++)
+ xdp_return_frame_rx_napi(bq->q[i]);
- xdp_return_frame_rx_napi(xdpf);
- drops++;
- }
- goto out;
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, bq->count - sent, err);
+ bq->dev_rx = NULL;
+ bq->count = 0;
+ __list_del_clearprev(&bq->flush_node);
}
/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
@@ -392,7 +385,7 @@ void __dev_flush(void)
* update happens in parallel here a dev_put wont happen until after reading the
* ifindex.
*/
-struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *obj;
@@ -735,6 +728,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
map, key, value, map_flags);
}
+static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem);
+}
+
+static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem);
+}
+
static int dev_map_btf_id;
const struct bpf_map_ops dev_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -747,6 +750,7 @@ const struct bpf_map_ops dev_map_ops = {
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_dtab",
.map_btf_id = &dev_map_btf_id,
+ .map_redirect = dev_map_redirect,
};
static int dev_map_hash_map_btf_id;
@@ -761,6 +765,7 @@ const struct bpf_map_ops dev_map_hash_ops = {
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_dtab",
.map_btf_id = &dev_map_hash_map_btf_id,
+ .map_redirect = dev_hash_map_redirect,
};
static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index faa54d58972c..bbfc6bb79240 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
{
BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
- if (insn->src_reg != BPF_PSEUDO_CALL &&
+ if (!insn->src_reg &&
insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
func_id_str[insn->imm])
return func_id_str[insn->imm];
- if (cbs && cbs->cb_call)
- return cbs->cb_call(cbs->private_data, insn);
+ if (cbs && cbs->cb_call) {
+ const char *res;
+
+ res = cbs->cb_call(cbs->private_data, insn);
+ if (res)
+ return res;
+ }
if (insn->src_reg == BPF_PSEUDO_CALL)
snprintf(buff, len, "%+d", insn->imm);
+ else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ snprintf(buff, len, "kernel-function");
return buff;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d63912e73ad9..d7ebb12ffffc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -31,7 +31,7 @@
/*
* The bucket lock has two protection scopes:
*
- * 1) Serializing concurrent operations from BPF programs on differrent
+ * 1) Serializing concurrent operations from BPF programs on different
* CPUs
*
* 2) Serializing concurrent operations from BPF programs and sys_bpf()
@@ -1869,6 +1869,63 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
};
+static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
+ void *callback_ctx, u64 flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ struct htab_elem *elem;
+ u32 roundup_key_size;
+ int i, num_elems = 0;
+ void __percpu *pptr;
+ struct bucket *b;
+ void *key, *val;
+ bool is_percpu;
+ u64 ret = 0;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ is_percpu = htab_is_percpu(htab);
+
+ roundup_key_size = round_up(map->key_size, 8);
+ /* disable migration so percpu value prepared here will be the
+ * same as the one seen by the bpf program with bpf_map_lookup_elem().
+ */
+ if (is_percpu)
+ migrate_disable();
+ for (i = 0; i < htab->n_buckets; i++) {
+ b = &htab->buckets[i];
+ rcu_read_lock();
+ head = &b->head;
+ hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ key = elem->key;
+ if (is_percpu) {
+ /* current cpu value for percpu map */
+ pptr = htab_elem_get_ptr(elem, map->key_size);
+ val = this_cpu_ptr(pptr);
+ } else {
+ val = elem->key + roundup_key_size;
+ }
+ num_elems++;
+ ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
+ (u64)(long)key, (u64)(long)val,
+ (u64)(long)callback_ctx, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret) {
+ rcu_read_unlock();
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+ }
+out:
+ if (is_percpu)
+ migrate_enable();
+ return num_elems;
+}
+
static int htab_map_btf_id;
const struct bpf_map_ops htab_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -1881,6 +1938,8 @@ const struct bpf_map_ops htab_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab),
.map_btf_name = "bpf_htab",
.map_btf_id = &htab_map_btf_id,
@@ -1900,6 +1959,8 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab_lru),
.map_btf_name = "bpf_htab",
.map_btf_id = &htab_lru_map_btf_id,
@@ -2019,6 +2080,8 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab_percpu),
.map_btf_name = "bpf_htab",
.map_btf_id = &htab_percpu_map_btf_id,
@@ -2036,6 +2099,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab_lru_percpu),
.map_btf_name = "bpf_htab",
.map_btf_id = &htab_lru_percpu_map_btf_id,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 308427fe03a3..544773970dbc 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -382,8 +382,8 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
};
#ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
- bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
+ bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
{
@@ -392,10 +392,17 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
* verifier checks that its value is correct.
*/
enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage;
+ struct bpf_cgroup_storage *storage = NULL;
void *ptr;
+ int i;
- storage = this_cpu_read(bpf_cgroup_storage[stype]);
+ for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+ if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+ continue;
+
+ storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
+ break;
+ }
if (stype == BPF_CGROUP_STORAGE_SHARED)
ptr = &READ_ONCE(storage->buf)->data[0];
@@ -662,6 +669,322 @@ const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
};
+static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
+ size_t bufsz)
+{
+ void __user *user_ptr = (__force void __user *)unsafe_ptr;
+
+ buf[0] = 0;
+
+ switch (fmt_ptype) {
+ case 's':
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)unsafe_ptr < TASK_SIZE)
+ return strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ fallthrough;
+#endif
+ case 'k':
+ return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
+ case 'u':
+ return strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ }
+
+ return -EINVAL;
+}
+
+/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p
+ */
+#define MAX_PRINTF_BUF_LEN 512
+
+struct bpf_printf_buf {
+ char tmp_buf[MAX_PRINTF_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf);
+static DEFINE_PER_CPU(int, bpf_printf_buf_used);
+
+static int try_get_fmt_tmp_buf(char **tmp_buf)
+{
+ struct bpf_printf_buf *bufs;
+ int used;
+
+ preempt_disable();
+ used = this_cpu_inc_return(bpf_printf_buf_used);
+ if (WARN_ON_ONCE(used > 1)) {
+ this_cpu_dec(bpf_printf_buf_used);
+ preempt_enable();
+ return -EBUSY;
+ }
+ bufs = this_cpu_ptr(&bpf_printf_buf);
+ *tmp_buf = bufs->tmp_buf;
+
+ return 0;
+}
+
+void bpf_bprintf_cleanup(void)
+{
+ if (this_cpu_read(bpf_printf_buf_used)) {
+ this_cpu_dec(bpf_printf_buf_used);
+ preempt_enable();
+ }
+}
+
+/*
+ * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
+ *
+ * Returns a negative value if fmt is an invalid format string or 0 otherwise.
+ *
+ * This can be used in two ways:
+ * - Format string verification only: when bin_args is NULL
+ * - Arguments preparation: in addition to the above verification, it writes in
+ * bin_args a binary representation of arguments usable by bstr_printf where
+ * pointers from BPF have been sanitized.
+ *
+ * In argument preparation mode, if 0 is returned, safe temporary buffers are
+ * allocated and bpf_bprintf_cleanup should be called to free them after use.
+ */
+int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
+ u32 **bin_args, u32 num_args)
+{
+ char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
+ size_t sizeof_cur_arg, sizeof_cur_ip;
+ int err, i, num_spec = 0;
+ u64 cur_arg;
+ char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX";
+
+ fmt_end = strnchr(fmt, fmt_size, 0);
+ if (!fmt_end)
+ return -EINVAL;
+ fmt_size = fmt_end - fmt;
+
+ if (bin_args) {
+ if (num_args && try_get_fmt_tmp_buf(&tmp_buf))
+ return -EBUSY;
+
+ tmp_buf_end = tmp_buf + MAX_PRINTF_BUF_LEN;
+ *bin_args = (u32 *)tmp_buf;
+ }
+
+ for (i = 0; i < fmt_size; i++) {
+ if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (fmt[i] != '%')
+ continue;
+
+ if (fmt[i + 1] == '%') {
+ i++;
+ continue;
+ }
+
+ if (num_spec >= num_args) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* The string is zero-terminated so if fmt[i] != 0, we can
+ * always access fmt[i + 1], in the worst case it will be a 0
+ */
+ i++;
+
+ /* skip optional "[0 +-][num]" width formatting field */
+ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
+ fmt[i] == ' ')
+ i++;
+ if (fmt[i] >= '1' && fmt[i] <= '9') {
+ i++;
+ while (fmt[i] >= '0' && fmt[i] <= '9')
+ i++;
+ }
+
+ if (fmt[i] == 'p') {
+ sizeof_cur_arg = sizeof(long);
+
+ if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
+ fmt[i + 2] == 's') {
+ fmt_ptype = fmt[i + 1];
+ i += 2;
+ goto fmt_str;
+ }
+
+ if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
+ ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' ||
+ fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
+ fmt[i + 1] == 'S') {
+ /* just kernel pointers */
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+ i++;
+ goto nocopy_fmt;
+ }
+
+ if (fmt[i + 1] == 'B') {
+ if (tmp_buf) {
+ err = snprintf(tmp_buf,
+ (tmp_buf_end - tmp_buf),
+ "%pB",
+ (void *)(long)raw_args[num_spec]);
+ tmp_buf += (err + 1);
+ }
+
+ i++;
+ num_spec++;
+ continue;
+ }
+
+ /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
+ if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
+ (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ i += 2;
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16;
+ if (tmp_buf_end - tmp_buf < sizeof_cur_ip) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ unsafe_ptr = (char *)(long)raw_args[num_spec];
+ err = copy_from_kernel_nofault(cur_ip, unsafe_ptr,
+ sizeof_cur_ip);
+ if (err < 0)
+ memset(cur_ip, 0, sizeof_cur_ip);
+
+ /* hack: bstr_printf expects IP addresses to be
+ * pre-formatted as strings, ironically, the easiest way
+ * to do that is to call snprintf.
+ */
+ ip_spec[2] = fmt[i - 1];
+ ip_spec[3] = fmt[i];
+ err = snprintf(tmp_buf, tmp_buf_end - tmp_buf,
+ ip_spec, &cur_ip);
+
+ tmp_buf += err + 1;
+ num_spec++;
+
+ continue;
+ } else if (fmt[i] == 's') {
+ fmt_ptype = fmt[i];
+fmt_str:
+ if (fmt[i + 1] != 0 &&
+ !isspace(fmt[i + 1]) &&
+ !ispunct(fmt[i + 1])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ if (tmp_buf_end == tmp_buf) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ unsafe_ptr = (char *)(long)raw_args[num_spec];
+ err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
+ fmt_ptype,
+ tmp_buf_end - tmp_buf);
+ if (err < 0) {
+ tmp_buf[0] = '\0';
+ err = 1;
+ }
+
+ tmp_buf += err;
+ num_spec++;
+
+ continue;
+ }
+
+ sizeof_cur_arg = sizeof(int);
+
+ if (fmt[i] == 'l') {
+ sizeof_cur_arg = sizeof(long);
+ i++;
+ }
+ if (fmt[i] == 'l') {
+ sizeof_cur_arg = sizeof(long long);
+ i++;
+ }
+
+ if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
+ fmt[i] != 'x' && fmt[i] != 'X') {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+nocopy_fmt:
+ if (tmp_buf) {
+ tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32));
+ if (tmp_buf_end - tmp_buf < sizeof_cur_arg) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ if (sizeof_cur_arg == 8) {
+ *(u32 *)tmp_buf = *(u32 *)&cur_arg;
+ *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1);
+ } else {
+ *(u32 *)tmp_buf = (u32)(long)cur_arg;
+ }
+ tmp_buf += sizeof_cur_arg;
+ }
+ num_spec++;
+ }
+
+ err = 0;
+out:
+ if (err)
+ bpf_bprintf_cleanup();
+ return err;
+}
+
+#define MAX_SNPRINTF_VARARGS 12
+
+BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
+ const void *, data, u32, data_len)
+{
+ int err, num_args;
+ u32 *bin_args;
+
+ if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 ||
+ (data_len && !data))
+ return -EINVAL;
+ num_args = data_len / 8;
+
+ /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
+ * can safely give an unbounded size.
+ */
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args);
+ if (err < 0)
+ return err;
+
+ err = bstr_printf(str, str_size, fmt, bin_args);
+
+ bpf_bprintf_cleanup();
+
+ return err + 1;
+}
+
+const struct bpf_func_proto bpf_snprintf_proto = {
+ .func = bpf_snprintf,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_CONST_STR,
+ .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -708,6 +1031,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
default:
break;
}
@@ -748,6 +1073,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_probe_read_kernel_str_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
+ case BPF_FUNC_snprintf:
+ return &bpf_snprintf_proto;
default:
return NULL;
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index d2de2abec35b..b4ebd60a6c16 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -816,8 +816,6 @@ static int __init bpf_init(void)
{
int ret;
- mutex_init(&bpf_preload_lock);
-
ret = sysfs_create_mount_point(fs_kobj, "bpf");
if (ret)
return ret;
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 2d4f9ac12377..bd11db9774c3 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,10 +9,11 @@
#include <linux/slab.h>
#include <uapi/linux/btf.h>
-DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
-
#ifdef CONFIG_CGROUP_BPF
+DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
+ bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
+
#include "../cgroup/cgroup-internal.h"
#define LOCAL_STORAGE_CREATE_FLAG_MASK \
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index cec792a17e5f..1b7b8a6f34ee 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -726,6 +726,9 @@ const struct bpf_map_ops trie_map_ops = {
.map_lookup_elem = trie_lookup_elem,
.map_update_elem = trie_update_elem,
.map_delete_elem = trie_delete_elem,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_delete_batch = generic_map_delete_batch,
.map_check_btf = trie_check_btf,
.map_btf_name = "lpm_trie",
.map_btf_id = &trie_map_btf_id,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 250503482cda..941ca06d9dfa 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1694,7 +1694,9 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
- bpf_prog_free_linfo(prog);
+ kvfree(prog->aux->jited_linfo);
+ kvfree(prog->aux->linfo);
+ kfree(prog->aux->kfunc_tab);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
@@ -2549,6 +2551,9 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
container_of(link, struct bpf_tracing_link, link);
info->tracing.attach_type = tr_link->attach_type;
+ bpf_trampoline_unpack_key(tr_link->trampoline->key,
+ &info->tracing.target_obj_id,
+ &info->tracing.target_btf_id);
return 0;
}
@@ -2643,14 +2648,25 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
* target_btf_id using the link_create API.
*
* - if tgt_prog == NULL when this function was called using the old
- * raw_tracepoint_open API, and we need a target from prog->aux
- *
- * The combination of no saved target in prog->aux, and no target
- * specified on load is illegal, and we reject that here.
+ * raw_tracepoint_open API, and we need a target from prog->aux
+ *
+ * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
+ * was detached and is going for re-attachment.
*/
if (!prog->aux->dst_trampoline && !tgt_prog) {
- err = -ENOENT;
- goto out_unlock;
+ /*
+ * Allow re-attach for TRACING and LSM programs. If it's
+ * currently linked, bpf_trampoline_link_prog will fail.
+ * EXT programs need to specify tgt_prog_fd, so they
+ * re-attach in separate code path.
+ */
+ if (prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_LSM) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ btf_id = prog->aux->attach_btf_id;
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
}
if (!prog->aux->dst_trampoline ||
@@ -2946,6 +2962,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_SK_MSG;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_SKB_VERDICT:
return BPF_PROG_TYPE_SK_SKB;
case BPF_LIRC_MODE2:
return BPF_PROG_TYPE_LIRC_MODE2;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 4aa8b52adf25..2d44b5aa0057 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -444,7 +444,7 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
tr->progs_cnt[kind]++;
err = bpf_trampoline_update(tr);
if (err) {
- hlist_del(&prog->aux->tramp_hlist);
+ hlist_del_init(&prog->aux->tramp_hlist);
tr->progs_cnt[kind]--;
}
out:
@@ -467,7 +467,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
tr->extension_prog = NULL;
goto out;
}
- hlist_del(&prog->aux->tramp_hlist);
+ hlist_del_init(&prog->aux->tramp_hlist);
tr->progs_cnt[kind]--;
err = bpf_trampoline_update(tr);
out:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0399ac092b36..8fd552c16763 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -234,6 +234,18 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_CALL;
}
+static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
+}
+
+static bool bpf_pseudo_func(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
+ insn->src_reg == BPF_PSEUDO_FUNC;
+}
+
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
@@ -248,6 +260,7 @@ struct bpf_call_arg_meta {
u32 btf_id;
struct btf *ret_btf;
u32 ret_btf_id;
+ u32 subprogno;
};
struct btf *btf_vmlinux;
@@ -390,6 +403,24 @@ __printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
env->prev_linfo = linfo;
}
+static void verbose_invalid_scalar(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ struct tnum *range, const char *ctx,
+ const char *reg_name)
+{
+ char tn_buf[48];
+
+ verbose(env, "At %s the register %s ", ctx, reg_name);
+ if (!tnum_is_unknown(reg->var_off)) {
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "has value %s", tn_buf);
+ } else {
+ verbose(env, "has unknown scalar value");
+ }
+ tnum_strn(tn_buf, sizeof(tn_buf), *range);
+ verbose(env, " should have been in %s\n", tn_buf);
+}
+
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
return type == PTR_TO_PACKET ||
@@ -409,6 +440,7 @@ static bool reg_type_not_null(enum bpf_reg_type type)
return type == PTR_TO_SOCKET ||
type == PTR_TO_TCP_SOCK ||
type == PTR_TO_MAP_VALUE ||
+ type == PTR_TO_MAP_KEY ||
type == PTR_TO_SOCK_COMMON;
}
@@ -451,7 +483,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type)
type == ARG_PTR_TO_MEM_OR_NULL ||
type == ARG_PTR_TO_CTX_OR_NULL ||
type == ARG_PTR_TO_SOCKET_OR_NULL ||
- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+ type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
+ type == ARG_PTR_TO_STACK_OR_NULL;
}
/* Determine whether the function releases some resources allocated by another
@@ -541,6 +574,8 @@ static const char * const reg_type_str[] = {
[PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
[PTR_TO_RDWR_BUF] = "rdwr_buf",
[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
+ [PTR_TO_FUNC] = "func",
+ [PTR_TO_MAP_KEY] = "map_key",
};
static char slot_type_char[] = {
@@ -612,6 +647,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (type_is_pkt_pointer(t))
verbose(env, ",r=%d", reg->range);
else if (t == CONST_PTR_TO_MAP ||
+ t == PTR_TO_MAP_KEY ||
t == PTR_TO_MAP_VALUE ||
t == PTR_TO_MAP_VALUE_OR_NULL)
verbose(env, ",ks=%d,vs=%d",
@@ -1362,9 +1398,7 @@ static bool __reg64_bound_s32(s64 a)
static bool __reg64_bound_u32(u64 a)
{
- if (a > U32_MIN && a < U32_MAX)
- return true;
- return false;
+ return a > U32_MIN && a < U32_MAX;
}
static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
@@ -1375,10 +1409,10 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
reg->s32_min_value = (s32)reg->smin_value;
reg->s32_max_value = (s32)reg->smax_value;
}
- if (__reg64_bound_u32(reg->umin_value))
+ if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) {
reg->u32_min_value = (u32)reg->umin_value;
- if (__reg64_bound_u32(reg->umax_value))
reg->u32_max_value = (u32)reg->umax_value;
+ }
/* Intersecting with the old var_off might have improved our bounds
* slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
@@ -1519,39 +1553,210 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
}
ret = find_subprog(env, off);
if (ret >= 0)
- return 0;
+ return ret;
if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
verbose(env, "too many subprograms\n");
return -E2BIG;
}
+ /* determine subprog starts. The end is one before the next starts */
env->subprog_info[env->subprog_cnt++].start = off;
sort(env->subprog_info, env->subprog_cnt,
sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
+ return env->subprog_cnt - 1;
+}
+
+struct bpf_kfunc_desc {
+ struct btf_func_model func_model;
+ u32 func_id;
+ s32 imm;
+};
+
+#define MAX_KFUNC_DESCS 256
+struct bpf_kfunc_desc_tab {
+ struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
+ u32 nr_descs;
+};
+
+static int kfunc_desc_cmp_by_id(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ /* func_id is not greater than BTF_MAX_TYPE */
+ return d0->func_id - d1->func_id;
+}
+
+static const struct bpf_kfunc_desc *
+find_kfunc_desc(const struct bpf_prog *prog, u32 func_id)
+{
+ struct bpf_kfunc_desc desc = {
+ .func_id = func_id,
+ };
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ return bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_id);
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
+{
+ const struct btf_type *func, *func_proto;
+ struct bpf_kfunc_desc_tab *tab;
+ struct bpf_prog_aux *prog_aux;
+ struct bpf_kfunc_desc *desc;
+ const char *func_name;
+ unsigned long addr;
+ int err;
+
+ prog_aux = env->prog->aux;
+ tab = prog_aux->kfunc_tab;
+ if (!tab) {
+ if (!btf_vmlinux) {
+ verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
+ return -ENOTSUPP;
+ }
+
+ if (!env->prog->jit_requested) {
+ verbose(env, "JIT is required for calling kernel function\n");
+ return -ENOTSUPP;
+ }
+
+ if (!bpf_jit_supports_kfunc_call()) {
+ verbose(env, "JIT does not support calling kernel function\n");
+ return -ENOTSUPP;
+ }
+
+ if (!env->prog->gpl_compatible) {
+ verbose(env, "cannot call kernel function from non-GPL compatible program\n");
+ return -EINVAL;
+ }
+
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL);
+ if (!tab)
+ return -ENOMEM;
+ prog_aux->kfunc_tab = tab;
+ }
+
+ if (find_kfunc_desc(env->prog, func_id))
+ return 0;
+
+ if (tab->nr_descs == MAX_KFUNC_DESCS) {
+ verbose(env, "too many different kernel function calls\n");
+ return -E2BIG;
+ }
+
+ func = btf_type_by_id(btf_vmlinux, func_id);
+ if (!func || !btf_type_is_func(func)) {
+ verbose(env, "kernel btf_id %u is not a function\n",
+ func_id);
+ return -EINVAL;
+ }
+ func_proto = btf_type_by_id(btf_vmlinux, func->type);
+ if (!func_proto || !btf_type_is_func_proto(func_proto)) {
+ verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
+ func_id);
+ return -EINVAL;
+ }
+
+ func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+ addr = kallsyms_lookup_name(func_name);
+ if (!addr) {
+ verbose(env, "cannot find address for kernel function %s\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ desc = &tab->descs[tab->nr_descs++];
+ desc->func_id = func_id;
+ desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base;
+ err = btf_distill_func_proto(&env->log, btf_vmlinux,
+ func_proto, func_name,
+ &desc->func_model);
+ if (!err)
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_id, NULL);
+ return err;
+}
+
+static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ if (d0->imm > d1->imm)
+ return 1;
+ else if (d0->imm < d1->imm)
+ return -1;
return 0;
}
-static int check_subprogs(struct bpf_verifier_env *env)
+static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+{
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ if (!tab)
+ return;
+
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_imm, NULL);
+}
+
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+ return !!prog->aux->kfunc_tab;
+}
+
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_kfunc_desc desc = {
+ .imm = insn->imm,
+ };
+ const struct bpf_kfunc_desc *res;
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ res = bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+
+ return res ? &res->func_model : NULL;
+}
+
+static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
- int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
+ int i, ret, insn_cnt = env->prog->len;
/* Add entry function. */
ret = add_subprog(env, 0);
- if (ret < 0)
+ if (ret)
return ret;
- /* determine subprog starts. The end is one before the next starts */
- for (i = 0; i < insn_cnt; i++) {
- if (!bpf_pseudo_call(insn + i))
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
+ !bpf_pseudo_kfunc_call(insn))
continue;
+
if (!env->bpf_capable) {
- verbose(env,
- "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+ verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
return -EPERM;
}
- ret = add_subprog(env, i + insn[i].imm + 1);
+
+ if (bpf_pseudo_func(insn)) {
+ ret = add_subprog(env, i + insn->imm + 1);
+ if (ret >= 0)
+ /* remember subprog */
+ insn[1].imm = ret;
+ } else if (bpf_pseudo_call(insn)) {
+ ret = add_subprog(env, i + insn->imm + 1);
+ } else {
+ ret = add_kfunc_call(env, insn->imm);
+ }
+
if (ret < 0)
return ret;
}
@@ -1565,6 +1770,16 @@ static int check_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < env->subprog_cnt; i++)
verbose(env, "func#%d @%d\n", i, subprog[i].start);
+ return 0;
+}
+
+static int check_subprogs(struct bpf_verifier_env *env)
+{
+ int i, subprog_start, subprog_end, off, cur_subprog = 0;
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+
/* now check that all jumps are within the same subprog */
subprog_start = subprog[cur_subprog].start;
subprog_end = subprog[cur_subprog + 1].start;
@@ -1873,6 +2088,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
return i;
}
+static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
+{
+ const struct btf_type *func;
+
+ if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
+ return NULL;
+
+ func = btf_type_by_id(btf_vmlinux, insn->imm);
+ return btf_name_by_offset(btf_vmlinux, func->name_off);
+}
+
/* For given verifier state backtrack_insn() is called from the last insn to
* the first insn. Its purpose is to compute a bitmask of registers and
* stack slots that needs precision in the parent verifier state.
@@ -1881,6 +2107,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
u32 *reg_mask, u64 *stack_mask)
{
const struct bpf_insn_cbs cbs = {
+ .cb_call = disasm_kfunc_name,
.cb_print = verbose,
.private_data = env,
};
@@ -2295,6 +2522,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_PERCPU_BTF_ID:
case PTR_TO_MEM:
case PTR_TO_MEM_OR_NULL:
+ case PTR_TO_FUNC:
+ case PTR_TO_MAP_KEY:
return true;
default:
return false;
@@ -2899,6 +3128,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
reg = &cur_regs(env)[regno];
switch (reg->type) {
+ case PTR_TO_MAP_KEY:
+ verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
+ mem_size, off, size);
+ break;
case PTR_TO_MAP_VALUE:
verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
mem_size, off, size);
@@ -3304,6 +3537,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_FLOW_KEYS:
pointer_desc = "flow keys ";
break;
+ case PTR_TO_MAP_KEY:
+ pointer_desc = "key ";
+ break;
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
break;
@@ -3405,7 +3641,7 @@ process_func:
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
- if (!bpf_pseudo_call(insn + i))
+ if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
ret_insn[frame] = i + 1;
@@ -3842,7 +4078,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
/* for access checks, reg->off is just part of off */
off += reg->off;
- if (reg->type == PTR_TO_MAP_VALUE) {
+ if (reg->type == PTR_TO_MAP_KEY) {
+ if (t == BPF_WRITE) {
+ verbose(env, "write to change key R%d not allowed\n", regno);
+ return -EACCES;
+ }
+
+ err = check_mem_region_access(env, regno, off, size,
+ reg->map_ptr->key_size, false);
+ if (err)
+ return err;
+ if (value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into map\n", value_regno);
@@ -4258,6 +4506,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
+ case PTR_TO_MAP_KEY:
+ return check_mem_region_access(env, regno, reg->off, access_size,
+ reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
if (check_map_access_type(env, regno, reg->off, access_size,
meta && meta->raw_mode ? BPF_WRITE :
@@ -4474,6 +4725,7 @@ static const struct bpf_reg_types map_key_value_types = {
PTR_TO_STACK,
PTR_TO_PACKET,
PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
},
};
@@ -4505,6 +4757,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_STACK,
PTR_TO_PACKET,
PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
PTR_TO_MEM,
PTR_TO_RDONLY_BUF,
@@ -4517,6 +4770,7 @@ static const struct bpf_reg_types int_ptr_types = {
PTR_TO_STACK,
PTR_TO_PACKET,
PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
},
};
@@ -4529,6 +4783,9 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T
static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
+static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
+static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
@@ -4557,6 +4814,9 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
+ [ARG_PTR_TO_FUNC] = &func_ptr_types,
+ [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
+ [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4738,6 +4998,8 @@ skip_type_check:
verbose(env, "verifier internal error\n");
return -EFAULT;
}
+ } else if (arg_type == ARG_PTR_TO_FUNC) {
+ meta->subprogno = reg->subprogno;
} else if (arg_type_is_mem_ptr(arg_type)) {
/* The access to this pointer is only checked when we hit the
* next is_mem_size argument below.
@@ -4805,6 +5067,44 @@ skip_type_check:
if (err)
return err;
err = check_ptr_alignment(env, reg, 0, size, true);
+ } else if (arg_type == ARG_PTR_TO_CONST_STR) {
+ struct bpf_map *map = reg->map_ptr;
+ int map_off;
+ u64 map_addr;
+ char *str_ptr;
+
+ if (!bpf_map_is_rdonly(map)) {
+ verbose(env, "R%d does not point to a readonly map'\n", regno);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a constant address'\n", regno);
+ return -EACCES;
+ }
+
+ if (!map->ops->map_direct_value_addr) {
+ verbose(env, "no direct value access support for this map type\n");
+ return -EACCES;
+ }
+
+ err = check_map_access(env, regno, reg->off,
+ map->value_size - reg->off, false);
+ if (err)
+ return err;
+
+ map_off = reg->off + reg->var_off.value;
+ err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
+ if (err) {
+ verbose(env, "direct value access on string failed\n");
+ return err;
+ }
+
+ str_ptr = (char *)(long)(map_addr);
+ if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
+ verbose(env, "string is not zero-terminated\n");
+ return -EINVAL;
+ }
}
return err;
@@ -5258,13 +5558,19 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
}
}
-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx)
+typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx);
+
+static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx, int subprog,
+ set_callee_state_fn set_callee_state_cb)
{
struct bpf_verifier_state *state = env->cur_state;
struct bpf_func_info_aux *func_info_aux;
struct bpf_func_state *caller, *callee;
- int i, err, subprog, target_insn;
+ int err;
bool is_global = false;
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
@@ -5273,14 +5579,6 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -E2BIG;
}
- target_insn = *insn_idx + insn->imm;
- subprog = find_subprog(env, target_insn + 1);
- if (subprog < 0) {
- verbose(env, "verifier bug. No program starts at insn %d\n",
- target_insn + 1);
- return -EFAULT;
- }
-
caller = state->frame[state->curframe];
if (state->frame[state->curframe + 1]) {
verbose(env, "verifier bug. Frame %d already allocated\n",
@@ -5291,7 +5589,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_info_aux = env->prog->aux->func_info_aux;
if (func_info_aux)
is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, subprog, caller->regs);
+ err = btf_check_subprog_arg_match(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
if (is_global) {
@@ -5335,11 +5633,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (err)
return err;
- /* copy r1 - r5 args that callee can access. The copy includes parent
- * pointers, which connects us up to the liveness chain
- */
- for (i = BPF_REG_1; i <= BPF_REG_5; i++)
- callee->regs[i] = caller->regs[i];
+ err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ if (err)
+ return err;
clear_caller_saved_regs(env, caller->regs);
@@ -5347,7 +5643,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
state->curframe++;
/* and go analyze first insn of the callee */
- *insn_idx = target_insn;
+ *insn_idx = env->subprog_info[subprog].start - 1;
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
@@ -5358,6 +5654,92 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return 0;
}
+int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee)
+{
+ /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
+ * void *callback_ctx, u64 flags);
+ * callback_fn(struct bpf_map *map, void *key, void *value,
+ * void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ /* pointer to stack or null */
+ callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ return 0;
+}
+
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx)
+{
+ int i;
+
+ /* copy r1 - r5 args that callee can access. The copy includes parent
+ * pointers, which connects us up to the liveness chain
+ */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ callee->regs[i] = caller->regs[i];
+ return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx)
+{
+ int subprog, target_insn;
+
+ target_insn = *insn_idx + insn->imm + 1;
+ subprog = find_subprog(env, target_insn);
+ if (subprog < 0) {
+ verbose(env, "verifier bug. No program starts at insn %d\n",
+ target_insn);
+ return -EFAULT;
+ }
+
+ return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
+}
+
+static int set_map_elem_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+ struct bpf_map *map;
+ int err;
+
+ if (bpf_map_ptr_poisoned(insn_aux)) {
+ verbose(env, "tail_call abusing map_ptr\n");
+ return -EINVAL;
+ }
+
+ map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+ if (!map->ops->map_set_for_each_callback_args ||
+ !map->ops->map_for_each_callback) {
+ verbose(env, "callback function not allowed for map\n");
+ return -ENOTSUPP;
+ }
+
+ err = map->ops->map_set_for_each_callback_args(env, caller, callee);
+ if (err)
+ return err;
+
+ callee->in_callback_fn = true;
+ return 0;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -5380,8 +5762,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
state->curframe--;
caller = state->frame[state->curframe];
- /* return to the caller whatever r0 had in the callee */
- caller->regs[BPF_REG_0] = *r0;
+ if (callee->in_callback_fn) {
+ /* enforce R0 return value range [0, 1]. */
+ struct tnum range = tnum_range(0, 1);
+
+ if (r0->type != SCALAR_VALUE) {
+ verbose(env, "R0 not a scalar value\n");
+ return -EACCES;
+ }
+ if (!tnum_in(range, r0->var_off)) {
+ verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+ return -EINVAL;
+ }
+ } else {
+ /* return to the caller whatever r0 had in the callee */
+ caller->regs[BPF_REG_0] = *r0;
+ }
/* Transfer references to the caller */
err = transfer_reference_state(caller, callee);
@@ -5409,6 +5805,7 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
if (ret_type != RET_INTEGER ||
(func_id != BPF_FUNC_get_stack &&
+ func_id != BPF_FUNC_get_task_stack &&
func_id != BPF_FUNC_probe_read_str &&
func_id != BPF_FUNC_probe_read_kernel_str &&
func_id != BPF_FUNC_probe_read_user_str))
@@ -5436,7 +5833,9 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_map_push_elem &&
func_id != BPF_FUNC_map_pop_elem &&
- func_id != BPF_FUNC_map_peek_elem)
+ func_id != BPF_FUNC_map_peek_elem &&
+ func_id != BPF_FUNC_for_each_map_elem &&
+ func_id != BPF_FUNC_redirect_map)
return 0;
if (map == NULL) {
@@ -5517,15 +5916,55 @@ static int check_reference_leak(struct bpf_verifier_env *env)
return state->acquired_refs ? -EINVAL : 0;
}
-static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
+ struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
+ struct bpf_map *fmt_map = fmt_reg->map_ptr;
+ int err, fmt_map_off, num_args;
+ u64 fmt_addr;
+ char *fmt;
+
+ /* data must be an array of u64 */
+ if (data_len_reg->var_off.value % 8)
+ return -EINVAL;
+ num_args = data_len_reg->var_off.value / 8;
+
+ /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const
+ * and map_direct_value_addr is set.
+ */
+ fmt_map_off = fmt_reg->off + fmt_reg->var_off.value;
+ err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
+ fmt_map_off);
+ if (err) {
+ verbose(env, "verifier bug\n");
+ return -EFAULT;
+ }
+ fmt = (char *)(long)fmt_addr + fmt_map_off;
+
+ /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
+ * can focus on validating the format specifiers.
+ */
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args);
+ if (err < 0)
+ verbose(env, "Invalid format string\n");
+
+ return err;
+}
+
+static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
{
const struct bpf_func_proto *fn = NULL;
struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
+ int insn_idx = *insn_idx_p;
bool changes_data;
- int i, err;
+ int i, err, func_id;
/* find function prototype */
+ func_id = insn->imm;
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
func_id);
@@ -5571,7 +6010,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
meta.func_id = func_id;
/* check args */
- for (i = 0; i < 5; i++) {
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
err = check_func_arg(env, i, &meta, fn);
if (err)
return err;
@@ -5621,6 +6060,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return -EINVAL;
}
+ if (func_id == BPF_FUNC_for_each_map_elem) {
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_map_elem_callback_state);
+ if (err < 0)
+ return -EINVAL;
+ }
+
+ if (func_id == BPF_FUNC_snprintf) {
+ err = check_bpf_snprintf_call(env, regs);
+ if (err < 0)
+ return err;
+ }
+
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -5776,6 +6228,98 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return 0;
}
+/* mark_btf_func_reg_size() is used when the reg size is determined by
+ * the BTF func_proto's return value size and argument.
+ */
+static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
+ size_t reg_size)
+{
+ struct bpf_reg_state *reg = &cur_regs(env)[regno];
+
+ if (regno == BPF_REG_0) {
+ /* Function return value */
+ reg->live |= REG_LIVE_WRITTEN;
+ reg->subreg_def = reg_size == sizeof(u64) ?
+ DEF_NOT_SUBREG : env->insn_idx + 1;
+ } else {
+ /* Function argument */
+ if (reg_size == sizeof(u64)) {
+ mark_insn_zext(env, reg);
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ } else {
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
+ }
+ }
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ const struct btf_type *t, *func, *func_proto, *ptr_type;
+ struct bpf_reg_state *regs = cur_regs(env);
+ const char *func_name, *ptr_type_name;
+ u32 i, nargs, func_id, ptr_type_id;
+ const struct btf_param *args;
+ int err;
+
+ func_id = insn->imm;
+ func = btf_type_by_id(btf_vmlinux, func_id);
+ func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+ func_proto = btf_type_by_id(btf_vmlinux, func->type);
+
+ if (!env->ops->check_kfunc_call ||
+ !env->ops->check_kfunc_call(func_id)) {
+ verbose(env, "calling kernel function %s is not allowed\n",
+ func_name);
+ return -EACCES;
+ }
+
+ /* Check the arguments */
+ err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs);
+ if (err)
+ return err;
+
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(env, regs, caller_saved[i]);
+
+ /* Check return type */
+ t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL);
+ if (btf_type_is_scalar(t)) {
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ mark_btf_func_reg_size(env, BPF_REG_0, t->size);
+ } else if (btf_type_is_ptr(t)) {
+ ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type,
+ &ptr_type_id);
+ if (!btf_type_is_struct(ptr_type)) {
+ ptr_type_name = btf_name_by_offset(btf_vmlinux,
+ ptr_type->name_off);
+ verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
+ func_name, btf_type_str(ptr_type),
+ ptr_type_name);
+ return -EINVAL;
+ }
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = btf_vmlinux;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
+ mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+ } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+
+ nargs = btf_type_vlen(func_proto);
+ args = (const struct btf_param *)(func_proto + 1);
+ for (i = 0; i < nargs; i++) {
+ u32 regno = i + 1;
+
+ t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL);
+ if (btf_type_is_ptr(t))
+ mark_btf_func_reg_size(env, regno, sizeof(void *));
+ else
+ /* scalar. ensured by btf_check_kfunc_arg_match() */
+ mark_btf_func_reg_size(env, regno, t->size);
+ }
+
+ return 0;
+}
+
static bool signed_add_overflows(s64 a, s64 b)
{
/* Do the add in u64, where overflow is well-defined */
@@ -5920,7 +6464,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
aux->alu_limit != alu_limit))
return REASON_PATHS;
- /* Corresponding fixup done in fixup_bpf_calls(). */
+ /* Corresponding fixup done in do_misc_fixups(). */
aux->alu_state = alu_state;
aux->alu_limit = alu_limit;
return 0;
@@ -8345,6 +8889,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
+ if (insn->src_reg == BPF_PSEUDO_FUNC) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ u32 subprogno = insn[1].imm;
+
+ if (!aux->func_info) {
+ verbose(env, "missing btf func_info\n");
+ return -EINVAL;
+ }
+ if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
+ verbose(env, "callback function not static\n");
+ return -EINVAL;
+ }
+
+ dst_reg->type = PTR_TO_FUNC;
+ dst_reg->subprogno = subprogno;
+ return 0;
+ }
+
map = env->used_maps[aux->map_index];
mark_reg_known_zero(env, regs, insn->dst_reg);
dst_reg->map_ptr = map;
@@ -8573,17 +9135,7 @@ static int check_return_code(struct bpf_verifier_env *env)
}
if (!tnum_in(range, reg->var_off)) {
- char tn_buf[48];
-
- verbose(env, "At program exit the register R0 ");
- if (!tnum_is_unknown(reg->var_off)) {
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "has value %s", tn_buf);
- } else {
- verbose(env, "has unknown scalar value");
- }
- tnum_strn(tn_buf, sizeof(tn_buf), range);
- verbose(env, " should have been in %s\n", tn_buf);
+ verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
return -EINVAL;
}
@@ -8710,6 +9262,27 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return DONE_EXPLORING;
}
+static int visit_func_call_insn(int t, int insn_cnt,
+ struct bpf_insn *insns,
+ struct bpf_verifier_env *env,
+ bool visit_callee)
+{
+ int ret;
+
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
+ if (ret)
+ return ret;
+
+ if (t + 1 < insn_cnt)
+ init_explored_state(env, t + 1);
+ if (visit_callee) {
+ init_explored_state(env, t);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
+ env, false);
+ }
+ return ret;
+}
+
/* Visits the instruction at index t and returns one of the following:
* < 0 - an error occurred
* DONE_EXPLORING - the instruction was fully explored
@@ -8720,6 +9293,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
struct bpf_insn *insns = env->prog->insnsi;
int ret;
+ if (bpf_pseudo_func(insns + t))
+ return visit_func_call_insn(t, insn_cnt, insns, env, true);
+
/* All non-branch instructions have a single fall-through edge. */
if (BPF_CLASS(insns[t].code) != BPF_JMP &&
BPF_CLASS(insns[t].code) != BPF_JMP32)
@@ -8730,18 +9306,8 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
return DONE_EXPLORING;
case BPF_CALL:
- ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
- if (ret)
- return ret;
-
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
- if (insns[t].src_reg == BPF_PSEUDO_CALL) {
- init_explored_state(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
- env, false);
- }
- return ret;
+ return visit_func_call_insn(t, insn_cnt, insns, env,
+ insns[t].src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
if (BPF_SRC(insns[t].code) != BPF_K)
@@ -9354,6 +9920,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
*/
return false;
}
+ case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
@@ -10037,6 +10604,7 @@ static int do_check(struct bpf_verifier_env *env)
if (env->log.level & BPF_LOG_LEVEL) {
const struct bpf_insn_cbs cbs = {
+ .cb_call = disasm_kfunc_name,
.cb_print = verbose,
.private_data = env,
};
@@ -10184,7 +10752,8 @@ static int do_check(struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K ||
insn->off != 0 ||
(insn->src_reg != BPF_REG_0 &&
- insn->src_reg != BPF_PSEUDO_CALL) ||
+ insn->src_reg != BPF_PSEUDO_CALL &&
+ insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
insn->dst_reg != BPF_REG_0 ||
class == BPF_JMP32) {
verbose(env, "BPF_CALL uses reserved fields\n");
@@ -10199,11 +10768,12 @@ static int do_check(struct bpf_verifier_env *env)
}
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
+ else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ err = check_kfunc_call(env, insn);
else
- err = check_helper_call(env, insn->imm, env->insn_idx);
+ err = check_helper_call(env, insn, &env->insn_idx);
if (err)
return err;
-
} else if (opcode == BPF_JA) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
@@ -10632,6 +11202,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
goto next_insn;
}
+ if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
+ aux = &env->insn_aux_data[i];
+ aux->ptr_type = PTR_TO_FUNC;
+ goto next_insn;
+ }
+
/* In final convert_pseudo_ld_imm64() step, this is
* converted into regular 64-bit imm load insn.
*/
@@ -10764,9 +11340,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
int insn_cnt = env->prog->len;
int i;
- for (i = 0; i < insn_cnt; i++, insn++)
- if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
- insn->src_reg = 0;
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
+ continue;
+ if (insn->src_reg == BPF_PSEUDO_FUNC)
+ continue;
+ insn->src_reg = 0;
+ }
}
/* single env->prog->insni[off] instruction was replaced with the range
@@ -11405,6 +11985,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ env->insn_aux_data[i].call_imm = insn->imm;
+ /* subprog is encoded in insn[1].imm */
+ continue;
+ }
+
if (!bpf_pseudo_call(insn))
continue;
/* Upon error here we cannot fall back to interpreter but
@@ -11494,6 +12080,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
+ func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
func[i]->aux->linfo = prog->aux->linfo;
func[i]->aux->nr_linfo = prog->aux->nr_linfo;
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
@@ -11534,6 +12121,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < env->subprog_cnt; i++) {
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ subprog = insn[1].imm;
+ insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+ insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+ continue;
+ }
if (!bpf_pseudo_call(insn))
continue;
subprog = insn->off;
@@ -11579,6 +12172,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* later look the same as if they were interpreted only.
*/
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ insn[0].imm = env->insn_aux_data[i].call_imm;
+ insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
+ continue;
+ }
if (!bpf_pseudo_call(insn))
continue;
insn->off = env->insn_aux_data[i].call_imm;
@@ -11590,7 +12188,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->bpf_func = func[0]->bpf_func;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
- bpf_prog_free_unused_jited_linfo(prog);
+ bpf_prog_jit_attempt_done(prog);
return 0;
out_free:
for (i = 0; i < env->subprog_cnt; i++) {
@@ -11613,7 +12211,7 @@ out_undo_insn:
insn->off = 0;
insn->imm = env->insn_aux_data[i].call_imm;
}
- bpf_prog_free_jited_linfo(prog);
+ bpf_prog_jit_attempt_done(prog);
return err;
}
@@ -11622,6 +12220,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
struct bpf_prog *prog = env->prog;
struct bpf_insn *insn = prog->insnsi;
+ bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
int i, depth;
#endif
int err = 0;
@@ -11635,6 +12234,10 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ if (has_kfunc_call) {
+ verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
/* When JIT fails the progs with bpf2bpf calls and tail_calls
* have to be rejected, since interpreter doesn't support them yet.
@@ -11643,6 +12246,14 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return -EINVAL;
}
for (i = 0; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ /* When JIT fails the progs with callback calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "callbacks are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
+
if (!bpf_pseudo_call(insn))
continue;
depth = get_callee_stack_depth(env, insn, i);
@@ -11655,12 +12266,30 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
-/* fixup insn->imm field of bpf_call instructions
- * and inline eligible helpers as explicit sequence of BPF instructions
- *
- * this function is called after eBPF program passed verification
+static int fixup_kfunc_call(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ const struct bpf_kfunc_desc *desc;
+
+ /* insn->imm has the btf func_id. Replace it with
+ * an address (relative to __bpf_base_call).
+ */
+ desc = find_kfunc_desc(env->prog, insn->imm);
+ if (!desc) {
+ verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
+ insn->imm);
+ return -EFAULT;
+ }
+
+ insn->imm = desc->imm;
+
+ return 0;
+}
+
+/* Do various post-verification rewrites in a single program pass.
+ * These rewrites simplify JIT and interpreter implementations.
*/
-static int fixup_bpf_calls(struct bpf_verifier_env *env)
+static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
bool expect_blinding = bpf_jit_blinding_enabled(prog);
@@ -11675,6 +12304,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
int i, ret, cnt, delta = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ /* Make divide-by-zero exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
@@ -11715,6 +12345,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
if (BPF_CLASS(insn->code) == BPF_LD &&
(BPF_MODE(insn->code) == BPF_ABS ||
BPF_MODE(insn->code) == BPF_IND)) {
@@ -11734,11 +12365,11 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ /* Rewrite pointer arithmetic to mitigate speculation attacks. */
if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
- struct bpf_insn insn_buf[16];
struct bpf_insn *patch = &insn_buf[0];
bool issrc, isneg;
u32 off_reg;
@@ -11790,6 +12421,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
if (insn->src_reg == BPF_PSEUDO_CALL)
continue;
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ ret = fixup_kfunc_call(env, insn);
+ if (ret)
+ return ret;
+ continue;
+ }
if (insn->imm == BPF_FUNC_get_route_realm)
prog->dst_needed = 1;
@@ -11882,7 +12519,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->imm == BPF_FUNC_map_delete_elem ||
insn->imm == BPF_FUNC_map_push_elem ||
insn->imm == BPF_FUNC_map_pop_elem ||
- insn->imm == BPF_FUNC_map_peek_elem)) {
+ insn->imm == BPF_FUNC_map_peek_elem ||
+ insn->imm == BPF_FUNC_redirect_map)) {
aux = &env->insn_aux_data[i + delta];
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
@@ -11924,6 +12562,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
(int (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
(int (*)(struct bpf_map *map, void *value))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_redirect,
+ (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+
patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
@@ -11950,11 +12591,16 @@ patch_map_ops_generic:
insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
__bpf_call_base;
continue;
+ case BPF_FUNC_redirect_map:
+ insn->imm = BPF_CAST_CALL(ops->map_redirect) -
+ __bpf_call_base;
+ continue;
}
goto patch_call_imm;
}
+ /* Implement bpf_jiffies64 inline. */
if (prog->jit_requested && BITS_PER_LONG == 64 &&
insn->imm == BPF_FUNC_jiffies64) {
struct bpf_insn ld_jiffies_addr[2] = {
@@ -12010,6 +12656,8 @@ patch_call_imm:
}
}
+ sort_kfunc_descs_by_imm(env->prog);
+
return 0;
}
@@ -12120,7 +12768,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
mark_reg_known_zero(env, regs, BPF_REG_1);
- ret = btf_check_func_arg_match(env, subprog, regs);
+ ret = btf_check_subprog_arg_match(env, subprog, regs);
if (ret == -EFAULT)
/* unlikely verifier bug. abort.
* ret == 0 and ret < 0 are sadly acceptable for
@@ -12720,6 +13368,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (!env->explored_states)
goto skip_full_check;
+ ret = add_subprog_and_kfunc(env);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = check_subprogs(env);
if (ret < 0)
goto skip_full_check;
@@ -12770,7 +13422,7 @@ skip_full_check:
ret = convert_ctx_accesses(env);
if (ret == 0)
- ret = fixup_bpf_calls(env);
+ ret = do_misc_fixups(env);
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
diff --git a/kernel/fork.c b/kernel/fork.c
index 0f1992d3f80b..0a5d28fe9990 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -96,6 +96,7 @@
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
+#include <linux/bpf.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
cgroup_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
+ bpf_task_storage_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
@@ -2080,6 +2082,9 @@ static __latent_entropy struct task_struct *copy_process(
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif
+#ifdef CONFIG_BPF_SYSCALL
+ RCU_INIT_POINTER(p->bpf_storage, NULL);
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0151c8c206c4..f91d327273c1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1026,6 +1026,65 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
do_proc_douintvec_minmax_conv, &param);
}
+/**
+ * proc_dou8vec_minmax - read a vector of unsigned chars with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(u8) unsigned chars
+ * values from/to the user buffer, treated as an ASCII string. Negative
+ * strings are not allowed.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success or an error on write when the range check fails.
+ */
+int proc_dou8vec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp;
+ unsigned int min = 0, max = 255U, val;
+ u8 *data = table->data;
+ struct do_proc_douintvec_minmax_conv_param param = {
+ .min = &min,
+ .max = &max,
+ };
+ int res;
+
+ /* Do not support arrays yet. */
+ if (table->maxlen != sizeof(u8))
+ return -EINVAL;
+
+ if (table->extra1) {
+ min = *(unsigned int *) table->extra1;
+ if (min > 255U)
+ return -EINVAL;
+ }
+ if (table->extra2) {
+ max = *(unsigned int *) table->extra2;
+ if (max > 255U)
+ return -EINVAL;
+ }
+
+ tmp = *table;
+
+ tmp.maxlen = sizeof(val);
+ tmp.data = &val;
+ val = *data;
+ res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos,
+ do_proc_douintvec_minmax_conv, &param);
+ if (res)
+ return res;
+ if (write)
+ *data = val;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(proc_dou8vec_minmax);
+
static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
unsigned int *valp,
int write, void *data)
@@ -1574,6 +1633,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_dou8vec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b0c45d923f0f..d2d7cf6cfe83 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -372,188 +372,34 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
return &bpf_probe_write_user_proto;
}
-static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
- size_t bufsz)
-{
- void __user *user_ptr = (__force void __user *)unsafe_ptr;
-
- buf[0] = 0;
-
- switch (fmt_ptype) {
- case 's':
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)unsafe_ptr < TASK_SIZE) {
- strncpy_from_user_nofault(buf, user_ptr, bufsz);
- break;
- }
- fallthrough;
-#endif
- case 'k':
- strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
- break;
- case 'u':
- strncpy_from_user_nofault(buf, user_ptr, bufsz);
- break;
- }
-}
-
static DEFINE_RAW_SPINLOCK(trace_printk_lock);
-#define BPF_TRACE_PRINTK_SIZE 1024
+#define MAX_TRACE_PRINTK_VARARGS 3
+#define BPF_TRACE_PRINTK_SIZE 1024
-static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
+BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
+ u64, arg2, u64, arg3)
{
+ u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
+ u32 *bin_args;
static char buf[BPF_TRACE_PRINTK_SIZE];
unsigned long flags;
- va_list ap;
int ret;
+ ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args,
+ MAX_TRACE_PRINTK_VARARGS);
+ if (ret < 0)
+ return ret;
+
raw_spin_lock_irqsave(&trace_printk_lock, flags);
- va_start(ap, fmt);
- ret = vsnprintf(buf, sizeof(buf), fmt, ap);
- va_end(ap);
- /* vsnprintf() will not append null for zero-length strings */
- if (ret == 0)
- buf[0] = '\0';
+ ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
+
trace_bpf_trace_printk(buf);
raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
- return ret;
-}
-
-/*
- * Only limited trace_printk() conversion specifiers allowed:
- * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s
- */
-BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
- u64, arg2, u64, arg3)
-{
- int i, mod[3] = {}, fmt_cnt = 0;
- char buf[64], fmt_ptype;
- void *unsafe_ptr = NULL;
- bool str_seen = false;
-
- /*
- * bpf_check()->check_func_arg()->check_stack_boundary()
- * guarantees that fmt points to bpf program stack,
- * fmt_size bytes of it were initialized and fmt_size > 0
- */
- if (fmt[--fmt_size] != 0)
- return -EINVAL;
-
- /* check format string for allowed specifiers */
- for (i = 0; i < fmt_size; i++) {
- if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
- return -EINVAL;
-
- if (fmt[i] != '%')
- continue;
-
- if (fmt_cnt >= 3)
- return -EINVAL;
-
- /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
- i++;
- if (fmt[i] == 'l') {
- mod[fmt_cnt]++;
- i++;
- } else if (fmt[i] == 'p') {
- mod[fmt_cnt]++;
- if ((fmt[i + 1] == 'k' ||
- fmt[i + 1] == 'u') &&
- fmt[i + 2] == 's') {
- fmt_ptype = fmt[i + 1];
- i += 2;
- goto fmt_str;
- }
-
- if (fmt[i + 1] == 'B') {
- i++;
- goto fmt_next;
- }
-
- /* disallow any further format extensions */
- if (fmt[i + 1] != 0 &&
- !isspace(fmt[i + 1]) &&
- !ispunct(fmt[i + 1]))
- return -EINVAL;
-
- goto fmt_next;
- } else if (fmt[i] == 's') {
- mod[fmt_cnt]++;
- fmt_ptype = fmt[i];
-fmt_str:
- if (str_seen)
- /* allow only one '%s' per fmt string */
- return -EINVAL;
- str_seen = true;
-
- if (fmt[i + 1] != 0 &&
- !isspace(fmt[i + 1]) &&
- !ispunct(fmt[i + 1]))
- return -EINVAL;
-
- switch (fmt_cnt) {
- case 0:
- unsafe_ptr = (void *)(long)arg1;
- arg1 = (long)buf;
- break;
- case 1:
- unsafe_ptr = (void *)(long)arg2;
- arg2 = (long)buf;
- break;
- case 2:
- unsafe_ptr = (void *)(long)arg3;
- arg3 = (long)buf;
- break;
- }
-
- bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype,
- sizeof(buf));
- goto fmt_next;
- }
-
- if (fmt[i] == 'l') {
- mod[fmt_cnt]++;
- i++;
- }
-
- if (fmt[i] != 'i' && fmt[i] != 'd' &&
- fmt[i] != 'u' && fmt[i] != 'x')
- return -EINVAL;
-fmt_next:
- fmt_cnt++;
- }
+ bpf_bprintf_cleanup();
-/* Horrid workaround for getting va_list handling working with different
- * argument type combinations generically for 32 and 64 bit archs.
- */
-#define __BPF_TP_EMIT() __BPF_ARG3_TP()
-#define __BPF_TP(...) \
- bpf_do_trace_printk(fmt, ##__VA_ARGS__)
-
-#define __BPF_ARG1_TP(...) \
- ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_TP(arg1, ##__VA_ARGS__) \
- : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_TP((long)arg1, ##__VA_ARGS__) \
- : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
-
-#define __BPF_ARG2_TP(...) \
- ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
- : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
- : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
-
-#define __BPF_ARG3_TP(...) \
- ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
- : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
- : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
-
- return __BPF_TP_EMIT();
+ return ret;
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
@@ -581,184 +427,27 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
}
#define MAX_SEQ_PRINTF_VARARGS 12
-#define MAX_SEQ_PRINTF_MAX_MEMCPY 6
-#define MAX_SEQ_PRINTF_STR_LEN 128
-
-struct bpf_seq_printf_buf {
- char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
-};
-static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
-static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
const void *, data, u32, data_len)
{
- int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
- int i, buf_used, copy_size, num_args;
- u64 params[MAX_SEQ_PRINTF_VARARGS];
- struct bpf_seq_printf_buf *bufs;
- const u64 *args = data;
-
- buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
- if (WARN_ON_ONCE(buf_used > 1)) {
- err = -EBUSY;
- goto out;
- }
-
- bufs = this_cpu_ptr(&bpf_seq_printf_buf);
-
- /*
- * bpf_check()->check_func_arg()->check_stack_boundary()
- * guarantees that fmt points to bpf program stack,
- * fmt_size bytes of it were initialized and fmt_size > 0
- */
- if (fmt[--fmt_size] != 0)
- goto out;
-
- if (data_len & 7)
- goto out;
-
- for (i = 0; i < fmt_size; i++) {
- if (fmt[i] == '%') {
- if (fmt[i + 1] == '%')
- i++;
- else if (!data || !data_len)
- goto out;
- }
- }
+ int err, num_args;
+ u32 *bin_args;
+ if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 ||
+ (data_len && !data))
+ return -EINVAL;
num_args = data_len / 8;
- /* check format string for allowed specifiers */
- for (i = 0; i < fmt_size; i++) {
- /* only printable ascii for now. */
- if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
- err = -EINVAL;
- goto out;
- }
-
- if (fmt[i] != '%')
- continue;
-
- if (fmt[i + 1] == '%') {
- i++;
- continue;
- }
-
- if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
- err = -E2BIG;
- goto out;
- }
-
- if (fmt_cnt >= num_args) {
- err = -EINVAL;
- goto out;
- }
-
- /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
- i++;
-
- /* skip optional "[0 +-][num]" width formating field */
- while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
- fmt[i] == ' ')
- i++;
- if (fmt[i] >= '1' && fmt[i] <= '9') {
- i++;
- while (fmt[i] >= '0' && fmt[i] <= '9')
- i++;
- }
-
- if (fmt[i] == 's') {
- void *unsafe_ptr;
-
- /* try our best to copy */
- if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
- err = -E2BIG;
- goto out;
- }
-
- unsafe_ptr = (void *)(long)args[fmt_cnt];
- err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt],
- unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN);
- if (err < 0)
- bufs->buf[memcpy_cnt][0] = '\0';
- params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
-
- fmt_cnt++;
- memcpy_cnt++;
- continue;
- }
-
- if (fmt[i] == 'p') {
- if (fmt[i + 1] == 0 ||
- fmt[i + 1] == 'K' ||
- fmt[i + 1] == 'x' ||
- fmt[i + 1] == 'B') {
- /* just kernel pointers */
- params[fmt_cnt] = args[fmt_cnt];
- fmt_cnt++;
- continue;
- }
-
- /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
- if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') {
- err = -EINVAL;
- goto out;
- }
- if (fmt[i + 2] != '4' && fmt[i + 2] != '6') {
- err = -EINVAL;
- goto out;
- }
-
- if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
- err = -E2BIG;
- goto out;
- }
-
-
- copy_size = (fmt[i + 2] == '4') ? 4 : 16;
-
- err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt],
- (void *) (long) args[fmt_cnt],
- copy_size);
- if (err < 0)
- memset(bufs->buf[memcpy_cnt], 0, copy_size);
- params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
-
- i += 2;
- fmt_cnt++;
- memcpy_cnt++;
- continue;
- }
-
- if (fmt[i] == 'l') {
- i++;
- if (fmt[i] == 'l')
- i++;
- }
-
- if (fmt[i] != 'i' && fmt[i] != 'd' &&
- fmt[i] != 'u' && fmt[i] != 'x' &&
- fmt[i] != 'X') {
- err = -EINVAL;
- goto out;
- }
+ err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
+ if (err < 0)
+ return err;
- params[fmt_cnt] = args[fmt_cnt];
- fmt_cnt++;
- }
+ seq_bprintf(m, fmt, bin_args);
- /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
- * all of them to seq_printf().
- */
- seq_printf(m, fmt, params[0], params[1], params[2], params[3],
- params[4], params[5], params[6], params[7], params[8],
- params[9], params[10], params[11]);
+ bpf_bprintf_cleanup();
- err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
-out:
- this_cpu_dec(bpf_seq_printf_buf_used);
- return err;
+ return seq_has_overflowed(m) ? -EOVERFLOW : 0;
}
BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file)
@@ -1367,6 +1056,14 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_per_cpu_ptr_proto;
case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
+ case BPF_FUNC_task_storage_get:
+ return &bpf_task_storage_get_proto;
+ case BPF_FUNC_task_storage_delete:
+ return &bpf_task_storage_delete_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_snprintf:
+ return &bpf_snprintf_proto;
default:
return NULL;
}