From 1e0bd5a091e5d9e0f1d5b0e6329b87bb1792f784 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:02 -0800 Subject: bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never fails 92117d8443bc ("bpf: fix refcnt overflow") turned refcounting of bpf_map into potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit (32k). Due to using 32-bit counter, it's possible in practice to overflow refcounter and make it wrap around to 0, causing erroneous map free, while there are still references to it, causing use-after-free problems. But having a failing refcounting operations are problematic in some cases. One example is mmap() interface. After establishing initial memory-mapping, user is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily splitting it into multiple non-contiguous regions. All this happening without any control from the users of mmap subsystem. Rather mmap subsystem sends notifications to original creator of memory mapping through open/close callbacks, which are optionally specified during initial memory mapping creation. These callbacks are used to maintain accurate refcount for bpf_map (see next patch in this series). The problem is that open() callback is not supposed to fail, because memory-mapped resource is set up and properly referenced. This is posing a problem for using memory-mapping with BPF maps. One solution to this is to maintain separate refcount for just memory-mappings and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively. There are similar use cases in current work on tcp-bpf, necessitating extra counter as well. This seems like a rather unfortunate and ugly solution that doesn't scale well to various new use cases. Another approach to solve this is to use non-failing refcount_t type, which uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX, stays there. This utlimately causes memory leak, but prevents use after free. But given refcounting is not the most performance-critical operation with BPF maps (it's not used from running BPF program code), we can also just switch to 64-bit counter that can't overflow in practice, potentially disadvantaging 32-bit platforms a tiny bit. This simplifies semantics and allows above described scenarios to not worry about failing refcount increment operation. In terms of struct bpf_map size, we are still good and use the same amount of space: BEFORE (3 cache lines, 8 bytes of padding at the end): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic_t refcnt __attribute__((__aligned__(64))); /* 128 4 */ atomic_t usercnt; /* 132 4 */ struct work_struct work; /* 136 32 */ char name[16]; /* 168 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 146, holes: 1, sum holes: 38 */ /* padding: 8 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); AFTER (same 3 cache lines, no extra padding now): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */ atomic64_t usercnt; /* 136 8 */ struct work_struct work; /* 144 32 */ char name[16]; /* 176 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 154, holes: 1, sum holes: 38 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); This patch, while modifying all users of bpf_map_inc, also cleans up its interface to match bpf_map_put with separate operations for bpf_map_inc and bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref, respectively). Also, given there are no users of bpf_map_inc_not_zero specifying uref=true, remove uref flag and default to uref=false internally. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/net/ethernet') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 88fab6a82acf..06927ba5a3ae 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -46,9 +46,7 @@ nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, /* Grab a single ref to the map for our record. The prog destroy ndo * happens after free_used_maps(). */ - map = bpf_map_inc(map, false); - if (IS_ERR(map)) - return PTR_ERR(map); + bpf_map_inc(map); record = kmalloc(sizeof(*record), GFP_KERNEL); if (!record) { -- cgit v1.2.3-59-g8ed1b From 85192dbf4de08795afe2b88e52a36fc6abfc3dba Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:03 -0800 Subject: bpf: Convert bpf_prog refcnt to atomic64_t Similarly to bpf_map's refcnt/usercnt, convert bpf_prog's refcnt to atomic64 and remove artificial 32k limit. This allows to make bpf_prog's refcounting non-failing, simplifying logic of users of bpf_prog_add/bpf_prog_inc. Validated compilation by running allyesconfig kernel build. Suggested-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191117172806.2195367-3-andriin@fb.com --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++----- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 9 ++----- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 7 ++---- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 24 +++++------------- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 18 ++++---------- drivers/net/ethernet/qlogic/qede/qede_main.c | 8 ++---- drivers/net/virtio_net.c | 7 ++---- include/linux/bpf.h | 13 ++++------ kernel/bpf/inode.c | 5 ++-- kernel/bpf/syscall.c | 30 +++++++---------------- kernel/events/core.c | 7 ++---- 11 files changed, 40 insertions(+), 97 deletions(-) (limited to 'drivers/net/ethernet') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c07172429c70..9da4fbee3cf7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3171,13 +3171,8 @@ static int bnxt_init_one_rx_ring(struct bnxt *bp, int ring_nr) bnxt_init_rxbd_pages(ring, type); if (BNXT_RX_PAGE_MODE(bp) && bp->xdp_prog) { - rxr->xdp_prog = bpf_prog_add(bp->xdp_prog, 1); - if (IS_ERR(rxr->xdp_prog)) { - int rc = PTR_ERR(rxr->xdp_prog); - - rxr->xdp_prog = NULL; - return rc; - } + bpf_prog_add(bp->xdp_prog, 1); + rxr->xdp_prog = bp->xdp_prog; } prod = rxr->rx_prod; for (i = 0; i < bp->rx_ring_size; i++) { diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 40a44dcb3d9b..f28409279ea4 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1876,13 +1876,8 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog) if (nic->xdp_prog) { /* Attach BPF program */ - nic->xdp_prog = bpf_prog_add(nic->xdp_prog, nic->rx_queues - 1); - if (!IS_ERR(nic->xdp_prog)) { - bpf_attached = true; - } else { - ret = PTR_ERR(nic->xdp_prog); - nic->xdp_prog = NULL; - } + bpf_prog_add(nic->xdp_prog, nic->rx_queues - 1); + bpf_attached = true; } /* Calculate Tx queues needed for XDP and network stack */ diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index c26c0a7cbb6b..acc56606d3a5 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1807,11 +1807,8 @@ static int setup_xdp(struct net_device *dev, struct bpf_prog *prog) if (prog && !xdp_mtu_valid(priv, dev->mtu)) return -EINVAL; - if (prog) { - prog = bpf_prog_add(prog, priv->num_channels); - if (IS_ERR(prog)) - return PTR_ERR(prog); - } + if (prog) + bpf_prog_add(prog, priv->num_channels); up = netif_running(dev); need_update = (!!priv->xdp_prog != !!prog); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 40ec5acf79c0..d4697beeacc2 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2286,11 +2286,7 @@ int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv, lockdep_is_held(&priv->mdev->state_lock)); if (xdp_prog && carry_xdp_prog) { - xdp_prog = bpf_prog_add(xdp_prog, tmp->rx_ring_num); - if (IS_ERR(xdp_prog)) { - mlx4_en_free_resources(tmp); - return PTR_ERR(xdp_prog); - } + bpf_prog_add(xdp_prog, tmp->rx_ring_num); for (i = 0; i < tmp->rx_ring_num; i++) rcu_assign_pointer(tmp->rx_ring[i]->xdp_prog, xdp_prog); @@ -2782,11 +2778,9 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) * program for a new one. */ if (priv->tx_ring_num[TX_XDP] == xdp_ring_num) { - if (prog) { - prog = bpf_prog_add(prog, priv->rx_ring_num - 1); - if (IS_ERR(prog)) - return PTR_ERR(prog); - } + if (prog) + bpf_prog_add(prog, priv->rx_ring_num - 1); + mutex_lock(&mdev->state_lock); for (i = 0; i < priv->rx_ring_num; i++) { old_prog = rcu_dereference_protected( @@ -2807,13 +2801,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) if (!tmp) return -ENOMEM; - if (prog) { - prog = bpf_prog_add(prog, priv->rx_ring_num - 1); - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto out; - } - } + if (prog) + bpf_prog_add(prog, priv->rx_ring_num - 1); mutex_lock(&mdev->state_lock); memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile)); @@ -2862,7 +2851,6 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) unlock_out: mutex_unlock(&mdev->state_lock); -out: kfree(tmp); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 772bfdbdeb9c..1d4a66fb466a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -408,12 +408,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->stats = &c->priv->channel_stats[c->ix].rq; INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work); - rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL; - if (IS_ERR(rq->xdp_prog)) { - err = PTR_ERR(rq->xdp_prog); - rq->xdp_prog = NULL; - goto err_rq_wq_destroy; - } + if (params->xdp_prog) + bpf_prog_inc(params->xdp_prog); + rq->xdp_prog = params->xdp_prog; rq_xdp_ix = rq->ix; if (xsk) @@ -4406,16 +4403,11 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog) /* no need for full reset when exchanging programs */ reset = (!priv->channels.params.xdp_prog || !prog); - if (was_opened && !reset) { + if (was_opened && !reset) /* num_channels is invariant here, so we can take the * batched reference right upfront. */ - prog = bpf_prog_add(prog, priv->channels.num); - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto unlock; - } - } + bpf_prog_add(prog, priv->channels.num); if (was_opened && reset) { struct mlx5e_channels new_channels = {}; diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 8d1c208f778f..1e26964fe4e9 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -2107,12 +2107,8 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats) if (rc) goto out; - fp->rxq->xdp_prog = bpf_prog_add(edev->xdp_prog, 1); - if (IS_ERR(fp->rxq->xdp_prog)) { - rc = PTR_ERR(fp->rxq->xdp_prog); - fp->rxq->xdp_prog = NULL; - goto out; - } + bpf_prog_add(edev->xdp_prog, 1); + fp->rxq->xdp_prog = edev->xdp_prog; } if (fp->type & QEDE_FASTPATH_TX) { diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 5a635f028bdc..4d7d5434cc5d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2445,11 +2445,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, if (!prog && !old_prog) return 0; - if (prog) { - prog = bpf_prog_add(prog, vi->max_queue_pairs - 1); - if (IS_ERR(prog)) - return PTR_ERR(prog); - } + if (prog) + bpf_prog_add(prog, vi->max_queue_pairs - 1); /* Make sure NAPI is not using any XDP TX queues for RX. */ if (netif_running(dev)) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 34a34445c009..fb606dc61a3a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -485,7 +485,7 @@ struct bpf_func_info_aux { }; struct bpf_prog_aux { - atomic_t refcnt; + atomic64_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; u32 max_pkt_offset; @@ -770,9 +770,9 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv); -struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); +void bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); -struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog); +void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); int __bpf_prog_charge(struct user_struct *user, u32 pages); @@ -912,10 +912,8 @@ static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, return ERR_PTR(-EOPNOTSUPP); } -static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, - int i) +static inline void bpf_prog_add(struct bpf_prog *prog, int i) { - return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_prog_sub(struct bpf_prog *prog, int i) @@ -926,9 +924,8 @@ static inline void bpf_prog_put(struct bpf_prog *prog) { } -static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog) +static inline void bpf_prog_inc(struct bpf_prog *prog) { - return ERR_PTR(-EOPNOTSUPP); } static inline struct bpf_prog *__must_check diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 2f17f24258dc..ecf42bec38c0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -31,7 +31,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: - raw = bpf_prog_inc(raw); + bpf_prog_inc(raw); break; case BPF_TYPE_MAP: bpf_map_inc_with_uref(raw); @@ -534,7 +534,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type if (!bpf_prog_get_ok(prog, &type, false)) return ERR_PTR(-EINVAL); - return bpf_prog_inc(prog); + bpf_prog_inc(prog); + return prog; } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 20030751b7a2..52fe4bacb330 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1339,7 +1339,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { - if (atomic_dec_and_test(&prog->aux->refcnt)) { + if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); @@ -1445,16 +1445,9 @@ static struct bpf_prog *____bpf_prog_get(struct fd f) return f.file->private_data; } -/* prog's refcnt limit */ -#define BPF_MAX_REFCNT 32768 - -struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) +void bpf_prog_add(struct bpf_prog *prog, int i) { - if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { - atomic_sub(i, &prog->aux->refcnt); - return ERR_PTR(-EBUSY); - } - return prog; + atomic64_add(i, &prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_add); @@ -1465,13 +1458,13 @@ void bpf_prog_sub(struct bpf_prog *prog, int i) * path holds a reference to the program, thus atomic_sub() can * be safely used in such cases! */ - WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0); + WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0); } EXPORT_SYMBOL_GPL(bpf_prog_sub); -struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +void bpf_prog_inc(struct bpf_prog *prog) { - return bpf_prog_add(prog, 1); + atomic64_inc(&prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_inc); @@ -1480,12 +1473,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) { int refold; - refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0); - - if (refold >= BPF_MAX_REFCNT) { - __bpf_prog_put(prog, false); - return ERR_PTR(-EBUSY); - } + refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); @@ -1523,7 +1511,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, goto out; } - prog = bpf_prog_inc(prog); + bpf_prog_inc(prog); out: fdput(f); return prog; @@ -1714,7 +1702,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->orig_prog = NULL; prog->jited = 0; - atomic_set(&prog->aux->refcnt, 1); + atomic64_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { diff --git a/kernel/events/core.c b/kernel/events/core.c index aec8dba2bea4..73c616876597 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10477,12 +10477,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (overflow_handler == bpf_overflow_handler) { - struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + struct bpf_prog *prog = parent_event->prog; - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto err_ns; - } + bpf_prog_inc(prog); event->prog = prog; event->orig_overflow_handler = parent_event->orig_overflow_handler; -- cgit v1.2.3-59-g8ed1b