diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 2 | ||||
-rw-r--r-- | net/core/dev.c | 55 | ||||
-rw-r--r-- | net/core/devlink.c | 45 | ||||
-rw-r--r-- | net/core/ethtool.c | 23 | ||||
-rw-r--r-- | net/core/fib_rules.c | 36 | ||||
-rw-r--r-- | net/core/filter.c | 467 | ||||
-rw-r--r-- | net/core/neighbour.c | 146 | ||||
-rw-r--r-- | net/core/net_namespace.c | 6 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 322 | ||||
-rw-r--r-- | net/core/skbuff.c | 12 | ||||
-rw-r--r-- | net/core/skmsg.c | 802 | ||||
-rw-r--r-- | net/core/sock.c | 70 | ||||
-rw-r--r-- | net/core/sock_map.c | 1002 |
13 files changed, 2607 insertions, 381 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 80175e6a2eb8..fccd31e0e7f7 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,6 +16,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o @@ -27,6 +28,7 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o +obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/dev.c b/net/core/dev.c index 0b2d777e5b9e..022ad73d6253 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1752,6 +1752,28 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); +/** + * call_netdevice_notifiers_mtu - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @arg: additional u32 argument passed to the notifier function + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ +static int call_netdevice_notifiers_mtu(unsigned long val, + struct net_device *dev, u32 arg) +{ + struct netdev_notifier_info_ext info = { + .info.dev = dev, + .ext.mtu = arg, + }; + + BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); + + return call_netdevice_notifiers_info(val, &info.info); +} + #ifdef CONFIG_NET_INGRESS static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); @@ -1954,6 +1976,17 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) return false; } +/** + * dev_nit_active - return true if any network interface taps are in use + * + * @dev: network device to check for the presence of taps + */ +bool dev_nit_active(struct net_device *dev) +{ + return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all); +} +EXPORT_SYMBOL_GPL(dev_nit_active); + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -3211,7 +3244,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, unsigned int len; int rc; - if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) + if (dev_nit_active(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; @@ -4258,6 +4291,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct netdev_rx_queue *rxqueue; void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; + __be16 orig_eth_type; + struct ethhdr *eth; + bool orig_bcast; int hlen, off; u32 mac_len; @@ -4298,6 +4334,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_hard_start = skb->data - skb_headroom(skb); orig_data_end = xdp->data_end; orig_data = xdp->data; + eth = (struct ethhdr *)xdp->data; + orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); + orig_eth_type = eth->h_proto; rxqueue = netif_get_rxqueue(skb); xdp->rxq = &rxqueue->xdp_rxq; @@ -4321,6 +4360,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, } + /* check if XDP changed eth hdr such SKB needs update */ + eth = (struct ethhdr *)xdp->data; + if ((orig_eth_type != eth->h_proto) || + (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { + __skb_push(skb, ETH_HLEN); + skb->protocol = eth_type_trans(skb, skb->dev); + } + switch (act) { case XDP_REDIRECT: case XDP_TX: @@ -7575,14 +7622,16 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, err = __dev_set_mtu(dev, new_mtu); if (!err) { - err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, + orig_mtu); err = notifier_to_errno(err); if (err) { /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes. */ __dev_set_mtu(dev, orig_mtu); - call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, + new_mtu); } } return err; diff --git a/net/core/devlink.c b/net/core/devlink.c index 938f68ee92f0..3a4b29a13d31 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3012,6 +3012,8 @@ devlink_param_value_get_from_info(const struct devlink_param *param, struct genl_info *info, union devlink_param_value *value) { + int len; + if (param->type != DEVLINK_PARAM_TYPE_BOOL && !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) return -EINVAL; @@ -3027,10 +3029,13 @@ devlink_param_value_get_from_info(const struct devlink_param *param, value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); break; case DEVLINK_PARAM_TYPE_STRING: - if (nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) > - DEVLINK_PARAM_MAX_STRING_VALUE) + len = strnlen(nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]), + nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])); + if (len == nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) || + len >= __DEVLINK_PARAM_MAX_STRING_VALUE) return -EINVAL; - value->vstr = nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + strcpy(value->vstr, + nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])); break; case DEVLINK_PARAM_TYPE_BOOL: value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ? @@ -3117,7 +3122,10 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, return -EOPNOTSUPP; if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { - param_item->driverinit_value = value; + if (param->type == DEVLINK_PARAM_TYPE_STRING) + strcpy(param_item->driverinit_value.vstr, value.vstr); + else + param_item->driverinit_value = value; param_item->driverinit_value_valid = true; } else { if (!param->set) @@ -3504,7 +3512,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, start_offset = *((u64 *)&cb->args[0]); err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, ops->policy, NULL); + attrs, DEVLINK_ATTR_MAX, ops->policy, cb->extack); if (err) goto out; @@ -4557,7 +4565,10 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, DEVLINK_PARAM_CMODE_DRIVERINIT)) return -EOPNOTSUPP; - *init_val = param_item->driverinit_value; + if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) + strcpy(init_val->vstr, param_item->driverinit_value.vstr); + else + *init_val = param_item->driverinit_value; return 0; } @@ -4588,7 +4599,10 @@ int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, DEVLINK_PARAM_CMODE_DRIVERINIT)) return -EOPNOTSUPP; - param_item->driverinit_value = init_val; + if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) + strcpy(param_item->driverinit_value.vstr, init_val.vstr); + else + param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); @@ -4621,6 +4635,23 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id) EXPORT_SYMBOL_GPL(devlink_param_value_changed); /** + * devlink_param_value_str_fill - Safely fill-up the string preventing + * from overflow of the preallocated buffer + * + * @dst_val: destination devlink_param_value + * @src: source buffer + */ +void devlink_param_value_str_fill(union devlink_param_value *dst_val, + const char *src) +{ + size_t len; + + len = strlcpy(dst_val->vstr, src, __DEVLINK_PARAM_MAX_STRING_VALUE); + WARN_ON(len >= __DEVLINK_PARAM_MAX_STRING_VALUE); +} +EXPORT_SYMBOL_GPL(devlink_param_value_str_fill); + +/** * devlink_region_create - create a new address region * * @devlink: devlink diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 3144ef2bf136..4cc603dfc9ef 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -27,6 +27,7 @@ #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/net.h> +#include <net/xdp_sock.h> /* * Some useful ethtool_ops methods that're device independent. @@ -1662,8 +1663,10 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev, static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { - struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS }; + struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; + u16 from_channel, to_channel; u32 max_rx_in_use = 0; + unsigned int i; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; @@ -1671,13 +1674,13 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; - dev->ethtool_ops->get_channels(dev, &max); + dev->ethtool_ops->get_channels(dev, &curr); /* ensure new counts are within the maximums */ - if ((channels.rx_count > max.max_rx) || - (channels.tx_count > max.max_tx) || - (channels.combined_count > max.max_combined) || - (channels.other_count > max.max_other)) + if (channels.rx_count > curr.max_rx || + channels.tx_count > curr.max_tx || + channels.combined_count > curr.max_combined || + channels.other_count > curr.max_other) return -EINVAL; /* ensure the new Rx count fits within the configured Rx flow @@ -1687,6 +1690,14 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, (channels.combined_count + channels.rx_count) <= max_rx_in_use) return -EINVAL; + /* Disabling channels, query zero-copy AF_XDP sockets */ + from_channel = channels.combined_count + + min(channels.rx_count, channels.tx_count); + to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); + for (i = from_channel; i < to_channel; i++) + if (xdp_get_umem_from_qid(dev, i)) + return -EINVAL; + return dev->ethtool_ops->set_channels(dev, &channels); } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 0ff3953f64aa..ffbb827723a2 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1063,13 +1063,47 @@ skip: return err; } +static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct fib_rule_hdr *frh; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); + return -EINVAL; + } + + frh = nlmsg_data(nlh); + if (frh->dst_len || frh->src_len || frh->tos || frh->table || + frh->res1 || frh->res2 || frh->action || frh->flags) { + NL_SET_ERR_MSG(extack, + "Invalid values in header for fib rule dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); + return -EINVAL; + } + + return 0; +} + static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; int idx = 0, family; - family = rtnl_msg_family(cb->nlh); + if (cb->strict_check) { + int err = fib_valid_dumprule_req(nlh, cb->extack); + + if (err < 0) + return err; + } + + family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); diff --git a/net/core/filter.c b/net/core/filter.c index 72db8afb7cb6..1a3ac6c46873 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -38,6 +38,7 @@ #include <net/protocol.h> #include <net/netlink.h> #include <linux/skbuff.h> +#include <linux/skmsg.h> #include <net/sock.h> #include <net/flow_dissector.h> #include <linux/errno.h> @@ -58,13 +59,17 @@ #include <net/busy_poll.h> #include <net/tcp.h> #include <net/xfrm.h> +#include <net/udp.h> #include <linux/bpf_trace.h> #include <net/xdp_sock.h> #include <linux/inetdevice.h> +#include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> #include <net/ip_fib.h> #include <net/flow.h> #include <net/arp.h> #include <net/ipv6.h> +#include <net/net_namespace.h> #include <linux/seg6_local.h> #include <net/seg6.h> #include <net/seg6_local.h> @@ -2138,123 +2143,7 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, - struct bpf_map *, map, void *, key, u64, flags) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) - return SK_DROP; - - return SK_PASS; -} - -static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { - .func = bpf_sk_redirect_hash, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, - struct bpf_map *, map, u32, key, u64, flags) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) - return SK_DROP; - - return SK_PASS; -} - -struct sock *do_sk_redirect_map(struct sk_buff *skb) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - return tcb->bpf.sk_redir; -} - -static const struct bpf_func_proto bpf_sk_redirect_map_proto = { - .func = bpf_sk_redirect_map, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, - struct bpf_map *, map, void *, key, u64, flags) -{ - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - msg->flags = flags; - msg->sk_redir = __sock_hash_lookup_elem(map, key); - if (!msg->sk_redir) - return SK_DROP; - - return SK_PASS; -} - -static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { - .func = bpf_msg_redirect_hash, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, - struct bpf_map *, map, u32, key, u64, flags) -{ - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - msg->flags = flags; - msg->sk_redir = __sock_map_lookup_elem(map, key); - if (!msg->sk_redir) - return SK_DROP; - - return SK_PASS; -} - -struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) -{ - return msg->sk_redir; -} - -static const struct bpf_func_proto bpf_msg_redirect_map_proto = { - .func = bpf_msg_redirect_map, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; return 0; @@ -2268,7 +2157,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) { msg->cork_bytes = bytes; return 0; @@ -2282,45 +2171,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; -#define sk_msg_iter_var(var) \ - do { \ - var++; \ - if (var == MAX_SKB_FRAGS) \ - var = 0; \ - } while (0) - -BPF_CALL_4(bpf_msg_pull_data, - struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) +BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, + u32, end, u64, flags) { - unsigned int len = 0, offset = 0, copy = 0, poffset = 0; - int bytes = end - start, bytes_sg_total; - struct scatterlist *sg = msg->sg_data; - int first_sg, last_sg, i, shift; - unsigned char *p, *to, *from; + u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; + u32 first_sge, last_sge, i, shift, bytes_sg_total; + struct scatterlist *sge; + u8 *raw, *to, *from; struct page *page; if (unlikely(flags || end <= start)) return -EINVAL; /* First find the starting scatterlist element */ - i = msg->sg_start; + i = msg->sg.start; do { - len = sg[i].length; + len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; offset += len; - sk_msg_iter_var(i); - } while (i != msg->sg_end); + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); if (unlikely(start >= offset + len)) return -EINVAL; - first_sg = i; + first_sge = i; /* The start may point into the sg element so we need to also * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!msg->sg_copy[i] && bytes_sg_total <= len) + if (!msg->sg.copy[i] && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2334,12 +2215,12 @@ BPF_CALL_4(bpf_msg_pull_data, * will copy the entire sg entry. */ do { - copy += sg[i].length; - sk_msg_iter_var(i); + copy += sk_msg_elem(msg, i)->length; + sk_msg_iter_var_next(i); if (bytes_sg_total <= copy) break; - } while (i != msg->sg_end); - last_sg = i; + } while (i != msg->sg.end); + last_sge = i; if (unlikely(bytes_sg_total > copy)) return -EINVAL; @@ -2348,63 +2229,61 @@ BPF_CALL_4(bpf_msg_pull_data, get_order(copy)); if (unlikely(!page)) return -ENOMEM; - p = page_address(page); - i = first_sg; + raw = page_address(page); + i = first_sge; do { - from = sg_virt(&sg[i]); - len = sg[i].length; - to = p + poffset; + sge = sk_msg_elem(msg, i); + from = sg_virt(sge); + len = sge->length; + to = raw + poffset; memcpy(to, from, len); poffset += len; - sg[i].length = 0; - put_page(sg_page(&sg[i])); + sge->length = 0; + put_page(sg_page(sge)); - sk_msg_iter_var(i); - } while (i != last_sg); + sk_msg_iter_var_next(i); + } while (i != last_sge); - sg[first_sg].length = copy; - sg_set_page(&sg[first_sg], page, copy, 0); + sg_set_page(&msg->sg.data[first_sge], page, copy, 0); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ - WARN_ON_ONCE(last_sg == first_sg); - shift = last_sg > first_sg ? - last_sg - first_sg - 1 : - MAX_SKB_FRAGS - first_sg + last_sg - 1; + WARN_ON_ONCE(last_sge == first_sge); + shift = last_sge > first_sge ? + last_sge - first_sge - 1 : + MAX_SKB_FRAGS - first_sge + last_sge - 1; if (!shift) goto out; - i = first_sg; - sk_msg_iter_var(i); + i = first_sge; + sk_msg_iter_var_next(i); do { - int move_from; + u32 move_from; - if (i + shift >= MAX_SKB_FRAGS) - move_from = i + shift - MAX_SKB_FRAGS; + if (i + shift >= MAX_MSG_FRAGS) + move_from = i + shift - MAX_MSG_FRAGS; else move_from = i + shift; - - if (move_from == msg->sg_end) + if (move_from == msg->sg.end) break; - sg[i] = sg[move_from]; - sg[move_from].length = 0; - sg[move_from].page_link = 0; - sg[move_from].offset = 0; - - sk_msg_iter_var(i); + msg->sg.data[i] = msg->sg.data[move_from]; + msg->sg.data[move_from].length = 0; + msg->sg.data[move_from].page_link = 0; + msg->sg.data[move_from].offset = 0; + sk_msg_iter_var_next(i); } while (1); - msg->sg_end -= shift; - if (msg->sg_end < 0) - msg->sg_end += MAX_SKB_FRAGS; + + msg->sg.end = msg->sg.end - shift > msg->sg.end ? + msg->sg.end - shift + MAX_MSG_FRAGS : + msg->sg.end - shift; out: - msg->data = sg_virt(&sg[first_sg]) + start - offset; + msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; - return 0; } @@ -3923,8 +3802,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_userlocks |= SOCK_SNDBUF_LOCK; sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); break; - case SO_MAX_PACING_RATE: - sk->sk_max_pacing_rate = val; + case SO_MAX_PACING_RATE: /* 32bit version */ + sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); break; @@ -4813,6 +4692,149 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ +#ifdef CONFIG_INET +static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, + struct sk_buff *skb, u8 family, u8 proto) +{ + bool refcounted = false; + struct sock *sk = NULL; + int dif = 0; + + if (skb->dev) + dif = skb->dev->ifindex; + + if (family == AF_INET) { + __be32 src4 = tuple->ipv4.saddr; + __be32 dst4 = tuple->ipv4.daddr; + int sdif = inet_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &refcounted); + else + sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &udp_table, skb); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; + struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; + u16 hnum = ntohs(tuple->ipv6.dport); + int sdif = inet6_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + src6, tuple->ipv6.sport, + dst6, hnum, + dif, sdif, &refcounted); + else if (likely(ipv6_bpf_stub)) + sk = ipv6_bpf_stub->udp6_lib_lookup(net, + src6, tuple->ipv6.sport, + dst6, hnum, + dif, sdif, + &udp_table, skb); +#endif + } + + if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + sk = NULL; + } + return sk; +} + +/* bpf_sk_lookup performs the core lookup for different types of sockets, + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. + * Returns the socket as an 'unsigned long' to simplify the casting in the + * callers to satisfy BPF_CALL declarations. + */ +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + struct sock *sk = NULL; + u8 family = AF_UNSPEC; + struct net *net; + + family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; + if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) + goto out; + + if (skb->dev) + caller_net = dev_net(skb->dev); + else + caller_net = sock_net(skb->sk); + if (netns_id) { + net = get_net_ns_by_id(caller_net, netns_id); + if (unlikely(!net)) + goto out; + sk = sk_lookup(net, tuple, skb, family, proto); + put_net(net); + } else { + net = caller_net; + sk = sk_lookup(net, tuple, skb, family, proto); + } + + if (sk) + sk = sk_to_full_sk(sk); +out: + return (unsigned long) sk; +} + +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { + .func = bpf_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { + .func = bpf_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_release, struct sock *, sk) +{ + if (!sock_flag(sk, SOCK_RCU_FREE)) + sock_gen_put(sk); + return 0; +} + +static const struct bpf_func_proto bpf_sk_release_proto = { + .func = bpf_sk_release, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; +#endif /* CONFIG_INET */ + bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || @@ -5019,6 +5041,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -5051,6 +5081,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; + static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5074,6 +5107,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; +const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; + static const struct bpf_func_proto * sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5095,6 +5131,9 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; +const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; + static const struct bpf_func_proto * sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5119,6 +5158,14 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_hash_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -5394,23 +5441,29 @@ static bool __sock_filter_check_size(int off, int size, return size == size_default; } -static bool sock_filter_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) +bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; - if (!__sock_filter_check_attach_type(off, type, - prog->expected_attach_type)) - return false; if (!__sock_filter_check_size(off, size, info)) return false; return true; } +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (!bpf_sock_is_valid_access(off, size, type, info)) + return false; + return __sock_filter_check_attach_type(off, type, + prog->expected_attach_type); +} + static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { @@ -6122,10 +6175,10 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, u32 *target_size) +u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; @@ -6835,22 +6888,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, switch (si->off) { case offsetof(struct sk_msg_md, data): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, data)); + offsetof(struct sk_msg, data)); break; case offsetof(struct sk_msg_md, data_end): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, data_end)); + offsetof(struct sk_msg, data_end)); break; case offsetof(struct sk_msg_md, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; @@ -6859,9 +6912,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; @@ -6871,9 +6924,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); @@ -6888,9 +6941,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, off = si->off; off -= offsetof(struct sk_msg_md, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + @@ -6909,9 +6962,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, off = si->off; off -= offsetof(struct sk_msg_md, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + @@ -6925,9 +6978,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD @@ -6939,9 +6992,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; @@ -7037,7 +7090,7 @@ const struct bpf_prog_ops lwt_seg6local_prog_ops = { const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, - .convert_ctx_access = sock_filter_convert_ctx_access, + .convert_ctx_access = bpf_sock_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_prog_ops = { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index fb023df48b83..69c41cb3966d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -232,7 +232,8 @@ static void pneigh_queue_purge(struct sk_buff_head *list) } } -static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) +static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { int i; struct neigh_hash_table *nht; @@ -250,6 +251,10 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) np = &n->next; continue; } + if (skip_perm && n->nud_state & NUD_PERMANENT) { + np = &n->next; + continue; + } rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); @@ -285,21 +290,35 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { write_lock_bh(&tbl->lock); - neigh_flush_dev(tbl, dev); + neigh_flush_dev(tbl, dev, false); write_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); -int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { write_lock_bh(&tbl->lock); - neigh_flush_dev(tbl, dev); + neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); del_timer_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue); return 0; } + +int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev) +{ + __neigh_ifdown(tbl, dev, true); + return 0; +} +EXPORT_SYMBOL(neigh_carrier_down); + +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + __neigh_ifdown(tbl, dev, false); + return 0; +} EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) @@ -2164,15 +2183,47 @@ errout: return err; } +static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ndtmsg *ndtm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); + return -EINVAL; + } + + ndtm = nlmsg_data(nlh); + if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ndtm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request"); + return -EINVAL; + } + + return 0; +} + static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; int neigh_skip = cb->args[1]; struct neigh_table *tbl; - family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + if (cb->strict_check) { + int err = neightbl_valid_dump_info(nlh, cb->extack); + + if (err < 0) + return err; + } + + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; @@ -2185,7 +2236,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) continue; if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, + nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) break; @@ -2200,7 +2251,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (neightbl_fill_param_info(skb, tbl, p, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) goto out; @@ -2426,11 +2477,73 @@ out: } +static int neigh_valid_dump_req(const struct nlmsghdr *nlh, + bool strict_check, + struct neigh_dump_filter *filter, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + int err, i; + + if (strict_check) { + struct ndmsg *ndm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || + ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + NULL, extack); + } else { + err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + NULL, extack); + } + if (err < 0) + return err; + + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + /* all new attributes should require strict_check */ + switch (i) { + case NDA_IFINDEX: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in neighbor dump request"); + return -EINVAL; + } + filter->dev_idx = nla_get_u32(tb[i]); + break; + case NDA_MASTER: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in neighbor dump request"); + return -EINVAL; + } + filter->master_idx = nla_get_u32(tb[i]); + break; + default: + if (strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request"); + return -EINVAL; + } + } + } + + return 0; +} + static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct neigh_dump_filter filter = {}; - struct nlattr *tb[NDA_MAX + 1]; struct neigh_table *tbl; int t, family, s_t; int proxy = 0; @@ -2445,19 +2558,10 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY) proxy = 1; - err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL); - if (!err) { - if (tb[NDA_IFINDEX]) { - if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) - return -EINVAL; - filter.dev_idx = nla_get_u32(tb[NDA_IFINDEX]); - } - if (tb[NDA_MASTER]) { - if (nla_len(tb[NDA_MASTER]) != sizeof(u32)) - return -EINVAL; - filter.master_idx = nla_get_u32(tb[NDA_MASTER]); - } - } + err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); + if (err < 0 && cb->strict_check) + return err; + s_t = cb->args[0]; for (t = 0; t < NEIGH_NR_TABLES; t++) { diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 670c84b1bfc2..fefe72774aeb 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -853,6 +853,12 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) .s_idx = cb->args[0], }; + if (cb->strict_check && + nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) { + NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request"); + return -EINVAL; + } + spin_lock_bh(&net->nsid_lock); idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); spin_unlock_bh(&net->nsid_lock); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5564eee1e980..0958c7be2c22 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,7 +59,7 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> -#define RTNL_MAX_TYPE 48 +#define RTNL_MAX_TYPE 49 #define RTNL_SLAVE_MAX_TYPE 36 struct rtnl_link { @@ -1878,8 +1878,52 @@ struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) } EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); +static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, + bool strict_check, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int hdrlen; + + if (strict_check) { + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid header for link dump"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change) { + NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); + return -EINVAL; + } + if (ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); + return -EINVAL; + } + + return nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); + } + + /* A hack to preserve kernel<->userspace interface. + * The correct header is ifinfomsg. It is consistent with rtnl_getlink. + * However, before Linux v3.9 the code here assumed rtgenmsg and that's + * what iproute2 < v3.9.0 used. + * We can detect the old iproute2. Even including the IFLA_EXT_MASK + * attribute, its netlink message is shorter than struct ifinfomsg. + */ + hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? + sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + + return nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack); +} + static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { + struct netlink_ext_ack *extack = cb->extack; + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; int h, s_h; @@ -1892,44 +1936,54 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) unsigned int flags = NLM_F_MULTI; int master_idx = 0; int netnsid = -1; - int err; - int hdrlen; + int err, i; s_h = cb->args[0]; s_idx = cb->args[1]; - /* A hack to preserve kernel<->userspace interface. - * The correct header is ifinfomsg. It is consistent with rtnl_getlink. - * However, before Linux v3.9 the code here assumed rtgenmsg and that's - * what iproute2 < v3.9.0 used. - * We can detect the old iproute2. Even including the IFLA_EXT_MASK - * attribute, its netlink message is shorter than struct ifinfomsg. - */ - hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ? - sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); + if (err < 0) { + if (cb->strict_check) + return err; + + goto walk_entries; + } + + for (i = 0; i <= IFLA_MAX; ++i) { + if (!tb[i]) + continue; - if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, - ifla_policy, NULL) >= 0) { - if (tb[IFLA_TARGET_NETNSID]) { - netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); + /* new attributes should only be added with strict checking */ + switch (i) { + case IFLA_TARGET_NETNSID: + netnsid = nla_get_s32(tb[i]); tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); - if (IS_ERR(tgt_net)) + if (IS_ERR(tgt_net)) { + NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); return PTR_ERR(tgt_net); + } + break; + case IFLA_EXT_MASK: + ext_filter_mask = nla_get_u32(tb[i]); + break; + case IFLA_MASTER: + master_idx = nla_get_u32(tb[i]); + break; + case IFLA_LINKINFO: + kind_ops = linkinfo_to_kind_ops(tb[i]); + break; + default: + if (cb->strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); + return -EINVAL; + } } - - if (tb[IFLA_EXT_MASK]) - ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); - - if (tb[IFLA_MASTER]) - master_idx = nla_get_u32(tb[IFLA_MASTER]); - - if (tb[IFLA_LINKINFO]) - kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]); - - if (master_idx || kind_ops) - flags |= NLM_F_DUMP_FILTERED; } + if (master_idx || kind_ops) + flags |= NLM_F_DUMP_FILTERED; + +walk_entries: for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &tgt_net->dev_index_head[h]; @@ -1941,8 +1995,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - flags, + nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, netnsid); @@ -3746,22 +3799,66 @@ out: } EXPORT_SYMBOL(ndo_dflt_fdb_dump); -static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) +static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, + int *br_idx, int *brport_idx, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_flags || ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fbd dump request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + NULL, extack); + if (err < 0) + return err; + + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_IFINDEX: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request"); + return -EINVAL; + } + *brport_idx = nla_get_u32(tb[NDA_IFINDEX]); + break; + case NDA_MASTER: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request"); + return -EINVAL; + } + *br_idx = nla_get_u32(tb[NDA_MASTER]); + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request"); + return -EINVAL; + } + } + + return 0; +} + +static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, + int *br_idx, int *brport_idx, + struct netlink_ext_ack *extack) { - struct net_device *dev; struct nlattr *tb[IFLA_MAX+1]; - struct net_device *br_dev = NULL; - const struct net_device_ops *ops = NULL; - const struct net_device_ops *cops = NULL; - struct ifinfomsg *ifm = nlmsg_data(cb->nlh); - struct net *net = sock_net(skb->sk); - struct hlist_head *head; - int brport_idx = 0; - int br_idx = 0; - int h, s_h; - int idx = 0, s_idx; - int err = 0; - int fidx = 0; + int err; /* A hack to preserve kernel<->userspace interface. * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. @@ -3770,20 +3867,49 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) * Fortunately these sizes don't conflict with the size of ifinfomsg * with an optional attribute. */ - if (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) && - (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) + + if (nlmsg_len(nlh) != sizeof(struct ndmsg) && + (nlmsg_len(nlh) != sizeof(struct ndmsg) + nla_attr_size(sizeof(u32)))) { - err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, NULL); + struct ifinfomsg *ifm; + + err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, + ifla_policy, extack); if (err < 0) { return -EINVAL; } else if (err == 0) { if (tb[IFLA_MASTER]) - br_idx = nla_get_u32(tb[IFLA_MASTER]); + *br_idx = nla_get_u32(tb[IFLA_MASTER]); } - brport_idx = ifm->ifi_index; + ifm = nlmsg_data(nlh); + *brport_idx = ifm->ifi_index; } + return 0; +} + +static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net_device *dev; + struct net_device *br_dev = NULL; + const struct net_device_ops *ops = NULL; + const struct net_device_ops *cops = NULL; + struct net *net = sock_net(skb->sk); + struct hlist_head *head; + int brport_idx = 0; + int br_idx = 0; + int h, s_h; + int idx = 0, s_idx; + int err = 0; + int fidx = 0; + + if (cb->strict_check) + err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, + cb->extack); + else + err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx, + cb->extack); + if (err < 0) + return err; if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); @@ -3968,28 +4094,72 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); +static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, + bool strict_check, u32 *filter_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX+1]; + int err, i; + + if (strict_check) { + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change || ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, extack); + } else { + err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, extack); + } + if (err < 0) + return err; + + /* new attributes should only be added with strict checking */ + for (i = 0; i <= IFLA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case IFLA_EXT_MASK: + *filter_mask = nla_get_u32(tb[i]); + break; + default: + if (strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request"); + return -EINVAL; + } + } + } + + return 0; +} + static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; - u32 seq = cb->nlh->nlmsg_seq; + u32 seq = nlh->nlmsg_seq; u32 filter_mask = 0; int err; - if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { - struct nlattr *extfilt; - - extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), - IFLA_EXT_MASK); - if (extfilt) { - if (nla_len(extfilt) < sizeof(filter_mask)) - return -EINVAL; - - filter_mask = nla_get_u32(extfilt); - } - } + err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask, + cb->extack); + if (err < 0 && cb->strict_check) + return err; rcu_read_lock(); for_each_netdev_rcu(net, dev) { @@ -4583,6 +4753,7 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct netlink_ext_ack *extack = cb->extack; int h, s_h, err, s_idx, s_idxattr, s_prividx; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; @@ -4599,13 +4770,32 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = net->dev_base_seq; - if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) + if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) { + NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; + } ifsm = nlmsg_data(cb->nlh); + + /* only requests using strict checks can pass data to influence + * the dump. The legacy exception is filter_mask. + */ + if (cb->strict_check) { + if (ifsm->pad1 || ifsm->pad2 || ifsm->ifindex) { + NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); + return -EINVAL; + } + if (nlmsg_attrlen(cb->nlh, sizeof(*ifsm))) { + NL_SET_ERR_MSG(extack, "Invalid attributes after stats header"); + return -EINVAL; + } + } + filter_mask = ifsm->filter_mask; - if (!filter_mask) + if (!filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); return -EINVAL; + } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0e937d3d85b5..54b961de9538 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4394,14 +4394,16 @@ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); */ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) { - if (unlikely(start > skb_headlen(skb)) || - unlikely((int)start + off > skb_headlen(skb) - 2)) { - net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n", - start, off, skb_headlen(skb)); + u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); + u32 csum_start = skb_headroom(skb) + (u32)start; + + if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { + net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", + start, off, skb_headroom(skb), skb_headlen(skb)); return false; } skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + start; + skb->csum_start = csum_start; skb->csum_offset = off; skb_set_transport_header(skb, start); return true; diff --git a/net/core/skmsg.c b/net/core/skmsg.c new file mode 100644 index 000000000000..56a99d0c9aa0 --- /dev/null +++ b/net/core/skmsg.c @@ -0,0 +1,802 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include <linux/skmsg.h> +#include <linux/skbuff.h> +#include <linux/scatterlist.h> + +#include <net/sock.h> +#include <net/tcp.h> + +static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) +{ + if (msg->sg.end > msg->sg.start && + elem_first_coalesce < msg->sg.end) + return true; + + if (msg->sg.end < msg->sg.start && + (elem_first_coalesce > msg->sg.start || + elem_first_coalesce < msg->sg.end)) + return true; + + return false; +} + +int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, + int elem_first_coalesce) +{ + struct page_frag *pfrag = sk_page_frag(sk); + int ret = 0; + + len -= msg->sg.size; + while (len > 0) { + struct scatterlist *sge; + u32 orig_offset; + int use, i; + + if (!sk_page_frag_refill(sk, pfrag)) + return -ENOMEM; + + orig_offset = pfrag->offset; + use = min_t(int, len, pfrag->size - orig_offset); + if (!sk_wmem_schedule(sk, use)) + return -ENOMEM; + + i = msg->sg.end; + sk_msg_iter_var_prev(i); + sge = &msg->sg.data[i]; + + if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && + sg_page(sge) == pfrag->page && + sge->offset + sge->length == orig_offset) { + sge->length += use; + } else { + if (sk_msg_full(msg)) { + ret = -ENOSPC; + break; + } + + sge = &msg->sg.data[msg->sg.end]; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + sk_msg_iter_next(msg, end); + } + + sk_mem_charge(sk, use); + msg->sg.size += use; + pfrag->offset += use; + len -= use; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_alloc); + +int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, + u32 off, u32 len) +{ + int i = src->sg.start; + struct scatterlist *sge = sk_msg_elem(src, i); + u32 sge_len, sge_off; + + if (sk_msg_full(dst)) + return -ENOSPC; + + while (off) { + if (sge->length > off) + break; + off -= sge->length; + sk_msg_iter_var_next(i); + if (i == src->sg.end && off) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + while (len) { + sge_len = sge->length - off; + sge_off = sge->offset + off; + if (sge_len > len) + sge_len = len; + off = 0; + len -= sge_len; + sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); + sk_mem_charge(sk, sge_len); + sk_msg_iter_var_next(i); + if (i == src->sg.end && len) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + return 0; +} +EXPORT_SYMBOL_GPL(sk_msg_clone); + +void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (bytes < sge->length) { + sge->length -= bytes; + sge->offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sge->length); + bytes -= sge->length; + sge->length = 0; + sge->offset = 0; + sk_msg_iter_var_next(i); + } while (bytes && i != msg->sg.end); + msg->sg.start = i; +} +EXPORT_SYMBOL_GPL(sk_msg_return_zero); + +void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = &msg->sg.data[i]; + int uncharge = (bytes < sge->length) ? bytes : sge->length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +} +EXPORT_SYMBOL_GPL(sk_msg_return); + +static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + u32 len = sge->length; + + if (charge) + sk_mem_uncharge(sk, len); + if (!msg->skb) + put_page(sg_page(sge)); + memset(sge, 0, sizeof(*sge)); + return len; +} + +static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + int freed = 0; + + while (msg->sg.size) { + msg->sg.size -= sge->length; + freed += sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, msg->sg.size); + sge = sk_msg_elem(msg, i); + } + if (msg->skb) + consume_skb(msg->skb); + sk_msg_init(msg); + return freed; +} + +int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, false); +} +EXPORT_SYMBOL_GPL(sk_msg_free_nocharge); + +int sk_msg_free(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free); + +static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, + u32 bytes, bool charge) +{ + struct scatterlist *sge; + u32 i = msg->sg.start; + + while (bytes) { + sge = sk_msg_elem(msg, i); + if (!sge->length) + break; + if (bytes < sge->length) { + if (charge) + sk_mem_uncharge(sk, bytes); + sge->length -= bytes; + sge->offset += bytes; + msg->sg.size -= bytes; + break; + } + + msg->sg.size -= sge->length; + bytes -= sge->length; + sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, bytes); + } + msg->sg.start = i; +} + +void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free_partial); + +void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, + u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, false); +} + +void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) +{ + int trim = msg->sg.size - len; + u32 i = msg->sg.end; + + if (trim <= 0) { + WARN_ON(trim < 0); + return; + } + + sk_msg_iter_var_prev(i); + msg->sg.size = len; + while (msg->sg.data[i].length && + trim >= msg->sg.data[i].length) { + trim -= msg->sg.data[i].length; + sk_msg_free_elem(sk, msg, i, true); + sk_msg_iter_var_prev(i); + if (!trim) + goto out; + } + + msg->sg.data[i].length -= trim; + sk_mem_uncharge(sk, trim); +out: + /* If we trim data before curr pointer update copybreak and current + * so that any future copy operations start at new copy location. + * However trimed data that has not yet been used in a copy op + * does not require an update. + */ + if (msg->sg.curr >= i) { + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } + sk_msg_iter_var_next(i); + msg->sg.end = i; +} +EXPORT_SYMBOL_GPL(sk_msg_trim); + +int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg); + const int to_max_pages = MAX_MSG_FRAGS; + struct page *pages[MAX_MSG_FRAGS]; + ssize_t orig, copied, use, offset; + + orig = msg->sg.size; + while (bytes > 0) { + i = 0; + maxpages = to_max_pages - num_elems; + if (maxpages == 0) { + ret = -EFAULT; + goto out; + } + + copied = iov_iter_get_pages(from, pages, bytes, maxpages, + &offset); + if (copied <= 0) { + ret = -EFAULT; + goto out; + } + + iov_iter_advance(from, copied); + bytes -= copied; + msg->sg.size += copied; + + while (copied) { + use = min_t(int, copied, PAGE_SIZE - offset); + sg_set_page(&msg->sg.data[msg->sg.end], + pages[i], use, offset); + sg_unmark_end(&msg->sg.data[msg->sg.end]); + sk_mem_charge(sk, use); + + offset = 0; + copied -= use; + sk_msg_iter_next(msg, end); + num_elems++; + i++; + } + /* When zerocopy is mixed with sk_msg_*copy* operations we + * may have a copybreak set in this case clear and prefer + * zerocopy remainder when possible. + */ + msg->sg.copybreak = 0; + msg->sg.curr = msg->sg.end; + } +out: + /* Revert iov_iter updates, msg will need to use 'trim' later if it + * also needs to be cleared. + */ + if (ret) + iov_iter_revert(from, msg->sg.size - orig); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter); + +int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int ret = -ENOSPC, i = msg->sg.curr; + struct scatterlist *sge; + u32 copy, buf_size; + void *to; + + do { + sge = sk_msg_elem(msg, i); + /* This is possible if a trim operation shrunk the buffer */ + if (msg->sg.copybreak >= sge->length) { + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + if (i == msg->sg.end) + break; + sge = sk_msg_elem(msg, i); + } + + buf_size = sge->length - msg->sg.copybreak; + copy = (buf_size > bytes) ? bytes : buf_size; + to = sg_virt(sge) + msg->sg.copybreak; + msg->sg.copybreak += copy; + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + ret = copy_from_iter_nocache(to, copy, from); + else + ret = copy_from_iter(to, copy, from); + if (ret != copy) { + ret = -EFAULT; + goto out; + } + bytes -= copy; + if (!bytes) + break; + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +out: + msg->sg.curr = i; + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); + +static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sk; + int copied = 0, num_sge; + struct sk_msg *msg; + + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + if (unlikely(!msg)) + return -EAGAIN; + if (!sk_rmem_schedule(sk, skb, skb->len)) { + kfree(msg); + return -EAGAIN; + } + + sk_msg_init(msg); + num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); + if (unlikely(num_sge < 0)) { + kfree(msg); + return num_sge; + } + + sk_mem_charge(sk, skb->len); + copied = skb->len; + msg->sg.start = 0; + msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge; + msg->skb = skb; + + sk_psock_queue_msg(psock, msg); + sk->sk_data_ready(sk); + return copied; +} + +static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len, bool ingress) +{ + if (ingress) + return sk_psock_skb_ingress(psock, skb); + else + return skb_send_sock_locked(psock->sk, skb, off, len); +} + +static void sk_psock_backlog(struct work_struct *work) +{ + struct sk_psock *psock = container_of(work, struct sk_psock, work); + struct sk_psock_work_state *state = &psock->work_state; + struct sk_buff *skb; + bool ingress; + u32 len, off; + int ret; + + /* Lock sock to avoid losing sk_socket during loop. */ + lock_sock(psock->sk); + if (state->skb) { + skb = state->skb; + len = state->len; + off = state->off; + state->skb = NULL; + goto start; + } + + while ((skb = skb_dequeue(&psock->ingress_skb))) { + len = skb->len; + off = 0; +start: + ingress = tcp_skb_bpf_ingress(skb); + do { + ret = -EIO; + if (likely(psock->sk->sk_socket)) + ret = sk_psock_handle_skb(psock, skb, off, + len, ingress); + if (ret <= 0) { + if (ret == -EAGAIN) { + state->skb = skb; + state->len = len; + state->off = off; + goto end; + } + /* Hard errors break pipe and stop xmit. */ + sk_psock_report_error(psock, ret ? -ret : EPIPE); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + kfree_skb(skb); + goto end; + } + off += ret; + len -= ret; + } while (len); + + if (!ingress) + kfree_skb(skb); + } +end: + release_sock(psock->sk); +} + +struct sk_psock *sk_psock_init(struct sock *sk, int node) +{ + struct sk_psock *psock = kzalloc_node(sizeof(*psock), + GFP_ATOMIC | __GFP_NOWARN, + node); + if (!psock) + return NULL; + + psock->sk = sk; + psock->eval = __SK_NONE; + + INIT_LIST_HEAD(&psock->link); + spin_lock_init(&psock->link_lock); + + INIT_WORK(&psock->work, sk_psock_backlog); + INIT_LIST_HEAD(&psock->ingress_msg); + skb_queue_head_init(&psock->ingress_skb); + + sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); + refcount_set(&psock->refcnt, 1); + + rcu_assign_sk_user_data(sk, psock); + sock_hold(sk); + + return psock; +} +EXPORT_SYMBOL_GPL(sk_psock_init); + +struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) +{ + struct sk_psock_link *link; + + spin_lock_bh(&psock->link_lock); + link = list_first_entry_or_null(&psock->link, struct sk_psock_link, + list); + if (link) + list_del(&link->list); + spin_unlock_bh(&psock->link_lock); + return link; +} + +void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +{ + struct sk_msg *msg, *tmp; + + list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { + list_del(&msg->list); + sk_msg_free(psock->sk, msg); + kfree(msg); + } +} + +static void sk_psock_zap_ingress(struct sk_psock *psock) +{ + __skb_queue_purge(&psock->ingress_skb); + __sk_psock_purge_ingress_msg(psock); +} + +static void sk_psock_link_destroy(struct sk_psock *psock) +{ + struct sk_psock_link *link, *tmp; + + list_for_each_entry_safe(link, tmp, &psock->link, list) { + list_del(&link->list); + sk_psock_free_link(link); + } +} + +static void sk_psock_destroy_deferred(struct work_struct *gc) +{ + struct sk_psock *psock = container_of(gc, struct sk_psock, gc); + + /* No sk_callback_lock since already detached. */ + if (psock->parser.enabled) + strp_done(&psock->parser.strp); + + cancel_work_sync(&psock->work); + + psock_progs_drop(&psock->progs); + + sk_psock_link_destroy(psock); + sk_psock_cork_free(psock); + sk_psock_zap_ingress(psock); + + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sk); + kfree(psock); +} + +void sk_psock_destroy(struct rcu_head *rcu) +{ + struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); + + INIT_WORK(&psock->gc, sk_psock_destroy_deferred); + schedule_work(&psock->gc); +} +EXPORT_SYMBOL_GPL(sk_psock_destroy); + +void sk_psock_drop(struct sock *sk, struct sk_psock *psock) +{ + rcu_assign_sk_user_data(sk, NULL); + sk_psock_cork_free(psock); + sk_psock_restore_proto(sk, psock); + + write_lock_bh(&sk->sk_callback_lock); + if (psock->progs.skb_parser) + sk_psock_stop_strp(sk, psock); + write_unlock_bh(&sk->sk_callback_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + + call_rcu_sched(&psock->rcu, sk_psock_destroy); +} +EXPORT_SYMBOL_GPL(sk_psock_drop); + +static int sk_psock_map_verd(int verdict, bool redir) +{ + switch (verdict) { + case SK_PASS: + return redir ? __SK_REDIRECT : __SK_PASS; + case SK_DROP: + default: + break; + } + + return __SK_DROP; +} + +int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg) +{ + struct bpf_prog *prog; + int ret; + + preempt_disable(); + rcu_read_lock(); + prog = READ_ONCE(psock->progs.msg_parser); + if (unlikely(!prog)) { + ret = __SK_PASS; + goto out; + } + + sk_msg_compute_data_pointers(msg); + msg->sk = sk; + ret = BPF_PROG_RUN(prog, msg); + ret = sk_psock_map_verd(ret, msg->sk_redir); + psock->apply_bytes = msg->apply_bytes; + if (ret == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = msg->sk_redir; + if (!psock->sk_redir) { + ret = __SK_DROP; + goto out; + } + sock_hold(psock->sk_redir); + } +out: + rcu_read_unlock(); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); + +static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, + struct sk_buff *skb) +{ + int ret; + + skb->sk = psock->sk; + bpf_compute_data_end_sk_skb(skb); + preempt_disable(); + ret = BPF_PROG_RUN(prog, skb); + preempt_enable(); + /* strparser clones the skb before handing it to a upper layer, + * meaning skb_orphan has been called. We NULL sk on the way out + * to ensure we don't trigger a BUG_ON() in skb/sk operations + * later and because we are not charging the memory of this skb + * to any socket yet. + */ + skb->sk = NULL; + return ret; +} + +static struct sk_psock *sk_psock_from_strp(struct strparser *strp) +{ + struct sk_psock_parser *parser; + + parser = container_of(strp, struct sk_psock_parser, strp); + return container_of(parser, struct sk_psock, parser); +} + +static void sk_psock_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + struct sk_psock *psock_other; + struct sock *sk_other; + bool ingress; + + switch (verdict) { + case __SK_REDIRECT: + sk_other = tcp_skb_bpf_redirect_fetch(skb); + if (unlikely(!sk_other)) + goto out_free; + psock_other = sk_psock(sk_other); + if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) + goto out_free; + ingress = tcp_skb_bpf_ingress(skb); + if ((!ingress && sock_writeable(sk_other)) || + (ingress && + atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf)) { + if (!ingress) + skb_set_owner_w(skb, sk_other); + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); + break; + } + /* fall-through */ + case __SK_DROP: + /* fall-through */ + default: +out_free: + kfree_skb(skb); + } +} + +static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock = sk_psock_from_strp(strp); + struct bpf_prog *prog; + int ret = __SK_DROP; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + skb_orphan(skb); + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + rcu_read_unlock(); + sk_psock_verdict_apply(psock, skb, ret); +} + +static int sk_psock_strp_read_done(struct strparser *strp, int err) +{ + return err; +} + +static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock = sk_psock_from_strp(strp); + struct bpf_prog *prog; + int ret = skb->len; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_parser); + if (likely(prog)) + ret = sk_psock_bpf_run(psock, prog, skb); + rcu_read_unlock(); + return ret; +} + +/* Called with socket lock held. */ +static void sk_psock_data_ready(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->parser.strp); + write_unlock_bh(&sk->sk_callback_lock); + } + rcu_read_unlock(); +} + +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk); + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + rcu_read_unlock(); + write_space(sk); +} + +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + psock->parser.enabled = false; + return strp_init(&psock->parser.strp, sk, &cb); +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + strp_stop(&parser->strp); + parser->enabled = false; +} diff --git a/net/core/sock.c b/net/core/sock.c index 7e8796a6a089..6fcc4bc07d19 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -998,7 +998,7 @@ set_rcvbuf: cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); - sk->sk_max_pacing_rate = val; + sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); break; @@ -1336,7 +1336,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname, #endif case SO_MAX_PACING_RATE: - v.val = sk->sk_max_pacing_rate; + /* 32bit version */ + v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); break; case SO_INCOMING_CPU: @@ -2238,67 +2239,6 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, - int first_coalesce) -{ - int sg_curr = *sg_curr_index, use = 0, rc = 0; - unsigned int size = *sg_curr_size; - struct page_frag *pfrag; - struct scatterlist *sge; - - len -= size; - pfrag = sk_page_frag(sk); - - while (len > 0) { - unsigned int orig_offset; - - if (!sk_page_frag_refill(sk, pfrag)) { - rc = -ENOMEM; - goto out; - } - - use = min_t(int, len, pfrag->size - pfrag->offset); - - if (!sk_wmem_schedule(sk, use)) { - rc = -ENOMEM; - goto out; - } - - sk_mem_charge(sk, use); - size += use; - orig_offset = pfrag->offset; - pfrag->offset += use; - - sge = sg + sg_curr - 1; - if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page && - sge->offset + sge->length == orig_offset) { - sge->length += use; - } else { - sge = sg + sg_curr; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - sg_curr++; - - if (sg_curr == MAX_SKB_FRAGS) - sg_curr = 0; - - if (sg_curr == sg_start) { - rc = -ENOSPC; - break; - } - } - - len -= use; - } -out: - *sg_curr_size = size; - *sg_curr_index = sg_curr; - return rc; -} -EXPORT_SYMBOL(sk_alloc_sg); - static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) @@ -2810,8 +2750,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_ll_usec = sysctl_net_busy_read; #endif - sk->sk_max_pacing_rate = ~0U; - sk->sk_pacing_rate = ~0U; + sk->sk_max_pacing_rate = ~0UL; + sk->sk_pacing_rate = ~0UL; sk->sk_pacing_shift = 10; sk->sk_incoming_cpu = -1; diff --git a/net/core/sock_map.c b/net/core/sock_map.c new file mode 100644 index 000000000000..3c0e44cb811a --- /dev/null +++ b/net/core/sock_map.c @@ -0,0 +1,1002 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/net.h> +#include <linux/workqueue.h> +#include <linux/skmsg.h> +#include <linux/list.h> +#include <linux/jhash.h> + +struct bpf_stab { + struct bpf_map map; + struct sock **sks; + struct sk_psock_progs progs; + raw_spinlock_t lock; +}; + +#define SOCK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +static struct bpf_map *sock_map_alloc(union bpf_attr *attr) +{ + struct bpf_stab *stab; + u64 cost; + int err; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->max_entries == 0 || + attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + stab = kzalloc(sizeof(*stab), GFP_USER); + if (!stab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&stab->map, attr); + raw_spin_lock_init(&stab->lock); + + /* Make sure page count doesn't overflow. */ + cost = (u64) stab->map.max_entries * sizeof(struct sock *); + if (cost >= U32_MAX - PAGE_SIZE) { + err = -EINVAL; + goto free_stab; + } + + stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(stab->map.pages); + if (err) + goto free_stab; + + stab->sks = bpf_map_area_alloc(stab->map.max_entries * + sizeof(struct sock *), + stab->map.numa_node); + if (stab->sks) + return &stab->map; + err = -ENOMEM; +free_stab: + kfree(stab); + return ERR_PTR(err); +} + +int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) +{ + u32 ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int ret; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + ret = sock_map_prog_update(map, prog, attr->attach_type); + fdput(f); + return ret; +} + +static void sock_map_sk_acquire(struct sock *sk) + __acquires(&sk->sk_lock.slock) +{ + lock_sock(sk); + preempt_disable(); + rcu_read_lock(); +} + +static void sock_map_sk_release(struct sock *sk) + __releases(&sk->sk_lock.slock) +{ + rcu_read_unlock(); + preempt_enable(); + release_sock(sk); +} + +static void sock_map_add_link(struct sk_psock *psock, + struct sk_psock_link *link, + struct bpf_map *map, void *link_raw) +{ + link->link_raw = link_raw; + link->map = map; + spin_lock_bh(&psock->link_lock); + list_add_tail(&link->list, &psock->link); + spin_unlock_bh(&psock->link_lock); +} + +static void sock_map_del_link(struct sock *sk, + struct sk_psock *psock, void *link_raw) +{ + struct sk_psock_link *link, *tmp; + bool strp_stop = false; + + spin_lock_bh(&psock->link_lock); + list_for_each_entry_safe(link, tmp, &psock->link, list) { + if (link->link_raw == link_raw) { + struct bpf_map *map = link->map; + struct bpf_stab *stab = container_of(map, struct bpf_stab, + map); + if (psock->parser.enabled && stab->progs.skb_parser) + strp_stop = true; + list_del(&link->list); + sk_psock_free_link(link); + } + } + spin_unlock_bh(&psock->link_lock); + if (strp_stop) { + write_lock_bh(&sk->sk_callback_lock); + sk_psock_stop_strp(sk, psock); + write_unlock_bh(&sk->sk_callback_lock); + } +} + +static void sock_map_unref(struct sock *sk, void *link_raw) +{ + struct sk_psock *psock = sk_psock(sk); + + if (likely(psock)) { + sock_map_del_link(sk, psock, link_raw); + sk_psock_put(sk, psock); + } +} + +static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, + struct sock *sk) +{ + struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + bool skb_progs, sk_psock_is_new = false; + struct sk_psock *psock; + int ret; + + skb_verdict = READ_ONCE(progs->skb_verdict); + skb_parser = READ_ONCE(progs->skb_parser); + skb_progs = skb_parser && skb_verdict; + if (skb_progs) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) + return PTR_ERR(skb_verdict); + skb_parser = bpf_prog_inc_not_zero(skb_parser); + if (IS_ERR(skb_parser)) { + bpf_prog_put(skb_verdict); + return PTR_ERR(skb_parser); + } + } + + msg_parser = READ_ONCE(progs->msg_parser); + if (msg_parser) { + msg_parser = bpf_prog_inc_not_zero(msg_parser); + if (IS_ERR(msg_parser)) { + ret = PTR_ERR(msg_parser); + goto out; + } + } + + psock = sk_psock_get(sk); + if (psock) { + if (!sk_has_psock(sk)) { + ret = -EBUSY; + goto out_progs; + } + if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || + (skb_progs && READ_ONCE(psock->progs.skb_parser))) { + sk_psock_put(sk, psock); + ret = -EBUSY; + goto out_progs; + } + } else { + psock = sk_psock_init(sk, map->numa_node); + if (!psock) { + ret = -ENOMEM; + goto out_progs; + } + sk_psock_is_new = true; + } + + if (msg_parser) + psock_set_prog(&psock->progs.msg_parser, msg_parser); + if (sk_psock_is_new) { + ret = tcp_bpf_init(sk); + if (ret < 0) + goto out_drop; + } else { + tcp_bpf_reinit(sk); + } + + write_lock_bh(&sk->sk_callback_lock); + if (skb_progs && !psock->parser.enabled) { + ret = sk_psock_init_strp(sk, psock); + if (ret) { + write_unlock_bh(&sk->sk_callback_lock); + goto out_drop; + } + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + psock_set_prog(&psock->progs.skb_parser, skb_parser); + sk_psock_start_strp(sk, psock); + } + write_unlock_bh(&sk->sk_callback_lock); + return 0; +out_drop: + sk_psock_put(sk, psock); +out_progs: + if (msg_parser) + bpf_prog_put(msg_parser); +out: + if (skb_progs) { + bpf_prog_put(skb_verdict); + bpf_prog_put(skb_parser); + } + return ret; +} + +static void sock_map_free(struct bpf_map *map) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + int i; + + synchronize_rcu(); + rcu_read_lock(); + raw_spin_lock_bh(&stab->lock); + for (i = 0; i < stab->map.max_entries; i++) { + struct sock **psk = &stab->sks[i]; + struct sock *sk; + + sk = xchg(psk, NULL); + if (sk) + sock_map_unref(sk, psk); + } + raw_spin_unlock_bh(&stab->lock); + rcu_read_unlock(); + + bpf_map_area_free(stab->sks); + kfree(stab); +} + +static void sock_map_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); +} + +static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(key >= map->max_entries)) + return NULL; + return READ_ONCE(stab->sks[key]); +} + +static void *sock_map_lookup(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, + struct sock **psk) +{ + struct sock *sk; + + raw_spin_lock_bh(&stab->lock); + sk = *psk; + if (!sk_test || sk_test == sk) + *psk = NULL; + raw_spin_unlock_bh(&stab->lock); + if (unlikely(!sk)) + return -EINVAL; + sock_map_unref(sk, psk); + return 0; +} + +static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + __sock_map_delete(stab, sk, link_raw); +} + +static int sock_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = *(u32 *)key; + struct sock **psk; + + if (unlikely(i >= map->max_entries)) + return -EINVAL; + + psk = &stab->sks[i]; + return __sock_map_delete(stab, NULL, psk); +} + +static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = key ? *(u32 *)key : U32_MAX; + u32 *key_next = next; + + if (i == stab->map.max_entries - 1) + return -ENOENT; + if (i >= stab->map.max_entries) + *key_next = 0; + else + *key_next = i + 1; + return 0; +} + +static int sock_map_update_common(struct bpf_map *map, u32 idx, + struct sock *sk, u64 flags) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct sk_psock_link *link; + struct sk_psock *psock; + struct sock *osk; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(idx >= map->max_entries)) + return -E2BIG; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, &stab->progs, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + raw_spin_lock_bh(&stab->lock); + osk = stab->sks[idx]; + if (osk && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!osk && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + sock_map_add_link(psock, link, map, &stab->sks[idx]); + stab->sks[idx] = sk; + if (osk) + sock_map_unref(osk, &stab->sks[idx]); + raw_spin_unlock_bh(&stab->lock); + return 0; +out_unlock: + raw_spin_unlock_bh(&stab->lock); + if (psock) + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; +} + +static bool sock_map_sk_is_suitable(const struct sock *sk) +{ + return sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static int sock_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + u32 ufd = *(u32 *)value; + u32 idx = *(u32 *)key; + struct socket *sock; + struct sock *sk; + int ret; + + sock = sockfd_lookup(ufd, &ret); + if (!sock) + return ret; + sk = sock->sk; + if (!sk) { + ret = -EINVAL; + goto out; + } + if (!sock_map_sk_is_suitable(sk) || + sk->sk_state != TCP_ESTABLISHED) { + ret = -EOPNOTSUPP; + goto out; + } + + sock_map_sk_acquire(sk); + ret = sock_map_update_common(map, idx, sk, flags); + sock_map_sk_release(sk); +out: + fput(sock->file); + return ret; +} + +BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_map_update_common(map, *(u32 *)key, sops->sk, + flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_map_update_proto = { + .func = bpf_sock_map_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, + struct bpf_map *, map, u32, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_map_proto = { + .func = bpf_sk_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + msg->flags = flags; + msg->sk_redir = __sock_map_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_map_ops sock_map_ops = { + .map_alloc = sock_map_alloc, + .map_free = sock_map_free, + .map_get_next_key = sock_map_get_next_key, + .map_update_elem = sock_map_update_elem, + .map_delete_elem = sock_map_delete_elem, + .map_lookup_elem = sock_map_lookup, + .map_release_uref = sock_map_release_progs, + .map_check_btf = map_check_no_btf, +}; + +struct bpf_htab_elem { + struct rcu_head rcu; + u32 hash; + struct sock *sk; + struct hlist_node node; + u8 key[0]; +}; + +struct bpf_htab_bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + +struct bpf_htab { + struct bpf_map map; + struct bpf_htab_bucket *buckets; + u32 buckets_num; + u32 elem_size; + struct sk_psock_progs progs; + atomic_t count; +}; + +static inline u32 sock_hash_bucket_hash(const void *key, u32 len) +{ + return jhash(key, len, 0); +} + +static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab, + u32 hash) +{ + return &htab->buckets[hash & (htab->buckets_num - 1)]; +} + +static struct bpf_htab_elem * +sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, + u32 key_size) +{ + struct bpf_htab_elem *elem; + + hlist_for_each_entry_rcu(elem, head, node) { + if (elem->hash == hash && + !memcmp(&elem->key, key, key_size)) + return elem; + } + + return NULL; +} + +static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 key_size = map->key_size, hash; + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + + return elem ? elem->sk : NULL; +} + +static void sock_hash_free_elem(struct bpf_htab *htab, + struct bpf_htab_elem *elem) +{ + atomic_dec(&htab->count); + kfree_rcu(elem, rcu); +} + +static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_elem *elem_probe, *elem = link_raw; + struct bpf_htab_bucket *bucket; + + WARN_ON_ONCE(!rcu_read_lock_held()); + bucket = sock_hash_select_bucket(htab, elem->hash); + + /* elem may be deleted in parallel from the map, but access here + * is okay since it's going away only after RCU grace period. + * However, we need to check whether it's still present. + */ + raw_spin_lock_bh(&bucket->lock); + elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, + elem->key, map->key_size); + if (elem_probe && elem_probe == elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + raw_spin_unlock_bh(&bucket->lock); +} + +static int sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 hash, key_size = map->key_size; + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + int ret = -ENOENT; + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + raw_spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + ret = 0; + } + raw_spin_unlock_bh(&bucket->lock); + return ret; +} + +static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab, + void *key, u32 key_size, + u32 hash, struct sock *sk, + struct bpf_htab_elem *old) +{ + struct bpf_htab_elem *new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + + new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!new) { + atomic_dec(&htab->count); + return ERR_PTR(-ENOMEM); + } + memcpy(new->key, key, key_size); + new->sk = sk; + new->hash = hash; + return new; +} + +static int sock_hash_update_common(struct bpf_map *map, void *key, + struct sock *sk, u64 flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 key_size = map->key_size, hash; + struct bpf_htab_elem *elem, *elem_new; + struct bpf_htab_bucket *bucket; + struct sk_psock_link *link; + struct sk_psock *psock; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, &htab->progs, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + raw_spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!elem && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); + if (IS_ERR(elem_new)) { + ret = PTR_ERR(elem_new); + goto out_unlock; + } + + sock_map_add_link(psock, link, map, elem_new); + /* Add new element to the head of the list, so that + * concurrent search will find it before old elem. + */ + hlist_add_head_rcu(&elem_new->node, &bucket->head); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + raw_spin_unlock_bh(&bucket->lock); + return 0; +out_unlock: + raw_spin_unlock_bh(&bucket->lock); + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static int sock_hash_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + u32 ufd = *(u32 *)value; + struct socket *sock; + struct sock *sk; + int ret; + + sock = sockfd_lookup(ufd, &ret); + if (!sock) + return ret; + sk = sock->sk; + if (!sk) { + ret = -EINVAL; + goto out; + } + if (!sock_map_sk_is_suitable(sk) || + sk->sk_state != TCP_ESTABLISHED) { + ret = -EOPNOTSUPP; + goto out; + } + + sock_map_sk_acquire(sk); + ret = sock_hash_update_common(map, key, sk, flags); + sock_map_sk_release(sk); +out: + fput(sock->file); + return ret; +} + +static int sock_hash_get_next_key(struct bpf_map *map, void *key, + void *key_next) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_elem *elem, *elem_next; + u32 hash, key_size = map->key_size; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first_elem; + hash = sock_hash_bucket_hash(key, key_size); + head = &sock_hash_select_bucket(htab, hash)->head; + elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); + if (!elem) + goto find_first_elem; + + elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)), + struct bpf_htab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + + i = hash & (htab->buckets_num - 1); + i++; +find_first_elem: + for (; i < htab->buckets_num; i++) { + head = &sock_hash_select_bucket(htab, i)->head; + elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct bpf_htab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + } + + return -ENOENT; +} + +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int i, err; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->max_entries == 0 || + attr->key_size == 0 || + attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + if (attr->key_size > MAX_BPF_STACK) + return ERR_PTR(-E2BIG); + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct bpf_htab_elem) + + round_up(htab->map.key_size, 8); + if (htab->buckets_num == 0 || + htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) { + err = -EINVAL; + goto free_htab; + } + + cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) + + (u64) htab->elem_size * htab->map.max_entries; + if (cost >= U32_MAX - PAGE_SIZE) { + err = -EINVAL; + goto free_htab; + } + + htab->buckets = bpf_map_area_alloc(htab->buckets_num * + sizeof(struct bpf_htab_bucket), + htab->map.numa_node); + if (!htab->buckets) { + err = -ENOMEM; + goto free_htab; + } + + for (i = 0; i < htab->buckets_num; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + struct hlist_node *node; + int i; + + synchronize_rcu(); + rcu_read_lock(); + for (i = 0; i < htab->buckets_num; i++) { + bucket = sock_hash_select_bucket(htab, i); + raw_spin_lock_bh(&bucket->lock); + hlist_for_each_entry_safe(elem, node, &bucket->head, node) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + } + raw_spin_unlock_bh(&bucket->lock); + } + rcu_read_unlock(); + + bpf_map_area_free(htab->buckets); + kfree(htab); +} + +static void sock_hash_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs); +} + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_hash_update_common(map, key, sops->sk, flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, + struct bpf_map *, map, void *, key, u64, flags) +{ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + msg->flags = flags; + msg->sk_redir = __sock_hash_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_map_ops sock_hash_ops = { + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_hash_update_elem, + .map_delete_elem = sock_hash_delete_elem, + .map_lookup_elem = sock_map_lookup, + .map_release_uref = sock_hash_release_progs, + .map_check_btf = map_check_no_btf, +}; + +static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) +{ + switch (map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return &container_of(map, struct bpf_stab, map)->progs; + case BPF_MAP_TYPE_SOCKHASH: + return &container_of(map, struct bpf_htab, map)->progs; + default: + break; + } + + return NULL; +} + +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + u32 which) +{ + struct sk_psock_progs *progs = sock_map_progs(map); + + if (!progs) + return -EOPNOTSUPP; + + switch (which) { + case BPF_SK_MSG_VERDICT: + psock_set_prog(&progs->msg_parser, prog); + break; + case BPF_SK_SKB_STREAM_PARSER: + psock_set_prog(&progs->skb_parser, prog); + break; + case BPF_SK_SKB_STREAM_VERDICT: + psock_set_prog(&progs->skb_verdict, prog); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link) +{ + switch (link->map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return sock_map_delete_from_link(link->map, sk, + link->link_raw); + case BPF_MAP_TYPE_SOCKHASH: + return sock_hash_delete_from_link(link->map, sk, + link->link_raw); + default: + break; + } +} |