diff options
Diffstat (limited to 'net')
663 files changed, 24705 insertions, 10522 deletions
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index a068757eabaf..7b3341cef926 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -5,6 +5,7 @@ * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de> */ +#include <linux/if_arp.h> #include <linux/module.h> #include <net/6lowpan.h> diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 788076b002b3..e40aa3e3641c 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -319,8 +319,7 @@ static void vlan_transfer_features(struct net_device *dev, { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); - netif_set_gso_max_size(vlandev, dev->gso_max_size); - netif_set_gso_max_segs(vlandev, dev->gso_max_segs); + netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 1a705a4ef7fa..5eaf38875554 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -129,6 +129,7 @@ void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); +void vlan_dev_free_egress_priority(const struct net_device *dev); int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size); @@ -139,7 +140,6 @@ int vlan_check_real_dev(struct net_device *real_dev, void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); -void vlan_dev_uninit(struct net_device *dev); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 26d031a43cc1..839f2020b015 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -573,8 +573,7 @@ static int vlan_dev_init(struct net_device *dev) NETIF_F_ALL_FCOE; dev->features |= dev->hw_features | NETIF_F_LLTX; - netif_set_gso_max_size(dev, real_dev->gso_max_size); - netif_set_gso_max_segs(dev, real_dev->gso_max_segs); + netif_inherit_tso_max(dev, real_dev); if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); @@ -622,7 +621,7 @@ static int vlan_dev_init(struct net_device *dev) } /* Note: this function might be called multiple times for the same device. */ -void vlan_dev_uninit(struct net_device *dev) +void vlan_dev_free_egress_priority(const struct net_device *dev) { struct vlan_priority_tci_mapping *pm; struct vlan_dev_priv *vlan = vlan_dev_priv(dev); @@ -636,6 +635,11 @@ void vlan_dev_uninit(struct net_device *dev) } } +static void vlan_dev_uninit(struct net_device *dev) +{ + vlan_dev_free_egress_priority(dev); +} + static netdev_features_t vlan_dev_fix_features(struct net_device *dev, netdev_features_t features) { diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 0db85aeb119b..214532173536 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -182,11 +182,16 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, else if (dev->mtu > max_mtu) return -EINVAL; + /* Note: If this initial vlan_changelink() fails, we need + * to call vlan_dev_free_egress_priority() to free memory. + */ err = vlan_changelink(dev, tb, data, extack); + if (!err) err = register_vlan_dev(dev, extack); + if (err) - vlan_dev_uninit(dev); + vlan_dev_free_egress_priority(dev); return err; } diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index 08bf6c839e25..7825c129742a 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -280,7 +280,7 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset) const struct vlan_priority_tci_mapping *mp = vlan->egress_priority_map[i]; while (mp) { - seq_printf(seq, "%u:%hu ", + seq_printf(seq, "%u:%d ", mp->priority, ((mp->vlan_qos >> 13) & 0x7)); mp = mp->next; } diff --git a/net/9p/Kconfig b/net/9p/Kconfig index 64468c49791f..deabbd376cb1 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -15,6 +15,13 @@ menuconfig NET_9P if NET_9P +config NET_9P_FD + default NET_9P + tristate "9P FD Transport" + help + This builds support for transports over TCP, Unix sockets and + filedescriptors. + config NET_9P_VIRTIO depends on VIRTIO tristate "9P Virtio Transport" diff --git a/net/9p/Makefile b/net/9p/Makefile index aa0a5641e5d0..1df9b344c30b 100644 --- a/net/9p/Makefile +++ b/net/9p/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_NET_9P) := 9pnet.o +obj-$(CONFIG_NET_9P_FD) += 9pnet_fd.o obj-$(CONFIG_NET_9P_XEN) += 9pnet_xen.o obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o @@ -9,9 +10,11 @@ obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o client.o \ error.o \ protocol.o \ - trans_fd.o \ trans_common.o \ +9pnet_fd-objs := \ + trans_fd.o \ + 9pnet_virtio-objs := \ trans_virtio.o \ diff --git a/net/9p/client.c b/net/9p/client.c index d062f1e5bfb0..8bba0d9cf975 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1038,8 +1038,13 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) if (err) goto put_trans; - if (clnt->msize > clnt->trans_mod->maxsize) + if (clnt->msize > clnt->trans_mod->maxsize) { clnt->msize = clnt->trans_mod->maxsize; + pr_info("Limiting 'msize' to %d as this is the maximum " + "supported by transport %s\n", + clnt->msize, clnt->trans_mod->name + ); + } if (clnt->msize < 4096) { p9_debug(P9_DEBUG_ERROR, diff --git a/net/9p/mod.c b/net/9p/mod.c index c37fc201a944..55576c1866fa 100644 --- a/net/9p/mod.c +++ b/net/9p/mod.c @@ -83,7 +83,7 @@ void v9fs_unregister_trans(struct p9_trans_module *m) } EXPORT_SYMBOL(v9fs_unregister_trans); -static struct p9_trans_module *_p9_get_trans_by_name(char *s) +static struct p9_trans_module *_p9_get_trans_by_name(const char *s) { struct p9_trans_module *t, *found = NULL; @@ -106,7 +106,7 @@ static struct p9_trans_module *_p9_get_trans_by_name(char *s) * @s: string identifying transport * */ -struct p9_trans_module *v9fs_get_trans_by_name(char *s) +struct p9_trans_module *v9fs_get_trans_by_name(const char *s) { struct p9_trans_module *found = NULL; @@ -123,6 +123,10 @@ struct p9_trans_module *v9fs_get_trans_by_name(char *s) } EXPORT_SYMBOL(v9fs_get_trans_by_name); +static const char * const v9fs_default_transports[] = { + "virtio", "tcp", "fd", "unix", "xen", "rdma", +}; + /** * v9fs_get_default_trans - get the default transport * @@ -131,6 +135,7 @@ EXPORT_SYMBOL(v9fs_get_trans_by_name); struct p9_trans_module *v9fs_get_default_trans(void) { struct p9_trans_module *t, *found = NULL; + int i; spin_lock(&v9fs_trans_lock); @@ -148,6 +153,10 @@ struct p9_trans_module *v9fs_get_default_trans(void) } spin_unlock(&v9fs_trans_lock); + + for (i = 0; !found && i < ARRAY_SIZE(v9fs_default_transports); i++) + found = v9fs_get_trans_by_name(v9fs_default_transports[i]); + return found; } EXPORT_SYMBOL(v9fs_get_default_trans); @@ -177,7 +186,6 @@ static int __init init_p9(void) p9_error_init(); pr_info("Installing 9P2000 support\n"); - p9_trans_fd_init(); return ret; } @@ -191,7 +199,6 @@ static void __exit exit_p9(void) { pr_info("Unloading 9P2000 support\n"); - p9_trans_fd_exit(); p9_client_exit(); } diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 827c47620fc0..8f8f95e39b03 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -1090,6 +1090,7 @@ static struct p9_trans_module p9_tcp_trans = { .show_options = p9_fd_show_options, .owner = THIS_MODULE, }; +MODULE_ALIAS_9P("tcp"); static struct p9_trans_module p9_unix_trans = { .name = "unix", @@ -1103,6 +1104,7 @@ static struct p9_trans_module p9_unix_trans = { .show_options = p9_fd_show_options, .owner = THIS_MODULE, }; +MODULE_ALIAS_9P("unix"); static struct p9_trans_module p9_fd_trans = { .name = "fd", @@ -1116,6 +1118,7 @@ static struct p9_trans_module p9_fd_trans = { .show_options = p9_fd_show_options, .owner = THIS_MODULE, }; +MODULE_ALIAS_9P("fd"); /** * p9_poll_workfn - poll worker thread @@ -1149,7 +1152,7 @@ static void p9_poll_workfn(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "finish\n"); } -int p9_trans_fd_init(void) +static int __init p9_trans_fd_init(void) { v9fs_register_trans(&p9_tcp_trans); v9fs_register_trans(&p9_unix_trans); @@ -1158,10 +1161,17 @@ int p9_trans_fd_init(void) return 0; } -void p9_trans_fd_exit(void) +static void __exit p9_trans_fd_exit(void) { flush_work(&p9_poll_work); v9fs_unregister_trans(&p9_tcp_trans); v9fs_unregister_trans(&p9_unix_trans); v9fs_unregister_trans(&p9_fd_trans); } + +module_init(p9_trans_fd_init); +module_exit(p9_trans_fd_exit); + +MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); +MODULE_DESCRIPTION("Filedescriptor Transport for 9P"); +MODULE_LICENSE("GPL"); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index bd5a89c4960d..b24a4fb0f0a2 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -648,7 +648,7 @@ fail: * @args: args passed from sys_mount() for per-transport options (unused) * * This sets up a transport channel for 9p communication. Right now - * we only match the first available channel, but eventually we couldlook up + * we only match the first available channel, but eventually we could look up * alternate channels by matching devname versus a virtio_config entry. * We use a simple reference count mechanism to ensure that only a single * mount has a channel open at a time. @@ -721,7 +721,7 @@ static void p9_virtio_remove(struct virtio_device *vdev) mutex_unlock(&virtio_9p_lock); - vdev->config->reset(vdev); + virtio_reset_device(vdev); vdev->config->del_vqs(vdev); sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 2418fa0b58f3..833cd3792c51 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -279,13 +279,13 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) grant_ref_t ref; ref = priv->rings[i].intf->ref[j]; - gnttab_end_foreign_access(ref, 0, 0); + gnttab_end_foreign_access(ref, NULL); } - free_pages((unsigned long)priv->rings[i].data.in, - priv->rings[i].intf->ring_order - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + free_pages_exact(priv->rings[i].data.in, + 1UL << (priv->rings[i].intf->ring_order + + XEN_PAGE_SHIFT)); } - gnttab_end_foreign_access(priv->rings[i].ref, 0, 0); + gnttab_end_foreign_access(priv->rings[i].ref, NULL); free_page((unsigned long)priv->rings[i].intf); } kfree(priv->rings); @@ -322,8 +322,8 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, if (ret < 0) goto out; ring->ref = ret; - bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - order - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + bytes = alloc_pages_exact(1UL << (order + XEN_PAGE_SHIFT), + GFP_KERNEL | __GFP_ZERO); if (!bytes) { ret = -ENOMEM; goto out; @@ -353,12 +353,10 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, out: if (bytes) { for (i--; i >= 0; i--) - gnttab_end_foreign_access(ring->intf->ref[i], 0, 0); - free_pages((unsigned long)bytes, - ring->intf->ring_order - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + gnttab_end_foreign_access(ring->intf->ref[i], NULL); + free_pages_exact(bytes, 1UL << (order + XEN_PAGE_SHIFT)); } - gnttab_end_foreign_access(ring->ref, 0, 0); + gnttab_end_foreign_access(ring->ref, NULL); free_page((unsigned long)ring->intf); return ret; } @@ -538,6 +536,7 @@ static void p9_trans_xen_exit(void) } module_exit(p9_trans_xen_exit); +MODULE_ALIAS("xen:9pfs"); MODULE_AUTHOR("Stefano Stabellini <stefano@aporeto.com>"); MODULE_DESCRIPTION("Xen Transport for 9P"); MODULE_LICENSE("GPL"); diff --git a/net/Kconfig b/net/Kconfig index 8a1f9d0287de..6b78f695caa6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -434,6 +434,19 @@ config NET_DEVLINK config PAGE_POOL bool +config PAGE_POOL_STATS + default n + bool "Page pool stats" + depends on PAGE_POOL + help + Enable page pool statistics to track page allocation and recycling + in page pools. This option incurs additional CPU cost in allocation + and recycle paths and additional memory cost to store the statistics. + These statistics are only available if this option is enabled and if + the driver using the page pool supports exporting this data. + + If unsure, say N. + config FAILOVER tristate "Generic failover module" help diff --git a/net/Kconfig.debug b/net/Kconfig.debug index 2f50611df858..e6ae11cc2fb7 100644 --- a/net/Kconfig.debug +++ b/net/Kconfig.debug @@ -17,3 +17,10 @@ config NET_NS_REFCNT_TRACKER help Enable debugging feature to track netns references. This adds memory and cpu costs. + +config DEBUG_NET + bool "Add generic networking debug" + depends on DEBUG_KERNEL && NET + help + Enable extra sanity checks in networking. + This is mostly used by fuzzers, but is safe to select. diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index bf5736c1d458..a06f4d4a6f47 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1753,8 +1753,7 @@ static int atalk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int err = 0; struct sk_buff *skb; - skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, - flags & MSG_DONTWAIT, &err); + skb = skb_recv_datagram(sk, flags, &err); lock_sock(sk); if (!skb) diff --git a/net/atm/common.c b/net/atm/common.c index 1cfa9bf1d187..f7019df41c3e 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -540,7 +540,7 @@ int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, !test_bit(ATM_VF_READY, &vcc->flags)) return 0; - skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &error); + skb = skb_recv_datagram(sk, flags, &error); if (!skb) return error; @@ -553,7 +553,7 @@ int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, error = skb_copy_datagram_msg(skb, 0, msg, copied); if (error) return error; - sock_recv_ts_and_drops(msg, sk, skb); + sock_recv_cmsgs(msg, sk, skb); if (!(flags & MSG_PEEK)) { pr_debug("%d -= %d\n", atomic_read(&sk->sk_rmem_alloc), diff --git a/net/atm/proc.c b/net/atm/proc.c index 4369ffa3302a..9bf736290e48 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -108,7 +108,7 @@ out: static inline void *vcc_walk(struct seq_file *seq, loff_t l) { struct vcc_state *state = seq->private; - int family = (uintptr_t)(PDE_DATA(file_inode(seq->file))); + int family = (uintptr_t)(pde_data(file_inode(seq->file))); return __vcc_walk(&state->sk, family, &state->bucket, l) ? state : NULL; @@ -324,7 +324,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf, page = get_zeroed_page(GFP_KERNEL); if (!page) return -ENOMEM; - dev = PDE_DATA(file_inode(file)); + dev = pde_data(file_inode(file)); if (!dev->ops->proc_read) length = -EINVAL; else { diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 02f43f3e2c56..4c7030ed8d33 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -62,12 +62,12 @@ static void ax25_free_sock(struct sock *sk) */ static void ax25_cb_del(ax25_cb *ax25) { + spin_lock_bh(&ax25_list_lock); if (!hlist_unhashed(&ax25->ax25_node)) { - spin_lock_bh(&ax25_list_lock); hlist_del_init(&ax25->ax25_node); - spin_unlock_bh(&ax25_list_lock); ax25_cb_put(ax25); } + spin_unlock_bh(&ax25_list_lock); } /* @@ -77,21 +77,38 @@ static void ax25_kill_by_device(struct net_device *dev) { ax25_dev *ax25_dev; ax25_cb *s; + struct sock *sk; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return; + ax25_dev->device_up = false; spin_lock_bh(&ax25_list_lock); again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { + sk = s->sk; + if (!sk) { + spin_unlock_bh(&ax25_list_lock); + ax25_disconnect(s, ENETUNREACH); + s->ax25_dev = NULL; + ax25_cb_del(s); + spin_lock_bh(&ax25_list_lock); + goto again; + } + sock_hold(sk); spin_unlock_bh(&ax25_list_lock); - lock_sock(s->sk); - s->ax25_dev = NULL; - release_sock(s->sk); + lock_sock(sk); ax25_disconnect(s, ENETUNREACH); + s->ax25_dev = NULL; + if (sk->sk_socket) { + dev_put_track(ax25_dev->dev, &ax25_dev->dev_tracker); + ax25_dev_put(ax25_dev); + } + ax25_cb_del(s); + release_sock(sk); spin_lock_bh(&ax25_list_lock); - + sock_put(sk); /* The entry could have been deleted from the * list meanwhile and thus the next pointer is * no longer valid. Play it safe and restart @@ -355,21 +372,25 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl))) return -EFAULT; - if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL) - return -ENODEV; - if (ax25_ctl.digi_count > AX25_MAX_DIGIS) return -EINVAL; if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr); + if (!ax25_dev) + return -ENODEV; + digi.ndigi = ax25_ctl.digi_count; for (k = 0; k < digi.ndigi; k++) digi.calls[k] = ax25_ctl.digi_addr[k]; - if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL) + ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev); + if (!ax25) { + ax25_dev_put(ax25_dev); return -ENOTCONN; + } switch (ax25_ctl.cmd) { case AX25_KILL: @@ -436,6 +457,7 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) } out_put: + ax25_dev_put(ax25_dev); ax25_cb_put(ax25); return ret; @@ -962,21 +984,25 @@ static int ax25_release(struct socket *sock) { struct sock *sk = sock->sk; ax25_cb *ax25; + ax25_dev *ax25_dev; if (sk == NULL) return 0; sock_hold(sk); - sock_orphan(sk); lock_sock(sk); + sock_orphan(sk); ax25 = sk_to_ax25(sk); + ax25_dev = ax25->ax25_dev; if (sk->sk_type == SOCK_SEQPACKET) { switch (ax25->state) { case AX25_STATE_0: - release_sock(sk); - ax25_disconnect(ax25, 0); - lock_sock(sk); + if (!sock_flag(ax25->sk, SOCK_DEAD)) { + release_sock(sk); + ax25_disconnect(ax25, 0); + lock_sock(sk); + } ax25_destroy_socket(ax25); break; @@ -1031,6 +1057,17 @@ static int ax25_release(struct socket *sock) sk->sk_state_change(sk); ax25_destroy_socket(ax25); } + if (ax25_dev) { + if (!ax25_dev->device_up) { + del_timer_sync(&ax25->timer); + del_timer_sync(&ax25->t1timer); + del_timer_sync(&ax25->t2timer); + del_timer_sync(&ax25->t3timer); + del_timer_sync(&ax25->idletimer); + } + dev_put_track(ax25_dev->dev, &ax25_dev->dev_tracker); + ax25_dev_put(ax25_dev); + } sock->sk = NULL; release_sock(sk); @@ -1107,8 +1144,10 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } } - if (ax25_dev != NULL) + if (ax25_dev) { ax25_fillin_cb(ax25, ax25_dev); + dev_hold_track(ax25_dev->dev, &ax25_dev->dev_tracker, GFP_ATOMIC); + } done: ax25_cb_add(ax25); @@ -1622,9 +1661,12 @@ static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; - struct sk_buff *skb; + struct sk_buff *skb, *last; + struct sk_buff_head *sk_queue; int copied; int err = 0; + int off = 0; + long timeo; lock_sock(sk); /* @@ -1636,11 +1678,29 @@ static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, goto out; } - /* Now we can treat all alike */ - skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, - flags & MSG_DONTWAIT, &err); - if (skb == NULL) - goto out; + /* We need support for non-blocking reads. */ + sk_queue = &sk->sk_receive_queue; + skb = __skb_try_recv_datagram(sk, sk_queue, flags, &off, &err, &last); + /* If no packet is available, release_sock(sk) and try again. */ + if (!skb) { + if (err != -EAGAIN) + goto out; + release_sock(sk); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + while (timeo && !__skb_wait_for_more_packets(sk, sk_queue, &err, + &timeo, last)) { + skb = __skb_try_recv_datagram(sk, sk_queue, flags, &off, + &err, &last); + if (skb) + break; + + if (err != -EAGAIN) + goto done; + } + if (!skb) + goto done; + lock_sock(sk); + } if (!sk_to_ax25(sk)->pidincl) skb_pull(skb, 1); /* Remove PID */ @@ -1687,6 +1747,7 @@ static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, out: release_sock(sk); +done: return err; } diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 256fadb94df3..95a76d571c44 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -37,6 +37,7 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr) for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; + ax25_dev_hold(ax25_dev); } spin_unlock_bh(&ax25_dev_lock); @@ -56,10 +57,12 @@ void ax25_dev_device_up(struct net_device *dev) return; } + refcount_set(&ax25_dev->refcount, 1); dev->ax25_ptr = ax25_dev; ax25_dev->dev = dev; dev_hold_track(dev, &ax25_dev->dev_tracker, GFP_ATOMIC); ax25_dev->forward = NULL; + ax25_dev->device_up = true; ax25_dev->values[AX25_VALUES_IPDEFMODE] = AX25_DEF_IPDEFMODE; ax25_dev->values[AX25_VALUES_AXDEFMODE] = AX25_DEF_AXDEFMODE; @@ -84,6 +87,7 @@ void ax25_dev_device_up(struct net_device *dev) ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; spin_unlock_bh(&ax25_dev_lock); + ax25_dev_hold(ax25_dev); ax25_register_dev_sysctl(ax25_dev); } @@ -112,27 +116,28 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; - spin_unlock_bh(&ax25_dev_lock); - dev->ax25_ptr = NULL; - dev_put_track(dev, &ax25_dev->dev_tracker); - kfree(ax25_dev); - return; + goto unlock_put; } while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; - spin_unlock_bh(&ax25_dev_lock); - dev->ax25_ptr = NULL; - dev_put_track(dev, &ax25_dev->dev_tracker); - kfree(ax25_dev); - return; + goto unlock_put; } s = s->next; } spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; + ax25_dev_put(ax25_dev); + return; + +unlock_put: + spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); + dev->ax25_ptr = NULL; + dev_put_track(dev, &ax25_dev->dev_tracker); + ax25_dev_put(ax25_dev); } int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) @@ -144,20 +149,32 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) switch (cmd) { case SIOCAX25ADDFWD: - if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL) + fwd_dev = ax25_addr_ax25dev(&fwd->port_to); + if (!fwd_dev) { + ax25_dev_put(ax25_dev); return -EINVAL; - if (ax25_dev->forward != NULL) + } + if (ax25_dev->forward) { + ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = fwd_dev->dev; + ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); break; case SIOCAX25DELFWD: - if (ax25_dev->forward == NULL) + if (!ax25_dev->forward) { + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = NULL; + ax25_dev_put(ax25_dev); break; default: + ax25_dev_put(ax25_dev); return -EINVAL; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index d0b2e094bd55..b7c4d656a94b 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -75,11 +75,13 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_dev *ax25_dev; int i; - if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) - return -EINVAL; if (route->digi_count > AX25_MAX_DIGIS) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&route->port_addr); + if (!ax25_dev) + return -EINVAL; + write_lock_bh(&ax25_route_lock); ax25_rt = ax25_route_list; @@ -91,6 +93,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -101,6 +104,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) } } write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } ax25_rt = ax25_rt->next; @@ -108,10 +112,10 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } - refcount_set(&ax25_rt->refcount, 1); ax25_rt->callsign = route->dest_addr; ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; @@ -120,6 +124,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); kfree(ax25_rt); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -132,6 +137,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->next = ax25_route_list; ax25_route_list = ax25_rt; write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -160,12 +166,12 @@ static int ax25_rt_del(struct ax25_routes_struct *route) ax25cmp(&route->dest_addr, &s->callsign) == 0) { if (ax25_route_list == s) { ax25_route_list = s->next; - ax25_put_route(s); + __ax25_put_route(s); } else { for (t = ax25_route_list; t != NULL; t = t->next) { if (t->next == s) { t->next = s->next; - ax25_put_route(s); + __ax25_put_route(s); break; } } @@ -173,6 +179,7 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -215,6 +222,7 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) out: write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return err; } diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 15ab812c4fe4..9ff98f46dc6b 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -261,12 +261,20 @@ void ax25_disconnect(ax25_cb *ax25, int reason) { ax25_clear_queues(ax25); - if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY)) - ax25_stop_heartbeat(ax25); - ax25_stop_t1timer(ax25); - ax25_stop_t2timer(ax25); - ax25_stop_t3timer(ax25); - ax25_stop_idletimer(ax25); + if (reason == ENETUNREACH) { + del_timer_sync(&ax25->timer); + del_timer_sync(&ax25->t1timer); + del_timer_sync(&ax25->t2timer); + del_timer_sync(&ax25->t3timer); + del_timer_sync(&ax25->idletimer); + } else { + if (ax25->sk && !sock_flag(ax25->sk, SOCK_DESTROY)) + ax25_stop_heartbeat(ax25); + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + } ax25->state = AX25_STATE_0; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f94f538fa382..7f6a7c96ac92 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -13,13 +13,13 @@ #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 71999e13f729..b6db999abf75 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -10,13 +10,13 @@ #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/minmax.h> #include <linux/netdevice.h> diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 1d750f3cb2e4..033639df96d8 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -9,12 +9,12 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 2ed9496fc41f..37ce6cfb3520 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -10,6 +10,7 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> +#include <linux/container_of.h> #include <linux/crc16.h> #include <linux/errno.h> #include <linux/etherdevice.h> @@ -64,7 +65,7 @@ batadv_bla_send_announce(struct batadv_priv *bat_priv, */ static inline u32 batadv_choose_claim(const void *data, u32 size) { - struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; + const struct batadv_bla_claim *claim = data; u32 hash = 0; hash = jhash(&claim->addr, sizeof(claim->addr), hash); @@ -85,7 +86,7 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) const struct batadv_bla_backbone_gw *gw; u32 hash = 0; - gw = (struct batadv_bla_backbone_gw *)data; + gw = data; hash = jhash(&gw->orig, sizeof(gw->orig), hash); hash = jhash(&gw->vid, sizeof(gw->vid), hash); @@ -443,7 +444,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); - netif_rx_any_context(skb); + netif_rx(skb); out: batadv_hardif_put(primary_if); } diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 2f008e329007..fefb51a5f606 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -11,6 +11,7 @@ #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -20,7 +21,6 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/netlink.h> diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 0899a729a23f..c120c7c6d25f 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -475,6 +475,17 @@ int batadv_frag_send_packet(struct sk_buff *skb, goto free_skb; } + /* GRO might have added fragments to the fragment list instead of + * frags[]. But this is not handled by skb_split and must be + * linearized to avoid incorrect length information after all + * batman-adv fragments were created and submitted to the + * hard-interface + */ + if (skb_has_frag_list(skb) && __skb_linearize(skb)) { + ret = -ENOMEM; + goto free_skb; + } + /* Create one header to be copied to all fragments */ frag_header.packet_type = BATADV_UNICAST_FRAG; frag_header.version = BATADV_COMPAT_VERSION; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index b7466136e292..d26124bc27e1 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -9,6 +9,7 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 8a2b78f9c4b2..b8f8da7ee3de 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -9,11 +9,11 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> @@ -149,25 +149,28 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) struct net *net = dev_net(net_dev); struct net_device *parent_dev; struct net *parent_net; + int iflink; bool ret; /* check if this is a batman-adv mesh interface */ if (batadv_softif_is_valid(net_dev)) return true; - /* no more parents..stop recursion */ - if (dev_get_iflink(net_dev) == 0 || - dev_get_iflink(net_dev) == net_dev->ifindex) + iflink = dev_get_iflink(net_dev); + if (iflink == 0) return false; parent_net = batadv_getlink_net(net_dev, net); + /* iflink to itself, most likely physical device */ + if (net == parent_net && iflink == net_dev->ifindex) + return false; + /* recurse over the parent device */ - parent_dev = __dev_get_by_index((struct net *)parent_net, - dev_get_iflink(net_dev)); - /* if we got a NULL parent_dev there is something broken.. */ + parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); if (!parent_dev) { - pr_err("Cannot find parent device\n"); + pr_warn("Cannot find parent device. Skipping batadv-on-batadv check for %s\n", + net_dev->name); return false; } @@ -214,14 +217,15 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) struct net_device *real_netdev = NULL; struct net *real_net; struct net *net; - int ifindex; + int iflink; ASSERT_RTNL(); if (!netdev) return NULL; - if (netdev->ifindex == dev_get_iflink(netdev)) { + iflink = dev_get_iflink(netdev); + if (iflink == 0) { dev_hold(netdev); return netdev; } @@ -231,9 +235,16 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) goto out; net = dev_net(hard_iface->soft_iface); - ifindex = dev_get_iflink(netdev); real_net = batadv_getlink_net(netdev, net); - real_netdev = dev_get_by_index(real_net, ifindex); + + /* iflink to itself, most likely physical device */ + if (net == real_net && netdev->ifindex == iflink) { + real_netdev = netdev; + dev_hold(real_netdev); + goto out; + } + + real_netdev = dev_get_by_index(real_net, iflink); out: batadv_hardif_put(hard_iface); @@ -296,9 +307,11 @@ static bool batadv_is_cfg80211_netdev(struct net_device *net_device) if (!net_device) return false; +#if IS_ENABLED(CONFIG_CFG80211) /* cfg80211 drivers have to set ieee80211_ptr */ if (net_device->ieee80211_ptr) return true; +#endif return false; } diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 5207cd8d6ad8..e8a449915566 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -9,6 +9,7 @@ #include <linux/atomic.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/crc32c.h> #include <linux/device.h> #include <linux/errno.h> @@ -132,7 +133,6 @@ static void __exit batadv_exit(void) rtnl_link_unregister(&batadv_link_ops); unregister_netdevice_notifier(&batadv_hard_if_notifier); - flush_workqueue(batadv_event_workqueue); destroy_workqueue(batadv_event_workqueue); batadv_event_workqueue = NULL; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 494d1ebecac2..23f3d53f4b51 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2022.0" +#define BATADV_SOURCE_VERSION "2022.2" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index f4004cf0ff6f..b238455913df 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -11,6 +11,7 @@ #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -134,7 +135,7 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) { struct inet6_dev *in6_dev = __in6_dev_get(dev); - if (in6_dev && in6_dev->cnf.mc_forwarding) + if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR6; diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 974d726fabb9..5f4aeeb60dc4 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -11,6 +11,7 @@ #include <linux/bitops.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -19,7 +20,6 @@ #include <linux/init.h> #include <linux/jhash.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index aadc653ca1d8..34903df4fe93 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -8,11 +8,11 @@ #include "main.h" #include <linux/atomic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 477d85a3b558..0379b126865d 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -10,13 +10,13 @@ #include <linux/atomic.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_ether.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 2dbbe6c19609..0f5c0679b55a 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -11,6 +11,7 @@ #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> +#include <linux/container_of.h> #include <linux/cpumask.h> #include <linux/errno.h> #include <linux/etherdevice.h> @@ -19,7 +20,6 @@ #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 93730d30af54..7f3dd3c393e0 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -12,13 +12,13 @@ #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> +#include <linux/container_of.h> #include <linux/err.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/kthread.h> #include <linux/limits.h> diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 4b7ad6684bc4..01d30c1e412c 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -13,6 +13,7 @@ #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> +#include <linux/container_of.h> #include <linux/crc32c.h> #include <linux/errno.h> #include <linux/etherdevice.h> @@ -21,7 +22,6 @@ #include <linux/init.h> #include <linux/jhash.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> @@ -103,10 +103,10 @@ static bool batadv_compare_tt(const struct hlist_node *node, const void *data2) */ static inline u32 batadv_choose_tt(const void *data, u32 size) { - struct batadv_tt_common_entry *tt; + const struct batadv_tt_common_entry *tt; u32 hash = 0; - tt = (struct batadv_tt_common_entry *)data; + tt = data; hash = jhash(&tt->addr, ETH_ALEN, hash); hash = jhash(&tt->vid, sizeof(tt->vid), hash); @@ -2766,7 +2766,7 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, u32 i; tt_tot = batadv_tt_entries(tt_len); - tt_change = (struct batadv_tvlv_tt_change *)tvlv_buff; + tt_change = tvlv_buff; if (!valid_cb) return; @@ -3994,7 +3994,7 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, if (tvlv_value_len < sizeof(*tt_data)) return; - tt_data = (struct batadv_tvlv_tt_data *)tvlv_value; + tt_data = tvlv_value; tvlv_value_len -= sizeof(*tt_data); num_vlan = ntohs(tt_data->num_vlan); @@ -4037,7 +4037,7 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, if (tvlv_value_len < sizeof(*tt_data)) return NET_RX_SUCCESS; - tt_data = (struct batadv_tvlv_tt_data *)tvlv_value; + tt_data = tvlv_value; tvlv_value_len -= sizeof(*tt_data); tt_vlan_len = sizeof(struct batadv_tvlv_tt_vlan_data); @@ -4129,7 +4129,7 @@ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, goto out; batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_RX); - roaming_adv = (struct batadv_tvlv_roam_adv *)tvlv_value; + roaming_adv = tvlv_value; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received ROAMING_ADV from %pM (client %pM)\n", diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 0cb58eb04093..7ec2e2343884 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -7,10 +7,10 @@ #include "main.h" #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 133d7ea063fb..215af9b3b589 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -240,7 +240,7 @@ static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev) if (!skb_cp) return NET_RX_DROP; - return netif_rx_ni(skb_cp); + return netif_rx(skb_cp); } static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, @@ -641,7 +641,6 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, return NULL; peer->chan = chan; - memset(&peer->peer_addr, 0, sizeof(struct in6_addr)); baswap((void *)peer->lladdr, &chan->dst); diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 1661979b6a6e..b506409bb498 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -251,7 +251,6 @@ EXPORT_SYMBOL(bt_accept_dequeue); int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { - int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; struct sk_buff *skb; size_t copied; @@ -263,7 +262,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (flags & MSG_OOB) return -EOPNOTSUPP; - skb = skb_recv_datagram(sk, flags, noblock, &err); + skb = skb_recv_datagram(sk, flags, &err); if (!skb) { if (sk->sk_shutdown & RCV_SHUTDOWN) return 0; @@ -281,7 +280,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, skb_reset_transport_header(skb); err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err == 0) { - sock_recv_ts_and_drops(msg, sk, skb); + sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name && bt_sk(sk)->skb_msg_name) bt_sk(sk)->skb_msg_name(skb, msg->msg_name, @@ -385,7 +384,7 @@ int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, copied += chunk; size -= chunk; - sock_recv_ts_and_drops(msg, sk, skb); + sock_recv_cmsgs(msg, sk, skb); if (!(flags & MSG_PEEK)) { int skb_len = skb_headlen(skb); @@ -568,7 +567,7 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo) EXPORT_SYMBOL(bt_sock_wait_state); /* This function expects the sk lock to be held when called */ -int bt_sock_wait_ready(struct sock *sk, unsigned long flags) +int bt_sock_wait_ready(struct sock *sk, unsigned int msg_flags) { DECLARE_WAITQUEUE(wait, current); unsigned long timeo; @@ -576,7 +575,7 @@ int bt_sock_wait_ready(struct sock *sk, unsigned long flags) BT_DBG("sk %p", sk); - timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + timeo = sock_sndtimeo(sk, !!(msg_flags & MSG_DONTWAIT)); add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); @@ -611,7 +610,7 @@ EXPORT_SYMBOL(bt_sock_wait_ready); static void *bt_seq_start(struct seq_file *seq, loff_t *pos) __acquires(seq->private->l->lock) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); read_lock(&l->lock); return seq_hlist_start_head(&l->head, *pos); @@ -619,7 +618,7 @@ static void *bt_seq_start(struct seq_file *seq, loff_t *pos) static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); return seq_hlist_next(v, &l->head, pos); } @@ -627,14 +626,14 @@ static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void bt_seq_stop(struct seq_file *seq, void *v) __releases(seq->private->l->lock) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); read_unlock(&l->lock); } static int bt_seq_show(struct seq_file *seq, void *v) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk RefCnt Rmem Wmem User Inode Parent"); diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index c9add7753b9f..5a6a49885ab6 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -400,7 +400,7 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) dev->stats.rx_packets++; nskb->ip_summed = CHECKSUM_NONE; nskb->protocol = eth_type_trans(nskb, dev); - netif_rx_ni(nskb); + netif_rx(nskb); return 0; badframe: @@ -535,7 +535,7 @@ static int bnep_session(void *arg) up_write(&bnep_session_sem); free_netdev(dev); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 83eb84e8e688..90d130588a3e 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -323,7 +323,7 @@ static int cmtp_session(void *arg) up_write(&cmtp_session_sem); kfree(session); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } diff --git a/net/bluetooth/eir.c b/net/bluetooth/eir.c index 7e930f77ecab..7d77fb00c2bf 100644 --- a/net/bluetooth/eir.c +++ b/net/bluetooth/eir.c @@ -55,6 +55,19 @@ u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len) return eir_append_le16(ptr, ad_len, EIR_APPEARANCE, hdev->appearance); } +u8 eir_append_service_data(u8 *eir, u16 eir_len, u16 uuid, u8 *data, + u8 data_len) +{ + eir[eir_len++] = sizeof(u8) + sizeof(uuid) + data_len; + eir[eir_len++] = EIR_SERVICE_DATA; + put_unaligned_le16(uuid, &eir[eir_len]); + eir_len += sizeof(uuid); + memcpy(&eir[eir_len], data, data_len); + eir_len += data_len; + + return eir_len; +} + static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) { u8 *ptr = data, *uuids_start = NULL; @@ -333,3 +346,21 @@ u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr) return scan_rsp_len; } + +void *eir_get_service_data(u8 *eir, size_t eir_len, u16 uuid, size_t *len) +{ + while ((eir = eir_get_data(eir, eir_len, EIR_SERVICE_DATA, len))) { + u16 value = get_unaligned_le16(eir); + + if (uuid == value) { + if (len) + *len -= 2; + return &eir[2]; + } + + eir += *len; + eir_len -= *len; + } + + return NULL; +} diff --git a/net/bluetooth/eir.h b/net/bluetooth/eir.h index 05e2e917fc25..62f2374078f2 100644 --- a/net/bluetooth/eir.h +++ b/net/bluetooth/eir.h @@ -14,6 +14,13 @@ u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr); u8 eir_append_local_name(struct hci_dev *hdev, u8 *eir, u8 ad_len); u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len); +u8 eir_append_service_data(u8 *eir, u16 eir_len, u16 uuid, u8 *data, + u8 data_len); + +static inline u16 eir_precalc_len(u8 data_len) +{ + return sizeof(u8) * 2 + data_len; +} static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, u8 data_len) @@ -36,6 +43,21 @@ static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data) return eir_len; } +static inline u16 eir_skb_put_data(struct sk_buff *skb, u8 type, u8 *data, u8 data_len) +{ + u8 *eir; + u16 eir_len; + + eir_len = eir_precalc_len(data_len); + eir = skb_put(skb, eir_len); + WARN_ON(sizeof(type) + data_len > U8_MAX); + eir[0] = sizeof(type) + data_len; + eir[1] = type; + memcpy(&eir[2], data, data_len); + + return eir_len; +} + static inline void *eir_get_data(u8 *eir, size_t eir_len, u8 type, size_t *data_len) { @@ -72,3 +94,5 @@ static inline void *eir_get_data(u8 *eir, size_t eir_len, u8 type, return NULL; } + +void *eir_get_service_data(u8 *eir, size_t eir_len, u16 uuid, size_t *len); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 04ebe901e86f..ac06c9724c7f 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -481,7 +481,7 @@ static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle) bool hci_setup_sync(struct hci_conn *conn, __u16 handle) { - if (enhanced_sco_capable(conn->hdev)) + if (enhanced_sync_conn_capable(conn->hdev)) return hci_enhanced_setup_sync_conn(conn, handle); return hci_setup_sync_conn(conn, handle); @@ -669,7 +669,9 @@ static void le_conn_timeout(struct work_struct *work) if (conn->role == HCI_ROLE_SLAVE) { /* Disable LE Advertising */ le_disable_advertising(hdev); - hci_le_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT); + hci_dev_lock(hdev); + hci_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT); + hci_dev_unlock(hdev); return; } @@ -689,6 +691,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, bacpy(&conn->dst, dst); bacpy(&conn->src, &hdev->bdaddr); + conn->handle = HCI_CONN_HANDLE_UNSET; conn->hdev = hdev; conn->type = type; conn->role = role; @@ -870,7 +873,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type) EXPORT_SYMBOL(hci_get_route); /* This function requires the caller holds hdev->lock */ -void hci_le_conn_failed(struct hci_conn *conn, u8 status) +static void hci_le_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; struct hci_conn_params *params; @@ -883,8 +886,6 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) params->conn = NULL; } - conn->state = BT_CLOSED; - /* If the status indicates successful cancellation of * the attempt (i.e. Unknown Connection Id) there's no point of * notifying failure since we'll go back to keep trying to @@ -896,10 +897,6 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) mgmt_connect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); - hci_connect_cfm(conn, status); - - hci_conn_del(conn); - /* Since we may have temporarily stopped the background scanning in * favor of connection establishment, we should restart it. */ @@ -911,6 +908,28 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) hci_enable_advertising(hdev); } +/* This function requires the caller holds hdev->lock */ +void hci_conn_failed(struct hci_conn *conn, u8 status) +{ + struct hci_dev *hdev = conn->hdev; + + bt_dev_dbg(hdev, "status 0x%2.2x", status); + + switch (conn->type) { + case LE_LINK: + hci_le_conn_failed(conn, status); + break; + case ACL_LINK: + mgmt_connect_failed(hdev, &conn->dst, conn->type, + conn->dst_type, status); + break; + } + + conn->state = BT_CLOSED; + hci_connect_cfm(conn, status); + hci_conn_del(conn); +} + static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; @@ -924,10 +943,11 @@ static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) bt_dev_err(hdev, "request failed to create LE connection: err %d", err); - if (!conn) + /* Check if connection is still pending */ + if (conn != hci_lookup_le_connect(hdev)) goto done; - hci_le_conn_failed(conn, err); + hci_conn_failed(conn, err); done: hci_dev_unlock(hdev); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2b7bd3655b07..a0f99baafd35 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -571,6 +571,7 @@ int hci_dev_close(__u16 dev) goto done; } + cancel_work_sync(&hdev->power_on); if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); @@ -2153,7 +2154,7 @@ int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; - bitmap_from_u64(entry->flags, flags); + entry->flags = flags; list_add(&entry->list, list); @@ -2503,6 +2504,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) INIT_LIST_HEAD(&hdev->conn_hash.list); INIT_LIST_HEAD(&hdev->adv_instances); INIT_LIST_HEAD(&hdev->blocked_keys); + INIT_LIST_HEAD(&hdev->monitored_devices); INIT_LIST_HEAD(&hdev->local_codecs); INIT_WORK(&hdev->rx_work, hci_rx_work); @@ -2554,10 +2556,10 @@ int hci_register_dev(struct hci_dev *hdev) */ switch (hdev->dev_type) { case HCI_PRIMARY: - id = ida_simple_get(&hci_index_ida, 0, 0, GFP_KERNEL); + id = ida_simple_get(&hci_index_ida, 0, HCI_MAX_ID, GFP_KERNEL); break; case HCI_AMP: - id = ida_simple_get(&hci_index_ida, 1, 0, GFP_KERNEL); + id = ida_simple_get(&hci_index_ida, 1, HCI_MAX_ID, GFP_KERNEL); break; default: return -EINVAL; @@ -2566,7 +2568,7 @@ int hci_register_dev(struct hci_dev *hdev) if (id < 0) return id; - sprintf(hdev->name, "hci%d", id); + snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); hdev->id = id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); @@ -2633,7 +2635,7 @@ int hci_register_dev(struct hci_dev *hdev) * callback. */ if (hdev->wakeup) - set_bit(HCI_CONN_FLAG_REMOTE_WAKEUP, hdev->conn_flags); + hdev->conn_flags |= HCI_CONN_FLAG_REMOTE_WAKEUP; hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); @@ -2738,6 +2740,7 @@ void hci_release_dev(struct hci_dev *hdev) hci_dev_unlock(hdev); ida_simple_remove(&hci_index_ida, hdev->id); + kfree_skb(hdev->sent_cmd); kfree(hdev); } EXPORT_SYMBOL(hci_release_dev); @@ -3666,8 +3669,8 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) sco_recv_scodata(conn, skb); return; } else { - bt_dev_err(hdev, "SCO packet for unknown connection handle %d", - handle); + bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d", + handle); } kfree_skb(skb); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fc30f4c03d29..af17dfb20e01 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1835,7 +1835,9 @@ static u8 hci_cc_le_clear_accept_list(struct hci_dev *hdev, void *data, if (rp->status) return rp->status; + hci_dev_lock(hdev); hci_bdaddr_list_clear(&hdev->le_accept_list); + hci_dev_unlock(hdev); return rp->status; } @@ -1855,8 +1857,10 @@ static u8 hci_cc_le_add_to_accept_list(struct hci_dev *hdev, void *data, if (!sent) return rp->status; + hci_dev_lock(hdev); hci_bdaddr_list_add(&hdev->le_accept_list, &sent->bdaddr, sent->bdaddr_type); + hci_dev_unlock(hdev); return rp->status; } @@ -1876,8 +1880,10 @@ static u8 hci_cc_le_del_from_accept_list(struct hci_dev *hdev, void *data, if (!sent) return rp->status; + hci_dev_lock(hdev); hci_bdaddr_list_del(&hdev->le_accept_list, &sent->bdaddr, sent->bdaddr_type); + hci_dev_unlock(hdev); return rp->status; } @@ -1949,9 +1955,11 @@ static u8 hci_cc_le_add_to_resolv_list(struct hci_dev *hdev, void *data, if (!sent) return rp->status; + hci_dev_lock(hdev); hci_bdaddr_list_add_with_irk(&hdev->le_resolv_list, &sent->bdaddr, sent->bdaddr_type, sent->peer_irk, sent->local_irk); + hci_dev_unlock(hdev); return rp->status; } @@ -1971,8 +1979,10 @@ static u8 hci_cc_le_del_from_resolv_list(struct hci_dev *hdev, void *data, if (!sent) return rp->status; + hci_dev_lock(hdev); hci_bdaddr_list_del_with_irk(&hdev->le_resolv_list, &sent->bdaddr, sent->bdaddr_type); + hci_dev_unlock(hdev); return rp->status; } @@ -1987,7 +1997,9 @@ static u8 hci_cc_le_clear_resolv_list(struct hci_dev *hdev, void *data, if (rp->status) return rp->status; + hci_dev_lock(hdev); hci_bdaddr_list_clear(&hdev->le_resolv_list); + hci_dev_unlock(hdev); return rp->status; } @@ -2834,7 +2846,7 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) bt_dev_dbg(hdev, "status 0x%2.2x", status); /* All connection failure handling is taken care of by the - * hci_le_conn_failed function which is triggered by the HCI + * hci_conn_failed function which is triggered by the HCI * request completion callbacks used for connecting. */ if (status) @@ -2859,7 +2871,7 @@ static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status) bt_dev_dbg(hdev, "status 0x%2.2x", status); /* All connection failure handling is taken care of by the - * hci_le_conn_failed function which is triggered by the HCI + * hci_conn_failed function which is triggered by the HCI * request completion callbacks used for connecting. */ if (status) @@ -3067,13 +3079,20 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, { struct hci_ev_conn_complete *ev = data; struct hci_conn *conn; + u8 status = ev->status; - bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); + bt_dev_dbg(hdev, "status 0x%2.2x", status); hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); if (!conn) { + /* In case of error status and there is no connection pending + * just unlock as there is nothing to cleanup. + */ + if (ev->status) + goto unlock; + /* Connection may not exist if auto-connected. Check the bredr * allowlist to see if this device is allowed to auto connect. * If link is an ACL type, create a connection class @@ -3106,8 +3125,25 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, } } - if (!ev->status) { + /* The HCI_Connection_Complete event is only sent once per connection. + * Processing it more than once per connection can corrupt kernel memory. + * + * As the connection handle is set here for the first time, it indicates + * whether the connection is already set up. + */ + if (conn->handle != HCI_CONN_HANDLE_UNSET) { + bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection"); + goto unlock; + } + + if (!status) { conn->handle = __le16_to_cpu(ev->handle); + if (conn->handle > HCI_CONN_HANDLE_MAX) { + bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", + conn->handle, HCI_CONN_HANDLE_MAX); + status = HCI_ERROR_INVALID_PARAMETERS; + goto done; + } if (conn->type == ACL_LINK) { conn->state = BT_CONFIG; @@ -3148,19 +3184,14 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, hci_send_cmd(hdev, HCI_OP_CHANGE_CONN_PTYPE, sizeof(cp), &cp); } - } else { - conn->state = BT_CLOSED; - if (conn->type == ACL_LINK) - mgmt_connect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, ev->status); } if (conn->type == ACL_LINK) hci_sco_setup(conn, ev->status); - if (ev->status) { - hci_connect_cfm(conn, ev->status); - hci_conn_del(conn); +done: + if (status) { + hci_conn_failed(conn, status); } else if (ev->link_type == SCO_LINK) { switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_CVSD: @@ -3169,7 +3200,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, break; } - hci_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, status); } unlock: @@ -3206,10 +3237,12 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data, return; } + hci_dev_lock(hdev); + if (hci_bdaddr_list_lookup(&hdev->reject_list, &ev->bdaddr, BDADDR_BREDR)) { hci_reject_conn(hdev, &ev->bdaddr); - return; + goto unlock; } /* Require HCI_CONNECTABLE or an accept list entry to accept the @@ -3221,13 +3254,11 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data, !hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &ev->bdaddr, BDADDR_BREDR)) { hci_reject_conn(hdev, &ev->bdaddr); - return; + goto unlock; } /* Connection accepted */ - hci_dev_lock(hdev); - ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr); if (ie) memcpy(ie->data.dev_class, ev->dev_class, 3); @@ -3239,8 +3270,7 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data, HCI_ROLE_SLAVE); if (!conn) { bt_dev_err(hdev, "no memory for new connection"); - hci_dev_unlock(hdev); - return; + goto unlock; } } @@ -3280,6 +3310,10 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data, conn->state = BT_CONNECT2; hci_connect_cfm(conn, 0); } + + return; +unlock: + hci_dev_unlock(hdev); } static u8 hci_to_mgmt_reason(u8 err) @@ -4534,7 +4568,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata, if (!info) { bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x", HCI_EV_INQUIRY_RESULT_WITH_RSSI); - return; + goto unlock; } bacpy(&data.bdaddr, &info->bdaddr); @@ -4565,7 +4599,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata, if (!info) { bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x", HCI_EV_INQUIRY_RESULT_WITH_RSSI); - return; + goto unlock; } bacpy(&data.bdaddr, &info->bdaddr); @@ -4587,7 +4621,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata, bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x", HCI_EV_INQUIRY_RESULT_WITH_RSSI); } - +unlock: hci_dev_unlock(hdev); } @@ -4660,8 +4694,22 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, { struct hci_ev_sync_conn_complete *ev = data; struct hci_conn *conn; + u8 status = ev->status; - bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); + switch (ev->link_type) { + case SCO_LINK: + case ESCO_LINK: + break; + default: + /* As per Core 5.3 Vol 4 Part E 7.7.35 (p.2219), Link_Type + * for HCI_Synchronous_Connection_Complete is limited to + * either SCO or eSCO + */ + bt_dev_err(hdev, "Ignoring connect complete event for invalid link type"); + return; + } + + bt_dev_dbg(hdev, "status 0x%2.2x", status); hci_dev_lock(hdev); @@ -4684,24 +4732,28 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, goto unlock; } - switch (ev->status) { + /* The HCI_Synchronous_Connection_Complete event is only sent once per connection. + * Processing it more than once per connection can corrupt kernel memory. + * + * As the connection handle is set here for the first time, it indicates + * whether the connection is already set up. + */ + if (conn->handle != HCI_CONN_HANDLE_UNSET) { + bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete event for existing connection"); + goto unlock; + } + + switch (status) { case 0x00: - /* The synchronous connection complete event should only be - * sent once per new connection. Receiving a successful - * complete event when the connection status is already - * BT_CONNECTED means that the device is misbehaving and sent - * multiple complete event packets for the same new connection. - * - * Registering the device more than once can corrupt kernel - * memory, hence upon detecting this invalid event, we report - * an error and ignore the packet. - */ - if (conn->state == BT_CONNECTED) { - bt_dev_err(hdev, "Ignoring connect complete event for existing connection"); - goto unlock; + conn->handle = __le16_to_cpu(ev->handle); + if (conn->handle > HCI_CONN_HANDLE_MAX) { + bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", + conn->handle, HCI_CONN_HANDLE_MAX); + status = HCI_ERROR_INVALID_PARAMETERS; + conn->state = BT_CLOSED; + break; } - conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; conn->type = ev->link_type; @@ -4745,8 +4797,8 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, } } - hci_connect_cfm(conn, ev->status); - if (ev->status) + hci_connect_cfm(conn, status); + if (status) hci_conn_del(conn); unlock: @@ -5423,8 +5475,9 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (hcon) { + if (hcon && hcon->type == AMP_LINK) { hcon->state = BT_CLOSED; + hci_disconn_cfm(hcon, ev->reason); hci_conn_del(hcon); } @@ -5505,6 +5558,12 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn = hci_lookup_le_connect(hdev); if (!conn) { + /* In case of error status and there is no connection pending + * just unlock as there is nothing to cleanup. + */ + if (status) + goto unlock; + conn = hci_conn_add(hdev, LE_LINK, bdaddr, role); if (!conn) { bt_dev_err(hdev, "no memory for new connection"); @@ -5537,6 +5596,17 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, cancel_delayed_work(&conn->le_conn_timeout); } + /* The HCI_LE_Connection_Complete event is only sent once per connection. + * Processing it more than once per connection can corrupt kernel memory. + * + * As the connection handle is set here for the first time, it indicates + * whether the connection is already set up. + */ + if (conn->handle != HCI_CONN_HANDLE_UNSET) { + bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection"); + goto unlock; + } + le_conn_update_addr(conn, bdaddr, bdaddr_type, local_rpa); /* Lookup the identity address from the stored connection @@ -5556,11 +5626,19 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn->dst_type = ev_bdaddr_type(hdev, conn->dst_type, NULL); - if (status) { - hci_le_conn_failed(conn, status); - goto unlock; + if (handle > HCI_CONN_HANDLE_MAX) { + bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", handle, + HCI_CONN_HANDLE_MAX); + status = HCI_ERROR_INVALID_PARAMETERS; } + /* All connection failure handling is taken care of by the + * hci_conn_failed function which is triggered by the HCI + * request completion callbacks used for connecting. + */ + if (status) + goto unlock; + if (conn->dst_type == ADDR_LE_DEV_PUBLIC) addr_type = BDADDR_LE_PUBLIC; else @@ -5670,8 +5748,6 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); - adv = hci_find_adv_instance(hdev, ev->handle); - /* The Bluetooth Core 5.3 specification clearly states that this event * shall not be sent when the Host disables the advertising set. So in * case of HCI_ERROR_CANCELLED_BY_HOST, just ignore the event. @@ -5684,9 +5760,13 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, return; } + hci_dev_lock(hdev); + + adv = hci_find_adv_instance(hdev, ev->handle); + if (ev->status) { if (!adv) - return; + goto unlock; /* Remove advertising as it has been terminated */ hci_remove_adv_instance(hdev, ev->handle); @@ -5694,12 +5774,12 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) { if (adv->enabled) - return; + goto unlock; } /* We are no longer advertising, clear HCI_LE_ADV */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - return; + goto unlock; } if (adv) @@ -5714,16 +5794,19 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM || bacmp(&conn->resp_addr, BDADDR_ANY)) - return; + goto unlock; if (!ev->handle) { bacpy(&conn->resp_addr, &hdev->random_addr); - return; + goto unlock; } if (adv) bacpy(&conn->resp_addr, &adv->random_addr); } + +unlock: + hci_dev_unlock(hdev); } static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, void *data, @@ -6798,7 +6881,7 @@ static const struct hci_ev { HCI_EV(HCI_EV_NUM_COMP_BLOCKS, hci_num_comp_blocks_evt, sizeof(struct hci_ev_num_comp_blocks)), /* [0xff = HCI_EV_VENDOR] */ - HCI_EV(HCI_EV_VENDOR, msft_vendor_evt, 0), + HCI_EV_VL(HCI_EV_VENDOR, msft_vendor_evt, 0, HCI_MAX_EVENT_SIZE), }; static void hci_event_func(struct hci_dev *hdev, u8 event, struct sk_buff *skb, @@ -6823,8 +6906,9 @@ static void hci_event_func(struct hci_dev *hdev, u8 event, struct sk_buff *skb, * decide if that is acceptable. */ if (skb->len > ev->max_len) - bt_dev_warn(hdev, "unexpected event 0x%2.2x length: %u > %u", - event, skb->len, ev->max_len); + bt_dev_warn_ratelimited(hdev, + "unexpected event 0x%2.2x length: %u > %u", + event, skb->len, ev->max_len); data = hci_ev_skb_pull(hdev, skb, event, ev->min_len); if (!data) diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 42c8047a9897..38ecaf9264ee 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -261,7 +261,7 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen, if (skb_queue_empty(&req->cmd_q)) bt_cb(skb)->hci.req_flags |= HCI_REQ_START; - bt_cb(skb)->hci.req_event = event; + hci_skb_event(skb) = event; skb_queue_tail(&req->cmd_q, skb); } @@ -482,7 +482,7 @@ static int add_to_accept_list(struct hci_request *req, /* During suspend, only wakeable devices can be in accept list */ if (hdev->suspended && - !test_bit(HCI_CONN_FLAG_REMOTE_WAKEUP, params->flags)) + !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) return 0; *num_entries += 1; @@ -2260,6 +2260,7 @@ static int active_scan(struct hci_request *req, unsigned long opt) if (err < 0) own_addr_type = ADDR_LE_DEV_PUBLIC; + hci_dev_lock(hdev); if (hci_is_adv_monitoring(hdev)) { /* Duplicate filter should be disabled when some advertisement * monitor is activated, otherwise AdvMon can only receive one @@ -2276,6 +2277,7 @@ static int active_scan(struct hci_request *req, unsigned long opt) */ filter_dup = LE_SCAN_FILTER_DUP_DISABLE; } + hci_dev_unlock(hdev); hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, hdev->le_scan_window_discovery, own_addr_type, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 33b3c0ffc339..189e3115c8c6 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1453,7 +1453,6 @@ static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { - int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; @@ -1470,7 +1469,7 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, if (sk->sk_state == BT_CLOSED) return 0; - skb = skb_recv_datagram(sk, flags, noblock, &err); + skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 0feb68f12545..1739e8cb3291 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -276,40 +276,37 @@ EXPORT_SYMBOL(__hci_cmd_sync_status); static void hci_cmd_sync_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_work); - struct hci_cmd_sync_work_entry *entry; - hci_cmd_sync_work_func_t func; - hci_cmd_sync_work_destroy_t destroy; - void *data; bt_dev_dbg(hdev, ""); - mutex_lock(&hdev->cmd_sync_work_lock); - entry = list_first_entry(&hdev->cmd_sync_work_list, - struct hci_cmd_sync_work_entry, list); - if (entry) { - list_del(&entry->list); - func = entry->func; - data = entry->data; - destroy = entry->destroy; - kfree(entry); - } else { - func = NULL; - data = NULL; - destroy = NULL; - } - mutex_unlock(&hdev->cmd_sync_work_lock); + /* Dequeue all entries and run them */ + while (1) { + struct hci_cmd_sync_work_entry *entry; - if (func) { - int err; + mutex_lock(&hdev->cmd_sync_work_lock); + entry = list_first_entry_or_null(&hdev->cmd_sync_work_list, + struct hci_cmd_sync_work_entry, + list); + if (entry) + list_del(&entry->list); + mutex_unlock(&hdev->cmd_sync_work_lock); - hci_req_sync_lock(hdev); + if (!entry) + break; - err = func(hdev, data); + bt_dev_dbg(hdev, "entry %p", entry); - if (destroy) - destroy(hdev, data, err); + if (entry->func) { + int err; + + hci_req_sync_lock(hdev); + err = entry->func(hdev, entry->data); + if (entry->destroy) + entry->destroy(hdev, entry->data, err); + hci_req_sync_unlock(hdev); + } - hci_req_sync_unlock(hdev); + kfree(entry); } } @@ -382,6 +379,9 @@ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, { struct hci_cmd_sync_work_entry *entry; + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) + return -ENODEV; + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; @@ -1637,7 +1637,7 @@ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev, * indicates that LL Privacy has been enabled and * HCI_OP_LE_SET_PRIVACY_MODE is supported. */ - if (!test_bit(HCI_CONN_FLAG_DEVICE_PRIVACY, params->flags)) + if (!(params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY)) return 0; irk = hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type); @@ -1664,20 +1664,19 @@ static int hci_le_add_accept_list_sync(struct hci_dev *hdev, struct hci_cp_le_add_to_accept_list cp; int err; + /* During suspend, only wakeable devices can be in acceptlist */ + if (hdev->suspended && + !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) + return 0; + /* Select filter policy to accept all advertising */ if (*num_entries >= hdev->le_accept_list_size) return -ENOSPC; /* Accept list can not be used with RPAs */ if (!use_ll_privacy(hdev) && - hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) { + hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) return -EINVAL; - } - - /* During suspend, only wakeable devices can be in acceptlist */ - if (hdev->suspended && - !test_bit(HCI_CONN_FLAG_REMOTE_WAKEUP, params->flags)) - return 0; /* Attempt to program the device in the resolving list first to avoid * having to rollback in case it fails since the resolving list is @@ -1841,6 +1840,7 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) struct bdaddr_list *b, *t; u8 num_entries = 0; bool pend_conn, pend_report; + u8 filter_policy; int err; /* Pause advertising if resolving list can be used as controllers are @@ -1927,6 +1927,8 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) err = -EINVAL; done: + filter_policy = err ? 0x00 : 0x01; + /* Enable address resolution when LL Privacy is enabled. */ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x01); if (err) @@ -1937,7 +1939,7 @@ done: hci_resume_advertising_sync(hdev); /* Select filter policy to use accept list */ - return err ? 0x00 : 0x01; + return filter_policy; } /* Returns true if an le connection is in the scanning state */ @@ -2806,6 +2808,9 @@ static int hci_set_event_filter_sync(struct hci_dev *hdev, u8 flt_type, if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; + if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) + return 0; + memset(&cp, 0, sizeof(cp)); cp.flt_type = flt_type; @@ -2826,6 +2831,13 @@ static int hci_clear_event_filter_sync(struct hci_dev *hdev) if (!hci_dev_test_flag(hdev, HCI_EVENT_FILTER_CONFIGURED)) return 0; + /* In theory the state machine should not reach here unless + * a hci_set_event_filter_sync() call succeeds, but we do + * the check both for parity and as a future reminder. + */ + if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) + return 0; + return hci_set_event_filter_sync(hdev, HCI_FLT_CLEAR_ALL, 0x00, BDADDR_ANY, 0x00); } @@ -3262,10 +3274,10 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) events[0] |= 0x40; /* LE Data Length Change */ - /* If the controller supports LL Privacy feature, enable - * the corresponding event. + /* If the controller supports LL Privacy feature or LE Extended Adv, + * enable the corresponding event. */ - if (hdev->le_features[0] & HCI_LE_LL_PRIVACY) + if (use_enhanced_conn_complete(hdev)) events[1] |= 0x02; /* LE Enhanced Connection Complete */ /* If the controller supports Extended Scanner Filter @@ -3812,6 +3824,30 @@ static int hci_init_sync(struct hci_dev *hdev) return 0; } +#define HCI_QUIRK_BROKEN(_quirk, _desc) { HCI_QUIRK_BROKEN_##_quirk, _desc } + +static const struct { + unsigned long quirk; + const char *desc; +} hci_broken_table[] = { + HCI_QUIRK_BROKEN(LOCAL_COMMANDS, + "HCI Read Local Supported Commands not supported"), + HCI_QUIRK_BROKEN(STORED_LINK_KEY, + "HCI Delete Stored Link Key command is advertised, " + "but not supported."), + HCI_QUIRK_BROKEN(ERR_DATA_REPORTING, + "HCI Read Default Erroneous Data Reporting command is " + "advertised, but not supported."), + HCI_QUIRK_BROKEN(READ_TRANSMIT_POWER, + "HCI Read Transmit Power Level command is advertised, " + "but not supported."), + HCI_QUIRK_BROKEN(FILTER_CLEAR_ALL, + "HCI Set Event Filter command not supported."), + HCI_QUIRK_BROKEN(ENHANCED_SETUP_SYNC_CONN, + "HCI Enhanced Setup Synchronous Connection command is " + "advertised, but not supported.") +}; + int hci_dev_open_sync(struct hci_dev *hdev) { int ret = 0; @@ -3873,12 +3909,19 @@ int hci_dev_open_sync(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_SETUP) || test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) { bool invalid_bdaddr; + size_t i; hci_sock_dev_event(hdev, HCI_DEV_SETUP); if (hdev->setup) ret = hdev->setup(hdev); + for (i = 0; i < ARRAY_SIZE(hci_broken_table); i++) { + if (test_bit(hci_broken_table[i].quirk, &hdev->quirks)) + bt_dev_warn(hdev, "%s", + hci_broken_table[i].desc); + } + /* The transport driver can set the quirk to mark the * BD_ADDR invalid before creating the HCI device or in * its setup callback. @@ -4106,9 +4149,9 @@ int hci_dev_close_sync(struct hci_dev *hdev) hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); hci_conn_hash_flush(hdev); - hci_dev_unlock(hdev); - + /* Prevent data races on hdev->smp_data or hdev->smp_bredr_data */ smp_unregister(hdev); + hci_dev_unlock(hdev); hci_sock_dev_event(hdev, HCI_DEV_DOWN); @@ -4395,12 +4438,21 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, static int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { + int err; + switch (conn->state) { case BT_CONNECTED: case BT_CONFIG: return hci_disconnect_sync(hdev, conn, reason); case BT_CONNECT: - return hci_connect_cancel_sync(hdev, conn); + err = hci_connect_cancel_sync(hdev, conn); + /* Cleanup hci_conn object if it cannot be cancelled as it + * likelly means the controller and host stack are out of sync. + */ + if (err) + hci_conn_failed(conn, err); + + return err; case BT_CONNECT2: return hci_reject_conn_sync(hdev, conn, reason); default: @@ -4422,7 +4474,7 @@ static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason) return err; } - return err; + return 0; } /* This function perform power off HCI command sequence as follows: @@ -4825,11 +4877,17 @@ static int hci_update_event_filter_sync(struct hci_dev *hdev) if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; + /* Some fake CSR controllers lock up after setting this type of + * filter, so avoid sending the request altogether. + */ + if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) + return 0; + /* Always clear event filter when starting */ hci_clear_event_filter_sync(hdev); list_for_each_entry(b, &hdev->accept_list, list) { - if (!test_bit(HCI_CONN_FLAG_REMOTE_WAKEUP, b->flags)) + if (!(b->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) continue; bt_dev_dbg(hdev, "Adding event filters for %pMR", &b->bdaddr); @@ -4853,10 +4911,28 @@ static int hci_update_event_filter_sync(struct hci_dev *hdev) return 0; } +/* This function disables scan (BR and LE) and mark it as paused */ +static int hci_pause_scan_sync(struct hci_dev *hdev) +{ + if (hdev->scanning_paused) + return 0; + + /* Disable page scan if enabled */ + if (test_bit(HCI_PSCAN, &hdev->flags)) + hci_write_scan_enable_sync(hdev, SCAN_DISABLED); + + hci_scan_disable_sync(hdev); + + hdev->scanning_paused = true; + + return 0; +} + /* This function performs the HCI suspend procedures in the follow order: * * Pause discovery (active scanning/inquiry) * Pause Directed Advertising/Advertising + * Pause Scanning (passive scanning in case discovery was not active) * Disconnect all connections * Set suspend_status to BT_SUSPEND_DISCONNECT if hdev cannot wakeup * otherwise: @@ -4882,15 +4958,11 @@ int hci_suspend_sync(struct hci_dev *hdev) /* Pause other advertisements */ hci_pause_advertising_sync(hdev); - /* Disable page scan if enabled */ - if (test_bit(HCI_PSCAN, &hdev->flags)) - hci_write_scan_enable_sync(hdev, SCAN_DISABLED); - /* Suspend monitor filters */ hci_suspend_monitor_sync(hdev); /* Prevent disconnects from causing scanning to be re-enabled */ - hdev->scanning_paused = true; + hci_pause_scan_sync(hdev); /* Soft disconnect everything (power off) */ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); @@ -4961,6 +5033,22 @@ static void hci_resume_monitor_sync(struct hci_dev *hdev) } } +/* This function resume scan and reset paused flag */ +static int hci_resume_scan_sync(struct hci_dev *hdev) +{ + if (!hdev->scanning_paused) + return 0; + + hci_update_scan_sync(hdev); + + /* Reset passive scanning to normal */ + hci_update_passive_scan_sync(hdev); + + hdev->scanning_paused = false; + + return 0; +} + /* This function performs the HCI suspend procedures in the follow order: * * Restore event mask @@ -4983,10 +5071,9 @@ int hci_resume_sync(struct hci_dev *hdev) /* Clear any event filters and restore scan state */ hci_clear_event_filter_sync(hdev); - hci_update_scan_sync(hdev); - /* Reset passive scanning to normal */ - hci_update_passive_scan_sync(hdev); + /* Resume scanning */ + hci_resume_scan_sync(hdev); /* Resume monitor filters */ hci_resume_monitor_sync(hdev); @@ -5140,8 +5227,8 @@ static void set_ext_conn_params(struct hci_conn *conn, p->max_ce_len = cpu_to_le16(0x0000); } -int hci_le_ext_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, - u8 own_addr_type) +static int hci_le_ext_create_conn_sync(struct hci_dev *hdev, + struct hci_conn *conn, u8 own_addr_type) { struct hci_cp_le_ext_create_conn *cp; struct hci_cp_le_ext_conn_param *p; @@ -5185,7 +5272,7 @@ int hci_le_ext_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_EXT_CREATE_CONN, plen, data, HCI_EV_LE_ENHANCED_CONN_COMPLETE, - HCI_CMD_TIMEOUT, NULL); + conn->conn_timeout, NULL); } int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) @@ -5270,9 +5357,18 @@ int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2261: + * + * If this event is unmasked and the HCI_LE_Connection_Complete event + * is unmasked, only the HCI_LE_Enhanced_Connection_Complete event is + * sent when a new connection has been created. + */ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CONN, - sizeof(cp), &cp, HCI_EV_LE_CONN_COMPLETE, - HCI_CMD_TIMEOUT, NULL); + sizeof(cp), &cp, + use_enhanced_conn_complete(hdev) ? + HCI_EV_LE_ENHANCED_CONN_COMPLETE : + HCI_EV_LE_CONN_COMPLETE, + conn->conn_timeout, NULL); done: /* Re-enable advertising after the connection attempt is finished. */ diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 80848dfc01db..5940744a8cd8 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1305,7 +1305,7 @@ static int hidp_session_thread(void *arg) l2cap_unregister_user(session->conn, &session->user); hidp_session_put(session); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index e817ff0607a0..ae78490ecd3d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1436,6 +1436,7 @@ static void l2cap_ecred_connect(struct l2cap_chan *chan) l2cap_ecred_init(chan, 0); + memset(&data, 0, sizeof(data)); data.pdu.req.psm = chan->psm; data.pdu.req.mtu = cpu_to_le16(chan->imtu); data.pdu.req.mps = cpu_to_le16(chan->mps); @@ -1443,7 +1444,6 @@ static void l2cap_ecred_connect(struct l2cap_chan *chan) data.pdu.scid[0] = cpu_to_le16(chan->scid); chan->ident = l2cap_get_ident(conn); - data.pid = chan->ops->get_peer_pid(chan); data.count = 1; data.chan = chan; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 37087cf7dc5a..ae758ab1b558 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -42,7 +42,7 @@ #include "aosp.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 21 +#define MGMT_REVISION 22 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -174,6 +174,8 @@ static const u16 mgmt_events[] = { MGMT_EV_ADV_MONITOR_REMOVED, MGMT_EV_CONTROLLER_SUSPEND, MGMT_EV_CONTROLLER_RESUME, + MGMT_EV_ADV_MONITOR_DEVICE_FOUND, + MGMT_EV_ADV_MONITOR_DEVICE_LOST, }; static const u16 mgmt_untrusted_commands[] = { @@ -1218,7 +1220,13 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip) static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; + struct mgmt_mode *cp; + + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + return; + + cp = cmd->param; bt_dev_dbg(hdev, "err %d", err); @@ -1242,7 +1250,7 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) mgmt_status(err)); } - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); } static int set_powered_sync(struct hci_dev *hdev, void *data) @@ -1281,7 +1289,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_POWERED, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1290,6 +1298,9 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd, mgmt_set_powered_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1383,6 +1394,10 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) + return; + hci_dev_lock(hdev); if (err) { @@ -1402,7 +1417,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); hci_dev_unlock(hdev); } @@ -1511,7 +1526,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1538,6 +1553,9 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_discoverable_sync, cmd, mgmt_set_discoverable_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1550,6 +1568,10 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) + return; + hci_dev_lock(hdev); if (err) { @@ -1562,7 +1584,9 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_free(cmd); + if (cmd) + mgmt_pending_remove(cmd); + hci_dev_unlock(hdev); } @@ -1634,7 +1658,7 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1654,6 +1678,9 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_connectable_sync, cmd, mgmt_set_connectable_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1774,6 +1801,10 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) u8 enable = cp->val; bool changed; + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_SSP, hdev)) + return; + if (err) { u8 mgmt_err = mgmt_status(err); @@ -2267,7 +2298,9 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_cp_remove_uuid *cp = data; struct mgmt_pending_cmd *cmd; struct bt_uuid *match, *tmp; - u8 bt_uuid_any[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + static const u8 bt_uuid_any[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; int err, found; bt_dev_dbg(hdev, "sock %p", sk); @@ -3321,6 +3354,9 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); + if (cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) + return; + if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, status); @@ -3493,6 +3529,9 @@ static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) struct sk_buff *skb = cmd->skb; u8 status = mgmt_status(err); + if (cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) + return; + if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -3759,13 +3798,6 @@ static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); - if (pending_find(MGMT_OP_SET_WIDEBAND_SPEECH, hdev)) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_WIDEBAND_SPEECH, - MGMT_STATUS_BUSY); - goto unlock; - } - if (hdev_is_powered(hdev) && !!cp->val != hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) { @@ -3981,10 +4013,11 @@ static int exp_ll_privacy_feature_changed(bool enabled, struct hci_dev *hdev, memcpy(ev.uuid, rpa_resolution_uuid, 16); ev.flags = cpu_to_le32((enabled ? BIT(0) : 0) | BIT(1)); + // Do we need to be atomic with the conn_flags? if (enabled && privacy_mode_capable(hdev)) - set_bit(HCI_CONN_FLAG_DEVICE_PRIVACY, hdev->conn_flags); + hdev->conn_flags |= HCI_CONN_FLAG_DEVICE_PRIVACY; else - clear_bit(HCI_CONN_FLAG_DEVICE_PRIVACY, hdev->conn_flags); + hdev->conn_flags &= ~HCI_CONN_FLAG_DEVICE_PRIVACY; return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, hdev, &ev, sizeof(ev), @@ -4403,8 +4436,7 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - bitmap_to_arr32(&supported_flags, hdev->conn_flags, - __HCI_CONN_NUM_FLAGS); + supported_flags = hdev->conn_flags; memset(&rp, 0, sizeof(rp)); @@ -4415,8 +4447,7 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, if (!br_params) goto done; - bitmap_to_arr32(¤t_flags, br_params->flags, - __HCI_CONN_NUM_FLAGS); + current_flags = br_params->flags; } else { params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, le_addr_type(cp->addr.type)); @@ -4424,8 +4455,7 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, if (!params) goto done; - bitmap_to_arr32(¤t_flags, params->flags, - __HCI_CONN_NUM_FLAGS); + current_flags = params->flags; } bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); @@ -4470,8 +4500,8 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, &cp->addr.bdaddr, cp->addr.type, __le32_to_cpu(current_flags)); - bitmap_to_arr32(&supported_flags, hdev->conn_flags, - __HCI_CONN_NUM_FLAGS); + // We should take hci_dev_lock() early, I think.. conn_flags can change + supported_flags = hdev->conn_flags; if ((supported_flags | current_flags) != supported_flags) { bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)", @@ -4487,7 +4517,7 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, cp->addr.type); if (br_params) { - bitmap_from_u64(br_params->flags, current_flags); + br_params->flags = current_flags; status = MGMT_STATUS_SUCCESS; } else { bt_dev_warn(hdev, "No such BR/EDR device %pMR (0x%x)", @@ -4497,14 +4527,26 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, le_addr_type(cp->addr.type)); if (params) { - bitmap_from_u64(params->flags, current_flags); + /* Devices using RPAs can only be programmed in the + * acceptlist LL Privacy has been enable otherwise they + * cannot mark HCI_CONN_FLAG_REMOTE_WAKEUP. + */ + if ((current_flags & HCI_CONN_FLAG_REMOTE_WAKEUP) && + !use_ll_privacy(hdev) && + hci_find_irk_by_addr(hdev, ¶ms->addr, + params->addr_type)) { + bt_dev_warn(hdev, + "Cannot set wakeable for RPA"); + goto unlock; + } + + params->flags = current_flags; status = MGMT_STATUS_SUCCESS; /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY * has been set. */ - if (test_bit(HCI_CONN_FLAG_DEVICE_PRIVACY, - params->flags)) + if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY) hci_update_passive_scan(hdev); } else { bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", @@ -4513,9 +4555,10 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, } } -done: +unlock: hci_dev_unlock(hdev); +done: if (status == MGMT_STATUS_SUCCESS) device_flags_changed(sk, hdev, &cp->addr.bdaddr, cp->addr.type, supported_flags, current_flags); @@ -5036,12 +5079,6 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev)) { - err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, - MGMT_STATUS_BUSY); - goto unlock; - } - cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_DATA, hdev, NULL, 0); if (!cmd) err = -ENOMEM; @@ -5261,11 +5298,16 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; + if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) && + cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) && + cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) + return; + bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED: DISCOVERY_FINDING); @@ -5327,7 +5369,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, else hdev->discovery.limited = false; - cmd = mgmt_pending_new(sk, op, hdev, data, len); + cmd = mgmt_pending_add(sk, op, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -5336,7 +5378,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto failed; } @@ -5430,7 +5472,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_START_SERVICE_DISCOVERY, + cmd = mgmt_pending_add(sk, MGMT_OP_START_SERVICE_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; @@ -5463,7 +5505,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto failed; } @@ -5495,11 +5537,14 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; + if (cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) + return; + bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); if (!err) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); @@ -5535,7 +5580,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - cmd = mgmt_pending_new(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; goto unlock; @@ -5544,7 +5589,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, stop_discovery_sync, cmd, stop_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto unlock; } @@ -7102,8 +7147,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, addr_type); if (params) - bitmap_to_arr32(¤t_flags, params->flags, - __HCI_CONN_NUM_FLAGS); + current_flags = params->flags; } err = hci_cmd_sync_queue(hdev, add_device_sync, NULL, NULL); @@ -7112,8 +7156,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, added: device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); - bitmap_to_arr32(&supported_flags, hdev->conn_flags, - __HCI_CONN_NUM_FLAGS); + supported_flags = hdev->conn_flags; device_flags_changed(NULL, hdev, &cp->addr.bdaddr, cp->addr.type, supported_flags, current_flags); @@ -7474,6 +7517,9 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, u8 status = mgmt_status(err); u16 eir_len; + if (cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) + return; + if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -7918,7 +7964,7 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, return false; /* Make sure that the data is correctly formatted. */ - for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) { + for (i = 0; i < len; i += (cur_len + 1)) { cur_len = data[i]; if (!cur_len) @@ -7969,11 +8015,7 @@ static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags) static bool adv_busy(struct hci_dev *hdev) { - return (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || - pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || - pending_find(MGMT_OP_SET_LE, hdev) || - pending_find(MGMT_OP_ADD_EXT_ADV_PARAMS, hdev) || - pending_find(MGMT_OP_ADD_EXT_ADV_DATA, hdev)); + return pending_find(MGMT_OP_SET_LE, hdev); } static void add_adv_complete(struct hci_dev *hdev, struct sock *sk, u8 instance, @@ -8046,7 +8088,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, u32 flags; u8 status; u16 timeout, duration; - unsigned int prev_instance_cnt = hdev->adv_instance_cnt; + unsigned int prev_instance_cnt; u8 schedule_instance = 0; struct adv_info *next_instance; int err; @@ -8097,6 +8139,8 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } + prev_instance_cnt = hdev->adv_instance_cnt; + err = hci_add_adv_instance(hdev, cp->instance, flags, cp->adv_data_len, cp->data, cp->scan_rsp_len, @@ -8563,9 +8607,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || - pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || - pending_find(MGMT_OP_SET_LE, hdev)) { + if (pending_find(MGMT_OP_SET_LE, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_BUSY); goto unlock; @@ -8601,7 +8643,6 @@ static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_get_adv_size_info *cp = data; struct mgmt_rp_get_adv_size_info rp; u32 flags, supported_flags; - int err; bt_dev_dbg(hdev, "sock %p", sk); @@ -8628,10 +8669,8 @@ static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev, rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true); rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false); - err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, - MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); - - return err; + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); } static const struct hci_mgmt_handler mgmt_handlers[] = { @@ -9059,12 +9098,14 @@ void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, u16 eir_len = 0; u32 flags = 0; + /* allocate buff for LE or BR/EDR adv */ if (conn->le_adv_data_len > 0) skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_CONNECTED, - conn->le_adv_data_len); + sizeof(*ev) + conn->le_adv_data_len); else skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_CONNECTED, - 2 + name_len + 5); + sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0) + + eir_precalc_len(sizeof(conn->dev_class))); ev = skb_put(skb, sizeof(*ev)); bacpy(&ev->addr.bdaddr, &conn->dst); @@ -9083,18 +9124,12 @@ void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, skb_put_data(skb, conn->le_adv_data, conn->le_adv_data_len); eir_len = conn->le_adv_data_len; } else { - if (name_len > 0) { - eir_len = eir_append_data(ev->eir, 0, EIR_NAME_COMPLETE, - name, name_len); - skb_put(skb, eir_len); - } + if (name) + eir_len += eir_skb_put_data(skb, EIR_NAME_COMPLETE, name, name_len); - if (memcmp(conn->dev_class, "\0\0\0", 3) != 0) { - eir_len = eir_append_data(ev->eir, eir_len, - EIR_CLASS_OF_DEV, - conn->dev_class, 3); - skb_put(skb, 5); - } + if (memcmp(conn->dev_class, "\0\0\0", sizeof(conn->dev_class))) + eir_len += eir_skb_put_data(skb, EIR_CLASS_OF_DEV, + conn->dev_class, sizeof(conn->dev_class)); } ev->eir_len = cpu_to_le16(eir_len); @@ -9589,12 +9624,120 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir, return true; } +void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, + bdaddr_t *bdaddr, u8 addr_type) +{ + struct mgmt_ev_adv_monitor_device_lost ev; + + ev.monitor_handle = cpu_to_le16(handle); + bacpy(&ev.addr.bdaddr, bdaddr); + ev.addr.type = addr_type; + + mgmt_event(MGMT_EV_ADV_MONITOR_DEVICE_LOST, hdev, &ev, sizeof(ev), + NULL); +} + +static void mgmt_send_adv_monitor_device_found(struct hci_dev *hdev, + struct sk_buff *skb, + struct sock *skip_sk, + u16 handle) +{ + struct sk_buff *advmon_skb; + size_t advmon_skb_len; + __le16 *monitor_handle; + + if (!skb) + return; + + advmon_skb_len = (sizeof(struct mgmt_ev_adv_monitor_device_found) - + sizeof(struct mgmt_ev_device_found)) + skb->len; + advmon_skb = mgmt_alloc_skb(hdev, MGMT_EV_ADV_MONITOR_DEVICE_FOUND, + advmon_skb_len); + if (!advmon_skb) + return; + + /* ADV_MONITOR_DEVICE_FOUND is similar to DEVICE_FOUND event except + * that it also has 'monitor_handle'. Make a copy of DEVICE_FOUND and + * store monitor_handle of the matched monitor. + */ + monitor_handle = skb_put(advmon_skb, sizeof(*monitor_handle)); + *monitor_handle = cpu_to_le16(handle); + skb_put_data(advmon_skb, skb->data, skb->len); + + mgmt_event_skb(advmon_skb, skip_sk); +} + +static void mgmt_adv_monitor_device_found(struct hci_dev *hdev, + bdaddr_t *bdaddr, bool report_device, + struct sk_buff *skb, + struct sock *skip_sk) +{ + struct monitored_device *dev, *tmp; + bool matched = false; + bool notified = false; + + /* We have received the Advertisement Report because: + * 1. the kernel has initiated active discovery + * 2. if not, we have pend_le_reports > 0 in which case we are doing + * passive scanning + * 3. if none of the above is true, we have one or more active + * Advertisement Monitor + * + * For case 1 and 2, report all advertisements via MGMT_EV_DEVICE_FOUND + * and report ONLY one advertisement per device for the matched Monitor + * via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event. + * + * For case 3, since we are not active scanning and all advertisements + * received are due to a matched Advertisement Monitor, report all + * advertisements ONLY via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event. + */ + if (report_device && !hdev->advmon_pend_notify) { + mgmt_event_skb(skb, skip_sk); + return; + } + + hdev->advmon_pend_notify = false; + + list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) { + if (!bacmp(&dev->bdaddr, bdaddr)) { + matched = true; + + if (!dev->notified) { + mgmt_send_adv_monitor_device_found(hdev, skb, + skip_sk, + dev->handle); + notified = true; + dev->notified = true; + } + } + + if (!dev->notified) + hdev->advmon_pend_notify = true; + } + + if (!report_device && + ((matched && !notified) || !msft_monitor_supported(hdev))) { + /* Handle 0 indicates that we are not active scanning and this + * is a subsequent advertisement report for an already matched + * Advertisement Monitor or the controller offloading support + * is not available. + */ + mgmt_send_adv_monitor_device_found(hdev, skb, skip_sk, 0); + } + + if (report_device) + mgmt_event_skb(skb, skip_sk); + else + kfree_skb(skb); +} + void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len) { struct sk_buff *skb; struct mgmt_ev_device_found *ev; + bool report_device = hci_discovery_active(hdev); /* Don't send events for a non-kernel initiated discovery. With * LE one exception is if we have pend_le_reports > 0 in which @@ -9603,11 +9746,10 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, if (!hci_discovery_active(hdev)) { if (link_type == ACL_LINK) return; - if (link_type == LE_LINK && - list_empty(&hdev->pend_le_reports) && - !hci_is_adv_monitoring(hdev)) { + if (link_type == LE_LINK && !list_empty(&hdev->pend_le_reports)) + report_device = true; + else if (!hci_is_adv_monitoring(hdev)) return; - } } if (hdev->discovery.result_filtering) { @@ -9672,7 +9814,7 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len); - mgmt_event_skb(skb, NULL); + mgmt_adv_monitor_device_found(hdev, bdaddr, report_device, skb, NULL); } void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, @@ -9680,28 +9822,21 @@ void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, { struct sk_buff *skb; struct mgmt_ev_device_found *ev; - u16 eir_len; - u32 flags; + u16 eir_len = 0; + u32 flags = 0; - if (name_len) - skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND, 2 + name_len); - else - skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND, 0); + skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND, + sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0)); ev = skb_put(skb, sizeof(*ev)); bacpy(&ev->addr.bdaddr, bdaddr); ev->addr.type = link_to_bdaddr(link_type, addr_type); ev->rssi = rssi; - if (name) { - eir_len = eir_append_data(ev->eir, 0, EIR_NAME_COMPLETE, name, - name_len); - flags = 0; - skb_put(skb, eir_len); - } else { - eir_len = 0; + if (name) + eir_len += eir_skb_put_data(skb, EIR_NAME_COMPLETE, name, name_len); + else flags = MGMT_DEV_FOUND_NAME_REQUEST_FAILED; - } ev->eir_len = cpu_to_le16(eir_len); ev->flags = cpu_to_le32(flags); diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index edee60bbc7b4..b69cfed62088 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -77,11 +77,12 @@ int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag, { struct hci_dev *hdev; struct mgmt_hdr *hdr; - int len = skb->len; + int len; if (!skb) return -EINVAL; + len = skb->len; hdev = bt_cb(skb)->mgmt.hdev; /* Time stamp */ @@ -296,7 +297,7 @@ struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, if (!cmd) return NULL; - list_add(&cmd->list, &hdev->mgmt_pending); + list_add_tail(&cmd->list, &hdev->mgmt_pending); return cmd; } diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index 6a943634b31a..f43994523b1f 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -80,6 +80,14 @@ struct msft_rp_le_set_advertisement_filter_enable { __u8 sub_opcode; } __packed; +#define MSFT_EV_LE_MONITOR_DEVICE 0x02 +struct msft_ev_le_monitor_device { + __u8 addr_type; + bdaddr_t bdaddr; + __u8 monitor_handle; + __u8 monitor_state; +} __packed; + struct msft_monitor_advertisement_handle_data { __u8 msft_handle; __u16 mgmt_handle; @@ -204,6 +212,37 @@ static struct msft_monitor_advertisement_handle_data *msft_find_handle_data return NULL; } +/* This function requires the caller holds hdev->lock */ +static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle, + bdaddr_t *bdaddr, __u8 addr_type, + bool notify) +{ + struct monitored_device *dev, *tmp; + int count = 0; + + list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) { + /* mgmt_handle == 0 indicates remove all devices, whereas, + * bdaddr == NULL indicates remove all devices matching the + * mgmt_handle. + */ + if ((!mgmt_handle || dev->handle == mgmt_handle) && + (!bdaddr || (!bacmp(bdaddr, &dev->bdaddr) && + addr_type == dev->addr_type))) { + if (notify && dev->notified) { + mgmt_adv_monitor_device_lost(hdev, dev->handle, + &dev->bdaddr, + dev->addr_type); + } + + list_del(&dev->list); + kfree(dev); + count++; + } + } + + return count; +} + static void msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) @@ -291,9 +330,14 @@ static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, /* Do not free the monitor if it is being removed due to * suspend. It will be re-monitored on resume. */ - if (monitor && !msft->suspending) + if (monitor && !msft->suspending) { hci_free_adv_monitor(hdev, monitor); + /* Clear any monitored devices by this Adv Monitor */ + msft_monitor_device_del(hdev, handle_data->mgmt_handle, + NULL, 0, false); + } + list_del(&handle_data->list); kfree(handle_data); } @@ -479,6 +523,16 @@ int msft_resume_sync(struct hci_dev *hdev) if (!msft || !msft_monitor_supported(hdev)) return 0; + hci_dev_lock(hdev); + + /* Clear already tracked devices on resume. Once the monitors are + * reregistered, devices in range will be found again after resume. + */ + hdev->advmon_pend_notify = false; + msft_monitor_device_del(hdev, 0, NULL, 0, true); + + hci_dev_unlock(hdev); + msft->resuming = true; while (1) { @@ -557,6 +611,14 @@ void msft_do_close(struct hci_dev *hdev) list_del(&handle_data->list); kfree(handle_data); } + + hci_dev_lock(hdev); + + /* Clear any devices that are being monitored and notify device lost */ + hdev->advmon_pend_notify = false; + msft_monitor_device_del(hdev, 0, NULL, 0, true); + + hci_dev_unlock(hdev); } void msft_register(struct hci_dev *hdev) @@ -590,10 +652,101 @@ void msft_unregister(struct hci_dev *hdev) kfree(msft); } +/* This function requires the caller holds hdev->lock */ +static void msft_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, + __u8 addr_type, __u16 mgmt_handle) +{ + struct monitored_device *dev; + + dev = kmalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + bt_dev_err(hdev, "MSFT vendor event %u: no memory", + MSFT_EV_LE_MONITOR_DEVICE); + return; + } + + bacpy(&dev->bdaddr, bdaddr); + dev->addr_type = addr_type; + dev->handle = mgmt_handle; + dev->notified = false; + + INIT_LIST_HEAD(&dev->list); + list_add(&dev->list, &hdev->monitored_devices); + hdev->advmon_pend_notify = true; +} + +/* This function requires the caller holds hdev->lock */ +static void msft_device_lost(struct hci_dev *hdev, bdaddr_t *bdaddr, + __u8 addr_type, __u16 mgmt_handle) +{ + if (!msft_monitor_device_del(hdev, mgmt_handle, bdaddr, addr_type, + true)) { + bt_dev_err(hdev, "MSFT vendor event %u: dev %pMR not in list", + MSFT_EV_LE_MONITOR_DEVICE, bdaddr); + } +} + +static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, + u8 ev, size_t len) +{ + void *data; + + data = skb_pull_data(skb, len); + if (!data) + bt_dev_err(hdev, "Malformed MSFT vendor event: 0x%02x", ev); + + return data; +} + +/* This function requires the caller holds hdev->lock */ +static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct msft_ev_le_monitor_device *ev; + struct msft_monitor_advertisement_handle_data *handle_data; + u8 addr_type; + + ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev)); + if (!ev) + return; + + bt_dev_dbg(hdev, + "MSFT vendor event 0x%02x: handle 0x%04x state %d addr %pMR", + MSFT_EV_LE_MONITOR_DEVICE, ev->monitor_handle, + ev->monitor_state, &ev->bdaddr); + + handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false); + if (!handle_data) + return; + + switch (ev->addr_type) { + case ADDR_LE_DEV_PUBLIC: + addr_type = BDADDR_LE_PUBLIC; + break; + + case ADDR_LE_DEV_RANDOM: + addr_type = BDADDR_LE_RANDOM; + break; + + default: + bt_dev_err(hdev, + "MSFT vendor event 0x%02x: unknown addr type 0x%02x", + MSFT_EV_LE_MONITOR_DEVICE, ev->addr_type); + return; + } + + if (ev->monitor_state) + msft_device_found(hdev, &ev->bdaddr, addr_type, + handle_data->mgmt_handle); + else + msft_device_lost(hdev, &ev->bdaddr, addr_type, + handle_data->mgmt_handle); +} + void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct msft_data *msft = hdev->msft_data; - u8 event; + u8 *evt_prefix; + u8 *evt; if (!msft) return; @@ -602,13 +755,12 @@ void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) * matches, and otherwise just return. */ if (msft->evt_prefix_len > 0) { - if (skb->len < msft->evt_prefix_len) + evt_prefix = msft_skb_pull(hdev, skb, 0, msft->evt_prefix_len); + if (!evt_prefix) return; - if (memcmp(skb->data, msft->evt_prefix, msft->evt_prefix_len)) + if (memcmp(evt_prefix, msft->evt_prefix, msft->evt_prefix_len)) return; - - skb_pull(skb, msft->evt_prefix_len); } /* Every event starts at least with an event code and the rest of @@ -617,10 +769,23 @@ void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) if (skb->len < 1) return; - event = *skb->data; - skb_pull(skb, 1); + evt = msft_skb_pull(hdev, skb, 0, sizeof(*evt)); + if (!evt) + return; + + hci_dev_lock(hdev); - bt_dev_dbg(hdev, "MSFT vendor event %u", event); + switch (*evt) { + case MSFT_EV_LE_MONITOR_DEVICE: + msft_monitor_device_evt(hdev, skb); + break; + + default: + bt_dev_dbg(hdev, "MSFT vendor event 0x%02x", *evt); + break; + } + + hci_dev_unlock(hdev); } __u64 msft_get_features(struct hci_dev *hdev) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 8eabf41b2993..1111da4e2f2b 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -574,19 +574,24 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen addr->sa_family != AF_BLUETOOTH) return -EINVAL; - if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) - return -EBADFD; + lock_sock(sk); + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) { + err = -EBADFD; + goto done; + } - if (sk->sk_type != SOCK_SEQPACKET) - return -EINVAL; + if (sk->sk_type != SOCK_SEQPACKET) { + err = -EINVAL; + goto done; + } hdev = hci_get_route(&sa->sco_bdaddr, &sco_pi(sk)->src, BDADDR_BREDR); - if (!hdev) - return -EHOSTUNREACH; + if (!hdev) { + err = -EHOSTUNREACH; + goto done; + } hci_dev_lock(hdev); - lock_sock(sk); - /* Set destination address and psm */ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr); @@ -885,7 +890,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, err = -EBADFD; break; } - if (enhanced_sco_capable(hdev) && + if (enhanced_sync_conn_capable(hdev) && voice.setting == BT_VOICE_TRANSPARENT) sco_pi(sk)->codec.id = BT_CODEC_TRANSPARENT; hci_dev_put(hdev); diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index fbc896323bec..e78dadfc5829 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -72,13 +72,16 @@ static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args) args->args[3], args->args[4]); } +extern const struct bpf_link_ops bpf_struct_ops_link_lops; + int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops; const struct btf_type *func_proto; struct bpf_dummy_ops_test_args *args; - struct bpf_tramp_progs *tprogs; + struct bpf_tramp_links *tlinks; + struct bpf_tramp_link *link = NULL; void *image = NULL; unsigned int op_idx; int prog_ret; @@ -92,8 +95,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(args)) return PTR_ERR(args); - tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); - if (!tprogs) { + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + if (!tlinks) { err = -ENOMEM; goto out; } @@ -105,8 +108,17 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, } set_vm_flush_reset_perms(image); + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out; + } + /* prog doesn't take the ownership of the reference from caller */ + bpf_prog_inc(prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog); + op_idx = prog->expected_attach_type; - err = bpf_struct_ops_prepare_trampoline(tprogs, prog, + err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[op_idx], image, image + PAGE_SIZE); if (err < 0) @@ -124,7 +136,9 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, out: kfree(args); bpf_jit_free_exec(image); - kfree(tprogs); + if (link) + bpf_link_put(&link->link); + kfree(tlinks); return err; } @@ -145,7 +159,8 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id) + u32 *next_btf_id, + enum bpf_type_flag *flag) { const struct btf_type *state; s32 type_id; @@ -162,7 +177,8 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, return -EACCES; } - err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id); + err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); if (err < 0) return err; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 46dd95755967..56f059b3c242 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -5,6 +5,7 @@ #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/slab.h> +#include <linux/init.h> #include <linux/vmalloc.h> #include <linux/etherdevice.h> #include <linux/filter.h> @@ -14,6 +15,7 @@ #include <net/sock.h> #include <net/tcp.h> #include <net/net_namespace.h> +#include <net/page_pool.h> #include <linux/error-injection.h> #include <linux/smp.h> #include <linux/sock_diag.h> @@ -52,10 +54,11 @@ static void bpf_test_timer_leave(struct bpf_test_timer *t) rcu_read_unlock(); } -static bool bpf_test_timer_continue(struct bpf_test_timer *t, u32 repeat, int *err, u32 *duration) +static bool bpf_test_timer_continue(struct bpf_test_timer *t, int iterations, + u32 repeat, int *err, u32 *duration) __must_hold(rcu) { - t->i++; + t->i += iterations; if (t->i >= repeat) { /* We're done. */ t->time_spent += ktime_get_ns() - t->time_start; @@ -87,6 +90,285 @@ reset: return false; } +/* We put this struct at the head of each page with a context and frame + * initialised when the page is allocated, so we don't have to do this on each + * repetition of the test run. + */ +struct xdp_page_head { + struct xdp_buff orig_ctx; + struct xdp_buff ctx; + struct xdp_frame frm; + u8 data[]; +}; + +struct xdp_test_data { + struct xdp_buff *orig_ctx; + struct xdp_rxq_info rxq; + struct net_device *dev; + struct page_pool *pp; + struct xdp_frame **frames; + struct sk_buff **skbs; + struct xdp_mem_info mem; + u32 batch_size; + u32 frame_cnt; +}; + +#define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head)) +#define TEST_XDP_MAX_BATCH 256 + +static void xdp_test_run_init_page(struct page *page, void *arg) +{ + struct xdp_page_head *head = phys_to_virt(page_to_phys(page)); + struct xdp_buff *new_ctx, *orig_ctx; + u32 headroom = XDP_PACKET_HEADROOM; + struct xdp_test_data *xdp = arg; + size_t frm_len, meta_len; + struct xdp_frame *frm; + void *data; + + orig_ctx = xdp->orig_ctx; + frm_len = orig_ctx->data_end - orig_ctx->data_meta; + meta_len = orig_ctx->data - orig_ctx->data_meta; + headroom -= meta_len; + + new_ctx = &head->ctx; + frm = &head->frm; + data = &head->data; + memcpy(data + headroom, orig_ctx->data_meta, frm_len); + + xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq); + xdp_prepare_buff(new_ctx, data, headroom, frm_len, true); + new_ctx->data = new_ctx->data_meta + meta_len; + + xdp_update_frame_from_buff(new_ctx, frm); + frm->mem = new_ctx->rxq->mem; + + memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx)); +} + +static int xdp_test_run_setup(struct xdp_test_data *xdp, struct xdp_buff *orig_ctx) +{ + struct page_pool *pp; + int err = -ENOMEM; + struct page_pool_params pp_params = { + .order = 0, + .flags = 0, + .pool_size = xdp->batch_size, + .nid = NUMA_NO_NODE, + .init_callback = xdp_test_run_init_page, + .init_arg = xdp, + }; + + xdp->frames = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); + if (!xdp->frames) + return -ENOMEM; + + xdp->skbs = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); + if (!xdp->skbs) + goto err_skbs; + + pp = page_pool_create(&pp_params); + if (IS_ERR(pp)) { + err = PTR_ERR(pp); + goto err_pp; + } + + /* will copy 'mem.id' into pp->xdp_mem_id */ + err = xdp_reg_mem_model(&xdp->mem, MEM_TYPE_PAGE_POOL, pp); + if (err) + goto err_mmodel; + + xdp->pp = pp; + + /* We create a 'fake' RXQ referencing the original dev, but with an + * xdp_mem_info pointing to our page_pool + */ + xdp_rxq_info_reg(&xdp->rxq, orig_ctx->rxq->dev, 0, 0); + xdp->rxq.mem.type = MEM_TYPE_PAGE_POOL; + xdp->rxq.mem.id = pp->xdp_mem_id; + xdp->dev = orig_ctx->rxq->dev; + xdp->orig_ctx = orig_ctx; + + return 0; + +err_mmodel: + page_pool_destroy(pp); +err_pp: + kvfree(xdp->skbs); +err_skbs: + kvfree(xdp->frames); + return err; +} + +static void xdp_test_run_teardown(struct xdp_test_data *xdp) +{ + xdp_unreg_mem_model(&xdp->mem); + page_pool_destroy(xdp->pp); + kfree(xdp->frames); + kfree(xdp->skbs); +} + +static bool ctx_was_changed(struct xdp_page_head *head) +{ + return head->orig_ctx.data != head->ctx.data || + head->orig_ctx.data_meta != head->ctx.data_meta || + head->orig_ctx.data_end != head->ctx.data_end; +} + +static void reset_ctx(struct xdp_page_head *head) +{ + if (likely(!ctx_was_changed(head))) + return; + + head->ctx.data = head->orig_ctx.data; + head->ctx.data_meta = head->orig_ctx.data_meta; + head->ctx.data_end = head->orig_ctx.data_end; + xdp_update_frame_from_buff(&head->ctx, &head->frm); +} + +static int xdp_recv_frames(struct xdp_frame **frames, int nframes, + struct sk_buff **skbs, + struct net_device *dev) +{ + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; + int i, n; + LIST_HEAD(list); + + n = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, (void **)skbs); + if (unlikely(n == 0)) { + for (i = 0; i < nframes; i++) + xdp_return_frame(frames[i]); + return -ENOMEM; + } + + for (i = 0; i < nframes; i++) { + struct xdp_frame *xdpf = frames[i]; + struct sk_buff *skb = skbs[i]; + + skb = __xdp_build_skb_from_frame(xdpf, skb, dev); + if (!skb) { + xdp_return_frame(xdpf); + continue; + } + + list_add_tail(&skb->list, &list); + } + netif_receive_skb_list(&list); + + return 0; +} + +static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, + u32 repeat) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + int err = 0, act, ret, i, nframes = 0, batch_sz; + struct xdp_frame **frames = xdp->frames; + struct xdp_page_head *head; + struct xdp_frame *frm; + bool redirect = false; + struct xdp_buff *ctx; + struct page *page; + + batch_sz = min_t(u32, repeat, xdp->batch_size); + + local_bh_disable(); + xdp_set_return_frame_no_direct(); + + for (i = 0; i < batch_sz; i++) { + page = page_pool_dev_alloc_pages(xdp->pp); + if (!page) { + err = -ENOMEM; + goto out; + } + + head = phys_to_virt(page_to_phys(page)); + reset_ctx(head); + ctx = &head->ctx; + frm = &head->frm; + xdp->frame_cnt++; + + act = bpf_prog_run_xdp(prog, ctx); + + /* if program changed pkt bounds we need to update the xdp_frame */ + if (unlikely(ctx_was_changed(head))) { + ret = xdp_update_frame_from_buff(ctx, frm); + if (ret) { + xdp_return_buff(ctx); + continue; + } + } + + switch (act) { + case XDP_TX: + /* we can't do a real XDP_TX since we're not in the + * driver, so turn it into a REDIRECT back to the same + * index + */ + ri->tgt_index = xdp->dev->ifindex; + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; + fallthrough; + case XDP_REDIRECT: + redirect = true; + ret = xdp_do_redirect_frame(xdp->dev, ctx, frm, prog); + if (ret) + xdp_return_buff(ctx); + break; + case XDP_PASS: + frames[nframes++] = frm; + break; + default: + bpf_warn_invalid_xdp_action(NULL, prog, act); + fallthrough; + case XDP_DROP: + xdp_return_buff(ctx); + break; + } + } + +out: + if (redirect) + xdp_do_flush(); + if (nframes) { + ret = xdp_recv_frames(frames, nframes, xdp->skbs, xdp->dev); + if (ret) + err = ret; + } + + xdp_clear_return_frame_no_direct(); + local_bh_enable(); + return err; +} + +static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx, + u32 repeat, u32 batch_size, u32 *time) + +{ + struct xdp_test_data xdp = { .batch_size = batch_size }; + struct bpf_test_timer t = { .mode = NO_MIGRATE }; + int ret; + + if (!repeat) + repeat = 1; + + ret = xdp_test_run_setup(&xdp, ctx); + if (ret) + return ret; + + bpf_test_timer_enter(&t); + do { + xdp.frame_cnt = 0; + ret = xdp_test_run_batch(&xdp, prog, repeat - t.i); + if (unlikely(ret < 0)) + break; + } while (bpf_test_timer_continue(&t, xdp.frame_cnt, repeat, &ret, time)); + bpf_test_timer_leave(&t); + + xdp_test_run_teardown(&xdp); + return ret; +} + static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { @@ -118,7 +400,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, *retval = bpf_prog_run_xdp(prog, ctx); else *retval = bpf_prog_run(prog, ctx); - } while (bpf_test_timer_continue(&t, repeat, &ret, time)); + } while (bpf_test_timer_continue(&t, 1, repeat, &ret, time)); bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); @@ -130,7 +412,8 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, static int bpf_test_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, - u32 size, u32 retval, u32 duration) + struct skb_shared_info *sinfo, u32 size, + u32 retval, u32 duration) { void __user *data_out = u64_to_user_ptr(kattr->test.data_out); int err = -EFAULT; @@ -145,8 +428,42 @@ static int bpf_test_finish(const union bpf_attr *kattr, err = -ENOSPC; } - if (data_out && copy_to_user(data_out, data, copy_size)) - goto out; + if (data_out) { + int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size; + + if (len < 0) { + err = -ENOSPC; + goto out; + } + + if (copy_to_user(data_out, data, len)) + goto out; + + if (sinfo) { + int i, offset = len; + u32 data_len; + + for (i = 0; i < sinfo->nr_frags; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + + if (offset >= copy_size) { + err = -ENOSPC; + break; + } + + data_len = min_t(u32, copy_size - offset, + skb_frag_size(frag)); + + if (copy_to_user(data_out + offset, + skb_frag_address(frag), + data_len)) + goto out; + + offset += data_len; + } + } + } + if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size))) goto out; if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) @@ -165,12 +482,14 @@ out: * future. */ __diag_push(); -__diag_ignore(GCC, 8, "-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); int noinline bpf_fentry_test1(int a) { return a + 1; } +EXPORT_SYMBOL_GPL(bpf_fentry_test1); +ALLOW_ERROR_INJECTION(bpf_fentry_test1, ERRNO); int noinline bpf_fentry_test2(int a, u64 b) { @@ -232,28 +551,197 @@ struct sock * noinline bpf_kfunc_call_test3(struct sock *sk) return sk; } +struct prog_test_member1 { + int a; +}; + +struct prog_test_member { + struct prog_test_member1 m; + int c; +}; + +struct prog_test_ref_kfunc { + int a; + int b; + struct prog_test_member memb; + struct prog_test_ref_kfunc *next; + refcount_t cnt; +}; + +static struct prog_test_ref_kfunc prog_test_struct = { + .a = 42, + .b = 108, + .next = &prog_test_struct, + .cnt = REFCOUNT_INIT(1), +}; + +noinline struct prog_test_ref_kfunc * +bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr) +{ + refcount_inc(&prog_test_struct.cnt); + return &prog_test_struct; +} + +noinline struct prog_test_member * +bpf_kfunc_call_memb_acquire(void) +{ + WARN_ON_ONCE(1); + return NULL; +} + +noinline void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) +{ + if (!p) + return; + + refcount_dec(&p->cnt); +} + +noinline void bpf_kfunc_call_memb_release(struct prog_test_member *p) +{ +} + +noinline void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p) +{ + WARN_ON_ONCE(1); +} + +noinline struct prog_test_ref_kfunc * +bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **pp, int a, int b) +{ + struct prog_test_ref_kfunc *p = READ_ONCE(*pp); + + if (!p) + return NULL; + refcount_inc(&p->cnt); + return p; +} + +struct prog_test_pass1 { + int x0; + struct { + int x1; + struct { + int x2; + struct { + int x3; + }; + }; + }; +}; + +struct prog_test_pass2 { + int len; + short arr1[4]; + struct { + char arr2[4]; + unsigned long arr3[8]; + } x; +}; + +struct prog_test_fail1 { + void *p; + int x; +}; + +struct prog_test_fail2 { + int x8; + struct prog_test_pass1 x; +}; + +struct prog_test_fail3 { + int len; + char arr1[2]; + char arr2[]; +}; + +noinline void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb) +{ +} + +noinline void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) +{ +} + +noinline void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) +{ +} + +noinline void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p) +{ +} + +noinline void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p) +{ +} + +noinline void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p) +{ +} + +noinline void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz) +{ +} + +noinline void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len) +{ +} + +noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len) +{ +} + __diag_pop(); ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); -BTF_SET_START(test_sk_kfunc_ids) +BTF_SET_START(test_sk_check_kfunc_ids) BTF_ID(func, bpf_kfunc_call_test1) BTF_ID(func, bpf_kfunc_call_test2) BTF_ID(func, bpf_kfunc_call_test3) -BTF_SET_END(test_sk_kfunc_ids) - -bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner) -{ - if (btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id)) - return true; - return bpf_check_mod_kfunc_call(&prog_test_kfunc_list, kfunc_id, owner); -} - -static void *bpf_test_init(const union bpf_attr *kattr, u32 size, - u32 headroom, u32 tailroom) +BTF_ID(func, bpf_kfunc_call_test_acquire) +BTF_ID(func, bpf_kfunc_call_memb_acquire) +BTF_ID(func, bpf_kfunc_call_test_release) +BTF_ID(func, bpf_kfunc_call_memb_release) +BTF_ID(func, bpf_kfunc_call_memb1_release) +BTF_ID(func, bpf_kfunc_call_test_kptr_get) +BTF_ID(func, bpf_kfunc_call_test_pass_ctx) +BTF_ID(func, bpf_kfunc_call_test_pass1) +BTF_ID(func, bpf_kfunc_call_test_pass2) +BTF_ID(func, bpf_kfunc_call_test_fail1) +BTF_ID(func, bpf_kfunc_call_test_fail2) +BTF_ID(func, bpf_kfunc_call_test_fail3) +BTF_ID(func, bpf_kfunc_call_test_mem_len_pass1) +BTF_ID(func, bpf_kfunc_call_test_mem_len_fail1) +BTF_ID(func, bpf_kfunc_call_test_mem_len_fail2) +BTF_SET_END(test_sk_check_kfunc_ids) + +BTF_SET_START(test_sk_acquire_kfunc_ids) +BTF_ID(func, bpf_kfunc_call_test_acquire) +BTF_ID(func, bpf_kfunc_call_memb_acquire) +BTF_ID(func, bpf_kfunc_call_test_kptr_get) +BTF_SET_END(test_sk_acquire_kfunc_ids) + +BTF_SET_START(test_sk_release_kfunc_ids) +BTF_ID(func, bpf_kfunc_call_test_release) +BTF_ID(func, bpf_kfunc_call_memb_release) +BTF_ID(func, bpf_kfunc_call_memb1_release) +BTF_SET_END(test_sk_release_kfunc_ids) + +BTF_SET_START(test_sk_ret_null_kfunc_ids) +BTF_ID(func, bpf_kfunc_call_test_acquire) +BTF_ID(func, bpf_kfunc_call_memb_acquire) +BTF_ID(func, bpf_kfunc_call_test_kptr_get) +BTF_SET_END(test_sk_ret_null_kfunc_ids) + +BTF_SET_START(test_sk_kptr_acquire_kfunc_ids) +BTF_ID(func, bpf_kfunc_call_test_kptr_get) +BTF_SET_END(test_sk_kptr_acquire_kfunc_ids) + +static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, + u32 size, u32 headroom, u32 tailroom) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); - u32 user_size = kattr->test.data_size_in; void *data; if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) @@ -283,7 +771,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, int b = 2, err = -EFAULT; u32 retval = 0; - if (kattr->test.flags || kattr->test.cpu) + if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; switch (prog->expected_attach_type) { @@ -347,7 +835,7 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, /* doesn't support data_in/out, ctx_out, duration, or repeat */ if (kattr->test.data_in || kattr->test.data_out || kattr->test.ctx_out || kattr->test.duration || - kattr->test.repeat) + kattr->test.repeat || kattr->test.batch_size) return -EINVAL; if (ctx_size_in < prog->aux->max_ctx_offset || @@ -524,7 +1012,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) cb->pkt_len = skb->len; } else { if (__skb->wire_len < skb->len || - __skb->wire_len > GSO_MAX_SIZE) + __skb->wire_len > GSO_LEGACY_MAX_SIZE) return -EINVAL; cb->pkt_len = __skb->wire_len; } @@ -578,10 +1066,11 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, void *data; int ret; - if (kattr->test.flags || kattr->test.cpu) + if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; - data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, + data = bpf_test_init(kattr, kattr->test.data_size_in, + size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) return PTR_ERR(data); @@ -683,7 +1172,8 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) size = skb_headlen(skb); - ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration); + ret = bpf_test_finish(kattr, uattr, skb->data, NULL, size, retval, + duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct __sk_buff)); @@ -757,22 +1247,38 @@ static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md) int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { + bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - u32 headroom = XDP_PACKET_HEADROOM; + u32 batch_size = kattr->test.batch_size; + u32 retval = 0, duration, max_data_sz; u32 size = kattr->test.data_size_in; + u32 headroom = XDP_PACKET_HEADROOM; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; + struct skb_shared_info *sinfo; struct xdp_buff xdp = {}; - u32 retval, duration; + int i, ret = -EINVAL; struct xdp_md *ctx; - u32 max_data_sz; void *data; - int ret = -EINVAL; if (prog->expected_attach_type == BPF_XDP_DEVMAP || prog->expected_attach_type == BPF_XDP_CPUMAP) return -EINVAL; + if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES) + return -EINVAL; + + if (do_live) { + if (!batch_size) + batch_size = NAPI_POLL_WEIGHT; + else if (batch_size > TEST_XDP_MAX_BATCH) + return -E2BIG; + + headroom += sizeof(struct xdp_page_head); + } else if (batch_size) { + return -EINVAL; + } + ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md)); if (IS_ERR(ctx)) return PTR_ERR(ctx); @@ -781,33 +1287,81 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, /* There can't be user provided data before the meta data */ if (ctx->data_meta || ctx->data_end != size || ctx->data > ctx->data_end || - unlikely(xdp_metalen_invalid(ctx->data))) + unlikely(xdp_metalen_invalid(ctx->data)) || + (do_live && (kattr->test.data_out || kattr->test.ctx_out))) goto free_ctx; /* Meta data is allocated from the headroom */ headroom -= ctx->data; } - /* XDP have extra tailroom as (most) drivers use full page */ max_data_sz = 4096 - headroom - tailroom; + if (size > max_data_sz) { + /* disallow live data mode for jumbo frames */ + if (do_live) + goto free_ctx; + size = max_data_sz; + } - data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); + data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom); if (IS_ERR(data)) { ret = PTR_ERR(data); goto free_ctx; } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); - xdp_init_buff(&xdp, headroom + max_data_sz + tailroom, - &rxqueue->xdp_rxq); + rxqueue->xdp_rxq.frag_size = headroom + max_data_sz + tailroom; + xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq); xdp_prepare_buff(&xdp, data, headroom, size, true); + sinfo = xdp_get_shared_info_from_buff(&xdp); ret = xdp_convert_md_to_buff(ctx, &xdp); if (ret) goto free_data; + if (unlikely(kattr->test.data_size_in > size)) { + void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + + while (size < kattr->test.data_size_in) { + struct page *page; + skb_frag_t *frag; + u32 data_len; + + if (sinfo->nr_frags == MAX_SKB_FRAGS) { + ret = -ENOMEM; + goto out; + } + + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto out; + } + + frag = &sinfo->frags[sinfo->nr_frags++]; + __skb_frag_set_page(frag, page); + + data_len = min_t(u32, kattr->test.data_size_in - size, + PAGE_SIZE); + skb_frag_size_set(frag, data_len); + + if (copy_from_user(page_address(page), data_in + size, + data_len)) { + ret = -EFAULT; + goto out; + } + sinfo->xdp_frags_size += data_len; + size += data_len; + } + xdp_buff_set_frags_flag(&xdp); + } + if (repeat > 1) bpf_prog_change_xdp(NULL, prog); - ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); + + if (do_live) + ret = bpf_test_run_xdp_live(prog, &xdp, repeat, batch_size, &duration); + else + ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); /* We convert the xdp_buff back to an xdp_md before checking the return * code so the reference count of any held netdevice will be decremented * even if the test run failed. @@ -816,12 +1370,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (ret) goto out; - if (xdp.data_meta != data + headroom || - xdp.data_end != xdp.data_meta + size) - size = xdp.data_end - xdp.data_meta; - - ret = bpf_test_finish(kattr, uattr, xdp.data_meta, size, retval, - duration); + size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size; + ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, + retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct xdp_md)); @@ -830,6 +1381,8 @@ out: if (repeat > 1) bpf_prog_change_xdp(prog, NULL); free_data: + for (i = 0; i < sinfo->nr_frags; i++) + __free_page(skb_frag_page(&sinfo->frags[i])); kfree(data); free_ctx: kfree(ctx); @@ -870,13 +1423,13 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; - if (kattr->test.flags || kattr->test.cpu) + if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (size < ETH_HLEN) return -EINVAL; - data = bpf_test_init(kattr, size, 0, 0); + data = bpf_test_init(kattr, kattr->test.data_size_in, size, 0, 0); if (IS_ERR(data)) return PTR_ERR(data); @@ -905,14 +1458,14 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, do { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, size, flags); - } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); bpf_test_timer_leave(&t); if (ret < 0) goto out; - ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), - retval, duration); + ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL, + sizeof(flow_keys), retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(struct bpf_flow_keys)); @@ -937,7 +1490,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat if (prog->type != BPF_PROG_TYPE_SK_LOOKUP) return -EINVAL; - if (kattr->test.flags || kattr->test.cpu) + if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || @@ -960,7 +1513,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) goto out; - if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) { + if (user_ctx->local_port > U16_MAX) { ret = -ERANGE; goto out; } @@ -968,7 +1521,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat ctx.family = (u16)user_ctx->family; ctx.protocol = (u16)user_ctx->protocol; ctx.dport = (u16)user_ctx->local_port; - ctx.sport = (__force __be16)user_ctx->remote_port; + ctx.sport = user_ctx->remote_port; switch (ctx.family) { case AF_INET: @@ -1000,7 +1553,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat do { ctx.selected_sk = NULL; retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run); - } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); bpf_test_timer_leave(&t); if (ret < 0) @@ -1016,7 +1569,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); } - ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration); + ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); @@ -1039,7 +1592,8 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog, /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */ if (kattr->test.data_in || kattr->test.data_out || kattr->test.ctx_out || kattr->test.duration || - kattr->test.repeat || kattr->test.flags) + kattr->test.repeat || kattr->test.flags || + kattr->test.batch_size) return -EINVAL; if (ctx_size_in < prog->aux->max_ctx_offset || @@ -1067,3 +1621,39 @@ out: kfree(ctx); return err; } + +static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &test_sk_check_kfunc_ids, + .acquire_set = &test_sk_acquire_kfunc_ids, + .release_set = &test_sk_release_kfunc_ids, + .ret_null_set = &test_sk_ret_null_kfunc_ids, + .kptr_acquire_set = &test_sk_kptr_acquire_kfunc_ids +}; + +BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids) +BTF_ID(struct, prog_test_ref_kfunc) +BTF_ID(func, bpf_kfunc_call_test_release) +BTF_ID(struct, prog_test_member) +BTF_ID(func, bpf_kfunc_call_memb_release) + +static int __init bpf_prog_test_run_init(void) +{ + const struct btf_id_dtor_kfunc bpf_prog_test_dtor_kfunc[] = { + { + .btf_id = bpf_prog_test_dtor_kfunc_ids[0], + .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[1] + }, + { + .btf_id = bpf_prog_test_dtor_kfunc_ids[2], + .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[3], + }, + }; + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); + return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc, + ARRAY_SIZE(bpf_prog_test_dtor_kfunc), + THIS_MODULE); +} +late_initcall(bpf_prog_test_run_init); diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 51a941b56ec3..422ec6e7ccff 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -70,7 +70,7 @@ static int bpfilter_process_sockopt(struct sock *sk, int optname, .addr = (uintptr_t)optval.user, .len = optlen, }; - if (uaccess_kernel() || sockptr_is_kernel(optval)) { + if (sockptr_is_kernel(optval)) { pr_err("kernel access not supported\n"); return -EFAULT; } diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 7fb9a021873b..24bd1c0a9a5a 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -20,7 +20,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o br_multicast_eht.o -bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_options.o +bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_options.o br_mst.o bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o diff --git a/net/bridge/br.c b/net/bridge/br.c index 1fac72cc617f..96e91d69a9a8 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -265,6 +265,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MCAST_VLAN_SNOOPING: err = br_multicast_toggle_vlan_snooping(br, on, extack); break; + case BR_BOOLOPT_MST_ENABLE: + err = br_mst_set_enabled(br, on, extack); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -281,6 +284,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_NO_LL_LEARN); case BR_BOOLOPT_MCAST_VLAN_SNOOPING: return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); + case BR_BOOLOPT_MST_ENABLE: + return br_opt_get(br, BROPT_MST_ENABLED); default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -342,23 +347,26 @@ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) clear_bit(opt, &br->options); } -static void __net_exit br_net_exit(struct net *net) +static void __net_exit br_net_exit_batch(struct list_head *net_list) { struct net_device *dev; + struct net *net; LIST_HEAD(list); rtnl_lock(); - for_each_netdev(net, dev) - if (netif_is_bridge_master(dev)) - br_dev_delete(dev, &list); + + list_for_each_entry(net, net_list, exit_list) + for_each_netdev(net, dev) + if (netif_is_bridge_master(dev)) + br_dev_delete(dev, &list); unregister_netdevice_many(&list); - rtnl_unlock(); + rtnl_unlock(); } static struct pernet_operations br_net_ops = { - .exit = br_net_exit, + .exit_batch = br_net_exit_batch, }; static const struct stp_proto br_stp_proto = { diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 3db1def4437b..e5e48c6e35d7 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -84,7 +84,7 @@ static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p, skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_HOST; - netif_rx_ni(skb); + netif_rx(skb); } } @@ -364,7 +364,7 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, reply->ip_summed = CHECKSUM_UNNECESSARY; reply->pkt_type = PACKET_HOST; - netif_rx_ni(reply); + netif_rx(reply); } } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 8d6bab244c4a..58a4f70e01e3 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -465,6 +465,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_fix_features = br_fix_features, .ndo_fdb_add = br_fdb_add, .ndo_fdb_del = br_fdb_delete, + .ndo_fdb_del_bulk = br_fdb_delete_bulk, .ndo_fdb_dump = br_fdb_dump, .ndo_fdb_get = br_fdb_get, .ndo_bridge_getlink = br_getlink, diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 6ccda68bd473..e7f4fccb6adb 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -558,18 +558,161 @@ void br_fdb_cleanup(struct work_struct *work) mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } -/* Completely flush all dynamic entries in forwarding database.*/ -void br_fdb_flush(struct net_bridge *br) +static bool __fdb_flush_matches(const struct net_bridge *br, + const struct net_bridge_fdb_entry *f, + const struct net_bridge_fdb_flush_desc *desc) +{ + const struct net_bridge_port *dst = READ_ONCE(f->dst); + int port_ifidx = dst ? dst->dev->ifindex : br->dev->ifindex; + + if (desc->vlan_id && desc->vlan_id != f->key.vlan_id) + return false; + if (desc->port_ifindex && desc->port_ifindex != port_ifidx) + return false; + if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags) + return false; + + return true; +} + +/* Flush forwarding database entries matching the description */ +void br_fdb_flush(struct net_bridge *br, + const struct net_bridge_fdb_flush_desc *desc) { struct net_bridge_fdb_entry *f; - struct hlist_node *tmp; - spin_lock_bh(&br->hash_lock); - hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { - if (!test_bit(BR_FDB_STATIC, &f->flags)) + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + if (!__fdb_flush_matches(br, f, desc)) + continue; + + spin_lock_bh(&br->hash_lock); + if (!hlist_unhashed(&f->fdb_node)) fdb_delete(br, f, true); + spin_unlock_bh(&br->hash_lock); } - spin_unlock_bh(&br->hash_lock); + rcu_read_unlock(); +} + +static unsigned long __ndm_state_to_fdb_flags(u16 ndm_state) +{ + unsigned long flags = 0; + + if (ndm_state & NUD_PERMANENT) + __set_bit(BR_FDB_LOCAL, &flags); + if (ndm_state & NUD_NOARP) + __set_bit(BR_FDB_STATIC, &flags); + + return flags; +} + +static unsigned long __ndm_flags_to_fdb_flags(u8 ndm_flags) +{ + unsigned long flags = 0; + + if (ndm_flags & NTF_USE) + __set_bit(BR_FDB_ADDED_BY_USER, &flags); + if (ndm_flags & NTF_EXT_LEARNED) + __set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &flags); + if (ndm_flags & NTF_OFFLOADED) + __set_bit(BR_FDB_OFFLOADED, &flags); + if (ndm_flags & NTF_STICKY) + __set_bit(BR_FDB_STICKY, &flags); + + return flags; +} + +static int __fdb_flush_validate_ifindex(const struct net_bridge *br, + int ifindex, + struct netlink_ext_ack *extack) +{ + const struct net_device *dev; + + dev = __dev_get_by_index(dev_net(br->dev), ifindex); + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Unknown flush device ifindex"); + return -ENODEV; + } + if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Flush device is not a bridge or bridge port"); + return -EINVAL; + } + if (netif_is_bridge_master(dev) && dev != br->dev) { + NL_SET_ERR_MSG_MOD(extack, + "Flush bridge device does not match target bridge device"); + return -EINVAL; + } + if (netif_is_bridge_port(dev)) { + struct net_bridge_port *p = br_port_get_rtnl(dev); + + if (p->br != br) { + NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); + return -EINVAL; + } + } + + return 0; +} + +int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, u16 vid, + struct netlink_ext_ack *extack) +{ + u8 ndm_flags = ndm->ndm_flags & ~FDB_FLUSH_IGNORED_NDM_FLAGS; + struct net_bridge_fdb_flush_desc desc = { .vlan_id = vid }; + struct net_bridge_port *p = NULL; + struct net_bridge *br; + + if (netif_is_bridge_master(dev)) { + br = netdev_priv(dev); + } else { + p = br_port_get_rtnl(dev); + if (!p) { + NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge port"); + return -EINVAL; + } + br = p->br; + } + + if (ndm_flags & ~FDB_FLUSH_ALLOWED_NDM_FLAGS) { + NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set"); + return -EINVAL; + } + if (ndm->ndm_state & ~FDB_FLUSH_ALLOWED_NDM_STATES) { + NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set"); + return -EINVAL; + } + + desc.flags |= __ndm_state_to_fdb_flags(ndm->ndm_state); + desc.flags |= __ndm_flags_to_fdb_flags(ndm_flags); + if (tb[NDA_NDM_STATE_MASK]) { + u16 ndm_state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]); + + desc.flags_mask |= __ndm_state_to_fdb_flags(ndm_state_mask); + } + if (tb[NDA_NDM_FLAGS_MASK]) { + u8 ndm_flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]); + + desc.flags_mask |= __ndm_flags_to_fdb_flags(ndm_flags_mask); + } + if (tb[NDA_IFINDEX]) { + int err, ifidx = nla_get_s32(tb[NDA_IFINDEX]); + + err = __fdb_flush_validate_ifindex(br, ifidx, extack); + if (err) + return err; + desc.port_ifindex = ifidx; + } else if (p) { + /* flush was invoked with port device and NTF_MASTER */ + desc.port_ifindex = p->dev->ifindex; + } + + br_debug(br, "flushing port ifindex: %d vlan id: %u flags: 0x%lx flags mask: 0x%lx\n", + desc.port_ifindex, desc.vlan_id, desc.flags, desc.flags_mask); + + br_fdb_flush(br, &desc); + + return 0; } /* Flush all entries referring to a specific port. @@ -1110,7 +1253,8 @@ static int __br_fdb_delete(struct net_bridge *br, /* Remove neighbor entry with RTM_DELNEIGH */ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, + struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index ec646656dbf1..02bb620d3b8d 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -62,7 +62,7 @@ EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - skb->tstamp = 0; + skb_clear_tstamp(skb); return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, net, sk, skb, NULL, skb->dev, br_dev_queue_push_xmit); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index a52ad81596b7..47fcbade7389 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -517,16 +517,16 @@ void br_mtu_auto_adjust(struct net_bridge *br) static void br_set_gso_limits(struct net_bridge *br) { - unsigned int gso_max_size = GSO_MAX_SIZE; - u16 gso_max_segs = GSO_MAX_SEGS; + unsigned int tso_max_size = TSO_MAX_SIZE; const struct net_bridge_port *p; + u16 tso_max_segs = TSO_MAX_SEGS; list_for_each_entry(p, &br->port_list, list) { - gso_max_size = min(gso_max_size, p->dev->gso_max_size); - gso_max_segs = min(gso_max_segs, p->dev->gso_max_segs); + tso_max_size = min(tso_max_size, p->dev->tso_max_size); + tso_max_segs = min(tso_max_segs, p->dev->tso_max_segs); } - netif_set_gso_max_size(br->dev, gso_max_size); - netif_set_gso_max_segs(br->dev, gso_max_segs); + netif_set_tso_max_size(br->dev, tso_max_size); + netif_set_tso_max_segs(br->dev, tso_max_segs); } /* @@ -615,6 +615,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, err = dev_set_allmulti(dev, 1); if (err) { br_multicast_del_port(p); + dev_put_track(dev, &p->dev_tracker); kfree(p); /* kobject not yet init'd, manually free */ goto err1; } @@ -724,10 +725,10 @@ err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: br_multicast_del_port(p); + dev_put_track(dev, &p->dev_tracker); kobject_put(&p->kobj); dev_set_allmulti(dev, -1); err1: - dev_put(dev); return err; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index b50382f957c1..68b3e850bcb9 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -39,6 +39,13 @@ static int br_pass_frame_up(struct sk_buff *skb) dev_sw_netstats_rx_add(brdev, skb->len); vg = br_vlan_group_rcu(br); + + /* Reset the offload_fwd_mark because there could be a stacked + * bridge above, and it should not think this bridge it doing + * that bridge's work forwarding out its ports. + */ + br_switchdev_frame_unmark(skb); + /* Bridge is just like any other port. Make sure the * packet is allowed except in promisc mode when someone * may be running packet capture. @@ -78,20 +85,38 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb u16 vid = 0; u8 state; - if (!p || p->state == BR_STATE_DISABLED) + if (!p) goto drop; + br = p->br; + + if (br_mst_is_enabled(br)) { + state = BR_STATE_FORWARDING; + } else { + if (p->state == BR_STATE_DISABLED) + goto drop; + + state = p->state; + } + brmctx = &p->br->multicast_ctx; pmctx = &p->multicast_ctx; - state = p->state; if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, &state, &vlan)) goto out; + if (p->flags & BR_PORT_LOCKED) { + struct net_bridge_fdb_entry *fdb_src = + br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid); + + if (!fdb_src || READ_ONCE(fdb_src->dst) != p || + test_bit(BR_FDB_LOCAL, &fdb_src->flags)) + goto drop; + } + nbp_switchdev_frame_mark(p, skb); /* insert into forwarding database after filtering to avoid spoofing */ - br = p->br; if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); @@ -361,9 +386,13 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; forward: + if (br_mst_is_enabled(p->br)) + goto defer_stp_filtering; + switch (p->state) { case BR_STATE_FORWARDING: case BR_STATE_LEARNING: +defer_stp_filtering: if (ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 4556d913955b..fdcc641fc89a 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -251,14 +251,16 @@ static int __mdb_fill_info(struct sk_buff *skb, __mdb_entry_fill_flags(&e, flags); e.ifindex = ifindex; e.vid = mp->addr.vid; - if (mp->addr.proto == htons(ETH_P_IP)) + if (mp->addr.proto == htons(ETH_P_IP)) { e.addr.u.ip4 = mp->addr.dst.ip4; #if IS_ENABLED(CONFIG_IPV6) - else if (mp->addr.proto == htons(ETH_P_IPV6)) + } else if (mp->addr.proto == htons(ETH_P_IPV6)) { e.addr.u.ip6 = mp->addr.dst.ip6; #endif - else + } else { ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); + e.state = MDB_PG_FLAGS_PERMANENT; + } e.addr.proto = mp->addr.proto; nest_ent = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY_INFO); @@ -873,8 +875,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, return -EINVAL; /* host join errors which can happen before creating the group */ - if (!port) { - /* don't allow any flags for host-joined groups */ + if (!port && !br_group_is_l2(&group)) { + /* don't allow any flags for host-joined IP groups */ if (entry->state) { NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); return -EINVAL; diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c new file mode 100644 index 000000000000..ee680adcee17 --- /dev/null +++ b/net/bridge/br_mst.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Bridge Multiple Spanning Tree Support + * + * Authors: + * Tobias Waldekranz <tobias@waldekranz.com> + */ + +#include <linux/kernel.h> +#include <net/switchdev.h> + +#include "br_private.h" + +DEFINE_STATIC_KEY_FALSE(br_mst_used); + +bool br_mst_enabled(const struct net_device *dev) +{ + if (!netif_is_bridge_master(dev)) + return false; + + return br_opt_get(netdev_priv(dev), BROPT_MST_ENABLED); +} +EXPORT_SYMBOL_GPL(br_mst_enabled); + +int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids) +{ + const struct net_bridge_vlan_group *vg; + const struct net_bridge_vlan *v; + const struct net_bridge *br; + + ASSERT_RTNL(); + + if (!netif_is_bridge_master(dev)) + return -EINVAL; + + br = netdev_priv(dev); + if (!br_opt_get(br, BROPT_MST_ENABLED)) + return -EINVAL; + + vg = br_vlan_group(br); + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (v->msti == msti) + __set_bit(v->vid, vids); + } + + return 0; +} +EXPORT_SYMBOL_GPL(br_mst_get_info); + +int br_mst_get_state(const struct net_device *dev, u16 msti, u8 *state) +{ + const struct net_bridge_port *p = NULL; + const struct net_bridge_vlan_group *vg; + const struct net_bridge_vlan *v; + + ASSERT_RTNL(); + + p = br_port_get_check_rtnl(dev); + if (!p || !br_opt_get(p->br, BROPT_MST_ENABLED)) + return -EINVAL; + + vg = nbp_vlan_group(p); + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (v->brvlan->msti == msti) { + *state = v->state; + return 0; + } + } + + return -ENOENT; +} +EXPORT_SYMBOL_GPL(br_mst_get_state); + +static void br_mst_vlan_set_state(struct net_bridge_port *p, struct net_bridge_vlan *v, + u8 state) +{ + struct net_bridge_vlan_group *vg = nbp_vlan_group(p); + + if (v->state == state) + return; + + br_vlan_set_state(v, state); + + if (v->vid == vg->pvid) + br_vlan_set_pvid_state(vg, state); +} + +int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, + struct netlink_ext_ack *extack) +{ + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_MST_STATE, + .orig_dev = p->dev, + .u.mst_state = { + .msti = msti, + .state = state, + }, + }; + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + int err; + + vg = nbp_vlan_group(p); + if (!vg) + return 0; + + /* MSTI 0 (CST) state changes are notified via the regular + * SWITCHDEV_ATTR_ID_PORT_STP_STATE. + */ + if (msti) { + err = switchdev_port_attr_set(p->dev, &attr, extack); + if (err && err != -EOPNOTSUPP) + return err; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (v->brvlan->msti != msti) + continue; + + br_mst_vlan_set_state(p, v, state); + } + + return 0; +} + +static void br_mst_vlan_sync_state(struct net_bridge_vlan *pv, u16 msti) +{ + struct net_bridge_vlan_group *vg = nbp_vlan_group(pv->port); + struct net_bridge_vlan *v; + + list_for_each_entry(v, &vg->vlan_list, vlist) { + /* If this port already has a defined state in this + * MSTI (through some other VLAN membership), inherit + * it. + */ + if (v != pv && v->brvlan->msti == msti) { + br_mst_vlan_set_state(pv->port, pv, v->state); + return; + } + } + + /* Otherwise, start out in a new MSTI with all ports disabled. */ + return br_mst_vlan_set_state(pv->port, pv, BR_STATE_DISABLED); +} + +int br_mst_vlan_set_msti(struct net_bridge_vlan *mv, u16 msti) +{ + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_VLAN_MSTI, + .orig_dev = mv->br->dev, + .u.vlan_msti = { + .vid = mv->vid, + .msti = msti, + }, + }; + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *pv; + struct net_bridge_port *p; + int err; + + if (mv->msti == msti) + return 0; + + err = switchdev_port_attr_set(mv->br->dev, &attr, NULL); + if (err && err != -EOPNOTSUPP) + return err; + + mv->msti = msti; + + list_for_each_entry(p, &mv->br->port_list, list) { + vg = nbp_vlan_group(p); + + pv = br_vlan_find(vg, mv->vid); + if (pv) + br_mst_vlan_sync_state(pv, msti); + } + + return 0; +} + +void br_mst_vlan_init_state(struct net_bridge_vlan *v) +{ + /* VLANs always start out in MSTI 0 (CST) */ + v->msti = 0; + + if (br_vlan_is_master(v)) + v->state = BR_STATE_FORWARDING; + else + v->state = v->port->state; +} + +int br_mst_set_enabled(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_BRIDGE_MST, + .orig_dev = br->dev, + .u.mst = on, + }; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p; + int err; + + list_for_each_entry(p, &br->port_list, list) { + vg = nbp_vlan_group(p); + + if (!vg->num_vlans) + continue; + + NL_SET_ERR_MSG(extack, + "MST mode can't be changed while VLANs exist"); + return -EBUSY; + } + + if (br_opt_get(br, BROPT_MST_ENABLED) == on) + return 0; + + err = switchdev_port_attr_set(br->dev, &attr, extack); + if (err && err != -EOPNOTSUPP) + return err; + + if (on) + static_branch_enable(&br_mst_used); + else + static_branch_disable(&br_mst_used); + + br_opt_toggle(br, BROPT_MST_ENABLED, on); + return 0; +} + +size_t br_mst_info_size(const struct net_bridge_vlan_group *vg) +{ + DECLARE_BITMAP(seen, VLAN_N_VID) = { 0 }; + const struct net_bridge_vlan *v; + size_t sz; + + /* IFLA_BRIDGE_MST */ + sz = nla_total_size(0); + + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { + if (test_bit(v->brvlan->msti, seen)) + continue; + + /* IFLA_BRIDGE_MST_ENTRY */ + sz += nla_total_size(0) + + /* IFLA_BRIDGE_MST_ENTRY_MSTI */ + nla_total_size(sizeof(u16)) + + /* IFLA_BRIDGE_MST_ENTRY_STATE */ + nla_total_size(sizeof(u8)); + + __set_bit(v->brvlan->msti, seen); + } + + return sz; +} + +int br_mst_fill_info(struct sk_buff *skb, + const struct net_bridge_vlan_group *vg) +{ + DECLARE_BITMAP(seen, VLAN_N_VID) = { 0 }; + const struct net_bridge_vlan *v; + struct nlattr *nest; + int err = 0; + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (test_bit(v->brvlan->msti, seen)) + continue; + + nest = nla_nest_start_noflag(skb, IFLA_BRIDGE_MST_ENTRY); + if (!nest || + nla_put_u16(skb, IFLA_BRIDGE_MST_ENTRY_MSTI, v->brvlan->msti) || + nla_put_u8(skb, IFLA_BRIDGE_MST_ENTRY_STATE, v->state)) { + err = -EMSGSIZE; + break; + } + nla_nest_end(skb, nest); + + __set_bit(v->brvlan->msti, seen); + } + + return err; +} + +static const struct nla_policy br_mst_nl_policy[IFLA_BRIDGE_MST_ENTRY_MAX + 1] = { + [IFLA_BRIDGE_MST_ENTRY_MSTI] = NLA_POLICY_RANGE(NLA_U16, + 1, /* 0 reserved for CST */ + VLAN_N_VID - 1), + [IFLA_BRIDGE_MST_ENTRY_STATE] = NLA_POLICY_RANGE(NLA_U8, + BR_STATE_DISABLED, + BR_STATE_BLOCKING), +}; + +static int br_mst_process_one(struct net_bridge_port *p, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MST_ENTRY_MAX + 1]; + u16 msti; + u8 state; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MST_ENTRY_MAX, attr, + br_mst_nl_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MST_ENTRY_MSTI]) { + NL_SET_ERR_MSG_MOD(extack, "MSTI not specified"); + return -EINVAL; + } + + if (!tb[IFLA_BRIDGE_MST_ENTRY_STATE]) { + NL_SET_ERR_MSG_MOD(extack, "State not specified"); + return -EINVAL; + } + + msti = nla_get_u16(tb[IFLA_BRIDGE_MST_ENTRY_MSTI]); + state = nla_get_u8(tb[IFLA_BRIDGE_MST_ENTRY_STATE]); + + return br_mst_set_state(p, msti, state, extack); +} + +int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *attr; + int err, msts = 0; + int rem; + + if (!br_opt_get(p->br, BROPT_MST_ENABLED)) { + NL_SET_ERR_MSG_MOD(extack, "Can't modify MST state when MST is disabled"); + return -EBUSY; + } + + nla_for_each_nested(attr, mst_attr, rem) { + switch (nla_type(attr)) { + case IFLA_BRIDGE_MST_ENTRY: + err = br_mst_process_one(p, attr, extack); + break; + default: + continue; + } + + msts++; + if (err) + break; + } + + if (!msts) { + NL_SET_ERR_MSG_MOD(extack, "Found no MST entries to process"); + err = -EINVAL; + } + + return err; +} diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index de2409889489..db4f2641d1cd 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -82,6 +82,9 @@ static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg); static void __br_multicast_stop(struct net_bridge_mcast *brmctx); +static int br_mc_disabled_update(struct net_device *dev, bool value, + struct netlink_ext_ack *extack); + static struct net_bridge_port_group * br_sg_port_find(struct net_bridge *br, struct net_bridge_port_group_sg_key *sg_p) @@ -1156,6 +1159,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, return mp; if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) { + br_mc_disabled_update(br->dev, false, NULL); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); return ERR_PTR(-E2BIG); } diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 4fd882686b04..ff4779036649 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1012,9 +1012,24 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, return okfn(net, sk, skb); ops = nf_hook_entries_get_hook_ops(e); - for (i = 0; i < e->num_hook_entries && - ops[i]->priority <= NF_BR_PRI_BRNF; i++) - ; + for (i = 0; i < e->num_hook_entries; i++) { + /* These hooks have already been called */ + if (ops[i]->priority < NF_BR_PRI_BRNF) + continue; + + /* These hooks have not been called yet, run them. */ + if (ops[i]->priority > NF_BR_PRI_BRNF) + break; + + /* take a closer look at NF_BR_PRI_BRNF. */ + if (ops[i]->hook == br_nf_pre_routing) { + /* This hook diverted the skb to this function, + * hooks after this have not been run yet. + */ + i++; + break; + } + } nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 2ff83d84230d..bb01776d2d88 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -119,6 +119,9 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, /* Each VLAN is returned in bridge_vlan_info along with flags */ vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); + if (p && vg && (filter_mask & RTEXT_FILTER_MST)) + vinfo_sz += br_mst_info_size(vg); + if (!(filter_mask & RTEXT_FILTER_CFM_STATUS)) return vinfo_sz; @@ -184,6 +187,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */ + + nla_total_size(1) /* IFLA_BRPORT_LOCKED */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ @@ -269,7 +273,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, !!(p->flags & BR_MRP_LOST_IN_CONT)) || - nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) + nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) || + nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -483,7 +488,8 @@ static int br_fill_ifinfo(struct sk_buff *skb, RTEXT_FILTER_BRVLAN_COMPRESSED | RTEXT_FILTER_MRP | RTEXT_FILTER_CFM_CONFIG | - RTEXT_FILTER_CFM_STATUS)) { + RTEXT_FILTER_CFM_STATUS | + RTEXT_FILTER_MST)) { af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) goto nla_put_failure; @@ -562,7 +568,28 @@ static int br_fill_ifinfo(struct sk_buff *skb, nla_nest_end(skb, cfm_nest); } + if ((filter_mask & RTEXT_FILTER_MST) && + br_opt_get(br, BROPT_MST_ENABLED) && port) { + const struct net_bridge_vlan_group *vg = nbp_vlan_group(port); + struct nlattr *mst_nest; + int err; + + if (!vg || !vg->num_vlans) + goto done; + + mst_nest = nla_nest_start(skb, IFLA_BRIDGE_MST); + if (!mst_nest) + goto nla_put_failure; + + err = br_mst_fill_info(skb, vg); + if (err) + goto nla_put_failure; + + nla_nest_end(skb, mst_nest); + } + done: + if (af) nla_nest_end(skb, af); nlmsg_end(skb, nlh); @@ -801,6 +828,23 @@ static int br_afspec(struct net_bridge *br, if (err) return err; break; + case IFLA_BRIDGE_MST: + if (!p) { + NL_SET_ERR_MSG(extack, + "MST states can only be set on bridge ports"); + return -EINVAL; + } + + if (cmd != RTM_SETLINK) { + NL_SET_ERR_MSG(extack, + "MST states can only be set through RTM_SETLINK"); + return -EINVAL; + } + + err = br_mst_process(p, attr, extack); + if (err) + return err; + break; } } @@ -827,6 +871,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, + [IFLA_BRPORT_LOCKED] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, }; @@ -893,6 +938,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL); br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS); br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); + br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED); changed_mask = old_flags ^ p->flags; @@ -1280,8 +1326,13 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], br_recalculate_fwd_mask(br); } - if (data[IFLA_BR_FDB_FLUSH]) - br_fdb_flush(br); + if (data[IFLA_BR_FDB_FLUSH]) { + struct net_bridge_fdb_flush_desc desc = { + .flags_mask = BR_FDB_STATIC + }; + + br_fdb_flush(br, &desc); + } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (data[IFLA_BR_MCAST_ROUTER]) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2661dda1a92b..06e5f6faa431 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -178,6 +178,7 @@ enum { * @br_mcast_ctx: if MASTER flag set, this is the global vlan multicast context * @port_mcast_ctx: if MASTER flag unset, this is the per-port/vlan multicast * context + * @msti: if MASTER flag set, this holds the VLANs MST instance * @vlist: sorted list of VLAN entries * @rcu: used for entry destruction * @@ -210,6 +211,8 @@ struct net_bridge_vlan { struct net_bridge_mcast_port port_mcast_ctx; }; + u16 msti; + struct list_head vlist; struct rcu_head rcu; @@ -271,6 +274,13 @@ struct net_bridge_fdb_entry { struct rcu_head rcu; }; +struct net_bridge_fdb_flush_desc { + unsigned long flags; + unsigned long flags_mask; + int port_ifindex; + u16 vlan_id; +}; + #define MDB_PG_FLAGS_PERMANENT BIT(0) #define MDB_PG_FLAGS_OFFLOAD BIT(1) #define MDB_PG_FLAGS_FAST_LEAVE BIT(2) @@ -445,6 +455,7 @@ enum net_bridge_opts { BROPT_NO_LL_LEARN, BROPT_VLAN_BRIDGE_BINDING, BROPT_MCAST_VLAN_SNOOPING_ENABLED, + BROPT_MST_ENABLED, }; struct net_bridge { @@ -751,11 +762,17 @@ static inline void br_netpoll_disable(struct net_bridge_port *p) #endif /* br_fdb.c */ +#define FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF) +#define FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP) +#define FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_USE | NTF_EXT_LEARNED | \ + NTF_STICKY | NTF_OFFLOADED) + int br_fdb_init(void); void br_fdb_fini(void); int br_fdb_hash_init(struct net_bridge *br); void br_fdb_hash_fini(struct net_bridge *br); -void br_fdb_flush(struct net_bridge *br); +void br_fdb_flush(struct net_bridge *br, + const struct net_bridge_fdb_flush_desc *desc); void br_fdb_find_delete_local(struct net_bridge *br, const struct net_bridge_port *p, const unsigned char *addr, u16 vid); @@ -776,7 +793,11 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, const unsigned char *addr, u16 vid); + struct net_device *dev, const unsigned char *addr, u16 vid, + struct netlink_ext_ack *extack); +int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, u16 vid, + struct netlink_ext_ack *extack); int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, struct netlink_ext_ack *extack); @@ -1765,6 +1786,63 @@ static inline bool br_vlan_state_allowed(u8 state, bool learn_allow) } #endif +/* br_mst.c */ +#ifdef CONFIG_BRIDGE_VLAN_FILTERING +DECLARE_STATIC_KEY_FALSE(br_mst_used); +static inline bool br_mst_is_enabled(struct net_bridge *br) +{ + return static_branch_unlikely(&br_mst_used) && + br_opt_get(br, BROPT_MST_ENABLED); +} + +int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, + struct netlink_ext_ack *extack); +int br_mst_vlan_set_msti(struct net_bridge_vlan *v, u16 msti); +void br_mst_vlan_init_state(struct net_bridge_vlan *v); +int br_mst_set_enabled(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack); +size_t br_mst_info_size(const struct net_bridge_vlan_group *vg); +int br_mst_fill_info(struct sk_buff *skb, + const struct net_bridge_vlan_group *vg); +int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr, + struct netlink_ext_ack *extack); +#else +static inline bool br_mst_is_enabled(struct net_bridge *br) +{ + return false; +} + +static inline int br_mst_set_state(struct net_bridge_port *p, u16 msti, + u8 state, struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline int br_mst_set_enabled(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline size_t br_mst_info_size(const struct net_bridge_vlan_group *vg) +{ + return 0; +} + +static inline int br_mst_fill_info(struct sk_buff *skb, + const struct net_bridge_vlan_group *vg) +{ + return -EOPNOTSUPP; +} + +static inline int br_mst_process(struct net_bridge_port *p, + const struct nlattr *mst_attr, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} +#endif + struct nf_br_ops { int (*br_dev_xmit_hook)(struct sk_buff *skb); }; @@ -1985,7 +2063,7 @@ void br_switchdev_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg, int type); int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, - struct netlink_ext_ack *extack); + bool changed, struct netlink_ext_ack *extack); int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid); void br_switchdev_init(struct net_bridge *br); @@ -2052,8 +2130,8 @@ static inline int br_switchdev_set_port_flag(struct net_bridge_port *p, return 0; } -static inline int br_switchdev_port_vlan_add(struct net_device *dev, - u16 vid, u16 flags, +static inline int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, + u16 flags, bool changed, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 1d80f34a139c..7d27b2e6038f 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -43,6 +43,12 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) return; p->state = state; + if (br_opt_get(p->br, BROPT_MST_ENABLED)) { + err = br_mst_set_state(p, 0, state, NULL); + if (err) + br_warn(p->br, "error setting MST state on port %u(%s)\n", + p->port_no, netdev_name(p->dev)); + } err = switchdev_port_attr_set(p->dev, &attr, NULL); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index f8fbaaa7c501..8f3d76c751dd 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -72,7 +72,8 @@ bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, /* Flags that can be offloaded to hardware */ #define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | \ - BR_MCAST_FLOOD | BR_BCAST_FLOOD) + BR_MCAST_FLOOD | BR_BCAST_FLOOD | BR_PORT_LOCKED | \ + BR_HAIRPIN_MODE | BR_ISOLATED | BR_MULTICAST_TO_UNICAST) int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, @@ -160,13 +161,14 @@ br_switchdev_fdb_notify(struct net_bridge *br, } int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, - struct netlink_ext_ack *extack) + bool changed, struct netlink_ext_ack *extack) { struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .flags = flags, .vid = vid, + .changed = changed, }; return switchdev_port_obj_add(dev, &v.obj, extack); @@ -330,6 +332,48 @@ br_switchdev_fdb_replay(const struct net_device *br_dev, const void *ctx, return err; } +static int br_switchdev_vlan_attr_replay(struct net_device *br_dev, + const void *ctx, + struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_port_attr_info attr_info = { + .info = { + .dev = br_dev, + .extack = extack, + .ctx = ctx, + }, + }; + struct net_bridge *br = netdev_priv(br_dev); + struct net_bridge_vlan_group *vg; + struct switchdev_attr attr; + struct net_bridge_vlan *v; + int err; + + attr_info.attr = &attr; + attr.orig_dev = br_dev; + + vg = br_vlan_group(br); + if (!vg) + return 0; + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (v->msti) { + attr.id = SWITCHDEV_ATTR_ID_VLAN_MSTI; + attr.u.vlan_msti.vid = v->vid; + attr.u.vlan_msti.msti = v->msti; + + err = nb->notifier_call(nb, SWITCHDEV_PORT_ATTR_SET, + &attr_info); + err = notifier_to_errno(err); + if (err) + return err; + } + } + + return 0; +} + static int br_switchdev_vlan_replay_one(struct notifier_block *nb, struct net_device *dev, @@ -351,19 +395,50 @@ br_switchdev_vlan_replay_one(struct notifier_block *nb, return notifier_to_errno(err); } +static int br_switchdev_vlan_replay_group(struct notifier_block *nb, + struct net_device *dev, + struct net_bridge_vlan_group *vg, + const void *ctx, unsigned long action, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan *v; + int err = 0; + u16 pvid; + + if (!vg) + return 0; + + pvid = br_get_pvid(vg); + + list_for_each_entry(v, &vg->vlan_list, vlist) { + struct switchdev_obj_port_vlan vlan = { + .obj.orig_dev = dev, + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = br_vlan_flags(v, pvid), + .vid = v->vid, + }; + + if (!br_vlan_should_use(v)) + continue; + + err = br_switchdev_vlan_replay_one(nb, dev, &vlan, ctx, + action, extack); + if (err) + return err; + } + + return 0; +} + static int br_switchdev_vlan_replay(struct net_device *br_dev, - struct net_device *dev, const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { - struct net_bridge_vlan_group *vg; - struct net_bridge_vlan *v; + struct net_bridge *br = netdev_priv(br_dev); struct net_bridge_port *p; - struct net_bridge *br; unsigned long action; - int err = 0; - u16 pvid; + int err; ASSERT_RTNL(); @@ -373,49 +448,33 @@ static int br_switchdev_vlan_replay(struct net_device *br_dev, if (!netif_is_bridge_master(br_dev)) return -EINVAL; - if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) - return -EINVAL; - - if (netif_is_bridge_master(dev)) { - br = netdev_priv(dev); - vg = br_vlan_group(br); - p = NULL; - } else { - p = br_port_get_rtnl(dev); - if (WARN_ON(!p)) - return -EINVAL; - vg = nbp_vlan_group(p); - br = p->br; - } - - if (!vg) - return 0; - if (adding) action = SWITCHDEV_PORT_OBJ_ADD; else action = SWITCHDEV_PORT_OBJ_DEL; - pvid = br_get_pvid(vg); + err = br_switchdev_vlan_replay_group(nb, br_dev, br_vlan_group(br), + ctx, action, extack); + if (err) + return err; - list_for_each_entry(v, &vg->vlan_list, vlist) { - struct switchdev_obj_port_vlan vlan = { - .obj.orig_dev = dev, - .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .flags = br_vlan_flags(v, pvid), - .vid = v->vid, - }; + list_for_each_entry(p, &br->port_list, list) { + struct net_device *dev = p->dev; - if (!br_vlan_should_use(v)) - continue; + err = br_switchdev_vlan_replay_group(nb, dev, + nbp_vlan_group(p), + ctx, action, extack); + if (err) + return err; + } - err = br_switchdev_vlan_replay_one(nb, dev, &vlan, ctx, - action, extack); + if (adding) { + err = br_switchdev_vlan_attr_replay(br_dev, ctx, nb, extack); if (err) return err; } - return err; + return 0; } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING @@ -681,8 +740,7 @@ static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, struct net_device *dev = p->dev; int err; - err = br_switchdev_vlan_replay(br_dev, dev, ctx, true, blocking_nb, - extack); + err = br_switchdev_vlan_replay(br_dev, ctx, true, blocking_nb, extack); if (err && err != -EOPNOTSUPP) return err; @@ -706,11 +764,11 @@ static void nbp_switchdev_unsync_objs(struct net_bridge_port *p, struct net_device *br_dev = p->br->dev; struct net_device *dev = p->dev; - br_switchdev_vlan_replay(br_dev, dev, ctx, false, blocking_nb, NULL); + br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb); br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL); - br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb); + br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL); } /* Let the bridge know that this port is offloaded, so that it can assign a diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 3f7ca88c2aa3..612e367fff20 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -344,7 +344,11 @@ static DEVICE_ATTR_RW(group_addr); static int set_flush(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br_fdb_flush(br); + struct net_bridge_fdb_flush_desc desc = { + .flags_mask = BR_FDB_STATIC + }; + + br_fdb_flush(br, &desc); return 0; } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 84ba456a78cc..0f5e75ccac79 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -34,53 +34,70 @@ static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); } -static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg, +static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, const struct net_bridge_vlan *v) { if (vg->pvid == v->vid) - return false; + return; smp_wmb(); br_vlan_set_pvid_state(vg, v->state); vg->pvid = v->vid; - - return true; } -static bool __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) +static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) { if (vg->pvid != vid) - return false; + return; smp_wmb(); vg->pvid = 0; - - return true; } -/* return true if anything changed, false otherwise */ -static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) +/* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v. + * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and + * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v. + */ +static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags, + bool commit) { struct net_bridge_vlan_group *vg; - u16 old_flags = v->flags; - bool ret; + bool change; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); + /* check if anything would be changed on commit */ + change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) || + ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED); + + if (!commit) + goto out; + if (flags & BRIDGE_VLAN_INFO_PVID) - ret = __vlan_add_pvid(vg, v); + __vlan_add_pvid(vg, v); else - ret = __vlan_delete_pvid(vg, v->vid); + __vlan_delete_pvid(vg, v->vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; else v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; - return ret || !!(old_flags ^ v->flags); +out: + return change; +} + +static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags) +{ + return __vlan_flags_update(v, flags, false); +} + +static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags) +{ + __vlan_flags_update(v, flags, true); } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, @@ -92,7 +109,7 @@ static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ - err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack); + err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err == -EOPNOTSUPP) return vlan_vid_add(dev, br->vlan_proto, v->vid); v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV; @@ -209,6 +226,24 @@ static void nbp_vlan_rcu_free(struct rcu_head *rcu) kfree(v); } +static void br_vlan_init_state(struct net_bridge_vlan *v) +{ + struct net_bridge *br; + + if (br_vlan_is_master(v)) + br = v->br; + else + br = v->port->br; + + if (br_opt_get(br, BROPT_MST_ENABLED)) { + br_mst_vlan_init_state(v); + return; + } + + v->state = BR_STATE_FORWARDING; + v->msti = 0; +} + /* This is the shared VLAN add function which works for both ports and bridge * devices. There are four possible calls to this function in terms of the * vlan entry type: @@ -284,9 +319,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, } br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); } else { - err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack); - if (err && err != -EOPNOTSUPP) - goto out; + if (br_vlan_should_use(v)) { + err = br_switchdev_port_vlan_add(dev, v->vid, flags, + false, extack); + if (err && err != -EOPNOTSUPP) + goto out; + } br_multicast_ctx_init(br, v, &v->br_mcast_ctx); v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; } @@ -302,7 +340,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, } /* set the state before publishing */ - v->state = BR_STATE_FORWARDING; + br_vlan_init_state(v); err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); @@ -310,7 +348,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, goto out_fdb_insert; __vlan_add_list(v); - __vlan_add_flags(v, flags); + __vlan_flags_commit(v, flags); br_multicast_toggle_one_vlan(v, true); if (p) @@ -404,6 +442,7 @@ static void __vlan_flush(const struct net_bridge *br, { struct net_bridge_vlan *vlan, *tmp; u16 v_start = 0, v_end = 0; + int err; __vlan_delete_pvid(vg, vg->pvid); list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { @@ -417,7 +456,13 @@ static void __vlan_flush(const struct net_bridge *br, } v_end = vlan->vid; - __vlan_del(vlan); + err = __vlan_del(vlan); + if (err) { + br_err(br, + "port %u(%s) failed to delete vlan %d: %pe\n", + (unsigned int) p->port_no, p->dev->name, + vlan->vid, ERR_PTR(err)); + } } /* notify about the last/whole vlan range */ @@ -560,10 +605,10 @@ static bool __allowed_ingress(const struct net_bridge *br, !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_pvid_state(vg); - return br_vlan_state_allowed(*state, true); - } else { - return true; + if (!br_vlan_state_allowed(*state, true)) + goto drop; } + return true; } } v = br_vlan_find(vg, *vid); @@ -670,18 +715,29 @@ static int br_vlan_add_existing(struct net_bridge *br, u16 flags, bool *changed, struct netlink_ext_ack *extack) { + bool would_change = __vlan_flags_would_change(vlan, flags); + bool becomes_brentry = false; int err; - err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, extack); - if (err && err != -EOPNOTSUPP) - return err; - if (!br_vlan_is_brentry(vlan)) { /* Trying to change flags of non-existent bridge vlan */ - if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) { - err = -EINVAL; - goto err_flags; - } + if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) + return -EINVAL; + + becomes_brentry = true; + } + + /* Master VLANs that aren't brentries weren't notified before, + * time to notify them now. + */ + if (becomes_brentry || would_change) { + err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, + would_change, extack); + if (err && err != -EOPNOTSUPP) + return err; + } + + if (becomes_brentry) { /* It was only kept for port vlans, now make it real */ err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid); if (err) { @@ -696,13 +752,13 @@ static int br_vlan_add_existing(struct net_bridge *br, br_multicast_toggle_one_vlan(vlan, true); } - if (__vlan_add_flags(vlan, flags)) + __vlan_flags_commit(vlan, flags); + if (would_change) *changed = true; return 0; err_fdb_insert: -err_flags: br_switchdev_port_vlan_del(br->dev, vlan->vid); return err; } @@ -1247,11 +1303,18 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, *changed = false; vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { - /* Pass the flags to the hardware bridge */ - ret = br_switchdev_port_vlan_add(port->dev, vid, flags, extack); - if (ret && ret != -EOPNOTSUPP) - return ret; - *changed = __vlan_add_flags(vlan, flags); + bool would_change = __vlan_flags_would_change(vlan, flags); + + if (would_change) { + /* Pass the flags to the hardware bridge */ + ret = br_switchdev_port_vlan_add(port->dev, vid, flags, + true, extack); + if (ret && ret != -EOPNOTSUPP) + return ret; + } + + __vlan_flags_commit(vlan, flags); + *changed = would_change; return 0; } @@ -2020,7 +2083,8 @@ static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) goto out_err; } err = br_vlan_dump_dev(dev, skb, cb, dump_flags); - if (err && err != -EMSGSIZE) + /* if the dump completed without an error we return 0 here */ + if (err != -EMSGSIZE) goto out_err; } else { for_each_netdev_rcu(net, dev) { diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index a6382973b3e7..a2724d03278c 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -99,6 +99,11 @@ static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, return -EBUSY; } + if (br_opt_get(br, BROPT_MST_ENABLED)) { + NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state directly when MST is enabled"); + return -EBUSY; + } + if (v->state == state) return 0; @@ -291,6 +296,7 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *r_end) { return v_curr->vid - r_end->vid == 1 && + v_curr->msti == r_end->msti && ((v_curr->priv_flags ^ r_end->priv_flags) & BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0 && br_multicast_ctx_options_equal(&v_curr->br_mcast_ctx, @@ -379,6 +385,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, #endif #endif + if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_MSTI, v_opts->msti)) + goto out_err; + nla_nest_end(skb, nest); return true; @@ -410,6 +419,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(const struct net_bridge_vlan *v) + nla_total_size(0) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */ + br_rports_size(&v->br_mcast_ctx) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */ #endif + + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_MSTI */ + nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */ } @@ -559,6 +569,15 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br, } #endif #endif + if (tb[BRIDGE_VLANDB_GOPTS_MSTI]) { + u16 msti; + + msti = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_MSTI]); + err = br_mst_vlan_set_msti(v, msti); + if (err) + return err; + *changed = true; + } return 0; } @@ -578,6 +597,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = { [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, + [BRIDGE_VLANDB_GOPTS_MSTI] = NLA_POLICY_MAX(NLA_U16, VLAN_N_VID - 1), }; int br_vlan_rtm_process_global_options(struct net_device *dev, diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index fdbed3158555..73242962be5d 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -32,6 +32,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + bool mono_delivery_time = skb->mono_delivery_time; unsigned int hlen, ll_rs, mtu; ktime_t tstamp = skb->tstamp; struct ip_frag_state state; @@ -81,7 +82,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, if (iter.frag) ip_fraglist_prepare(skb, &iter); - skb->tstamp = tstamp; + skb_set_delivery_time(skb, tstamp, mono_delivery_time); err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -112,7 +113,7 @@ slow_path: goto blackhole; } - skb2->tstamp = tstamp; + skb_set_delivery_time(skb2, tstamp, mono_delivery_time); err = output(net, sk, data, skb2); if (err) goto blackhole; @@ -380,7 +381,7 @@ static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb) protoff = skb_network_offset(skb) + ip_hdrlen(skb); break; case htons(ETH_P_IPV6): { - unsigned char pnum = ipv6_hdr(skb)->nexthdr; + unsigned char pnum = ipv6_hdr(skb)->nexthdr; __be16 frag_off; protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index c1ef9cc89b78..8c3eaba87ad2 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -87,6 +87,7 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, return nft_meta_get_init(ctx, expr, tb); } + priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } @@ -98,6 +99,7 @@ static const struct nft_expr_ops nft_meta_bridge_get_ops = { .eval = nft_meta_bridge_get_eval, .init = nft_meta_bridge_get_init, .dump = nft_meta_get_dump, + .reduce = nft_meta_get_reduce, }; static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track, @@ -112,8 +114,7 @@ static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track, if (track->regs[i].selector->ops != &nft_meta_bridge_get_ops) continue; - track->regs[i].selector = NULL; - track->regs[i].bitwise = NULL; + __nft_reg_track_cancel(track, i); } return false; diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index eba0efe64d05..71b54fed7263 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -49,7 +49,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook); + nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, NULL, hook); if (!nskb) return; @@ -65,7 +65,7 @@ static void nft_reject_br_send_v4_unreach(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code); + nskb = nf_reject_skb_v4_unreach(net, oldskb, NULL, hook, code); if (!nskb) return; @@ -81,7 +81,7 @@ static void nft_reject_br_send_v6_tcp_reset(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook); + nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, NULL, hook); if (!nskb) return; @@ -98,7 +98,7 @@ static void nft_reject_br_send_v6_unreach(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code); + nskb = nf_reject_skb_v6_unreach(net, oldskb, NULL, hook, code); if (!nskb) return; @@ -185,6 +185,7 @@ static const struct nft_expr_ops nft_reject_bridge_ops = { .init = nft_reject_init, .dump = nft_reject_dump, .validate = nft_reject_bridge_validate, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_reject_bridge_type __read_mostly = { diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 440139706130..52dd0b6835bc 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -268,7 +268,7 @@ static int receive(struct sk_buff *skb, struct net_device *dev, err = caifd->layer.up->receive(caifd->layer.up, pkt); - /* For -EILSEQ the packet is not freed so so it now */ + /* For -EILSEQ the packet is not freed so free it now */ if (err == -EILSEQ) cfpkt_destroy(pkt); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 2b8892d502f7..251e666ba9a2 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -282,7 +282,7 @@ static int caif_seqpkt_recvmsg(struct socket *sock, struct msghdr *m, if (flags & MSG_OOB) goto read_error; - skb = skb_recv_datagram(sk, flags, 0 , &ret); + skb = skb_recv_datagram(sk, flags, &ret); if (!skb) goto read_error; copylen = skb->len; diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 414dc5671c45..4d63ef13a1fd 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -99,7 +99,7 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) else skb->ip_summed = CHECKSUM_NONE; - netif_rx_any_context(skb); + netif_rx(skb); /* Update statistics. */ priv->netdev->stats.rx_packets++; diff --git a/net/can/af_can.c b/net/can/af_can.c index cce2af10eb3e..1fb49d51b25d 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -284,7 +284,7 @@ int can_send(struct sk_buff *skb, int loop) } if (newskb) - netif_rx_ni(newskb); + netif_rx(newskb); /* update statistics */ pkg_stats->tx_frames++; diff --git a/net/can/bcm.c b/net/can/bcm.c index bc88d901a1c0..e60161bec850 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -100,6 +100,7 @@ static inline u64 get_u64(const struct canfd_frame *cp, int offset) struct bcm_op { struct list_head list; + struct rcu_head rcu; int ifindex; canid_t can_id; u32 flags; @@ -193,7 +194,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) { char ifname[IFNAMSIZ]; struct net *net = m->private; - struct sock *sk = (struct sock *)PDE_DATA(m->file->f_inode); + struct sock *sk = (struct sock *)pde_data(m->file->f_inode); struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; @@ -718,10 +719,9 @@ static struct bcm_op *bcm_find_op(struct list_head *ops, return NULL; } -static void bcm_remove_op(struct bcm_op *op) +static void bcm_free_op_rcu(struct rcu_head *rcu_head) { - hrtimer_cancel(&op->timer); - hrtimer_cancel(&op->thrtimer); + struct bcm_op *op = container_of(rcu_head, struct bcm_op, rcu); if ((op->frames) && (op->frames != &op->sframe)) kfree(op->frames); @@ -732,6 +732,14 @@ static void bcm_remove_op(struct bcm_op *op) kfree(op); } +static void bcm_remove_op(struct bcm_op *op) +{ + hrtimer_cancel(&op->timer); + hrtimer_cancel(&op->thrtimer); + + call_rcu(&op->rcu, bcm_free_op_rcu); +} + static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) { if (op->rx_reg_dev == dev) { @@ -757,6 +765,9 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { + /* disable automatic timer on frame reception */ + op->flags |= RX_NO_AUTOTIMER; + /* * Don't care if we're bound or not (due to netdev * problems) can_rx_unregister() is always a save @@ -785,7 +796,6 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, bcm_rx_handler, op); list_del(&op->list); - synchronize_rcu(); bcm_remove_op(op); return 1; /* done */ } @@ -1632,12 +1642,9 @@ static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, struct sock *sk = sock->sk; struct sk_buff *skb; int error = 0; - int noblock; int err; - noblock = flags & MSG_DONTWAIT; - flags &= ~MSG_DONTWAIT; - skb = skb_recv_datagram(sk, flags, noblock, &error); + skb = skb_recv_datagram(sk, flags, &error); if (!skb) return error; @@ -1650,7 +1657,7 @@ static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, return err; } - sock_recv_ts_and_drops(msg, sk, skb); + sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(BCM_MIN_NAMELEN); diff --git a/net/can/gw.c b/net/can/gw.c index d8861e862f15..1ea4cc527db3 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -577,6 +577,13 @@ static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj) gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj); } +static void cgw_job_free_rcu(struct rcu_head *rcu_head) +{ + struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu); + + kmem_cache_free(cgw_cache, gwj); +} + static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { @@ -596,8 +603,7 @@ static int cgw_notifier(struct notifier_block *nb, if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); - synchronize_rcu(); - kmem_cache_free(cgw_cache, gwj); + call_rcu(&gwj->rcu, cgw_job_free_rcu); } } } @@ -1155,8 +1161,7 @@ static void cgw_remove_all_jobs(struct net *net) hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); - synchronize_rcu(); - kmem_cache_free(cgw_cache, gwj); + call_rcu(&gwj->rcu, cgw_job_free_rcu); } } @@ -1224,8 +1229,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); - synchronize_rcu(); - kmem_cache_free(cgw_cache, gwj); + call_rcu(&gwj->rcu, cgw_job_free_rcu); err = 0; break; } @@ -1239,16 +1243,19 @@ static int __net_init cangw_pernet_init(struct net *net) return 0; } -static void __net_exit cangw_pernet_exit(struct net *net) +static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list) { + struct net *net; + rtnl_lock(); - cgw_remove_all_jobs(net); + list_for_each_entry(net, net_list, exit_list) + cgw_remove_all_jobs(net); rtnl_unlock(); } static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, - .exit = cangw_pernet_exit, + .exit_batch = cangw_pernet_exit_batch, }; static __init int cgw_module_init(void) diff --git a/net/can/isotp.c b/net/can/isotp.c index 02cbcb2ecf0d..43a27d19cdac 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -14,7 +14,6 @@ * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent * - as we have static buffers the check whether the PDU fits into the buffer * is done at FF reception time (no support for sending 'wait frames') - * - take care of the tx-queue-len as traffic shaping is still on the TODO list * * Copyright (c) 2020 Volkswagen Group Electronic Research * All rights reserved. @@ -56,6 +55,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> +#include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/wait.h> #include <linux/uio.h> @@ -86,9 +86,9 @@ MODULE_ALIAS("can-proto-6"); /* ISO 15765-2:2016 supports more than 4095 byte per ISO PDU as the FF_DL can * take full 32 bit values (4 Gbyte). We would need some good concept to handle * this between user space and kernel space. For now increase the static buffer - * to something about 8 kbyte to be able to test this new functionality. + * to something about 64 kbyte to be able to test this new functionality. */ -#define MAX_MSG_LENGTH 8200 +#define MAX_MSG_LENGTH 66000 /* N_PCI type values in bits 7-4 of N_PCI bytes */ #define N_PCI_SF 0x00 /* single frame */ @@ -104,6 +104,7 @@ MODULE_ALIAS("can-proto-6"); #define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */ #define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA) +#define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST) /* Flow Status given in FC frame */ #define ISOTP_FC_CTS 0 /* clear to send */ @@ -140,11 +141,14 @@ struct isotp_sock { struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; + u32 frame_txtime; u32 force_tx_stmin; u32 force_rx_stmin; + u32 cfecho; /* consecutive frame echo tag */ struct tpcon rx, tx; struct list_head notifier; wait_queue_head_t wait; + spinlock_t rx_lock; /* protect single thread state machine */ }; static LIST_HEAD(isotp_notifier_list); @@ -156,6 +160,23 @@ static inline struct isotp_sock *isotp_sk(const struct sock *sk) return (struct isotp_sock *)sk; } +static u32 isotp_bc_flags(struct isotp_sock *so) +{ + return so->opt.flags & ISOTP_ALL_BC_FLAGS; +} + +static bool isotp_register_rxid(struct isotp_sock *so) +{ + /* no broadcast modes => register rx_id for FC frame reception */ + return (isotp_bc_flags(so) == 0); +} + +static bool isotp_register_txecho(struct isotp_sock *so) +{ + /* all modes but SF_BROADCAST register for tx echo skbs */ + return (isotp_bc_flags(so) != CAN_ISOTP_SF_BROADCAST); +} + static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, @@ -358,7 +379,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) so->tx_gap = ktime_set(0, 0); /* add transmission time for CAN frame N_As */ - so->tx_gap = ktime_add_ns(so->tx_gap, so->opt.frame_txtime); + so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime); /* add waiting time for consecutive frames N_Cs */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_add_ns(so->tx_gap, @@ -615,11 +636,17 @@ static void isotp_rcv(struct sk_buff *skb, void *data) n_pci_type = cf->data[ae] & 0xF0; + /* Make sure the state changes and data structures stay consistent at + * CAN frame reception time. This locking is not needed in real world + * use cases but the inconsistency can be triggered with syzkaller. + */ + spin_lock(&so->rx_lock); + if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) { /* check rx/tx path half duplex expectations */ if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) || (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC)) - return; + goto out_unlock; } switch (n_pci_type) { @@ -668,6 +695,9 @@ static void isotp_rcv(struct sk_buff *skb, void *data) isotp_rcv_cf(sk, cf, ae, skb); break; } + +out_unlock: + spin_unlock(&so->rx_lock); } static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, @@ -701,6 +731,63 @@ static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, cf->data[0] = so->opt.ext_address; } +static void isotp_send_cframe(struct isotp_sock *so) +{ + struct sock *sk = &so->sk; + struct sk_buff *skb; + struct net_device *dev; + struct canfd_frame *cf; + int can_send_ret; + int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; + + dev = dev_get_by_index(sock_net(sk), so->ifindex); + if (!dev) + return; + + skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC); + if (!skb) { + dev_put(dev); + return; + } + + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; + + cf = (struct canfd_frame *)skb->data; + skb_put_zero(skb, so->ll.mtu); + + /* create consecutive frame */ + isotp_fill_dataframe(cf, so, ae, 0); + + /* place consecutive frame N_PCI in appropriate index */ + cf->data[ae] = N_PCI_CF | so->tx.sn++; + so->tx.sn %= 16; + so->tx.bs++; + + cf->flags = so->ll.tx_flags; + + skb->dev = dev; + can_skb_set_owner(skb, sk); + + /* cfecho should have been zero'ed by init/isotp_rcv_echo() */ + if (so->cfecho) + pr_notice_once("can-isotp: cfecho is %08X != 0\n", so->cfecho); + + /* set consecutive frame echo tag */ + so->cfecho = *(u32 *)cf->data; + + /* send frame with local echo enabled */ + can_send_ret = can_send(skb, 1); + if (can_send_ret) { + pr_notice_once("can-isotp: %s: can_send_ret %pe\n", + __func__, ERR_PTR(can_send_ret)); + if (can_send_ret == -ENOBUFS) + pr_notice_once("can-isotp: tx queue is full\n"); + } + dev_put(dev); +} + static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, int ae) { @@ -734,7 +821,47 @@ static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, cf->data[i] = so->tx.buf[so->tx.idx++]; so->tx.sn = 1; - so->tx.state = ISOTP_WAIT_FIRST_FC; +} + +static void isotp_rcv_echo(struct sk_buff *skb, void *data) +{ + struct sock *sk = (struct sock *)data; + struct isotp_sock *so = isotp_sk(sk); + struct canfd_frame *cf = (struct canfd_frame *)skb->data; + + /* only handle my own local echo skb's */ + if (skb->sk != sk || so->cfecho != *(u32 *)cf->data) + return; + + /* cancel local echo timeout */ + hrtimer_cancel(&so->txtimer); + + /* local echo skb with consecutive frame has been consumed */ + so->cfecho = 0; + + if (so->tx.idx >= so->tx.len) { + /* we are done */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + return; + } + + if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { + /* stop and wait for FC with timeout */ + so->tx.state = ISOTP_WAIT_FC; + hrtimer_start(&so->txtimer, ktime_set(1, 0), + HRTIMER_MODE_REL_SOFT); + return; + } + + /* no gap between data frames needed => use burst mode */ + if (!so->tx_gap) { + isotp_send_cframe(so); + return; + } + + /* start timer to send next consecutive frame with correct delay */ + hrtimer_start(&so->txtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); } static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) @@ -742,14 +869,28 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; - struct sk_buff *skb; - struct net_device *dev; - struct canfd_frame *cf; enum hrtimer_restart restart = HRTIMER_NORESTART; - int can_send_ret; - int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; switch (so->tx.state) { + case ISOTP_SENDING: + + /* cfecho should be consumed by isotp_rcv_echo() here */ + if (!so->cfecho) { + /* start timeout for unlikely lost echo skb */ + hrtimer_set_expires(&so->txtimer, + ktime_add(ktime_get(), + ktime_set(2, 0))); + restart = HRTIMER_RESTART; + + /* push out the next consecutive frame */ + isotp_send_cframe(so); + break; + } + + /* cfecho has not been cleared in isotp_rcv_echo() */ + pr_notice_once("can-isotp: cfecho %08X timeout\n", so->cfecho); + fallthrough; + case ISOTP_WAIT_FC: case ISOTP_WAIT_FIRST_FC: @@ -765,78 +906,6 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) wake_up_interruptible(&so->wait); break; - case ISOTP_SENDING: - - /* push out the next segmented pdu */ - dev = dev_get_by_index(sock_net(sk), so->ifindex); - if (!dev) - break; - -isotp_tx_burst: - skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), - GFP_ATOMIC); - if (!skb) { - dev_put(dev); - break; - } - - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; - - cf = (struct canfd_frame *)skb->data; - skb_put_zero(skb, so->ll.mtu); - - /* create consecutive frame */ - isotp_fill_dataframe(cf, so, ae, 0); - - /* place consecutive frame N_PCI in appropriate index */ - cf->data[ae] = N_PCI_CF | so->tx.sn++; - so->tx.sn %= 16; - so->tx.bs++; - - cf->flags = so->ll.tx_flags; - - skb->dev = dev; - can_skb_set_owner(skb, sk); - - can_send_ret = can_send(skb, 1); - if (can_send_ret) { - pr_notice_once("can-isotp: %s: can_send_ret %pe\n", - __func__, ERR_PTR(can_send_ret)); - if (can_send_ret == -ENOBUFS) - pr_notice_once("can-isotp: tx queue is full, increasing txqueuelen may prevent this error\n"); - } - if (so->tx.idx >= so->tx.len) { - /* we are done */ - so->tx.state = ISOTP_IDLE; - dev_put(dev); - wake_up_interruptible(&so->wait); - break; - } - - if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { - /* stop and wait for FC */ - so->tx.state = ISOTP_WAIT_FC; - dev_put(dev); - hrtimer_set_expires(&so->txtimer, - ktime_add(ktime_get(), - ktime_set(1, 0))); - restart = HRTIMER_RESTART; - break; - } - - /* no gap between data frames needed => use burst mode */ - if (!so->tx_gap) - goto isotp_tx_burst; - - /* start timer to send next data frame with correct delay */ - dev_put(dev); - hrtimer_set_expires(&so->txtimer, - ktime_add(ktime_get(), so->tx_gap)); - restart = HRTIMER_RESTART; - break; - default: WARN_ON_ONCE(1); } @@ -854,6 +923,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; + s64 hrtimer_sec = 0; int off; int err; @@ -876,34 +946,34 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (!size || size > MAX_MSG_LENGTH) { err = -EINVAL; - goto err_out; + goto err_out_drop; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; /* does the given data fit into a single frame for SF_BROADCAST? */ - if ((so->opt.flags & CAN_ISOTP_SF_BROADCAST) && + if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) && (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { err = -EINVAL; - goto err_out; + goto err_out_drop; } err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) - goto err_out; + goto err_out_drop; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { err = -ENXIO; - goto err_out; + goto err_out_drop; } skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) { dev_put(dev); - goto err_out; + goto err_out_drop; } can_skb_reserve(skb); @@ -947,12 +1017,43 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) /* don't enable wait queue for a single frame transmission */ wait_tx_done = 0; } else { - /* send first frame and wait for FC */ + /* send first frame */ isotp_create_fframe(cf, so, ae); - /* start timeout for FC */ - hrtimer_start(&so->txtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); + if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) { + /* set timer for FC-less operation (STmin = 0) */ + if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) + so->tx_gap = ktime_set(0, so->force_tx_stmin); + else + so->tx_gap = ktime_set(0, so->frame_txtime); + + /* disable wait for FCs due to activated block size */ + so->txfc.bs = 0; + + /* cfecho should have been zero'ed by init */ + if (so->cfecho) + pr_notice_once("can-isotp: no fc cfecho %08X\n", + so->cfecho); + + /* set consecutive frame echo tag */ + so->cfecho = *(u32 *)cf->data; + + /* switch directly to ISOTP_SENDING state */ + so->tx.state = ISOTP_SENDING; + + /* start timeout for unlikely lost echo skb */ + hrtimer_sec = 2; + } else { + /* standard flow control check */ + so->tx.state = ISOTP_WAIT_FIRST_FC; + + /* start timeout for FC */ + hrtimer_sec = 1; + } + + hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), + HRTIMER_MODE_REL_SOFT); } /* send the first or only CAN frame */ @@ -965,7 +1066,15 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); - goto err_out; + + /* no transmission -> no timeout monitoring */ + if (hrtimer_sec) + hrtimer_cancel(&so->txtimer); + + /* reset consecutive frame echo tag */ + so->cfecho = 0; + + goto err_out_drop; } if (wait_tx_done) { @@ -978,6 +1087,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) return size; +err_out_drop: + /* drop this PDU and unlock a potential wait queue */ + old_state = ISOTP_IDLE; err_out: so->tx.state = old_state; if (so->tx.state == ISOTP_IDLE) @@ -991,26 +1103,27 @@ static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, { struct sock *sk = sock->sk; struct sk_buff *skb; - int err = 0; - int noblock; + struct isotp_sock *so = isotp_sk(sk); + int ret = 0; + + if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK)) + return -EINVAL; - noblock = flags & MSG_DONTWAIT; - flags &= ~MSG_DONTWAIT; + if (!so->bound) + return -EADDRNOTAVAIL; - skb = skb_recv_datagram(sk, flags, noblock, &err); + skb = skb_recv_datagram(sk, flags, &ret); if (!skb) - return err; + return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; - err = memcpy_to_msg(msg, skb->data, size); - if (err < 0) { - skb_free_datagram(sk, skb); - return err; - } + ret = memcpy_to_msg(msg, skb->data, size); + if (ret < 0) + goto out_err; sock_recv_timestamp(msg, sk, skb); @@ -1020,9 +1133,13 @@ static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } + /* set length of return value */ + ret = (flags & MSG_TRUNC) ? skb->len : size; + +out_err: skb_free_datagram(sk, skb); - return size; + return ret; } static int isotp_release(struct socket *sock) @@ -1052,15 +1169,20 @@ static int isotp_release(struct socket *sock) lock_sock(sk); /* remove current filters & unregister */ - if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) { + if (so->bound && isotp_register_txecho(so)) { if (so->ifindex) { struct net_device *dev; dev = dev_get_by_index(net, so->ifindex); if (dev) { - can_rx_unregister(net, dev, so->rxid, - SINGLE_MASK(so->rxid), - isotp_rcv, sk); + if (isotp_register_rxid(so)) + can_rx_unregister(net, dev, so->rxid, + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + + can_rx_unregister(net, dev, so->txid, + SINGLE_MASK(so->txid), + isotp_rcv_echo, sk); dev_put(dev); synchronize_rcu(); } @@ -1090,42 +1212,51 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) struct net *net = sock_net(sk); int ifindex; struct net_device *dev; + canid_t tx_id = addr->can_addr.tp.tx_id; + canid_t rx_id = addr->can_addr.tp.rx_id; int err = 0; int notify_enetdown = 0; - int do_rx_reg = 1; if (len < ISOTP_MIN_NAMELEN) return -EINVAL; - if (addr->can_addr.tp.tx_id & (CAN_ERR_FLAG | CAN_RTR_FLAG)) - return -EADDRNOTAVAIL; + /* sanitize tx CAN identifier */ + if (tx_id & CAN_EFF_FLAG) + tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); + else + tx_id &= CAN_SFF_MASK; + + /* give feedback on wrong CAN-ID value */ + if (tx_id != addr->can_addr.tp.tx_id) + return -EINVAL; + + /* sanitize rx CAN identifier (if needed) */ + if (isotp_register_rxid(so)) { + if (rx_id & CAN_EFF_FLAG) + rx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); + else + rx_id &= CAN_SFF_MASK; + + /* give feedback on wrong CAN-ID value */ + if (rx_id != addr->can_addr.tp.rx_id) + return -EINVAL; + } if (!addr->can_ifindex) return -ENODEV; lock_sock(sk); - /* do not register frame reception for functional addressing */ - if (so->opt.flags & CAN_ISOTP_SF_BROADCAST) - do_rx_reg = 0; - - /* do not validate rx address for functional addressing */ - if (do_rx_reg) { - if (addr->can_addr.tp.rx_id == addr->can_addr.tp.tx_id) { - err = -EADDRNOTAVAIL; - goto out; - } - - if (addr->can_addr.tp.rx_id & (CAN_ERR_FLAG | CAN_RTR_FLAG)) { - err = -EADDRNOTAVAIL; - goto out; - } + if (so->bound) { + err = -EINVAL; + goto out; } - if (so->bound && addr->can_ifindex == so->ifindex && - addr->can_addr.tp.rx_id == so->rxid && - addr->can_addr.tp.tx_id == so->txid) + /* ensure different CAN IDs when the rx_id is to be registered */ + if (isotp_register_rxid(so) && rx_id == tx_id) { + err = -EADDRNOTAVAIL; goto out; + } dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { @@ -1147,30 +1278,25 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) ifindex = dev->ifindex; - if (do_rx_reg) - can_rx_register(net, dev, addr->can_addr.tp.rx_id, - SINGLE_MASK(addr->can_addr.tp.rx_id), + if (isotp_register_rxid(so)) + can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id), isotp_rcv, sk, "isotp", sk); - dev_put(dev); + if (isotp_register_txecho(so)) { + /* no consecutive frame echo skb in flight */ + so->cfecho = 0; - if (so->bound && do_rx_reg) { - /* unregister old filter */ - if (so->ifindex) { - dev = dev_get_by_index(net, so->ifindex); - if (dev) { - can_rx_unregister(net, dev, so->rxid, - SINGLE_MASK(so->rxid), - isotp_rcv, sk); - dev_put(dev); - } - } + /* register for echo skb's */ + can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id), + isotp_rcv_echo, sk, "isotpe", sk); } + dev_put(dev); + /* switch to new settings */ so->ifindex = ifindex; - so->rxid = addr->can_addr.tp.rx_id; - so->txid = addr->can_addr.tp.tx_id; + so->rxid = rx_id; + so->txid = tx_id; so->bound = 1; out: @@ -1224,6 +1350,23 @@ static int isotp_setsockopt_locked(struct socket *sock, int level, int optname, /* no separate rx_ext_address is given => use ext_address */ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) so->opt.rx_ext_address = so->opt.ext_address; + + /* these broadcast flags are not allowed together */ + if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) { + /* CAN_ISOTP_SF_BROADCAST is prioritized */ + so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST; + + /* give user feedback on wrong config attempt */ + ret = -EINVAL; + } + + /* check for frame_txtime changes (0 => no changes) */ + if (so->opt.frame_txtime) { + if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO) + so->frame_txtime = 0; + else + so->frame_txtime = so->opt.frame_txtime; + } break; case CAN_ISOTP_RECV_FC: @@ -1367,10 +1510,16 @@ static void isotp_notify(struct isotp_sock *so, unsigned long msg, case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ - if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) - can_rx_unregister(dev_net(dev), dev, so->rxid, - SINGLE_MASK(so->rxid), - isotp_rcv, sk); + if (so->bound && isotp_register_txecho(so)) { + if (isotp_register_rxid(so)) + can_rx_unregister(dev_net(dev), dev, so->rxid, + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + + can_rx_unregister(dev_net(dev), dev, so->txid, + SINGLE_MASK(so->txid), + isotp_rcv_echo, sk); + } so->ifindex = 0; so->bound = 0; @@ -1425,6 +1574,7 @@ static int isotp_init(struct sock *sk) so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; + so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; @@ -1444,6 +1594,7 @@ static int isotp_init(struct sock *sk) so->txtimer.function = isotp_tx_timer_handler; init_waitqueue_head(&so->wait); + spin_lock_init(&so->rx_lock); spin_lock(&isotp_notifier_lock); list_add_tail(&so->notifier, &isotp_notifier_list); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 6dff4510687a..f5ecfdcf57b2 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -802,7 +802,7 @@ static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939, SCM_J1939_ERRQUEUE); - skb = skb_recv_datagram(sk, flags, 0, &ret); + skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; @@ -841,7 +841,7 @@ static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, paddr->can_addr.j1939.pgn = skcb->addr.pgn; } - sock_recv_ts_and_drops(msg, sk, skb); + sock_recv_cmsgs(msg, sk, skb); msg->msg_flags |= skcb->msg_flags; skb_free_datagram(sk, skb); diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index a271688780a2..307ee1174a6e 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2006,7 +2006,7 @@ struct j1939_session *j1939_tp_send(struct j1939_priv *priv, /* set the end-packet for broadcast */ session->pkt.last = session->pkt.total; - skcb->tskey = session->sk->sk_tskey++; + skcb->tskey = atomic_inc_return(&session->sk->sk_tskey) - 1; session->tskey = skcb->tskey; return session; diff --git a/net/can/proc.c b/net/can/proc.c index b3099f0a3cb8..bbce97825f13 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -305,7 +305,7 @@ static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx, static int can_rcvlist_proc_show(struct seq_file *m, void *v) { /* double cast to prevent GCC warning */ - int idx = (int)(long)PDE_DATA(m->file->f_inode); + int idx = (int)(long)pde_data(m->file->f_inode); struct net_device *dev; struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; diff --git a/net/can/raw.c b/net/can/raw.c index 7105fa4824e4..d1bd9cc51ebe 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -772,6 +772,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); + struct sockcm_cookie sockc; struct sk_buff *skb; struct net_device *dev; int ifindex; @@ -817,11 +818,18 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err < 0) goto free_skb; - skb_setup_tx_timestamp(skb, sk->sk_tsflags); + sockcm_init(&sockc, sk); + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + goto free_skb; + } skb->dev = dev; - skb->sk = sk; skb->priority = sk->sk_priority; + skb->tstamp = sockc.transmit_time; + + skb_setup_tx_timestamp(skb, sockc.tsflags); err = can_send(skb, ro->loopback); @@ -846,16 +854,12 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, struct sock *sk = sock->sk; struct sk_buff *skb; int err = 0; - int noblock; - - noblock = flags & MSG_DONTWAIT; - flags &= ~MSG_DONTWAIT; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sk, msg, size, SOL_CAN_RAW, SCM_CAN_RAW_ERRQUEUE); - skb = skb_recv_datagram(sk, flags, noblock, &err); + skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; @@ -870,7 +874,7 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, return err; } - sock_recv_ts_and_drops(msg, sk, skb); + sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(RAW_MIN_NAMELEN); diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index 5622763ad402..7e51f128045d 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -7,7 +7,7 @@ #include <linux/ceph/buffer.h> #include <linux/ceph/decode.h> -#include <linux/ceph/libceph.h> /* for ceph_kvmalloc */ +#include <linux/ceph/libceph.h> /* for kvmalloc */ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { @@ -17,7 +17,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (!b) return NULL; - b->vec.iov_base = ceph_kvmalloc(len, gfp); + b->vec.iov_base = kvmalloc(len, gfp); if (!b->vec.iov_base) { kfree(b); return NULL; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 97d6ea763e32..4c6441536d55 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -190,41 +190,14 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); -/* - * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are - * compatible with (a superset of) GFP_KERNEL. This is because while the - * actual pages are allocated with the specified flags, the page table pages - * are always allocated with GFP_KERNEL. - * - * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. - */ -void *ceph_kvmalloc(size_t size, gfp_t flags) -{ - void *p; - - if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { - p = kvmalloc(size, flags); - } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { - unsigned int nofs_flag = memalloc_nofs_save(); - p = kvmalloc(size, GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - } else { - unsigned int noio_flag = memalloc_noio_save(); - p = kvmalloc(size, GFP_KERNEL); - memalloc_noio_restore(noio_flag); - } - - return p; -} - -static int parse_fsid(const char *str, struct ceph_fsid *fsid) +int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; char tmp[3]; int err = -EINVAL; int d; - dout("parse_fsid '%s'\n", str); + dout("%s '%s'\n", __func__, str); tmp[2] = 0; while (*str && i < 16) { if (ispunct(*str)) { @@ -244,9 +217,10 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid) if (i == 16) err = 0; - dout("parse_fsid ret %d got fsid %pU\n", err, fsid); + dout("%s ret %d got fsid %pU\n", __func__, err, fsid); return err; } +EXPORT_SYMBOL(ceph_parse_fsid); /* * ceph options @@ -272,6 +246,7 @@ enum { Opt_cephx_sign_messages, Opt_tcp_nodelay, Opt_abort_on_full, + Opt_rxbounce, }; enum { @@ -321,6 +296,7 @@ static const struct fs_parameter_spec ceph_parameters[] = { fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout), fsparam_enum ("read_from_replica", Opt_read_from_replica, ceph_param_read_from_replica), + fsparam_flag ("rxbounce", Opt_rxbounce), fsparam_enum ("ms_mode", Opt_ms_mode, ceph_param_ms_mode), fsparam_string ("secret", Opt_secret), @@ -422,14 +398,14 @@ out: } int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, - struct fc_log *l) + struct fc_log *l, char delim) { struct p_log log = {.prefix = "libceph", .log = l}; int ret; - /* ip1[:port1][,ip2[:port2]...] */ + /* ip1[:port1][<delim>ip2[:port2]...] */ ret = ceph_parse_ips(buf, buf + len, opt->mon_addr, CEPH_MAX_MON, - &opt->num_mon); + &opt->num_mon, delim); if (ret) { error_plog(&log, "Failed to parse monitor IPs: %d", ret); return ret; @@ -455,8 +431,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, case Opt_ip: err = ceph_parse_ips(param->string, param->string + param->size, - &opt->my_addr, - 1, NULL); + &opt->my_addr, 1, NULL, ','); if (err) { error_plog(&log, "Failed to parse ip: %d", err); return err; @@ -465,7 +440,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, break; case Opt_fsid: - err = parse_fsid(param->string, &opt->fsid); + err = ceph_parse_fsid(param->string, &opt->fsid); if (err) { error_plog(&log, "Failed to parse fsid: %d", err); return err; @@ -611,6 +586,9 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, case Opt_abort_on_full: opt->flags |= CEPH_OPT_ABORT_ON_FULL; break; + case Opt_rxbounce: + opt->flags |= CEPH_OPT_RXBOUNCE; + break; default: BUG(); @@ -687,6 +665,8 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, seq_puts(m, "notcp_nodelay,"); if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL)) seq_puts(m, "abort_on_full,"); + if (opt->flags & CEPH_OPT_RXBOUNCE) + seq_puts(m, "rxbounce,"); if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) seq_printf(m, "mount_timeout=%d,", diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 7057f8db4f99..1daf95e17d67 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -906,7 +906,6 @@ int crush_do_rule(const struct crush_map *map, int recurse_to_leaf; int wsize = 0; int osize; - int *tmp; const struct crush_rule *rule; __u32 step; int i, j; @@ -1073,9 +1072,7 @@ int crush_do_rule(const struct crush_map *map, memcpy(o, c, osize*sizeof(*o)); /* swap o and w arrays */ - tmp = o; - o = w; - w = tmp; + swap(o, w); wsize = osize; break; diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 92d89b331645..051d22c0e4ad 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -147,7 +147,7 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key) static const u8 *aes_iv = (u8 *)CEPH_AES_IV; /* - * Should be used for buffers allocated with ceph_kvmalloc(). + * Should be used for buffers allocated with kvmalloc(). * Currently these are encrypt out-buffer (ceph_buffer) and decrypt * in-buffer (msg front). * diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 57d043b382ed..d3bb656308b4 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -515,6 +515,10 @@ static void ceph_con_reset_protocol(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } + if (con->bounce_page) { + __free_page(con->bounce_page); + con->bounce_page = NULL; + } if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_reset_protocol(con); @@ -1267,30 +1271,31 @@ static int ceph_parse_server_name(const char *name, size_t namelen, */ int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, - int max_count, int *count) + int max_count, int *count, char delim) { int i, ret = -EINVAL; const char *p = c; dout("parse_ips on '%.*s'\n", (int)(end-c), c); for (i = 0; i < max_count; i++) { + char cur_delim = delim; const char *ipend; int port; - char delim = ','; if (*p == '[') { - delim = ']'; + cur_delim = ']'; p++; } - ret = ceph_parse_server_name(p, end - p, &addr[i], delim, &ipend); + ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim, + &ipend); if (ret) goto bad; ret = -EINVAL; p = ipend; - if (delim == ']') { + if (cur_delim == ']') { if (*p != ']') { dout("missing matching ']'\n"); goto bad; @@ -1326,11 +1331,11 @@ int ceph_parse_ips(const char *c, const char *end, addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; addr[i].nonce = 0; - dout("parse_ips got %s\n", ceph_pr_addr(&addr[i])); + dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i])); if (p == end) break; - if (*p != ',') + if (*p != delim) goto bad; p++; } @@ -1920,7 +1925,7 @@ struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, /* front */ if (front_len) { - m->front.iov_base = ceph_kvmalloc(front_len, flags); + m->front.iov_base = kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 2cb5ffdf071a..6b014eca3a13 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -992,8 +992,7 @@ static int read_partial_message_section(struct ceph_connection *con, static int read_partial_msg_data(struct ceph_connection *con) { - struct ceph_msg *msg = con->in_msg; - struct ceph_msg_data_cursor *cursor = &msg->cursor; + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); struct page *page; size_t page_offset; @@ -1001,9 +1000,6 @@ static int read_partial_msg_data(struct ceph_connection *con) u32 crc = 0; int ret; - if (!msg->num_data_items) - return -EIO; - if (do_datacrc) crc = con->in_data_crc; while (cursor->total_resid) { @@ -1031,6 +1027,46 @@ static int read_partial_msg_data(struct ceph_connection *con) return 1; /* must return > 0 to indicate success */ } +static int read_partial_msg_data_bounce(struct ceph_connection *con) +{ + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; + struct page *page; + size_t off, len; + u32 crc; + int ret; + + if (unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + crc = con->in_data_crc; + while (cursor->total_resid) { + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + + page = ceph_msg_data_next(cursor, &off, &len, NULL); + ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len); + if (ret <= 0) { + con->in_data_crc = crc; + return ret; + } + + crc = crc32c(crc, page_address(con->bounce_page), ret); + memcpy_to_page(page, off, page_address(con->bounce_page), ret); + + ceph_msg_data_advance(cursor, ret); + } + con->in_data_crc = crc; + + return 1; /* must return > 0 to indicate success */ +} + /* * read (part of) a message. */ @@ -1141,7 +1177,13 @@ static int read_partial_message(struct ceph_connection *con) /* (page) data */ if (data_len) { - ret = read_partial_msg_data(con); + if (!m->num_data_items) + return -EIO; + + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) + ret = read_partial_msg_data_bounce(con); + else + ret = read_partial_msg_data(con); if (ret <= 0) return ret; } diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index cc40ce4e02fb..c6e5bfc717d5 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -57,8 +57,9 @@ #define IN_S_HANDLE_CONTROL_REMAINDER 3 #define IN_S_PREPARE_READ_DATA 4 #define IN_S_PREPARE_READ_DATA_CONT 5 -#define IN_S_HANDLE_EPILOGUE 6 -#define IN_S_FINISH_SKIP 7 +#define IN_S_PREPARE_READ_ENC_PAGE 6 +#define IN_S_HANDLE_EPILOGUE 7 +#define IN_S_FINISH_SKIP 8 #define OUT_S_QUEUE_DATA 1 #define OUT_S_QUEUE_DATA_CONT 2 @@ -308,7 +309,7 @@ static void *alloc_conn_buf(struct ceph_connection *con, int len) if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs))) return NULL; - buf = ceph_kvmalloc(len, GFP_NOIO); + buf = kvmalloc(len, GFP_NOIO); if (!buf) return NULL; @@ -1032,22 +1033,41 @@ static int decrypt_control_remainder(struct ceph_connection *con) padded_len(rem_len) + CEPH_GCM_TAG_LEN); } -static int decrypt_message(struct ceph_connection *con) +static int decrypt_tail(struct ceph_connection *con) { + struct sg_table enc_sgt = {}; struct sg_table sgt = {}; + int tail_len; int ret; + tail_len = tail_onwire_len(con->in_msg, true); + ret = sg_alloc_table_from_pages(&enc_sgt, con->v2.in_enc_pages, + con->v2.in_enc_page_cnt, 0, tail_len, + GFP_NOIO); + if (ret) + goto out; + ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf), MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf), con->v2.in_buf, true); if (ret) goto out; - ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl, - tail_onwire_len(con->in_msg, true)); + dout("%s con %p msg %p enc_page_cnt %d sg_cnt %d\n", __func__, con, + con->in_msg, con->v2.in_enc_page_cnt, sgt.orig_nents); + ret = gcm_crypt(con, false, enc_sgt.sgl, sgt.sgl, tail_len); + if (ret) + goto out; + + WARN_ON(!con->v2.in_enc_page_cnt); + ceph_release_page_vector(con->v2.in_enc_pages, + con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = NULL; + con->v2.in_enc_page_cnt = 0; out: sg_free_table(&sgt); + sg_free_table(&enc_sgt); return ret; } @@ -1733,54 +1753,153 @@ static int prepare_read_control_remainder(struct ceph_connection *con) return 0; } -static void prepare_read_data(struct ceph_connection *con) +static int prepare_read_data(struct ceph_connection *con) { struct bio_vec bv; - if (!con_secure(con)) - con->in_data_crc = -1; + con->in_data_crc = -1; ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg, data_len(con->in_msg)); get_bvec_at(&con->v2.in_cursor, &bv); + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + if (unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + } set_in_bvec(con, &bv); con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT; + return 0; } static void prepare_read_data_cont(struct ceph_connection *con) { struct bio_vec bv; - if (!con_secure(con)) + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + con->in_data_crc = crc32c(con->in_data_crc, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + + get_bvec_at(&con->v2.in_cursor, &bv); + memcpy_to_page(bv.bv_page, bv.bv_offset, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + } else { con->in_data_crc = ceph_crc32c_page(con->in_data_crc, con->v2.in_bvec.bv_page, con->v2.in_bvec.bv_offset, con->v2.in_bvec.bv_len); + } ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len); if (con->v2.in_cursor.total_resid) { get_bvec_at(&con->v2.in_cursor, &bv); + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + } set_in_bvec(con, &bv); WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT); return; } /* - * We've read all data. Prepare to read data padding (if any) - * and epilogue. + * We've read all data. Prepare to read epilogue. */ reset_in_kvecs(con); - if (con_secure(con)) { - if (need_padding(data_len(con->in_msg))) - add_in_kvec(con, DATA_PAD(con->v2.in_buf), - padding_len(data_len(con->in_msg))); - add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN); + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; +} + +static int prepare_read_tail_plain(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + + if (!front_len(msg) && !middle_len(msg)) { + WARN_ON(!data_len(msg)); + return prepare_read_data(con); + } + + reset_in_kvecs(con); + if (front_len(msg)) { + add_in_kvec(con, msg->front.iov_base, front_len(msg)); + WARN_ON(msg->front.iov_len != front_len(msg)); + } + if (middle_len(msg)) { + add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); + WARN_ON(msg->middle->vec.iov_len != middle_len(msg)); + } + + if (data_len(msg)) { + con->v2.in_state = IN_S_PREPARE_READ_DATA; } else { add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; } + return 0; +} + +static void prepare_read_enc_page(struct ceph_connection *con) +{ + struct bio_vec bv; + + dout("%s con %p i %d resid %d\n", __func__, con, con->v2.in_enc_i, + con->v2.in_enc_resid); + WARN_ON(!con->v2.in_enc_resid); + + bv.bv_page = con->v2.in_enc_pages[con->v2.in_enc_i]; + bv.bv_offset = 0; + bv.bv_len = min(con->v2.in_enc_resid, (int)PAGE_SIZE); + + set_in_bvec(con, &bv); + con->v2.in_enc_i++; + con->v2.in_enc_resid -= bv.bv_len; + + if (con->v2.in_enc_resid) { + con->v2.in_state = IN_S_PREPARE_READ_ENC_PAGE; + return; + } + + /* + * We are set to read the last piece of ciphertext (ending + * with epilogue) + auth tag. + */ + WARN_ON(con->v2.in_enc_i != con->v2.in_enc_page_cnt); con->v2.in_state = IN_S_HANDLE_EPILOGUE; } +static int prepare_read_tail_secure(struct ceph_connection *con) +{ + struct page **enc_pages; + int enc_page_cnt; + int tail_len; + + tail_len = tail_onwire_len(con->in_msg, true); + WARN_ON(!tail_len); + + enc_page_cnt = calc_pages_for(0, tail_len); + enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO); + if (IS_ERR(enc_pages)) + return PTR_ERR(enc_pages); + + WARN_ON(con->v2.in_enc_pages || con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = enc_pages; + con->v2.in_enc_page_cnt = enc_page_cnt; + con->v2.in_enc_resid = tail_len; + con->v2.in_enc_i = 0; + + prepare_read_enc_page(con); + return 0; +} + static void __finish_skip(struct ceph_connection *con) { con->in_seq++; @@ -2589,47 +2708,26 @@ static int __handle_control(struct ceph_connection *con, void *p) } msg = con->in_msg; /* set in process_message_header() */ - if (!front_len(msg) && !middle_len(msg)) { - if (!data_len(msg)) - return process_message(con); - - prepare_read_data(con); - return 0; - } - - reset_in_kvecs(con); if (front_len(msg)) { WARN_ON(front_len(msg) > msg->front_alloc_len); - add_in_kvec(con, msg->front.iov_base, front_len(msg)); msg->front.iov_len = front_len(msg); - - if (con_secure(con) && need_padding(front_len(msg))) - add_in_kvec(con, FRONT_PAD(con->v2.in_buf), - padding_len(front_len(msg))); } else { msg->front.iov_len = 0; } if (middle_len(msg)) { WARN_ON(middle_len(msg) > msg->middle->alloc_len); - add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); msg->middle->vec.iov_len = middle_len(msg); - - if (con_secure(con) && need_padding(middle_len(msg))) - add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf), - padding_len(middle_len(msg))); } else if (msg->middle) { msg->middle->vec.iov_len = 0; } - if (data_len(msg)) { - con->v2.in_state = IN_S_PREPARE_READ_DATA; - } else { - add_in_kvec(con, con->v2.in_buf, - con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN : - CEPH_EPILOGUE_PLAIN_LEN); - con->v2.in_state = IN_S_HANDLE_EPILOGUE; - } - return 0; + if (!front_len(msg) && !middle_len(msg) && !data_len(msg)) + return process_message(con); + + if (con_secure(con)) + return prepare_read_tail_secure(con); + + return prepare_read_tail_plain(con); } static int handle_preamble(struct ceph_connection *con) @@ -2717,7 +2815,7 @@ static int handle_epilogue(struct ceph_connection *con) int ret; if (con_secure(con)) { - ret = decrypt_message(con); + ret = decrypt_tail(con); if (ret) { if (ret == -EBADMSG) con->error_msg = "integrity error, bad epilogue auth tag"; @@ -2785,13 +2883,16 @@ static int populate_in_iter(struct ceph_connection *con) ret = handle_control_remainder(con); break; case IN_S_PREPARE_READ_DATA: - prepare_read_data(con); - ret = 0; + ret = prepare_read_data(con); break; case IN_S_PREPARE_READ_DATA_CONT: prepare_read_data_cont(con); ret = 0; break; + case IN_S_PREPARE_READ_ENC_PAGE: + prepare_read_enc_page(con); + ret = 0; + break; case IN_S_HANDLE_EPILOGUE: ret = handle_epilogue(con); break; @@ -3326,20 +3427,16 @@ void ceph_con_v2_revoke(struct ceph_connection *con) static void revoke_at_prepare_read_data(struct ceph_connection *con) { - int remaining; /* data + [data padding] + epilogue */ + int remaining; int resid; + WARN_ON(con_secure(con)); WARN_ON(!data_len(con->in_msg)); WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); WARN_ON(!resid); - if (con_secure(con)) - remaining = padded_len(data_len(con->in_msg)) + - CEPH_EPILOGUE_SECURE_LEN; - else - remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN; - + remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN; dout("%s con %p resid %d remaining %d\n", __func__, con, resid, remaining); con->v2.in_iter.count -= resid; @@ -3350,8 +3447,9 @@ static void revoke_at_prepare_read_data(struct ceph_connection *con) static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) { int recved, resid; /* current piece of data */ - int remaining; /* [data padding] + epilogue */ + int remaining; + WARN_ON(con_secure(con)); WARN_ON(!data_len(con->in_msg)); WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); @@ -3363,12 +3461,7 @@ static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) ceph_msg_data_advance(&con->v2.in_cursor, recved); WARN_ON(resid > con->v2.in_cursor.total_resid); - if (con_secure(con)) - remaining = padding_len(data_len(con->in_msg)) + - CEPH_EPILOGUE_SECURE_LEN; - else - remaining = CEPH_EPILOGUE_PLAIN_LEN; - + remaining = CEPH_EPILOGUE_PLAIN_LEN; dout("%s con %p total_resid %zu remaining %d\n", __func__, con, con->v2.in_cursor.total_resid, remaining); con->v2.in_iter.count -= resid; @@ -3376,11 +3469,26 @@ static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) con->v2.in_state = IN_S_FINISH_SKIP; } +static void revoke_at_prepare_read_enc_page(struct ceph_connection *con) +{ + int resid; /* current enc page (not necessarily data) */ + + WARN_ON(!con_secure(con)); + WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid || resid > con->v2.in_bvec.bv_len); + + dout("%s con %p resid %d enc_resid %d\n", __func__, con, resid, + con->v2.in_enc_resid); + con->v2.in_iter.count -= resid; + set_in_skip(con, resid + con->v2.in_enc_resid); + con->v2.in_state = IN_S_FINISH_SKIP; +} + static void revoke_at_handle_epilogue(struct ceph_connection *con) { int resid; - WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); WARN_ON(!resid); @@ -3399,6 +3507,9 @@ void ceph_con_v2_revoke_incoming(struct ceph_connection *con) case IN_S_PREPARE_READ_DATA_CONT: revoke_at_prepare_read_data_cont(con); break; + case IN_S_PREPARE_READ_ENC_PAGE: + revoke_at_prepare_read_enc_page(con); + break; case IN_S_HANDLE_EPILOGUE: revoke_at_handle_epilogue(con); break; @@ -3432,6 +3543,13 @@ void ceph_con_v2_reset_protocol(struct ceph_connection *con) clear_out_sign_kvecs(con); free_conn_bufs(con); + if (con->v2.in_enc_pages) { + WARN_ON(!con->v2.in_enc_page_cnt); + ceph_release_page_vector(con->v2.in_enc_pages, + con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = NULL; + con->v2.in_enc_page_cnt = 0; + } if (con->v2.out_enc_pages) { WARN_ON(!con->v2.out_enc_page_cnt); ceph_release_page_vector(con->v2.out_enc_pages, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1c5815530e0d..9d82bb42e958 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -537,43 +537,6 @@ static void request_init(struct ceph_osd_request *req) target_init(&req->r_t); } -/* - * This is ugly, but it allows us to reuse linger registration and ping - * requests, keeping the structure of the code around send_linger{_ping}() - * reasonable. Setting up a min_nr=2 mempool for each linger request - * and dealing with copying ops (this blasts req only, watch op remains - * intact) isn't any better. - */ -static void request_reinit(struct ceph_osd_request *req) -{ - struct ceph_osd_client *osdc = req->r_osdc; - bool mempool = req->r_mempool; - unsigned int num_ops = req->r_num_ops; - u64 snapid = req->r_snapid; - struct ceph_snap_context *snapc = req->r_snapc; - bool linger = req->r_linger; - struct ceph_msg *request_msg = req->r_request; - struct ceph_msg *reply_msg = req->r_reply; - - dout("%s req %p\n", __func__, req); - WARN_ON(kref_read(&req->r_kref) != 1); - request_release_checks(req); - - WARN_ON(kref_read(&request_msg->kref) != 1); - WARN_ON(kref_read(&reply_msg->kref) != 1); - target_destroy(&req->r_t); - - request_init(req); - req->r_osdc = osdc; - req->r_mempool = mempool; - req->r_num_ops = num_ops; - req->r_snapid = snapid; - req->r_snapc = snapc; - req->r_linger = linger; - req->r_request = request_msg; - req->r_reply = reply_msg; -} - struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_ops, @@ -918,14 +881,30 @@ EXPORT_SYMBOL(osd_req_op_xattr_init); * @watch_opcode: CEPH_OSD_WATCH_OP_* */ static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, - u64 cookie, u8 watch_opcode) + u8 watch_opcode, u64 cookie, u32 gen) { struct ceph_osd_req_op *op; op = osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); op->watch.cookie = cookie; op->watch.op = watch_opcode; - op->watch.gen = 0; + op->watch.gen = gen; +} + +/* + * prot_ver, timeout and notify payload (may be empty) should already be + * encoded in @request_pl + */ +static void osd_req_op_notify_init(struct ceph_osd_request *req, int which, + u64 cookie, struct ceph_pagelist *request_pl) +{ + struct ceph_osd_req_op *op; + + op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); + op->notify.cookie = cookie; + + ceph_osd_data_pagelist_init(&op->notify.request_data, request_pl); + op->indata_len = request_pl->length; } /* @@ -2385,7 +2364,11 @@ again: if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) { err = -ENOSPC; } else { - pr_warn_ratelimited("FULL or reached pool quota\n"); + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) + pr_warn_ratelimited("cluster is full (osdmap FULL)\n"); + else + pr_warn_ratelimited("pool %lld is full or reached quota\n", + req->r_t.base_oloc.pool); req->r_t.paused = true; maybe_request_map(osdc); } @@ -2727,10 +2710,13 @@ static void linger_release(struct kref *kref) WARN_ON(!list_empty(&lreq->pending_lworks)); WARN_ON(lreq->osd); - if (lreq->reg_req) - ceph_osdc_put_request(lreq->reg_req); - if (lreq->ping_req) - ceph_osdc_put_request(lreq->ping_req); + if (lreq->request_pl) + ceph_pagelist_release(lreq->request_pl); + if (lreq->notify_id_pages) + ceph_release_page_vector(lreq->notify_id_pages, 1); + + ceph_osdc_put_request(lreq->reg_req); + ceph_osdc_put_request(lreq->ping_req); target_destroy(&lreq->t); kfree(lreq); } @@ -2999,6 +2985,12 @@ static void linger_commit_cb(struct ceph_osd_request *req) struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); + if (req != lreq->reg_req) { + dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", + __func__, lreq, lreq->linger_id, req, lreq->reg_req); + goto out; + } + dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, lreq->linger_id, req->r_result); linger_reg_commit_complete(lreq, req->r_result); @@ -3022,6 +3014,7 @@ static void linger_commit_cb(struct ceph_osd_request *req) } } +out: mutex_unlock(&lreq->lock); linger_put(lreq); } @@ -3044,6 +3037,12 @@ static void linger_reconnect_cb(struct ceph_osd_request *req) struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); + if (req != lreq->reg_req) { + dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", + __func__, lreq, lreq->linger_id, req, lreq->reg_req); + goto out; + } + dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__, lreq, lreq->linger_id, req->r_result, lreq->last_error); if (req->r_result < 0) { @@ -3053,46 +3052,64 @@ static void linger_reconnect_cb(struct ceph_osd_request *req) } } +out: mutex_unlock(&lreq->lock); linger_put(lreq); } static void send_linger(struct ceph_osd_linger_request *lreq) { - struct ceph_osd_request *req = lreq->reg_req; - struct ceph_osd_req_op *op = &req->r_ops[0]; + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osd_request *req; + int ret; - verify_osdc_wrlocked(req->r_osdc); + verify_osdc_wrlocked(osdc); + mutex_lock(&lreq->lock); dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - if (req->r_osd) - cancel_linger_request(req); + if (lreq->reg_req) { + if (lreq->reg_req->r_osd) + cancel_linger_request(lreq->reg_req); + ceph_osdc_put_request(lreq->reg_req); + } + + req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO); + BUG_ON(!req); - request_reinit(req); target_copy(&req->r_t, &lreq->t); req->r_mtime = lreq->mtime; - mutex_lock(&lreq->lock); if (lreq->is_watch && lreq->committed) { - WARN_ON(op->op != CEPH_OSD_OP_WATCH || - op->watch.cookie != lreq->linger_id); - op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT; - op->watch.gen = ++lreq->register_gen; + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_RECONNECT, + lreq->linger_id, ++lreq->register_gen); dout("lreq %p reconnect register_gen %u\n", lreq, - op->watch.gen); + req->r_ops[0].watch.gen); req->r_callback = linger_reconnect_cb; } else { - if (!lreq->is_watch) + if (lreq->is_watch) { + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_WATCH, + lreq->linger_id, 0); + } else { lreq->notify_id = 0; - else - WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH); + + refcount_inc(&lreq->request_pl->refcnt); + osd_req_op_notify_init(req, 0, lreq->linger_id, + lreq->request_pl); + ceph_osd_data_pages_init( + osd_req_op_data(req, 0, notify, response_data), + lreq->notify_id_pages, PAGE_SIZE, 0, false, false); + } dout("lreq %p register\n", lreq); req->r_callback = linger_commit_cb; } - mutex_unlock(&lreq->lock); + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + BUG_ON(ret); req->r_priv = linger_get(lreq); req->r_linger = true; + lreq->reg_req = req; + mutex_unlock(&lreq->lock); submit_request(req, true); } @@ -3102,6 +3119,12 @@ static void linger_ping_cb(struct ceph_osd_request *req) struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); + if (req != lreq->ping_req) { + dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", + __func__, lreq, lreq->linger_id, req, lreq->ping_req); + goto out; + } + dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n", __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent, lreq->last_error); @@ -3117,6 +3140,7 @@ static void linger_ping_cb(struct ceph_osd_request *req) lreq->register_gen, req->r_ops[0].watch.gen); } +out: mutex_unlock(&lreq->lock); linger_put(lreq); } @@ -3124,8 +3148,8 @@ static void linger_ping_cb(struct ceph_osd_request *req) static void send_linger_ping(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; - struct ceph_osd_request *req = lreq->ping_req; - struct ceph_osd_req_op *op = &req->r_ops[0]; + struct ceph_osd_request *req; + int ret; if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { dout("%s PAUSERD\n", __func__); @@ -3137,19 +3161,26 @@ static void send_linger_ping(struct ceph_osd_linger_request *lreq) __func__, lreq, lreq->linger_id, lreq->ping_sent, lreq->register_gen); - if (req->r_osd) - cancel_linger_request(req); + if (lreq->ping_req) { + if (lreq->ping_req->r_osd) + cancel_linger_request(lreq->ping_req); + ceph_osdc_put_request(lreq->ping_req); + } - request_reinit(req); - target_copy(&req->r_t, &lreq->t); + req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO); + BUG_ON(!req); - WARN_ON(op->op != CEPH_OSD_OP_WATCH || - op->watch.cookie != lreq->linger_id || - op->watch.op != CEPH_OSD_WATCH_OP_PING); - op->watch.gen = lreq->register_gen; + target_copy(&req->r_t, &lreq->t); + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_PING, lreq->linger_id, + lreq->register_gen); req->r_callback = linger_ping_cb; + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + BUG_ON(ret); + req->r_priv = linger_get(lreq); req->r_linger = true; + lreq->ping_req = req; ceph_osdc_get_request(req); account_request(req); @@ -3165,12 +3196,6 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) down_write(&osdc->lock); linger_register(lreq); - if (lreq->is_watch) { - lreq->reg_req->r_ops[0].watch.cookie = lreq->linger_id; - lreq->ping_req->r_ops[0].watch.cookie = lreq->linger_id; - } else { - lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; - } calc_target(osdc, &lreq->t, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); @@ -3202,9 +3227,9 @@ static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) */ static void __linger_cancel(struct ceph_osd_linger_request *lreq) { - if (lreq->is_watch && lreq->ping_req->r_osd) + if (lreq->ping_req && lreq->ping_req->r_osd) cancel_linger_request(lreq->ping_req); - if (lreq->reg_req->r_osd) + if (lreq->reg_req && lreq->reg_req->r_osd) cancel_linger_request(lreq->reg_req); cancel_linger_map_check(lreq); unlink_linger(lreq->osd, lreq); @@ -4566,8 +4591,13 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, EXPORT_SYMBOL(ceph_osdc_start_request); /* - * Unregister a registered request. The request is not completed: - * ->r_result isn't set and __complete_request() isn't called. + * Unregister request. If @req was registered, it isn't completed: + * r_result isn't set and __complete_request() isn't invoked. + * + * If @req wasn't registered, this call may have raced with + * handle_reply(), in which case r_result would already be set and + * __complete_request() would be getting invoked, possibly even + * concurrently with this call. */ void ceph_osdc_cancel_request(struct ceph_osd_request *req) { @@ -4653,43 +4683,6 @@ again: } EXPORT_SYMBOL(ceph_osdc_sync); -static struct ceph_osd_request * -alloc_linger_request(struct ceph_osd_linger_request *lreq) -{ - struct ceph_osd_request *req; - - req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO); - if (!req) - return NULL; - - ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); - ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); - return req; -} - -static struct ceph_osd_request * -alloc_watch_request(struct ceph_osd_linger_request *lreq, u8 watch_opcode) -{ - struct ceph_osd_request *req; - - req = alloc_linger_request(lreq); - if (!req) - return NULL; - - /* - * Pass 0 for cookie because we don't know it yet, it will be - * filled in by linger_submit(). - */ - osd_req_op_watch_init(req, 0, 0, watch_opcode); - - if (ceph_osdc_alloc_messages(req, GFP_NOIO)) { - ceph_osdc_put_request(req); - return NULL; - } - - return req; -} - /* * Returns a handle, caller owns a ref. */ @@ -4719,18 +4712,6 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, lreq->t.flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&lreq->mtime); - lreq->reg_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_WATCH); - if (!lreq->reg_req) { - ret = -ENOMEM; - goto err_put_lreq; - } - - lreq->ping_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_PING); - if (!lreq->ping_req) { - ret = -ENOMEM; - goto err_put_lreq; - } - linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (ret) { @@ -4768,8 +4749,8 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); req->r_flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&req->r_mtime); - osd_req_op_watch_init(req, 0, lreq->linger_id, - CEPH_OSD_WATCH_OP_UNWATCH); + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_UNWATCH, + lreq->linger_id, 0); ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) @@ -4855,35 +4836,6 @@ out_put_req: } EXPORT_SYMBOL(ceph_osdc_notify_ack); -static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, - u64 cookie, u32 prot_ver, u32 timeout, - void *payload, u32 payload_len) -{ - struct ceph_osd_req_op *op; - struct ceph_pagelist *pl; - int ret; - - op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); - op->notify.cookie = cookie; - - pl = ceph_pagelist_alloc(GFP_NOIO); - if (!pl) - return -ENOMEM; - - ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */ - ret |= ceph_pagelist_encode_32(pl, timeout); - ret |= ceph_pagelist_encode_32(pl, payload_len); - ret |= ceph_pagelist_append(pl, payload, payload_len); - if (ret) { - ceph_pagelist_release(pl); - return -ENOMEM; - } - - ceph_osd_data_pagelist_init(&op->notify.request_data, pl); - op->indata_len = pl->length; - return 0; -} - /* * @timeout: in seconds * @@ -4902,7 +4854,6 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, size_t *preply_len) { struct ceph_osd_linger_request *lreq; - struct page **pages; int ret; WARN_ON(!timeout); @@ -4915,41 +4866,35 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, if (!lreq) return -ENOMEM; - lreq->preply_pages = preply_pages; - lreq->preply_len = preply_len; - - ceph_oid_copy(&lreq->t.base_oid, oid); - ceph_oloc_copy(&lreq->t.base_oloc, oloc); - lreq->t.flags = CEPH_OSD_FLAG_READ; - - lreq->reg_req = alloc_linger_request(lreq); - if (!lreq->reg_req) { + lreq->request_pl = ceph_pagelist_alloc(GFP_NOIO); + if (!lreq->request_pl) { ret = -ENOMEM; goto out_put_lreq; } - /* - * Pass 0 for cookie because we don't know it yet, it will be - * filled in by linger_submit(). - */ - ret = osd_req_op_notify_init(lreq->reg_req, 0, 0, 1, timeout, - payload, payload_len); - if (ret) + ret = ceph_pagelist_encode_32(lreq->request_pl, 1); /* prot_ver */ + ret |= ceph_pagelist_encode_32(lreq->request_pl, timeout); + ret |= ceph_pagelist_encode_32(lreq->request_pl, payload_len); + ret |= ceph_pagelist_append(lreq->request_pl, payload, payload_len); + if (ret) { + ret = -ENOMEM; goto out_put_lreq; + } /* for notify_id */ - pages = ceph_alloc_page_vector(1, GFP_NOIO); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); + lreq->notify_id_pages = ceph_alloc_page_vector(1, GFP_NOIO); + if (IS_ERR(lreq->notify_id_pages)) { + ret = PTR_ERR(lreq->notify_id_pages); + lreq->notify_id_pages = NULL; goto out_put_lreq; } - ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify, - response_data), - pages, PAGE_SIZE, 0, false, true); - ret = ceph_osdc_alloc_messages(lreq->reg_req, GFP_NOIO); - if (ret) - goto out_put_lreq; + lreq->preply_pages = preply_pages; + lreq->preply_len = preply_len; + + ceph_oid_copy(&lreq->t.base_oid, oid); + ceph_oloc_copy(&lreq->t.base_oloc, oloc); + lreq->t.flags = CEPH_OSD_FLAG_READ; linger_submit(lreq); ret = linger_reg_commit_wait(lreq); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 75b738083523..2823bb3cff55 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -980,7 +980,7 @@ static struct crush_work *alloc_workspace(const struct crush_map *c) work_size = crush_work_size(c, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); - work = ceph_kvmalloc(work_size, GFP_NOIO); + work = kvmalloc(work_size, GFP_NOIO); if (!work) return NULL; @@ -1190,9 +1190,9 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) if (max == map->max_osd) return 0; - state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); - weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); - addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + state = kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); if (!state || !weight || !addr) { kvfree(state); kvfree(weight); @@ -1222,7 +1222,7 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + affinity = kvmalloc(array_size(max, sizeof(*affinity)), GFP_NOFS); if (!affinity) return -ENOMEM; @@ -1503,7 +1503,7 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = ceph_kvmalloc( + map->osd_primary_affinity = kvmalloc( array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), GFP_NOFS); if (!map->osd_primary_affinity) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d9c37fd10809..a25ec93729b9 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,7 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata), true); return 0; } @@ -75,8 +75,8 @@ void bpf_sk_storage_free(struct sock *sk) * sk_storage. */ bpf_selem_unlink_map(selem); - free_sk_storage = bpf_selem_unlink_storage_nolock(sk_storage, - selem, true); + free_sk_storage = bpf_selem_unlink_storage_nolock( + sk_storage, selem, true, false); } raw_spin_unlock_bh(&sk_storage->lock); rcu_read_unlock(); @@ -141,7 +141,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, if (sock) { sdata = bpf_local_storage_update( sock->sk, (struct bpf_local_storage_map *)map, value, - map_flags); + map_flags, GFP_ATOMIC); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -172,7 +172,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -230,7 +230,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) bpf_selem_link_map(smap, copy_selem); bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { - ret = bpf_local_storage_alloc(newsk, smap, copy_selem); + ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); if (ret) { kfree(copy_selem); atomic_sub(smap->elem_size, @@ -255,8 +255,9 @@ out: return ret; } -BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, - void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; @@ -277,7 +278,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, refcount_inc_not_zero(&sk->sk_refcnt)) { sdata = bpf_local_storage_update( sk, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ @@ -337,7 +338,7 @@ bpf_sk_storage_ptr(void *owner) return &sk->sk_bpf_storage; } -static int sk_storage_map_btf_id; +BTF_ID_LIST_SINGLE(sk_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -348,8 +349,7 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_name = "bpf_local_storage_map", - .map_btf_id = &sk_storage_map_btf_id, + .map_btf_id = &sk_storage_map_btf_ids[0], .map_local_storage_charge = bpf_sk_storage_charge, .map_local_storage_uncharge = bpf_sk_storage_uncharge, .map_owner_storage_ptr = bpf_sk_storage_ptr, @@ -405,6 +405,8 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: btf_vmlinux = bpf_get_btf_vmlinux(); + if (IS_ERR_OR_NULL(btf_vmlinux)) + return false; btf_id = prog->aux->attach_btf_id; t = btf_type_by_id(btf_vmlinux, btf_id); tname = btf_name_by_offset(btf_vmlinux, t->name_off); @@ -417,14 +419,16 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return false; } -BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, - void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, gfp_flags) { WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return (unsigned long)NULL; - return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags, + gfp_flags); } BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, diff --git a/net/core/datagram.c b/net/core/datagram.c index ee290776c661..50f4faeea76c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -62,8 +62,6 @@ #include <trace/events/skb.h> #include <net/busy_poll.h> -#include "datagram.h" - /* * Is a socket 'connection oriented' ? */ @@ -310,12 +308,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, EXPORT_SYMBOL(__skb_recv_datagram); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, - int noblock, int *err) + int *err) { int off = 0; - return __skb_recv_datagram(sk, &sk->sk_receive_queue, - flags | (noblock ? MSG_DONTWAIT : 0), + return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); diff --git a/net/core/datagram.h b/net/core/datagram.h deleted file mode 100644 index bcfb75bfa3b2..000000000000 --- a/net/core/datagram.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _NET_CORE_DATAGRAM_H_ -#define _NET_CORE_DATAGRAM_H_ - -#include <linux/types.h> - -struct sock; -struct sk_buff; -struct iov_iter; - -int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, - struct iov_iter *from, size_t length); - -#endif /* _NET_CORE_DATAGRAM_H_ */ diff --git a/net/core/dev.c b/net/core/dev.c index 84a0d9542fe9..30a1603a7225 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -151,6 +151,7 @@ #include <linux/prandom.h> #include <linux/once_lite.h> +#include "dev.h" #include "net-sysfs.h" @@ -216,18 +217,38 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock(struct softnet_data *sd) +static inline void rps_lock_irqsave(struct softnet_data *sd, + unsigned long *flags) { -#ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(*flags); } -static inline void rps_unlock(struct softnet_data *sd) +static inline void rps_lock_irq_disable(struct softnet_data *sd) { -#ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); +} + +static inline void rps_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +static inline void rps_unlock_irq_enable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); } static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, @@ -320,7 +341,6 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name) return 0; } -EXPORT_SYMBOL(netdev_name_node_alt_create); static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) { @@ -348,7 +368,6 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) return 0; } -EXPORT_SYMBOL(netdev_name_node_alt_destroy); static void netdev_name_node_alt_flush(struct net_device *dev) { @@ -378,16 +397,18 @@ static void list_netdevice(struct net_device *dev) /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ -static void unlist_netdevice(struct net_device *dev) +static void unlist_netdevice(struct net_device *dev, bool lock) { ASSERT_RTNL(); /* Unlink dev from the device chain */ - write_lock(&dev_base_lock); + if (lock) + write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock(&dev_base_lock); + if (lock) + write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -663,11 +684,11 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, const struct net_device *last_dev; struct net_device_path_ctx ctx = { .dev = dev, - .daddr = daddr, }; struct net_device_path *path; int ret = 0; + memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); stack->num_paths = 0; while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { last_dev = ctx.dev; @@ -683,6 +704,10 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, if (WARN_ON_ONCE(last_dev == ctx.dev)) return -1; } + + if (!ctx.dev) + return ret; + path = dev_fwd_path(stack); if (!path) return -1; @@ -1037,7 +1062,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, name_node->name, IFNAMSIZ)) - set_bit(i, inuse); + __set_bit(i, inuse); } if (!sscanf(d->name, name, &i)) continue; @@ -1047,7 +1072,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) - set_bit(i, inuse); + __set_bit(i, inuse); } i = find_first_zero_bit(inuse, max_netdevices); @@ -1602,7 +1627,8 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) - N(PRE_CHANGEADDR) + N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) + N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) } #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -1919,6 +1945,32 @@ static int call_netdevice_notifiers_info(unsigned long val, return raw_notifier_call_chain(&netdev_chain, val, info); } +/** + * call_netdevice_notifiers_info_robust - call per-netns notifier blocks + * for and rollback on error + * @val_up: value passed unmodified to notifier function + * @val_down: value passed unmodified to the notifier function when + * recovering from an error on @val_up + * @info: notifier information data + * + * Call all per-netns network notifier blocks, but not notifier blocks on + * the global notifier chain. Parameters and return value are as for + * raw_notifier_call_chain_robust(). + */ + +static int +call_netdevice_notifiers_info_robust(unsigned long val_up, + unsigned long val_down, + struct netdev_notifier_info *info) +{ + struct net *net = dev_net(info->dev); + + ASSERT_RTNL(); + + return raw_notifier_call_chain_robust(&net->netdev_chain, + val_up, val_down, info); +} + static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack) @@ -2000,7 +2052,8 @@ void net_dec_egress_queue(void) EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif -static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); +DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); +EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; @@ -2061,14 +2114,15 @@ EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; + skb->mono_delivery_time = 0; if (static_branch_unlikely(&netstamp_needed_key)) - __net_timestamp(skb); + skb->tstamp = ktime_get_real(); } #define net_timestamp_check(COND, SKB) \ if (static_branch_unlikely(&netstamp_needed_key)) { \ if ((COND) && !(SKB)->tstamp) \ - __net_timestamp(SKB); \ + (SKB)->tstamp = ktime_get_real(); \ } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) @@ -2941,15 +2995,73 @@ undo_rx: EXPORT_SYMBOL(netif_set_real_num_queues); /** + * netif_set_tso_max_size() - set the max size of TSO frames supported + * @dev: netdev to update + * @size: max skb->len of a TSO frame + * + * Set the limit on the size of TSO super-frames the device can handle. + * Unless explicitly set the stack will assume the value of + * %GSO_LEGACY_MAX_SIZE. + */ +void netif_set_tso_max_size(struct net_device *dev, unsigned int size) +{ + dev->tso_max_size = min(GSO_MAX_SIZE, size); + if (size < READ_ONCE(dev->gso_max_size)) + netif_set_gso_max_size(dev, size); +} +EXPORT_SYMBOL(netif_set_tso_max_size); + +/** + * netif_set_tso_max_segs() - set the max number of segs supported for TSO + * @dev: netdev to update + * @segs: max number of TCP segments + * + * Set the limit on the number of TCP segments the device can generate from + * a single TSO super-frame. + * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. + */ +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) +{ + dev->tso_max_segs = segs; + if (segs < READ_ONCE(dev->gso_max_segs)) + netif_set_gso_max_segs(dev, segs); +} +EXPORT_SYMBOL(netif_set_tso_max_segs); + +/** + * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper + * @to: netdev to update + * @from: netdev from which to copy the limits + */ +void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) +{ + netif_set_tso_max_size(to, from->tso_max_size); + netif_set_tso_max_segs(to, from->tso_max_segs); +} +EXPORT_SYMBOL(netif_inherit_tso_max); + +/** * netif_get_num_default_rss_queues - default number of RSS queues * - * This routine should set an upper limit on the number of RSS queues - * used by default by multiqueue devices. + * Default value is the number of physical cores if there are only 1 or 2, or + * divided by 2 if there are more. */ int netif_get_num_default_rss_queues(void) { - return is_kdump_kernel() ? - 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); + cpumask_var_t cpus; + int cpu, count = 0; + + if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) + return 1; + + cpumask_copy(cpus, cpu_online_mask); + for_each_cpu(cpu, cpus) { + ++count; + cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); + } + free_cpumask_var(cpus); + + return count > 2 ? DIV_ROUND_UP(count, 2) : count; } EXPORT_SYMBOL(netif_get_num_default_rss_queues); @@ -3156,12 +3268,18 @@ int skb_checksum_help(struct sk_buff *skb) } offset = skb_checksum_start_offset(skb); - BUG_ON(offset >= skb_headlen(skb)); + ret = -EINVAL; + if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + goto out; + } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; - BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - + if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + goto out; + } ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); if (ret) goto out; @@ -3468,7 +3586,6 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, dev_queue_xmit_nit(skb, dev); len = skb->len; - PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies); trace_net_dev_start_xmit(skb, dev); rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); @@ -3586,7 +3703,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device out_kfree_skb: kfree_skb(skb); out_null: - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); return NULL; } @@ -3710,7 +3827,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, no_lock_out: if (unlikely(to_free)) - kfree_skb_list(to_free); + kfree_skb_list_reason(to_free, + SKB_DROP_REASON_QDISC_DROP); return rc; } @@ -3765,7 +3883,7 @@ no_lock_out: } spin_unlock(root_lock); if (unlikely(to_free)) - kfree_skb_list(to_free); + kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; @@ -3811,7 +3929,7 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) skb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(skb)); skb_dst_force(skb); - netif_rx_ni(skb); + netif_rx(skb); return 0; } EXPORT_SYMBOL(dev_loopback_xmit); @@ -3840,7 +3958,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); *ret = NET_XMIT_DROP; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: @@ -3860,6 +3978,25 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return skb; } + +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) +{ + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +static bool netdev_xmit_txqueue_skipped(void) +{ + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_XPS @@ -4002,35 +4139,30 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, } /** - * __dev_queue_xmit - transmit a buffer - * @skb: buffer to transmit - * @sb_dev: suboordinate device used for L2 forwarding offload + * __dev_queue_xmit() - transmit a buffer + * @skb: buffer to transmit + * @sb_dev: suboordinate device used for L2 forwarding offload * - * Queue a buffer for transmission to a network device. The caller must - * have set the device and priority and built the buffer before calling - * this function. The function can be called from an interrupt. + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. * - * A negative errno code is returned on a failure. A success does not - * guarantee the frame will be transmitted as it may be dropped due - * to congestion or traffic shaping. + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. * - * ----------------------------------------------------------------------------------- - * I notice this method can also return errors from the queue disciplines, - * including NET_XMIT_DROP, which is a positive value. So, errors can also - * be positive. + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) * - * Regardless of the return value, the skb is consumed, so it is currently - * difficult to retry a send to this method. (You can bump the ref count - * before sending to hold a reference for retry if you are careful.) - * - * When calling this method, interrupts MUST be enabled. This is because - * the BH enable code must have IRQs enabled so that it will not deadlock. - * --BLG + * Return: + * * 0 - buffer successfully transmitted + * * positive qdisc return code - NET_XMIT_DROP etc. + * * negative errno - other errors */ -static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) +int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; - struct netdev_queue *txq; + struct netdev_queue *txq = NULL; struct Qdisc *q; int rc = -ENOMEM; bool again = false; @@ -4058,11 +4190,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (!skb) goto out; } + + netdev_xmit_skip_txqueue(false); + nf_skip_egress(skb, true); skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; nf_skip_egress(skb, false); + + if (netdev_xmit_txqueue_skipped()) + txq = netdev_tx_queue_mapping(dev, skb); } #endif /* If device/qdisc don't need skb->dst, release it right now while @@ -4073,7 +4211,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) else skb_dst_force(skb); - txq = netdev_core_pick_tx(dev, skb, sb_dev); + if (!txq) + txq = netdev_core_pick_tx(dev, skb, sb_dev); + q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); @@ -4108,7 +4248,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (!skb) goto out; - PRANDOM_ADD_NOISE(skb, dev, txq, jiffies); HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { @@ -4136,25 +4275,14 @@ recursion_alert: rc = -ENETDOWN; rcu_read_unlock_bh(); - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); return rc; } - -int dev_queue_xmit(struct sk_buff *skb) -{ - return __dev_queue_xmit(skb, NULL); -} -EXPORT_SYMBOL(dev_queue_xmit); - -int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) -{ - return __dev_queue_xmit(skb, sb_dev); -} -EXPORT_SYMBOL(dev_queue_xmit_accel); +EXPORT_SYMBOL(__dev_queue_xmit); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { @@ -4174,7 +4302,6 @@ int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) skb_set_queue_mapping(skb, queue_id); txq = skb_get_tx_queue(dev, skb); - PRANDOM_ADD_NOISE(skb, dev, txq, jiffies); local_bh_disable(); @@ -4188,7 +4315,7 @@ int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) local_bh_enable(); return ret; drop: - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return NET_XMIT_DROP; } @@ -4202,6 +4329,7 @@ int netdev_max_backlog __read_mostly = 1000; EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; +unsigned int sysctl_skb_defer_max __read_mostly = 64; int netdev_budget __read_mostly = 300; /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; @@ -4217,6 +4345,8 @@ static inline void ____napi_schedule(struct softnet_data *sd, { struct task_struct *thread; + lockdep_assert_irqs_disabled(); + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in * napi_enable()/dev_set_threaded(). @@ -4451,16 +4581,25 @@ static void rps_trigger_softirq(void *data) #endif /* CONFIG_RPS */ +/* Called from hardirq (IPI) context */ +static void trigger_rx_softirq(void *data) +{ + struct softnet_data *sd = data; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + smp_store_release(&sd->defer_ipi_scheduled, 0); +} + /* * Check if this softnet_data structure is another cpu one * If yes, queue it to our IPI list and return 1 * If no, return 0 */ -static int rps_ipi_queued(struct softnet_data *sd) +static int napi_schedule_rps(struct softnet_data *sd) { -#ifdef CONFIG_RPS struct softnet_data *mysd = this_cpu_ptr(&softnet_data); +#ifdef CONFIG_RPS if (sd != mysd) { sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; @@ -4469,6 +4608,7 @@ static int rps_ipi_queued(struct softnet_data *sd) return 1; } #endif /* CONFIG_RPS */ + __napi_schedule_irqoff(&mysd->backlog); return 0; } @@ -4519,15 +4659,15 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { + enum skb_drop_reason reason; struct softnet_data *sd; unsigned long flags; unsigned int qlen; + reason = SKB_DROP_REASON_NOT_SPECIFIED; sd = &per_cpu(softnet_data, cpu); - local_irq_save(flags); - - rps_lock(sd); + rps_lock_irqsave(sd, &flags); if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); @@ -4536,29 +4676,25 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); - rps_unlock(sd); - local_irq_restore(flags); + rps_unlock_irq_restore(sd, &flags); return NET_RX_SUCCESS; } /* Schedule NAPI for backlog device * We can use non atomic operation since we own the queue lock */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { - if (!rps_ipi_queued(sd)) - ____napi_schedule(sd, &sd->backlog); - } + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + napi_schedule_rps(sd); goto enqueue; } + reason = SKB_DROP_REASON_CPU_BACKLOG; drop: sd->dropped++; - rps_unlock(sd); - - local_irq_restore(flags); + rps_unlock_irq_restore(sd, &flags); - atomic_long_inc(&skb->dev->rx_dropped); - kfree_skb(skb); + dev_core_stats_rx_dropped_inc(skb->dev); + kfree_skb_reason(skb, reason); return NET_RX_DROP; } @@ -4727,7 +4863,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, } /* When doing generic XDP we have to bypass the qdisc layer and the - * network taps in order to match in-driver-XDP behavior. + * network taps in order to match in-driver-XDP behavior. This also means + * that XDP packets are able to starve other packets going through a qdisc, + * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX + * queues, so they do not have this starvation issue. */ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) { @@ -4739,7 +4878,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) txq = netdev_core_pick_tx(dev, skb, NULL); cpu = smp_processor_id(); HARD_TX_LOCK(dev, txq, cpu); - if (!netif_xmit_stopped(txq)) { + if (!netif_xmit_frozen_or_drv_stopped(txq)) { rc = netdev_start_xmit(skb, dev, txq, 0); if (dev_xmit_complete(rc)) free_skb = false; @@ -4747,6 +4886,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) HARD_TX_UNLOCK(dev, txq); if (free_skb) { trace_xdp_exception(dev, xdp_prog, XDP_TX); + dev_core_stats_tx_dropped_inc(dev); kfree_skb(skb); } } @@ -4778,7 +4918,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) } return XDP_PASS; out_redir: - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); @@ -4796,7 +4936,6 @@ static int netif_rx_internal(struct sk_buff *skb) struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -4806,78 +4945,72 @@ static int netif_rx_internal(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } /** + * __netif_rx - Slightly optimized version of netif_rx + * @skb: buffer to post + * + * This behaves as netif_rx except that it does not disable bottom halves. + * As a result this function may only be invoked from the interrupt context + * (either hard or soft interrupt). + */ +int __netif_rx(struct sk_buff *skb) +{ + int ret; + + lockdep_assert_once(hardirq_count() | softirq_count()); + + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + return ret; +} +EXPORT_SYMBOL(__netif_rx); + +/** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. + * the upper (protocol) levels to process via the backlog NAPI device. It + * always succeeds. The buffer may be dropped during processing for + * congestion control or by the protocol layers. + * The network buffer is passed via the backlog NAPI device. Modern NIC + * driver should use NAPI and GRO. + * This function can used from interrupt and from process context. The + * caller from process context must not disable interrupts before invoking + * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ - int netif_rx(struct sk_buff *skb) { + bool need_bh_off = !(hardirq_count() | softirq_count()); int ret; + if (need_bh_off) + local_bh_disable(); trace_netif_rx_entry(skb); - ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); - + if (need_bh_off) + local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); -int netif_rx_ni(struct sk_buff *skb) -{ - int err; - - trace_netif_rx_ni_entry(skb); - - preempt_disable(); - err = netif_rx_internal(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); - trace_netif_rx_ni_exit(err); - - return err; -} -EXPORT_SYMBOL(netif_rx_ni); - -int netif_rx_any_context(struct sk_buff *skb) -{ - /* - * If invoked from contexts which do not invoke bottom half - * processing either at return from interrupt or when softrqs are - * reenabled, use netif_rx_ni() which invokes bottomhalf processing - * directly. - */ - if (in_interrupt()) - return netif_rx(skb); - else - return netif_rx_ni(skb); -} -EXPORT_SYMBOL(netif_rx_any_context); - static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -5001,7 +5134,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, break; case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: @@ -5319,10 +5452,10 @@ check_vlan_id: } else { drop: if (!deliver_exact) - atomic_long_inc(&skb->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(skb->dev); else - atomic_long_inc(&skb->dev->rx_nohandler); - kfree_skb(skb); + dev_core_stats_rx_nohandler_inc(skb->dev); + kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ @@ -5650,8 +5783,7 @@ static void flush_backlog(struct work_struct *work) local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); @@ -5659,8 +5791,7 @@ static void flush_backlog(struct work_struct *work) input_queue_head_incr(sd); } } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { @@ -5678,16 +5809,14 @@ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); return do_flush; #endif @@ -5802,8 +5931,7 @@ static int process_backlog(struct napi_struct *napi, int quota) } - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @@ -5819,8 +5947,7 @@ static int process_backlog(struct napi_struct *napi, int quota) skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); } return work; @@ -6230,8 +6357,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } EXPORT_SYMBOL(dev_set_threaded); -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) +void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) { if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; @@ -6264,7 +6391,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, if (dev->threaded && napi_kthread_create(napi)) dev->threaded = 0; } -EXPORT_SYMBOL(netif_napi_add); +EXPORT_SYMBOL(netif_napi_add_weight); void napi_disable(struct napi_struct *n) { @@ -6493,6 +6620,28 @@ static int napi_threaded_poll(void *data) return 0; } +static void skb_defer_free_flush(struct softnet_data *sd) +{ + struct sk_buff *skb, *next; + unsigned long flags; + + /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ + if (!READ_ONCE(sd->defer_list)) + return; + + spin_lock_irqsave(&sd->defer_lock, flags); + skb = sd->defer_list; + sd->defer_list = NULL; + sd->defer_count = 0; + spin_unlock_irqrestore(&sd->defer_lock, flags); + + while (skb != NULL) { + next = skb->next; + napi_consume_skb(skb, 1); + skb = next; + } +} + static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -6509,9 +6658,11 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) for (;;) { struct napi_struct *n; + skb_defer_free_flush(sd); + if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) - return; + goto end; break; } @@ -6538,6 +6689,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); +end:; } struct netdev_adjacent { @@ -7145,6 +7297,16 @@ static int __netdev_update_upper_level(struct net_device *dev, return 0; } +#ifdef CONFIG_LOCKDEP +static LIST_HEAD(net_unlink_list); + +static void net_unlink_todo(struct net_device *dev) +{ + if (list_empty(&dev->unlink_list)) + list_add_tail(&dev->unlink_list, &net_unlink_list); +} +#endif + static int __netdev_update_lower_level(struct net_device *dev, struct netdev_nested_priv *priv) { @@ -7727,6 +7889,242 @@ void netdev_bonding_info_change(struct net_device *dev, } EXPORT_SYMBOL(netdev_bonding_info_change); +static int netdev_offload_xstats_enable_l3(struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + int err; + int rc; + + dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), + GFP_KERNEL); + if (!dev->offload_xstats_l3) + return -ENOMEM; + + rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, + NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + err = notifier_to_errno(rc); + if (err) + goto free_stats; + + return 0; + +free_stats: + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; + return err; +} + +int netdev_offload_xstats_enable(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return netdev_offload_xstats_enable_l3(dev, extack); + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_enable); + +static void netdev_offload_xstats_disable_l3(struct net_device *dev) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + + call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; +} + +int netdev_offload_xstats_disable(struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + if (!netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + netdev_offload_xstats_disable_l3(dev); + return 0; + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_disable); + +static void netdev_offload_xstats_disable_all(struct net_device *dev) +{ + netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); +} + +static struct rtnl_hw_stats64 * +netdev_offload_xstats_get_ptr(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return dev->offload_xstats_l3; + } + + WARN_ON(1); + return NULL; +} + +bool netdev_offload_xstats_enabled(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + return netdev_offload_xstats_get_ptr(dev, type); +} +EXPORT_SYMBOL(netdev_offload_xstats_enabled); + +struct netdev_notifier_offload_xstats_ru { + bool used; +}; + +struct netdev_notifier_offload_xstats_rd { + struct rtnl_hw_stats64 stats; + bool used; +}; + +static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, + const struct rtnl_hw_stats64 *src) +{ + dest->rx_packets += src->rx_packets; + dest->tx_packets += src->tx_packets; + dest->rx_bytes += src->rx_bytes; + dest->tx_bytes += src->tx_bytes; + dest->rx_errors += src->rx_errors; + dest->tx_errors += src->tx_errors; + dest->rx_dropped += src->rx_dropped; + dest->tx_dropped += src->tx_dropped; + dest->multicast += src->multicast; +} + +static int netdev_offload_xstats_get_used(struct net_device *dev, + enum netdev_offload_xstats_type type, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_ru report_used = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_used = &report_used, + }; + int rc; + + WARN_ON(!netdev_offload_xstats_enabled(dev, type)); + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, + &info.info); + *p_used = report_used.used; + return notifier_to_errno(rc); +} + +static int netdev_offload_xstats_get_stats(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_rd report_delta = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_delta = &report_delta, + }; + struct rtnl_hw_stats64 *stats; + int rc; + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return -EINVAL; + + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, + &info.info); + + /* Cache whatever we got, even if there was an error, otherwise the + * successful stats retrievals would get lost. + */ + netdev_hw_stats64_add(stats, &report_delta.stats); + + if (p_stats) + *p_stats = *stats; + *p_used = report_delta.used; + + return notifier_to_errno(rc); +} + +int netdev_offload_xstats_get(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, bool *p_used, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (p_stats) + return netdev_offload_xstats_get_stats(dev, type, p_stats, + p_used, extack); + else + return netdev_offload_xstats_get_used(dev, type, p_used, + extack); +} +EXPORT_SYMBOL(netdev_offload_xstats_get); + +void +netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, + const struct rtnl_hw_stats64 *stats) +{ + report_delta->used = true; + netdev_hw_stats64_add(&report_delta->stats, stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_report_delta); + +void +netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) +{ + report_used->used = true; +} +EXPORT_SYMBOL(netdev_offload_xstats_report_used); + +void netdev_offload_xstats_push_delta(struct net_device *dev, + enum netdev_offload_xstats_type type, + const struct rtnl_hw_stats64 *p_stats) +{ + struct rtnl_hw_stats64 *stats; + + ASSERT_RTNL(); + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return; + + netdev_hw_stats64_add(stats, p_stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_push_delta); + /** * netdev_get_xmit_slave - Get the xmit slave of master device * @dev: device @@ -8347,7 +8745,6 @@ void dev_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } -EXPORT_SYMBOL(dev_set_group); /** * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. @@ -8462,7 +8859,6 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier) return -ENODEV; return ops->ndo_change_carrier(dev, new_carrier); } -EXPORT_SYMBOL(dev_change_carrier); /** * dev_get_phys_port_id - Get device physical port ID @@ -8480,7 +8876,6 @@ int dev_get_phys_port_id(struct net_device *dev, return -EOPNOTSUPP; return ops->ndo_get_phys_port_id(dev, ppid); } -EXPORT_SYMBOL(dev_get_phys_port_id); /** * dev_get_phys_port_name - Get device physical port name @@ -8503,7 +8898,6 @@ int dev_get_phys_port_name(struct net_device *dev, } return devlink_compat_phys_port_name_get(dev, name, len); } -EXPORT_SYMBOL(dev_get_phys_port_name); /** * dev_get_port_parent_id - Get the device's port parent identifier @@ -8585,7 +8979,6 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) dev->proto_down = proto_down; return 0; } -EXPORT_SYMBOL(dev_change_proto_down); /** * dev_change_proto_down_reason - proto down reason @@ -8610,7 +9003,6 @@ void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, } } } -EXPORT_SYMBOL(dev_change_proto_down_reason); struct bpf_xdp_link { struct bpf_link link; @@ -8981,6 +9373,12 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, goto out_unlock; } old_prog = link->prog; + if (old_prog->type != new_prog->type || + old_prog->expected_attach_type != new_prog->expected_attach_type) { + err = -EINVAL; + goto out_unlock; + } + if (old_prog == new_prog) { /* no-op, don't disturb drivers */ bpf_prog_put(new_prog); @@ -9131,13 +9529,13 @@ static int dev_new_index(struct net *net) } /* Delayed registration/unregisteration */ -static LIST_HEAD(net_todo_list); +LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); - dev_net(dev)->dev_unreg_count++; + atomic_inc(&dev_net(dev)->dev_unreg_count); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -9538,22 +9936,14 @@ void netif_tx_stop_all_queues(struct net_device *dev) EXPORT_SYMBOL(netif_tx_stop_all_queues); /** - * register_netdevice - register a network device - * @dev: device to register + * register_netdevice() - register a network device + * @dev: device to register * - * Take a completed network device structure and add it to the kernel - * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier - * chain. 0 is returned on success. A negative errno code is returned - * on a failure to set up the device, or if the name is a duplicate. - * - * Callers must hold the rtnl semaphore. You may want - * register_netdev() instead of this. - * - * BUGS: - * The locking appears insufficient to guarantee two parallel registers - * will not get the same name. + * Take a prepared network device structure and make it externally accessible. + * A %NETDEV_REGISTER message is sent to the netdev notifier chain. + * Callers must hold the rtnl lock - you may want register_netdev() + * instead of this. */ - int register_netdevice(struct net_device *dev) { int ret; @@ -9659,11 +10049,11 @@ int register_netdevice(struct net_device *dev) goto err_uninit; ret = netdev_register_kobject(dev); - if (ret) { - dev->reg_state = NETREG_UNREGISTERED; + write_lock(&dev_base_lock); + dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; + write_unlock(&dev_base_lock); + if (ret) goto err_uninit; - } - dev->reg_state = NETREG_REGISTERED; __netdev_update_features(dev); @@ -9677,8 +10067,10 @@ int register_netdevice(struct net_device *dev) linkwatch_init_dev(dev); dev_init_scheduler(dev); - dev_hold(dev); + + dev_hold_track(dev, &dev->dev_registered_tracker, GFP_KERNEL); list_netdevice(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should @@ -9807,8 +10199,8 @@ int netdev_unregister_timeout_secs __read_mostly = 10; #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** - * netdev_wait_allrefs - wait until all references are gone. - * @dev: target net_device + * netdev_wait_allrefs_any - wait until all references are gone. + * @list: list of net_devices to wait on * * This is called when unregistering network devices. * @@ -9818,37 +10210,42 @@ int netdev_unregister_timeout_secs __read_mostly = 10; * We can get stuck here if buggy protocols don't correctly * call dev_put. */ -static void netdev_wait_allrefs(struct net_device *dev) +static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; - int wait = 0, refcnt; - - linkwatch_forget_dev(dev); + struct net_device *dev; + int wait = 0; rebroadcast_time = warning_time = jiffies; - refcnt = netdev_refcnt_read(dev); - while (refcnt != 1) { + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + list_for_each_entry(dev, list, todo_list) + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); - if (test_bit(__LINK_STATE_LINKWATCH_PENDING, - &dev->state)) { - /* We must not have linkwatch events - * pending on unregister. If this - * happens, we simply run the queue - * unscheduled, resulting in a noop - * for this device. - */ - linkwatch_run_queue(); - } + list_for_each_entry(dev, list, todo_list) + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + break; + } __rtnl_unlock(); @@ -9863,14 +10260,18 @@ static void netdev_wait_allrefs(struct net_device *dev) wait = min(wait << 1, WAIT_REFS_MAX_MSECS); } - refcnt = netdev_refcnt_read(dev); + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; - if (refcnt != 1 && - time_after(jiffies, warning_time + + if (time_after(jiffies, warning_time + netdev_unregister_timeout_secs * HZ)) { - pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", - dev->name, refcnt); - ref_tracker_dir_print(&dev->refcnt_tracker, 10); + list_for_each_entry(dev, list, todo_list) { + pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", + dev->name, netdev_refcnt_read(dev)); + ref_tracker_dir_print(&dev->refcnt_tracker, 10); + } + warning_time = jiffies; } } @@ -9902,6 +10303,7 @@ static void netdev_wait_allrefs(struct net_device *dev) */ void netdev_run_todo(void) { + struct net_device *dev, *tmp; struct list_head list; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; @@ -9922,26 +10324,26 @@ void netdev_run_todo(void) __rtnl_unlock(); - /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); - while (!list_empty(&list)) { - struct net_device *dev - = list_first_entry(&list, struct net_device, todo_list); - list_del(&dev->todo_list); - + list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { - pr_err("network todo '%s' but state %d\n", - dev->name, dev->reg_state); - dump_stack(); + netdev_WARN(dev, "run_todo but not unregistering\n"); + list_del(&dev->todo_list); continue; } + write_lock(&dev_base_lock); dev->reg_state = NETREG_UNREGISTERED; + write_unlock(&dev_base_lock); + linkwatch_forget_dev(dev); + } - netdev_wait_allrefs(dev); + while (!list_empty(&list)) { + dev = netdev_wait_allrefs_any(&list); + list_del(&dev->todo_list); /* paranoia */ BUG_ON(netdev_refcnt_read(dev) != 1); @@ -9957,11 +10359,8 @@ void netdev_run_todo(void) if (dev->needs_free_netdev) free_netdev(dev); - /* Report a network device has been unregistered */ - rtnl_lock(); - dev_net(dev)->dev_unreg_count--; - __rtnl_unlock(); - wake_up(&netdev_unregistering_wq); + if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) + wake_up(&netdev_unregistering_wq); /* Free network device */ kobject_put(&dev->dev.kobj); @@ -9997,6 +10396,21 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, } EXPORT_SYMBOL(netdev_stats_to_stats64); +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) +{ + struct net_device_core_stats __percpu *p; + + p = alloc_percpu_gfp(struct net_device_core_stats, + GFP_ATOMIC | __GFP_NOWARN); + + if (p && cmpxchg(&dev->core_stats, NULL, p)) + free_percpu(p); + + /* This READ_ONCE() pairs with the cmpxchg() above */ + return READ_ONCE(dev->core_stats); +} +EXPORT_SYMBOL(netdev_core_stats_alloc); + /** * dev_get_stats - get network device statistics * @dev: device to get statistics from @@ -10011,6 +10425,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage) { const struct net_device_ops *ops = dev->netdev_ops; + const struct net_device_core_stats __percpu *p; if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); @@ -10020,9 +10435,21 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } else { netdev_stats_to_stats64(storage, &dev->stats); } - storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped); - storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped); - storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler); + + /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ + p = READ_ONCE(dev->core_stats); + if (p) { + const struct net_device_core_stats *core_stats; + int i; + + for_each_possible_cpu(i) { + core_stats = per_cpu_ptr(p, i); + storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); + storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); + storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); + storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); + } + } return storage; } EXPORT_SYMBOL(dev_get_stats); @@ -10166,7 +10593,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; - dev_hold(dev); + __dev_hold(dev); #else refcount_set(&dev->dev_refcnt, 1); #endif @@ -10179,9 +10606,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; - dev->gro_max_size = GRO_MAX_SIZE; + dev->gro_max_size = GRO_LEGACY_MAX_SIZE; + dev->tso_max_size = TSO_LEGACY_MAX_SIZE; + dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; #ifdef CONFIG_LOCKDEP @@ -10284,6 +10713,8 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; #endif + free_percpu(dev->core_stats); + dev->core_stats = NULL; free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; @@ -10387,9 +10818,10 @@ void unregister_netdevice_many(struct list_head *head) list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ - unlist_netdevice(dev); - + write_lock(&dev_base_lock); + unlist_netdevice(dev, false); dev->reg_state = NETREG_UNREGISTERING; + write_unlock(&dev_base_lock); } flush_all_backlogs(); @@ -10403,6 +10835,8 @@ void unregister_netdevice_many(struct list_head *head) dev_xdp_uninstall(dev); + netdev_offload_xstats_disable_all(dev); + /* Notify protocols, that we are about to destroy * this device. They should clean all the things. */ @@ -10443,7 +10877,7 @@ void unregister_netdevice_many(struct list_head *head) synchronize_net(); list_for_each_entry(dev, head, unreg_list) { - dev_put(dev); + dev_put_track(dev, &dev->dev_registered_tracker); net_set_todo(dev); } @@ -10534,7 +10968,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_close(dev); /* And unlink it from device chain */ - unlist_netdevice(dev); + unlist_netdevice(dev, true); synchronize_net(); @@ -10668,11 +11102,11 @@ static int dev_cpu_dead(unsigned int oldcpu) /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { - netif_rx_ni(skb); + netif_rx(skb); input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { - netif_rx_ni(skb); + netif_rx(skb); input_queue_head_incr(oldsd); } @@ -10726,8 +11160,7 @@ static int __net_init netdev_init(struct net *net) BUILD_BUG_ON(GRO_HASH_BUCKETS > 8 * sizeof_field(struct napi_struct, gro_bitmask)); - if (net != &init_net) - INIT_LIST_HEAD(&net->dev_base_head); + INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) @@ -10843,14 +11276,14 @@ static struct pernet_operations __net_initdata netdev_net_ops = { .exit = netdev_exit, }; -static void __net_exit default_device_exit(struct net *net) +static void __net_exit default_device_exit_net(struct net *net) { struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ - rtnl_lock(); + ASSERT_RTNL(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; @@ -10874,35 +11307,6 @@ static void __net_exit default_device_exit(struct net *net) BUG(); } } - rtnl_unlock(); -} - -static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) -{ - /* Return with the rtnl_lock held when there are no network - * devices unregistering in any network namespace in net_list. - */ - struct net *net; - bool unregistering; - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(&netdev_unregistering_wq, &wait); - for (;;) { - unregistering = false; - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) { - if (net->dev_unreg_count > 0) { - unregistering = true; - break; - } - } - if (!unregistering) - break; - __rtnl_unlock(); - - wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); - } - remove_wait_queue(&netdev_unregistering_wq, &wait); } static void __net_exit default_device_exit_batch(struct list_head *net_list) @@ -10916,18 +11320,12 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) struct net *net; LIST_HEAD(dev_kill_list); - /* To prevent network device cleanup code from dereferencing - * loopback devices or network devices that have been freed - * wait here for all pending unregistrations to complete, - * before unregistring the loopback device and allowing the - * network namespace be freed. - * - * The netdev todo list containing all network devices - * unregistrations that happen in default_device_exit_batch - * will run in the rtnl_unlock() at the end of - * default_device_exit_batch. - */ - rtnl_lock_unregistering(net_list); + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + default_device_exit_net(net); + cond_resched(); + } + list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) @@ -10941,7 +11339,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } static struct pernet_operations __net_initdata default_device_ops = { - .exit = default_device_exit, .exit_batch = default_device_exit_batch, }; @@ -10996,6 +11393,8 @@ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); + spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; diff --git a/net/core/dev.h b/net/core/dev.h new file mode 100644 index 000000000000..cbb8a925175a --- /dev/null +++ b/net/core/dev.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_CORE_DEV_H +#define _NET_CORE_DEV_H + +#include <linux/types.h> + +struct net; +struct net_device; +struct netdev_bpf; +struct netdev_phys_item_id; +struct netlink_ext_ack; + +/* Random bits of netdevice that don't need to be exposed */ +#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ +struct sd_flow_limit { + u64 count; + unsigned int num_buckets; + unsigned int history_head; + u16 history[FLOW_LIMIT_HISTORY]; + u8 buckets[]; +}; + +extern int netdev_flow_limit_table_len; + +#ifdef CONFIG_PROC_FS +int __init dev_proc_init(void); +#else +#define dev_proc_init() 0 +#endif + +void linkwatch_init_dev(struct net_device *dev); +void linkwatch_forget_dev(struct net_device *dev); +void linkwatch_run_queue(void); + +void dev_addr_flush(struct net_device *dev); +int dev_addr_init(struct net_device *dev); +void dev_addr_check(struct net_device *dev); + +/* sysctls not referred to from outside net/core/ */ +extern int netdev_budget; +extern unsigned int netdev_budget_usecs; +extern unsigned int sysctl_skb_defer_max; +extern int netdev_tstamp_prequeue; +extern int netdev_unregister_timeout_secs; +extern int weight_p; +extern int dev_weight_rx_bias; +extern int dev_weight_tx_bias; + +/* rtnl helpers */ +extern struct list_head net_todo_list; +void netdev_run_todo(void); + +/* netdev management, shared between various uAPI entry points */ +struct netdev_name_node { + struct hlist_node hlist; + struct list_head list; + struct net_device *dev; + const char *name; +}; + +int netdev_get_name(struct net *net, char *name, int ifindex); +int dev_change_name(struct net_device *dev, const char *newname); + +int netdev_name_node_alt_create(struct net_device *dev, const char *name); +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); + +int dev_validate_mtu(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); +int dev_set_mtu_ext(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); + +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_item_id *ppid); +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len); + +int dev_change_proto_down(struct net_device *dev, bool proto_down); +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value); + +typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, int expected_fd, u32 flags); + +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len); +void dev_set_group(struct net_device *dev, int new_group); +int dev_change_carrier(struct net_device *dev, bool new_carrier); + +void __dev_set_rx_mode(struct net_device *dev); + +static inline void netif_set_gso_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_size, size); +} + +static inline void netif_set_gso_max_segs(struct net_device *dev, + unsigned int segs) +{ + /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_segs, segs); +} + +static inline void netif_set_gro_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_max_size, size); +} + +#endif diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index bead38ca50bd..baa63dee2829 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -12,6 +12,8 @@ #include <linux/export.h> #include <linux/list.h> +#include "dev.h" + /* * General list handling functions */ diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 1b807d119da5..4f6be442ae7e 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -10,6 +10,8 @@ #include <net/dsa.h> #include <net/wext.h> +#include "dev.h" + /* * Map an interface index to its name (SIOCGIFNAME) */ diff --git a/net/core/devlink.c b/net/core/devlink.c index fcd9f6d85cf1..5cc88490f18f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -54,6 +54,8 @@ struct devlink { struct list_head trap_list; struct list_head trap_group_list; struct list_head trap_policer_list; + struct list_head linecard_list; + struct mutex linecards_lock; /* protects linecard_list */ const struct devlink_ops *ops; u64 features; struct xarray snapshot_ids; @@ -70,6 +72,23 @@ struct devlink { char priv[] __aligned(NETDEV_ALIGN); }; +struct devlink_linecard_ops; +struct devlink_linecard_type; + +struct devlink_linecard { + struct list_head list; + struct devlink *devlink; + unsigned int index; + refcount_t refcount; + const struct devlink_linecard_ops *ops; + void *priv; + enum devlink_linecard_state state; + struct mutex state_lock; /* Protects state */ + const char *type; + struct devlink_linecard_type *types; + unsigned int types_count; +}; + /** * struct devlink_resource - devlink resource * @name: name of the resource @@ -225,6 +244,33 @@ struct devlink *__must_check devlink_try_get(struct devlink *devlink) return NULL; } +void devl_assert_locked(struct devlink *devlink) +{ + lockdep_assert_held(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_assert_locked); + +#ifdef CONFIG_LOCKDEP +/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ +bool devl_lock_is_held(struct devlink *devlink) +{ + return lockdep_is_held(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_lock_is_held); +#endif + +void devl_lock(struct devlink *devlink) +{ + mutex_lock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_lock); + +void devl_unlock(struct devlink *devlink) +{ + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_unlock); + static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { @@ -370,6 +416,58 @@ devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) return ERR_PTR(-EINVAL); } +static struct devlink_linecard * +devlink_linecard_get_by_index(struct devlink *devlink, + unsigned int linecard_index) +{ + struct devlink_linecard *devlink_linecard; + + list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) { + if (devlink_linecard->index == linecard_index) + return devlink_linecard; + } + return NULL; +} + +static bool devlink_linecard_index_exists(struct devlink *devlink, + unsigned int linecard_index) +{ + return devlink_linecard_get_by_index(devlink, linecard_index); +} + +static struct devlink_linecard * +devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) +{ + if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) { + u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]); + struct devlink_linecard *linecard; + + mutex_lock(&devlink->linecards_lock); + linecard = devlink_linecard_get_by_index(devlink, linecard_index); + if (linecard) + refcount_inc(&linecard->refcount); + mutex_unlock(&devlink->linecards_lock); + if (!linecard) + return ERR_PTR(-ENODEV); + return linecard; + } + return ERR_PTR(-EINVAL); +} + +static struct devlink_linecard * +devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + return devlink_linecard_get_from_attrs(devlink, info->attrs); +} + +static void devlink_linecard_put(struct devlink_linecard *linecard) +{ + if (refcount_dec_and_test(&linecard->refcount)) { + mutex_destroy(&linecard->state_lock); + kfree(linecard); + } +} + struct devlink_sb { struct list_head list; unsigned int index; @@ -590,16 +688,18 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_RATE BIT(2) #define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) +#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) /* The per devlink instance lock is taken by default in the pre-doit * operation, yet several commands do not require this. The global * devlink lock is taken and protects from disruption by user-calls. */ -#define DEVLINK_NL_FLAG_NO_LOCK BIT(4) +#define DEVLINK_NL_FLAG_NO_LOCK BIT(5) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct devlink_linecard *linecard; struct devlink_port *devlink_port; struct devlink *devlink; int err; @@ -642,6 +742,13 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, goto unlock; } info->user_ptr[1] = rate_node; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) { + err = PTR_ERR(linecard); + goto unlock; + } + info->user_ptr[1] = linecard; } return 0; @@ -656,9 +763,14 @@ unlock: static void devlink_nl_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct devlink_linecard *linecard; struct devlink *devlink; devlink = info->user_ptr[0]; + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { + linecard = info->user_ptr[1]; + devlink_linecard_put(linecard); + } if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); devlink_put(devlink); @@ -1131,6 +1243,10 @@ static int devlink_nl_port_fill(struct sk_buff *msg, goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) goto nla_put_failure; + if (devlink_port->linecard && + nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, + devlink_port->linecard->index)) + goto nla_put_failure; genlmsg_end(msg, hdr); return 0; @@ -1541,35 +1657,20 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, return 0; } -static int devlink_port_split(struct devlink *devlink, u32 port_index, - u32 count, struct netlink_ext_ack *extack) - -{ - if (devlink->ops->port_split) - return devlink->ops->port_split(devlink, port_index, count, - extack); - return -EOPNOTSUPP; -} - static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, struct genl_info *info) { + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; - struct devlink_port *devlink_port; - u32 port_index; u32 count; - if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] || - !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]) + if (!info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]) return -EINVAL; + if (!devlink->ops->port_split) + return -EOPNOTSUPP; - devlink_port = devlink_port_get_from_info(devlink, info); - port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); - if (IS_ERR(devlink_port)) - return -EINVAL; - if (!devlink_port->attrs.splittable) { /* Split ports cannot be split. */ if (devlink_port->attrs.split) @@ -1584,29 +1685,19 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, return -EINVAL; } - return devlink_port_split(devlink, port_index, count, info->extack); -} - -static int devlink_port_unsplit(struct devlink *devlink, u32 port_index, - struct netlink_ext_ack *extack) - -{ - if (devlink->ops->port_unsplit) - return devlink->ops->port_unsplit(devlink, port_index, extack); - return -EOPNOTSUPP; + return devlink->ops->port_split(devlink, devlink_port, count, + info->extack); } static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info) { + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; - u32 port_index; - - if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) - return -EINVAL; - port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - return devlink_port_unsplit(devlink, port_index, info->extack); + if (!devlink->ops->port_unsplit) + return -EOPNOTSUPP; + return devlink->ops->port_unsplit(devlink, devlink_port, info->extack); } static int devlink_port_new_notifiy(struct devlink *devlink, @@ -1962,6 +2053,322 @@ static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, return err; } +struct devlink_linecard_type { + const char *type; + const void *priv; +}; + +static int devlink_nl_linecard_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_linecard *linecard, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, + struct netlink_ext_ack *extack) +{ + struct devlink_linecard_type *linecard_type; + struct nlattr *attr; + void *hdr; + int i; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state)) + goto nla_put_failure; + if (linecard->type && + nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type)) + goto nla_put_failure; + + if (linecard->types_count) { + attr = nla_nest_start(msg, + DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES); + if (!attr) + goto nla_put_failure; + for (i = 0; i < linecard->types_count; i++) { + linecard_type = &linecard->types[i]; + if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, + linecard_type->type)) { + nla_nest_cancel(msg, attr); + goto nla_put_failure; + } + } + nla_nest_end(msg, attr); + } + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_linecard_notify(struct devlink_linecard *linecard, + enum devlink_command cmd) +{ + struct devlink *devlink = linecard->devlink; + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW && + cmd != DEVLINK_CMD_LINECARD_DEL); + + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0, + NULL); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_linecard *linecard = info->user_ptr[1]; + struct devlink *devlink = linecard->devlink; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&linecard->state_lock); + err = devlink_nl_linecard_fill(msg, devlink, linecard, + DEVLINK_CMD_LINECARD_NEW, + info->snd_portid, info->snd_seq, 0, + info->extack); + mutex_unlock(&linecard->state_lock); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_linecard *linecard; + struct devlink *devlink; + int start = cb->args[0]; + unsigned long index; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + + mutex_lock(&devlink->linecards_lock); + list_for_each_entry(linecard, &devlink->linecard_list, list) { + if (idx < start) { + idx++; + continue; + } + mutex_lock(&linecard->state_lock); + err = devlink_nl_linecard_fill(msg, devlink, linecard, + DEVLINK_CMD_LINECARD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + cb->extack); + mutex_unlock(&linecard->state_lock); + if (err) { + mutex_unlock(&devlink->linecards_lock); + devlink_put(devlink); + goto out; + } + idx++; + } + mutex_unlock(&devlink->linecards_lock); +retry: + devlink_put(devlink); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static struct devlink_linecard_type * +devlink_linecard_type_lookup(struct devlink_linecard *linecard, + const char *type) +{ + struct devlink_linecard_type *linecard_type; + int i; + + for (i = 0; i < linecard->types_count; i++) { + linecard_type = &linecard->types[i]; + if (!strcmp(type, linecard_type->type)) + return linecard_type; + } + return NULL; +} + +static int devlink_linecard_type_set(struct devlink_linecard *linecard, + const char *type, + struct netlink_ext_ack *extack) +{ + const struct devlink_linecard_ops *ops = linecard->ops; + struct devlink_linecard_type *linecard_type; + int err; + + mutex_lock(&linecard->state_lock); + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { + NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { + NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned"); + err = -EBUSY; + goto out; + } + + linecard_type = devlink_linecard_type_lookup(linecard, type); + if (!linecard_type) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported line card type provided"); + err = -EINVAL; + goto out; + } + + if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && + linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { + NL_SET_ERR_MSG_MOD(extack, "Line card already provisioned"); + err = -EBUSY; + /* Check if the line card is provisioned in the same + * way the user asks. In case it is, make the operation + * to return success. + */ + if (ops->same_provision && + ops->same_provision(linecard, linecard->priv, + linecard_type->type, + linecard_type->priv)) + err = 0; + goto out; + } + + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; + linecard->type = linecard_type->type; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + err = ops->provision(linecard, linecard->priv, linecard_type->type, + linecard_type->priv, extack); + if (err) { + /* Provisioning failed. Assume the linecard is unprovisioned + * for future operations. + */ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + } + return err; + +out: + mutex_unlock(&linecard->state_lock); + return err; +} + +static int devlink_linecard_type_unset(struct devlink_linecard *linecard, + struct netlink_ext_ack *extack) +{ + int err; + + mutex_lock(&linecard->state_lock); + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { + NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { + NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + err = 0; + goto out; + } + + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { + NL_SET_ERR_MSG_MOD(extack, "Line card is not provisioned"); + err = 0; + goto out; + } + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + err = linecard->ops->unprovision(linecard, linecard->priv, + extack); + if (err) { + /* Unprovisioning failed. Assume the linecard is unprovisioned + * for future operations. + */ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + } + return err; + +out: + mutex_unlock(&linecard->state_lock); + return err; +} + +static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_linecard *linecard = info->user_ptr[1]; + struct netlink_ext_ack *extack = info->extack; + int err; + + if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { + const char *type; + + type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); + if (strcmp(type, "")) { + err = devlink_linecard_type_set(linecard, type, extack); + if (err) + return err; + } else { + err = devlink_linecard_type_unset(linecard, extack); + if (err) + return err; + } + } + + return 0; +} + static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, @@ -2866,15 +3273,11 @@ static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, { struct devlink_rate *devlink_rate; - /* Take the lock to sync with devlink_rate_nodes_destroy() */ - mutex_lock(&devlink->lock); list_for_each_entry(devlink_rate, &devlink->rate_list, list) if (devlink_rate_is_node(devlink_rate)) { - mutex_unlock(&devlink->lock); NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists."); return -EBUSY; } - mutex_unlock(&devlink->lock); return 0; } @@ -8591,6 +8994,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -8645,14 +9050,14 @@ static const struct genl_small_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_split_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { .cmd = DEVLINK_CMD_PORT_UNSPLIT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_unsplit_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { .cmd = DEVLINK_CMD_PORT_NEW, @@ -8667,6 +9072,19 @@ static const struct genl_small_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { + .cmd = DEVLINK_CMD_LINECARD_GET, + .doit = devlink_nl_cmd_linecard_get_doit, + .dumpit = devlink_nl_cmd_linecard_get_dumpit, + .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_LINECARD_SET, + .doit = devlink_nl_cmd_linecard_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, + }, + { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_get_doit, @@ -8733,14 +9151,12 @@ static const struct genl_small_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_get_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_ESWITCH_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, @@ -9047,6 +9463,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, write_pnet(&devlink->_net, net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->rate_list); + INIT_LIST_HEAD(&devlink->linecard_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); @@ -9058,6 +9475,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, INIT_LIST_HEAD(&devlink->trap_policer_list); mutex_init(&devlink->lock); mutex_init(&devlink->reporters_lock); + mutex_init(&devlink->linecards_lock); refcount_set(&devlink->refcount, 1); init_completion(&devlink->comp); @@ -9084,10 +9502,14 @@ static void devlink_notify_register(struct devlink *devlink) struct devlink_param_item *param_item; struct devlink_trap_item *trap_item; struct devlink_port *devlink_port; + struct devlink_linecard *linecard; struct devlink_rate *rate_node; struct devlink_region *region; devlink_notify(devlink, DEVLINK_CMD_NEW); + list_for_each_entry(linecard, &devlink->linecard_list, list) + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + list_for_each_entry(devlink_port, &devlink->port_list, list) devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); @@ -9195,6 +9617,7 @@ void devlink_free(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); + mutex_destroy(&devlink->linecards_lock); mutex_destroy(&devlink->reporters_lock); mutex_destroy(&devlink->lock); WARN_ON(!list_empty(&devlink->trap_policer_list)); @@ -9207,6 +9630,7 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); WARN_ON(!list_empty(&devlink->rate_list)); + WARN_ON(!list_empty(&devlink->linecard_list)); WARN_ON(!list_empty(&devlink->port_list)); xa_destroy(&devlink->snapshot_ids); @@ -9249,6 +9673,32 @@ static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) cancel_delayed_work_sync(&devlink_port->type_warn_dw); } +int devl_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index) +{ + lockdep_assert_held(&devlink->lock); + + if (devlink_port_index_exists(devlink, port_index)) + return -EEXIST; + + WARN_ON(devlink_port->devlink); + devlink_port->devlink = devlink; + devlink_port->index = port_index; + spin_lock_init(&devlink_port->type_lock); + INIT_LIST_HEAD(&devlink_port->reporter_list); + mutex_init(&devlink_port->reporters_lock); + list_add_tail(&devlink_port->list, &devlink->port_list); + INIT_LIST_HEAD(&devlink_port->param_list); + INIT_LIST_HEAD(&devlink_port->region_list); + + INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); + devlink_port_type_warn_schedule(devlink_port); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; +} +EXPORT_SYMBOL_GPL(devl_port_register); + /** * devlink_port_register - Register devlink port * @@ -9266,29 +9716,28 @@ int devlink_port_register(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index) { - mutex_lock(&devlink->lock); - if (devlink_port_index_exists(devlink, port_index)) { - mutex_unlock(&devlink->lock); - return -EEXIST; - } + int err; - WARN_ON(devlink_port->devlink); - devlink_port->devlink = devlink; - devlink_port->index = port_index; - spin_lock_init(&devlink_port->type_lock); - INIT_LIST_HEAD(&devlink_port->reporter_list); - mutex_init(&devlink_port->reporters_lock); - list_add_tail(&devlink_port->list, &devlink->port_list); - INIT_LIST_HEAD(&devlink_port->param_list); - INIT_LIST_HEAD(&devlink_port->region_list); + mutex_lock(&devlink->lock); + err = devl_port_register(devlink, devlink_port, port_index); mutex_unlock(&devlink->lock); - INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); - devlink_port_type_warn_schedule(devlink_port); - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); - return 0; + return err; } EXPORT_SYMBOL_GPL(devlink_port_register); +void devl_port_unregister(struct devlink_port *devlink_port) +{ + lockdep_assert_held(&devlink_port->devlink->lock); + + devlink_port_type_warn_cancel(devlink_port); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); + list_del(&devlink_port->list); + WARN_ON(!list_empty(&devlink_port->reporter_list)); + WARN_ON(!list_empty(&devlink_port->region_list)); + mutex_destroy(&devlink_port->reporters_lock); +} +EXPORT_SYMBOL_GPL(devl_port_unregister); + /** * devlink_port_unregister - Unregister devlink port * @@ -9298,14 +9747,9 @@ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; - devlink_port_type_warn_cancel(devlink_port); - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); mutex_lock(&devlink->lock); - list_del(&devlink_port->list); + devl_port_unregister(devlink_port); mutex_unlock(&devlink->lock); - WARN_ON(!list_empty(&devlink_port->reporter_list)); - WARN_ON(!list_empty(&devlink_port->region_list)); - mutex_destroy(&devlink_port->reporters_lock); } EXPORT_SYMBOL_GPL(devlink_port_unregister); @@ -9526,30 +9970,26 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); /** - * devlink_rate_leaf_create - create devlink rate leaf - * + * devl_rate_leaf_create - create devlink rate leaf * @devlink_port: devlink port object to create rate object on * @priv: driver private data * * Create devlink rate object of type leaf on provided @devlink_port. - * Throws call trace if @devlink_port already has a devlink rate object. - * - * Context: Takes and release devlink->lock <mutex>. - * - * Return: -ENOMEM if failed to allocate rate object, 0 otherwise. */ -int -devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv) +int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv) { struct devlink *devlink = devlink_port->devlink; struct devlink_rate *devlink_rate; + devl_assert_locked(devlink_port->devlink); + + if (WARN_ON(devlink_port->devlink_rate)) + return -EBUSY; + devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); if (!devlink_rate) return -ENOMEM; - mutex_lock(&devlink->lock); - WARN_ON(devlink_port->devlink_rate); devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; devlink_rate->devlink = devlink; devlink_rate->devlink_port = devlink_port; @@ -9557,12 +9997,42 @@ devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv) list_add_tail(&devlink_rate->list, &devlink->rate_list); devlink_port->devlink_rate = devlink_rate; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); - mutex_unlock(&devlink->lock); return 0; } +EXPORT_SYMBOL_GPL(devl_rate_leaf_create); + +int +devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv) +{ + struct devlink *devlink = devlink_port->devlink; + int ret; + + mutex_lock(&devlink->lock); + ret = devl_rate_leaf_create(devlink_port, priv); + mutex_unlock(&devlink->lock); + + return ret; +} EXPORT_SYMBOL_GPL(devlink_rate_leaf_create); +void devl_rate_leaf_destroy(struct devlink_port *devlink_port) +{ + struct devlink_rate *devlink_rate = devlink_port->devlink_rate; + + devl_assert_locked(devlink_port->devlink); + if (!devlink_rate) + return; + + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); + if (devlink_rate->parent) + refcount_dec(&devlink_rate->parent->refcnt); + list_del(&devlink_rate->list); + devlink_port->devlink_rate = NULL; + kfree(devlink_rate); +} +EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); + /** * devlink_rate_leaf_destroy - destroy devlink rate leaf * @@ -9579,32 +10049,25 @@ void devlink_rate_leaf_destroy(struct devlink_port *devlink_port) return; mutex_lock(&devlink->lock); - devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); - if (devlink_rate->parent) - refcount_dec(&devlink_rate->parent->refcnt); - list_del(&devlink_rate->list); - devlink_port->devlink_rate = NULL; + devl_rate_leaf_destroy(devlink_port); mutex_unlock(&devlink->lock); - kfree(devlink_rate); } EXPORT_SYMBOL_GPL(devlink_rate_leaf_destroy); /** - * devlink_rate_nodes_destroy - destroy all devlink rate nodes on device - * + * devl_rate_nodes_destroy - destroy all devlink rate nodes on device * @devlink: devlink instance * * Unset parent for all rate objects and destroy all rate nodes * on specified device. - * - * Context: Takes and release devlink->lock <mutex>. */ -void devlink_rate_nodes_destroy(struct devlink *devlink) +void devl_rate_nodes_destroy(struct devlink *devlink) { static struct devlink_rate *devlink_rate, *tmp; const struct devlink_ops *ops = devlink->ops; - mutex_lock(&devlink->lock); + devl_assert_locked(devlink); + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (!devlink_rate->parent) continue; @@ -9625,10 +10088,42 @@ void devlink_rate_nodes_destroy(struct devlink *devlink) kfree(devlink_rate); } } +} +EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy); + +/** + * devlink_rate_nodes_destroy - destroy all devlink rate nodes on device + * + * @devlink: devlink instance + * + * Unset parent for all rate objects and destroy all rate nodes + * on specified device. + * + * Context: Takes and release devlink->lock <mutex>. + */ +void devlink_rate_nodes_destroy(struct devlink *devlink) +{ + mutex_lock(&devlink->lock); + devl_rate_nodes_destroy(devlink); mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_rate_nodes_destroy); +/** + * devlink_port_linecard_set - Link port with a linecard + * + * @devlink_port: devlink port + * @linecard: devlink linecard + */ +void devlink_port_linecard_set(struct devlink_port *devlink_port, + struct devlink_linecard *linecard) +{ + if (WARN_ON(devlink_port->devlink)) + return; + devlink_port->linecard = linecard; +} +EXPORT_SYMBOL_GPL(devlink_port_linecard_set); + static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { @@ -9640,7 +10135,12 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: - n = snprintf(name, len, "p%u", attrs->phys.port_number); + if (devlink_port->linecard) + n = snprintf(name, len, "l%u", + devlink_port->linecard->index); + if (n < len) + n += snprintf(name + n, len - n, "p%u", + attrs->phys.port_number); if (n < len && attrs->split) n += snprintf(name + n, len - n, "s%u", attrs->phys.split_subport_number); @@ -9695,6 +10195,207 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, return 0; } +static int devlink_linecard_types_init(struct devlink_linecard *linecard) +{ + struct devlink_linecard_type *linecard_type; + unsigned int count; + int i; + + count = linecard->ops->types_count(linecard, linecard->priv); + linecard->types = kmalloc_array(count, sizeof(*linecard_type), + GFP_KERNEL); + if (!linecard->types) + return -ENOMEM; + linecard->types_count = count; + + for (i = 0; i < count; i++) { + linecard_type = &linecard->types[i]; + linecard->ops->types_get(linecard, linecard->priv, i, + &linecard_type->type, + &linecard_type->priv); + } + return 0; +} + +static void devlink_linecard_types_fini(struct devlink_linecard *linecard) +{ + kfree(linecard->types); +} + +/** + * devlink_linecard_create - Create devlink linecard + * + * @devlink: devlink + * @linecard_index: driver-specific numerical identifier of the linecard + * @ops: linecards ops + * @priv: user priv pointer + * + * Create devlink linecard instance with provided linecard index. + * Caller can use any indexing, even hw-related one. + * + * Return: Line card structure or an ERR_PTR() encoded error code. + */ +struct devlink_linecard * +devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, + const struct devlink_linecard_ops *ops, void *priv) +{ + struct devlink_linecard *linecard; + int err; + + if (WARN_ON(!ops || !ops->provision || !ops->unprovision || + !ops->types_count || !ops->types_get)) + return ERR_PTR(-EINVAL); + + mutex_lock(&devlink->linecards_lock); + if (devlink_linecard_index_exists(devlink, linecard_index)) { + mutex_unlock(&devlink->linecards_lock); + return ERR_PTR(-EEXIST); + } + + linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); + if (!linecard) { + mutex_unlock(&devlink->linecards_lock); + return ERR_PTR(-ENOMEM); + } + + linecard->devlink = devlink; + linecard->index = linecard_index; + linecard->ops = ops; + linecard->priv = priv; + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + mutex_init(&linecard->state_lock); + + err = devlink_linecard_types_init(linecard); + if (err) { + mutex_destroy(&linecard->state_lock); + kfree(linecard); + mutex_unlock(&devlink->linecards_lock); + return ERR_PTR(err); + } + + list_add_tail(&linecard->list, &devlink->linecard_list); + refcount_set(&linecard->refcount, 1); + mutex_unlock(&devlink->linecards_lock); + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + return linecard; +} +EXPORT_SYMBOL_GPL(devlink_linecard_create); + +/** + * devlink_linecard_destroy - Destroy devlink linecard + * + * @linecard: devlink linecard + */ +void devlink_linecard_destroy(struct devlink_linecard *linecard) +{ + struct devlink *devlink = linecard->devlink; + + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); + mutex_lock(&devlink->linecards_lock); + list_del(&linecard->list); + devlink_linecard_types_fini(linecard); + mutex_unlock(&devlink->linecards_lock); + devlink_linecard_put(linecard); +} +EXPORT_SYMBOL_GPL(devlink_linecard_destroy); + +/** + * devlink_linecard_provision_set - Set provisioning on linecard + * + * @linecard: devlink linecard + * @type: linecard type + * + * This is either called directly from the provision() op call or + * as a result of the provision() op call asynchronously. + */ +void devlink_linecard_provision_set(struct devlink_linecard *linecard, + const char *type) +{ + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->type && strcmp(linecard->type, type)); + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; + linecard->type = type; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); + +/** + * devlink_linecard_provision_clear - Clear provisioning on linecard + * + * @linecard: devlink linecard + * + * This is either called directly from the unprovision() op call or + * as a result of the unprovision() op call asynchronously. + */ +void devlink_linecard_provision_clear(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear); + +/** + * devlink_linecard_provision_fail - Fail provisioning on linecard + * + * @linecard: devlink linecard + * + * This is either called directly from the provision() op call or + * as a result of the provision() op call asynchronously. + */ +void devlink_linecard_provision_fail(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail); + +/** + * devlink_linecard_activate - Set linecard active + * + * @linecard: devlink linecard + */ +void devlink_linecard_activate(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED); + linecard->state = DEVLINK_LINECARD_STATE_ACTIVE; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_activate); + +/** + * devlink_linecard_deactivate - Set linecard inactive + * + * @linecard: devlink linecard + */ +void devlink_linecard_deactivate(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + switch (linecard->state) { + case DEVLINK_LINECARD_STATE_ACTIVE: + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + break; + case DEVLINK_LINECARD_STATE_UNPROVISIONING: + /* Line card is being deactivated as part + * of unprovisioning flow. + */ + break; + default: + WARN_ON(1); + break; + } + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_deactivate); + int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 7b288a121a41..41cac0e4834e 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -48,10 +48,22 @@ static int trace_state = TRACE_OFF; static bool monitor_hw; +#undef EM +#undef EMe + +#define EM(a, b) [a] = #b, +#define EMe(a, b) [a] = #b + +/* drop_reasons is used to translate 'enum skb_drop_reason' to string, + * which is reported to user space. + */ +static const char * const drop_reasons[] = { + TRACE_SKB_DROP_REASON +}; + /* net_dm_mutex * * An overall lock guarding every operation coming from userspace. - * It also guards the global 'hw_stats_list' list. */ static DEFINE_MUTEX(net_dm_mutex); @@ -87,11 +99,9 @@ struct per_cpu_dm_data { }; struct dm_hw_stat_delta { - struct net_device *dev; unsigned long last_rx; - struct list_head list; - struct rcu_head rcu; unsigned long last_drop_val; + struct rcu_head rcu; }; static struct genl_family net_drop_monitor_family; @@ -102,7 +112,6 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; -static LIST_HEAD(hw_stats_list); static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; static u32 net_dm_trunc_len; @@ -126,6 +135,7 @@ struct net_dm_skb_cb { struct devlink_trap_metadata *hw_metadata; void *pc; }; + enum skb_drop_reason reason; }; #define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0])) @@ -273,29 +283,27 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { - struct dm_hw_stat_delta *new_stat; - + struct net_device *dev = napi->dev; + struct dm_hw_stat_delta *stat; /* * Don't check napi structures with no associated device */ - if (!napi->dev) + if (!dev) return; rcu_read_lock(); - list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + stat = rcu_dereference(dev->dm_private); + if (stat) { /* * only add a note to our monitor buffer if: - * 1) this is the dev we received on - * 2) its after the last_rx delta - * 3) our rx_dropped count has gone up + * 1) its after the last_rx delta + * 2) our rx_dropped count has gone up */ - if ((new_stat->dev == napi->dev) && - (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && - (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) && + (dev->stats.rx_dropped != stat->last_drop_val)) { trace_drop_common(NULL, NULL); - new_stat->last_drop_val = napi->dev->stats.rx_dropped; - new_stat->last_rx = jiffies; - break; + stat->last_drop_val = dev->stats.rx_dropped; + stat->last_rx = jiffies; } } rcu_read_unlock(); @@ -498,6 +506,7 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore, { ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *data; + struct net_dm_skb_cb *cb; struct sk_buff *nskb; unsigned long flags; @@ -508,7 +517,11 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore, if (!nskb) return; - NET_DM_SKB_CB(nskb)->pc = location; + if (unlikely(reason >= SKB_DROP_REASON_MAX || reason <= 0)) + reason = SKB_DROP_REASON_NOT_SPECIFIED; + cb = NET_DM_SKB_CB(nskb); + cb->reason = reason; + cb->pc = location; /* Override the timestamp because we care about the time when the * packet was dropped. */ @@ -553,7 +566,8 @@ static size_t net_dm_in_port_size(void) #define NET_DM_MAX_SYMBOL_LEN 40 -static size_t net_dm_packet_report_size(size_t payload_len) +static size_t net_dm_packet_report_size(size_t payload_len, + enum skb_drop_reason reason) { size_t size; @@ -574,6 +588,8 @@ static size_t net_dm_packet_report_size(size_t payload_len) nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_REASON */ + nla_total_size(strlen(drop_reasons[reason]) + 1) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } @@ -606,7 +622,7 @@ nla_put_failure: static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { - u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc; + struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb); char buf[NET_DM_MAX_SYMBOL_LEN]; struct nlattr *attr; void *hdr; @@ -620,10 +636,15 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW)) goto nla_put_failure; - if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD)) + if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc, + NET_DM_ATTR_PAD)) goto nla_put_failure; - snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc); + if (nla_put_string(msg, NET_DM_ATTR_REASON, + drop_reasons[cb->reason])) + goto nla_put_failure; + + snprintf(buf, sizeof(buf), "%pS", cb->pc); if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) goto nla_put_failure; @@ -679,7 +700,9 @@ static void net_dm_packet_report(struct sk_buff *skb) if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); - msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL); + msg = nlmsg_new(net_dm_packet_report_size(payload_len, + NET_DM_SKB_CB(skb)->reason), + GFP_KERNEL); if (!msg) goto out; @@ -1165,7 +1188,6 @@ err_module_put: static void net_dm_trace_off_set(void) { - struct dm_hw_stat_delta *new_stat, *temp; const struct net_dm_alert_ops *ops; int cpu; @@ -1189,13 +1211,6 @@ static void net_dm_trace_off_set(void) consume_skb(skb); } - list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { - if (new_stat->dev == NULL) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - } - } - module_put(THIS_MODULE); } @@ -1556,38 +1571,28 @@ static int dropmon_net_event(struct notifier_block *ev_block, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dm_hw_stat_delta *new_stat = NULL; - struct dm_hw_stat_delta *tmp; + struct dm_hw_stat_delta *stat; switch (event) { case NETDEV_REGISTER: - new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private))) + break; + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) + break; - if (!new_stat) - goto out; + stat->last_rx = jiffies; + rcu_assign_pointer(dev->dm_private, stat); - new_stat->dev = dev; - new_stat->last_rx = jiffies; - mutex_lock(&net_dm_mutex); - list_add_rcu(&new_stat->list, &hw_stats_list); - mutex_unlock(&net_dm_mutex); break; case NETDEV_UNREGISTER: - mutex_lock(&net_dm_mutex); - list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { - if (new_stat->dev == dev) { - new_stat->dev = NULL; - if (trace_state == TRACE_OFF) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - break; - } - } + stat = rtnl_dereference(dev->dm_private); + if (stat) { + rcu_assign_pointer(dev->dm_private, NULL); + kfree_rcu(stat, rcu); } - mutex_unlock(&net_dm_mutex); break; } -out: return NOTIFY_DONE; } diff --git a/net/core/filter.c b/net/core/filter.c index 4603b7cd3cd1..2a6a0b0ce43e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -78,6 +78,7 @@ #include <linux/btf_ids.h> #include <net/tls.h> #include <net/xdp.h> +#include <net/mptcp.h> static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -1687,7 +1688,7 @@ BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; - if (unlikely(offset > 0xffff)) + if (unlikely(offset > INT_MAX)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; @@ -1722,7 +1723,7 @@ BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, { void *ptr; - if (unlikely(offset > 0xffff)) + if (unlikely(offset > INT_MAX)) goto err_clear; ptr = skb_header_pointer(skb, offset, len, to); @@ -2107,7 +2108,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); @@ -2176,7 +2177,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); @@ -2274,7 +2275,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); @@ -2603,7 +2604,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len) + if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2710,6 +2711,9 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, if (unlikely(flags)) return -EINVAL; + if (unlikely(len == 0)) + return 0; + /* First find the starting scatterlist element */ i = msg->sg.start; do { @@ -2809,7 +2813,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; - __clear_bit(new, &msg->sg.copy); + __clear_bit(new, msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); @@ -3783,6 +3787,28 @@ static const struct bpf_func_proto sk_skb_change_head_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; + +BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) +{ + return xdp_get_buff_len(xdp); +} + +static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { + .func = bpf_xdp_get_buff_len, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) + +const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { + .func = bpf_xdp_get_buff_len, + .gpl_only = false, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], +}; + static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : @@ -3817,11 +3843,208 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, + void *buf, unsigned long len, bool flush) +{ + unsigned long ptr_len, ptr_off = 0; + skb_frag_t *next_frag, *end_frag; + struct skb_shared_info *sinfo; + void *src, *dst; + u8 *ptr_buf; + + if (likely(xdp->data_end - xdp->data >= off + len)) { + src = flush ? buf : xdp->data + off; + dst = flush ? xdp->data + off : buf; + memcpy(dst, src, len); + return; + } + + sinfo = xdp_get_shared_info_from_buff(xdp); + end_frag = &sinfo->frags[sinfo->nr_frags]; + next_frag = &sinfo->frags[0]; + + ptr_len = xdp->data_end - xdp->data; + ptr_buf = xdp->data; + + while (true) { + if (off < ptr_off + ptr_len) { + unsigned long copy_off = off - ptr_off; + unsigned long copy_len = min(len, ptr_len - copy_off); + + src = flush ? buf : ptr_buf + copy_off; + dst = flush ? ptr_buf + copy_off : buf; + memcpy(dst, src, copy_len); + + off += copy_len; + len -= copy_len; + buf += copy_len; + } + + if (!len || next_frag == end_frag) + break; + + ptr_off += ptr_len; + ptr_buf = skb_frag_address(next_frag); + ptr_len = skb_frag_size(next_frag); + next_frag++; + } +} + +static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + u32 size = xdp->data_end - xdp->data; + void *addr = xdp->data; + int i; + + if (unlikely(offset > 0xffff || len > 0xffff)) + return ERR_PTR(-EFAULT); + + if (offset + len > xdp_get_buff_len(xdp)) + return ERR_PTR(-EINVAL); + + if (offset < size) /* linear area */ + goto out; + + offset -= size; + for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ + u32 frag_size = skb_frag_size(&sinfo->frags[i]); + + if (offset < frag_size) { + addr = skb_frag_address(&sinfo->frags[i]); + size = frag_size; + break; + } + offset -= frag_size; + } +out: + return offset + len < size ? addr + offset : NULL; +} + +BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset, + void *, buf, u32, len) +{ + void *ptr; + + ptr = bpf_xdp_pointer(xdp, offset, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + if (!ptr) + bpf_xdp_copy_buf(xdp, offset, buf, len, false); + else + memcpy(buf, ptr, len); + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { + .func = bpf_xdp_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, + void *, buf, u32, len) +{ + void *ptr; + + ptr = bpf_xdp_pointer(xdp, offset, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + if (!ptr) + bpf_xdp_copy_buf(xdp, offset, buf, len, true); + else + memcpy(ptr, buf, len); + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { + .func = bpf_xdp_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; + struct xdp_rxq_info *rxq = xdp->rxq; + unsigned int tailroom; + + if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) + return -EOPNOTSUPP; + + tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); + if (unlikely(offset > tailroom)) + return -EINVAL; + + memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); + skb_frag_size_add(frag, offset); + sinfo->xdp_frags_size += offset; + + return 0; +} + +static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, n_frags_free = 0, len_free = 0; + + if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN)) + return -EINVAL; + + for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) { + skb_frag_t *frag = &sinfo->frags[i]; + int shrink = min_t(int, offset, skb_frag_size(frag)); + + len_free += shrink; + offset -= shrink; + + if (skb_frag_size(frag) == shrink) { + struct page *page = skb_frag_page(frag); + + __xdp_return(page_address(page), &xdp->rxq->mem, + false, NULL); + n_frags_free++; + } else { + skb_frag_size_sub(frag, shrink); + break; + } + } + sinfo->nr_frags -= n_frags_free; + sinfo->xdp_frags_size -= len_free; + + if (unlikely(!sinfo->nr_frags)) { + xdp_buff_clear_frags_flag(xdp); + xdp->data_end -= offset; + } + + return 0; +} + BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; + if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */ + if (offset < 0) + return bpf_xdp_frags_shrink_tail(xdp, -offset); + + return bpf_xdp_frags_increase_tail(xdp, offset); + } + /* Notice that xdp_data_hard_end have reserved some tailroom */ if (unlikely(data_end > data_hard_end)) return -EINVAL; @@ -4047,6 +4270,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; + /* XDP_REDIRECT is not fully supported yet for xdp frags since + * not all XDP capable drivers can map non-linear xdp_frame in + * ndo_xdp_xmit. + */ + if (unlikely(xdp_buff_has_frags(xdp) && + map_type != BPF_MAP_TYPE_CPUMAP)) + return -EOPNOTSUPP; + if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); @@ -4268,6 +4499,7 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key if (unlikely(size != sizeof(struct bpf_tunnel_key))) { err = -EINVAL; switch (size) { + case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): goto set_compat; @@ -4293,10 +4525,14 @@ set_compat: if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); + memcpy(to->local_ipv6, &info->key.u.ipv6.dst, + sizeof(to->local_ipv6)); to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst); + memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3); to->tunnel_label = 0; } @@ -4367,6 +4603,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { + case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): @@ -4409,10 +4646,13 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); + memcpy(&info->key.u.ipv6.src, from->local_ipv6, + sizeof(from->local_ipv6)); info->key.label = cpu_to_be32(from->tunnel_label) & IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4); } return 0; @@ -4590,10 +4830,12 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { }; #endif -static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, +static unsigned long bpf_xdp_copy(void *dst, const void *ctx, unsigned long off, unsigned long len) { - memcpy(dst_buff, src_buff + off, len); + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + + bpf_xdp_copy_buf(xdp, off, dst, len, false); return 0; } @@ -4604,11 +4846,11 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(!xdp || - xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + + if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) return -EFAULT; - return bpf_event_output(map, flags, meta, meta_size, xdp->data, + return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, bpf_xdp_copy); } @@ -4862,6 +5104,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, case SO_REUSEPORT: sk->sk_reuseport = valbool; break; + case SO_TXREHASH: + if (val < -1 || val > 1) { + ret = -EINVAL; + break; + } + sk->sk_txrehash = (u8)val; + break; default: ret = -EINVAL; } @@ -4934,7 +5183,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, if (val <= 0 || tp->data_segs_out > tp->syn_data) ret = -EINVAL; else - tp->snd_cwnd = val; + tcp_snd_cwnd_set(tp, val); break; case TCP_BPF_SNDCWND_CLAMP: if (val <= 0) { @@ -5040,6 +5289,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, case SO_REUSEPORT: *((int *)optval) = sk->sk_reuseport; break; + case SO_TXREHASH: + *((int *)optval) = sk->sk_txrehash; + break; default: goto err_clear; } @@ -5906,7 +6158,6 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len if (err) return err; - ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); return seg6_lookup_nexthop(skb, NULL, 0); @@ -6264,10 +6515,21 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, ifindex, proto, netns_id, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } @@ -6301,10 +6563,21 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } @@ -6379,7 +6652,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, @@ -6774,24 +7047,33 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (!th->ack || th->rst || th->syn) return -ENOENT; + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; cookie = ntohl(th->ack_seq) - 1; - switch (sk->sk_family) { - case AF_INET: - if (unlikely(iph_len < sizeof(struct iphdr))) + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); break; #if IS_BUILTIN(CONFIG_IPV6) - case AF_INET6: + case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; + if (sk->sk_family != AF_INET6) + return -EINVAL; + ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); break; #endif /* CONFIG_IPV6 */ @@ -6848,7 +7130,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, */ switch (((struct iphdr *)iph)->version) { case 4: - if (sk->sk_family == AF_INET6 && sk->sk_ipv6only) + if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); @@ -7146,6 +7428,43 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, + u64, tstamp, u32, tstamp_type) +{ + /* skb_clear_delivery_time() is done for inet protocol */ + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -EOPNOTSUPP; + + switch (tstamp_type) { + case BPF_SKB_TSTAMP_DELIVERY_MONO: + if (!tstamp) + return -EINVAL; + skb->tstamp = tstamp; + skb->mono_delivery_time = 1; + break; + case BPF_SKB_TSTAMP_UNSPEC: + if (tstamp) + return -EINVAL; + skb->tstamp = 0; + skb->mono_delivery_time = 0; + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { + .func = bpf_skb_set_tstamp, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -7507,6 +7826,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; + case BPF_FUNC_skb_set_tstamp: + return &bpf_skb_set_tstamp_proto; #endif default: return bpf_sk_base_func_proto(func_id); @@ -7533,6 +7854,12 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; + case BPF_FUNC_xdp_get_buff_len: + return &bpf_xdp_get_buff_len_proto; + case BPF_FUNC_xdp_load_bytes: + return &bpf_xdp_load_bytes_proto; + case BPF_FUNC_xdp_store_bytes: + return &bpf_xdp_store_bytes_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; case BPF_FUNC_check_mtu: @@ -7840,7 +8167,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; - case offsetofend(struct __sk_buff, gso_size) ... offsetof(struct __sk_buff, hwtstamp) - 1: + case offsetof(struct __sk_buff, tstamp_type): + return false; + case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: @@ -8030,6 +8359,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); + int field_size; if (off < 0 || off >= sizeof(struct bpf_sock)) return false; @@ -8041,7 +8371,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): - case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): @@ -8050,6 +8379,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case bpf_ctx_range(struct bpf_sock, dst_port): + field_size = size == size_default ? + size_default : sizeof_field(struct bpf_sock, dst_port); + bpf_ctx_record_field_size(info, field_size); + return bpf_ctx_narrow_access_ok(off, size, field_size); + case offsetofend(struct bpf_sock, dst_port) ... + offsetof(struct bpf_sock, dst_ip4) - 1: + return false; } return size == size_default; @@ -8187,6 +8524,15 @@ static bool tc_cls_act_is_valid_access(int off, int size, break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; + case offsetof(struct __sk_buff, tstamp_type): + /* The convert_ctx_access() on reading and writing + * __sk_buff->tstamp depends on whether the bpf prog + * has used __sk_buff->tstamp_type or not. + * Thus, we need to set prog->tstamp_type_access + * earlier during is_valid_access() here. + */ + ((struct bpf_prog *)prog)->tstamp_type_access = 1; + return size == sizeof(__u8); } return bpf_skb_is_valid_access(off, size, type, prog, info); @@ -8582,6 +8928,25 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + /* AX is needed because src_reg and dst_reg could be the same */ + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + PKT_VLAN_PRESENT_OFFSET); + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK, 2); + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); + *insn++ = BPF_JMP_A(1); + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); + + return insn; +} + static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, struct bpf_insn *insn) { @@ -8603,6 +8968,74 @@ static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, return insn; } +static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, + const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + +#ifdef CONFIG_NET_CLS_ACT + /* If the tstamp_type is read, + * the bpf prog is aware the tstamp could have delivery time. + * Thus, read skb->tstamp as is if tstamp_type_access is true. + */ + if (!prog->tstamp_type_access) { + /* AX is needed because src_reg and dst_reg could be the same */ + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg, + TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2); + /* skb->tc_at_ingress && skb->mono_delivery_time, + * read 0 as the (rcv) timestamp. + */ + *insn++ = BPF_MOV64_IMM(value_reg, 0); + *insn++ = BPF_JMP_A(1); + } +#endif + + *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, + offsetof(struct sk_buff, tstamp)); + return insn; +} + +static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, + const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->src_reg; + __u8 skb_reg = si->dst_reg; + +#ifdef CONFIG_NET_CLS_ACT + /* If the tstamp_type is read, + * the bpf prog is aware the tstamp could have delivery time. + * Thus, write skb->tstamp as is if tstamp_type_access is true. + * Otherwise, writing at ingress will have to clear the + * mono_delivery_time bit also. + */ + if (!prog->tstamp_type_access) { + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); + /* Writing __sk_buff->tstamp as ingress, goto <clear> */ + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); + /* goto <store> */ + *insn++ = BPF_JMP_A(2); + /* <clear>: mono_delivery_time */ + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, PKT_VLAN_PRESENT_OFFSET); + } +#endif + + /* <store>: skb->tstamp = tstamp */ + *insn++ = BPF_STX_MEM(BPF_DW, skb_reg, value_reg, + offsetof(struct sk_buff, tstamp)); + return insn; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -8911,17 +9344,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_DW, - si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - tstamp, 8, - target_size)); + insn = bpf_convert_tstamp_write(prog, si, insn); else - *insn++ = BPF_LDX_MEM(BPF_DW, - si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - tstamp, 8, - target_size)); + insn = bpf_convert_tstamp_read(prog, si, insn); + break; + + case offsetof(struct __sk_buff, tstamp_type): + insn = bpf_convert_tstamp_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): @@ -10062,7 +10491,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, - .check_kfunc_call = bpf_prog_test_check_kfunc_call, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { @@ -10601,12 +11029,24 @@ static bool sk_lookup_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): - case bpf_ctx_range(struct bpf_sk_lookup, remote_port): case bpf_ctx_range(struct bpf_sk_lookup, local_port): case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): bpf_ctx_record_field_size(info, sizeof(__u32)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + /* Allow 4-byte access to 2-byte field for backward compatibility */ + if (size == sizeof(__u32)) + return true; + bpf_ctx_record_field_size(info, sizeof(__be16)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16)); + + case offsetofend(struct bpf_sk_lookup, remote_port) ... + offsetof(struct bpf_sk_lookup, local_ip4) - 1: + /* Allow access to zero padding for backward compatibility */ + bpf_ctx_record_field_size(info, sizeof(__u16)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16)); + default: return false; } @@ -10688,6 +11128,11 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, sport, 2, target_size)); break; + case offsetofend(struct bpf_sk_lookup, remote_port): + *target_size = 2; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); + break; + case offsetof(struct bpf_sk_lookup, local_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, @@ -10858,6 +11303,20 @@ const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], }; +BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) +{ + BTF_TYPE_EMIT(struct mptcp_sock); + return (unsigned long)bpf_mptcp_sock_from_subflow(sk); +} + +const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { + .func = bpf_skc_to_mptcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], +}; + BPF_CALL_1(bpf_sock_from_file, struct file *, file) { return (unsigned long)sock_from_file(file); @@ -10900,6 +11359,9 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; + case BPF_FUNC_skc_to_mptcp_sock: + func = &bpf_skc_to_mptcp_sock_proto; + break; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 15833e1d6ea1..6aee04f75e3e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -22,6 +22,7 @@ #include <linux/ppp_defs.h> #include <linux/stddef.h> #include <linux/if_ether.h> +#include <linux/if_hsr.h> #include <linux/mpls.h> #include <linux/tcp.h> #include <linux/ptp_classify.h> @@ -1031,7 +1032,17 @@ bool __skb_flow_dissect(const struct net *net, key_eth_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS, target_container); - memcpy(key_eth_addrs, ð->h_dest, sizeof(*key_eth_addrs)); + memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs)); + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { + struct flow_dissector_key_num_of_vlans *key_num_of_vlans; + + key_num_of_vlans = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS, + target_container); + key_num_of_vlans->num_of_vlans = 0; } proto_again: @@ -1157,6 +1168,16 @@ proto_again: nhoff += sizeof(*vlan); } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { + struct flow_dissector_key_num_of_vlans *key_nvs; + + key_nvs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS, + target_container); + key_nvs->num_of_vlans++; + } + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { @@ -1182,6 +1203,7 @@ proto_again: VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } key_vlan->vlan_tpid = saved_vlan_tpid; + key_vlan->vlan_eth_type = proto; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; @@ -1282,6 +1304,23 @@ proto_again: break; } + case htons(ETH_P_PRP): + case htons(ETH_P_HSR): { + struct hsr_tag *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + proto = hdr->encap_proto; + nhoff += HSR_HLEN; + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 73f68d4625f3..929f6379a279 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -595,3 +595,9 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count; } EXPORT_SYMBOL(flow_indr_dev_setup_offload); + +bool flow_indr_dev_exists(void) +{ + return !list_empty(&flow_block_indr_dev_list); +} +EXPORT_SYMBOL(flow_indr_dev_exists); diff --git a/net/core/gro.c b/net/core/gro.c index a11b286d1495..b4190eb08467 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -93,6 +93,31 @@ void dev_remove_offload(struct packet_offload *po) EXPORT_SYMBOL(dev_remove_offload); /** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + +/** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment * @features: features for the output path (see dev->features) @@ -142,6 +167,14 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; + if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { + if (p->protocol != htons(ETH_P_IPV6) || + skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || + ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + p->encapsulation) + return -E2BIG; + } + lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); @@ -459,29 +492,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff skb_set_network_header(skb, skb_gro_offset(skb)); skb_reset_mac_len(skb); - NAPI_GRO_CB(skb)->same_flow = 0; + BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); + BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), + sizeof(u32))); /* Avoid slow unaligned acc */ + *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->free = 0; - NAPI_GRO_CB(skb)->encap_mark = 0; - NAPI_GRO_CB(skb)->recursion_counter = 0; - NAPI_GRO_CB(skb)->is_fou = 0; NAPI_GRO_CB(skb)->is_atomic = 1; - NAPI_GRO_CB(skb)->gro_remcsum_start = 0; /* Setup for GRO checksum validation */ switch (skb->ip_summed) { case CHECKSUM_COMPLETE: NAPI_GRO_CB(skb)->csum = skb->csum; NAPI_GRO_CB(skb)->csum_valid = 1; - NAPI_GRO_CB(skb)->csum_cnt = 0; break; case CHECKSUM_UNNECESSARY: NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; - NAPI_GRO_CB(skb)->csum_valid = 0; break; - default: - NAPI_GRO_CB(skb)->csum_cnt = 0; - NAPI_GRO_CB(skb)->csum_valid = 0; } pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, @@ -634,7 +660,6 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; - skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index 6eb2e5ec2c50..541c7a72a28a 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -28,7 +28,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) { drop: - atomic_long_inc(&dev->rx_dropped); + dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); res = NET_RX_DROP; goto unlock; @@ -89,8 +89,23 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) } EXPORT_SYMBOL(gro_cells_init); +struct percpu_free_defer { + struct rcu_head rcu; + void __percpu *ptr; +}; + +static void percpu_free_defer_callback(struct rcu_head *head) +{ + struct percpu_free_defer *defer; + + defer = container_of(head, struct percpu_free_defer, rcu); + free_percpu(defer->ptr); + kfree(defer); +} + void gro_cells_destroy(struct gro_cells *gcells) { + struct percpu_free_defer *defer; int i; if (!gcells->cells) @@ -102,12 +117,23 @@ void gro_cells_destroy(struct gro_cells *gcells) __netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } - /* This barrier is needed because netpoll could access dev->napi_list - * under rcu protection. + /* We need to observe an rcu grace period before freeing ->cells, + * because netpoll could access dev->napi_list under rcu protection. + * Try hard using call_rcu() instead of synchronize_rcu(), + * because we might be called from cleanup_net(), and we + * definitely do not want to block this critical task. */ - synchronize_net(); - - free_percpu(gcells->cells); + defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN); + if (likely(defer)) { + defer->ptr = gcells->cells; + call_rcu(&defer->rcu, percpu_free_defer_callback); + } else { + /* We do not hold RTNL at this point, synchronize_net() + * would not be able to expedite this sync. + */ + synchronize_rcu_expedited(); + free_percpu(gcells->cells); + } gcells->cells = NULL; } EXPORT_SYMBOL(gro_cells_destroy); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index b0f5344d1185..a244d3bade7d 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -18,6 +18,7 @@ #include <linux/bitops.h> #include <linux/types.h> +#include "dev.h" enum lw_bits { LW_URGENT = 0, @@ -166,10 +167,10 @@ static void linkwatch_do_dev(struct net_device *dev) netdev_state_change(dev); } - /* Note: our callers are responsible for - * calling netdev_tracker_free(). + /* Note: our callers are responsible for calling netdev_tracker_free(). + * This is the reason we use __dev_put() instead of dev_put(). */ - dev_put(dev); + __dev_put(dev); } static void __linkwatch_run_queue(int urgent_only) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 349480ef68a5..8b6b5e72b217 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -159,10 +159,8 @@ static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) return dst->lwtstate->orig_output(net, sk, skb); } -static int xmit_check_hhlen(struct sk_buff *skb) +static int xmit_check_hhlen(struct sk_buff *skb, int hh_len) { - int hh_len = skb_dst(skb)->dev->hard_header_len; - if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); @@ -274,6 +272,7 @@ static int bpf_xmit(struct sk_buff *skb) bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->xmit.prog) { + int hh_len = dst->dev->hard_header_len; __be16 proto = skb->protocol; int ret; @@ -291,7 +290,7 @@ static int bpf_xmit(struct sk_buff *skb) /* If the header was expanded, headroom might be too * small for L2 header to come, expand as needed. */ - ret = xmit_check_hhlen(skb); + ret = xmit_check_hhlen(skb, hh_len); if (unlikely(ret)) return ret; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 213cb7b26b7a..54625287ee5b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1133,7 +1133,8 @@ out: neigh_release(neigh); } -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok) { int rc; bool immediate_probe = false; @@ -1154,18 +1155,23 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); - neigh->nud_state = NUD_INCOMPLETE; + neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; - next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), - HZ/100); + if (!immediate_ok) { + next = now + 1; + } else { + immediate_probe = true; + next = now + max(NEIGH_VAR(neigh->parms, + RETRANS_TIME), + HZ / 100); + } neigh_add_timer(neigh, next); - immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; neigh->updated = jiffies; write_unlock_bh(&neigh->lock); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); return 1; } } else if (neigh->nud_state & NUD_STALE) { @@ -1187,7 +1193,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; - kfree_skb(buff); + kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); @@ -1209,7 +1215,7 @@ out_dead: if (neigh->nud_state & NUD_STALE) goto out_unlock_bh; write_unlock_bh(&neigh->lock); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD); trace_neigh_event_send_dead(neigh, 1); return 1; } @@ -1571,9 +1577,9 @@ static void neigh_managed_work(struct work_struct *work) write_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) - neigh_event_send(neigh, NULL); + neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, - NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME)); + max(NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME), HZ)); write_unlock_bh(&tbl->lock); } @@ -3364,7 +3370,7 @@ EXPORT_SYMBOL(neigh_seq_stop); static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; if (*pos == 0) @@ -3381,7 +3387,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { @@ -3401,7 +3407,7 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v) static int neigh_stat_seq_show(struct seq_file *seq, void *v) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { @@ -3722,7 +3728,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; - t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); + t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto err; diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index d8b9dbabd4a4..1ec23bf8b05c 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -4,6 +4,8 @@ #include <linux/seq_file.h> #include <net/wext.h> +#include "dev.h" + #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) #define get_bucket(x) ((x) >> BUCKET_SPACE) @@ -190,12 +192,23 @@ static const struct seq_operations softnet_seq_ops = { .show = softnet_seq_show, }; -static void *ptype_get_idx(loff_t pos) +static void *ptype_get_idx(struct seq_file *seq, loff_t pos) { + struct list_head *ptype_list = NULL; struct packet_type *pt = NULL; + struct net_device *dev; loff_t i = 0; int t; + for_each_netdev_rcu(seq_file_net(seq), dev) { + ptype_list = &dev->ptype_all; + list_for_each_entry_rcu(pt, ptype_list, list) { + if (i == pos) + return pt; + ++i; + } + } + list_for_each_entry_rcu(pt, &ptype_all, list) { if (i == pos) return pt; @@ -216,22 +229,40 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); - return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; + return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net_device *dev; struct packet_type *pt; struct list_head *nxt; int hash; ++*pos; if (v == SEQ_START_TOKEN) - return ptype_get_idx(0); + return ptype_get_idx(seq, 0); pt = v; nxt = pt->list.next; + if (pt->dev) { + if (nxt != &pt->dev->ptype_all) + goto found; + + dev = pt->dev; + for_each_netdev_continue_rcu(seq_file_net(seq), dev) { + if (!list_empty(&dev->ptype_all)) { + nxt = dev->ptype_all.next; + goto found; + } + } + + nxt = ptype_all.next; + goto ptype_all; + } + if (pt->type == htons(ETH_P_ALL)) { +ptype_all: if (nxt != &ptype_all) goto found; hash = 0; @@ -260,7 +291,8 @@ static int ptype_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, "Type Device Function\n"); - else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && + (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 53ea262ecafd..a3642569fe53 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -24,6 +24,7 @@ #include <linux/of_net.h> #include <linux/cpu.h> +#include "dev.h" #include "net-sysfs.h" #ifdef CONFIG_SYSFS @@ -32,6 +33,7 @@ static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; +/* Caller holds RTNL or dev_base_lock */ static inline int dev_isalive(const struct net_device *dev) { return dev->reg_state <= NETREG_REGISTERED; @@ -213,7 +215,7 @@ static ssize_t speed_show(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev)) { + if (netif_running(netdev) && netif_device_present(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) @@ -745,7 +747,6 @@ static const struct attribute_group netstat_group = { .attrs = netstat_attrs, }; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) static struct attribute *wireless_attrs[] = { NULL }; @@ -754,7 +755,19 @@ static const struct attribute_group wireless_group = { .name = "wireless", .attrs = wireless_attrs, }; + +static bool wireless_group_needed(struct net_device *ndev) +{ +#if IS_ENABLED(CONFIG_CFG80211) + if (ndev->ieee80211_ptr) + return true; +#endif +#if IS_ENABLED(CONFIG_WIRELESS_EXT) + if (ndev->wireless_handlers) + return true; #endif + return false; +} #else /* CONFIG_SYSFS */ #define net_class_groups NULL @@ -823,7 +836,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, { struct rps_map *old_map, *map; cpumask_var_t mask; - int err, cpu, i, hk_flags; + int err, cpu, i; static DEFINE_MUTEX(rps_map_mutex); if (!capable(CAP_NET_ADMIN)) @@ -839,8 +852,8 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, } if (!cpumask_empty(mask)) { - hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; - cpumask_and(mask, mask, housekeeping_cpumask(hk_flags)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); if (cpumask_empty(mask)) { free_cpumask_var(mask); return -EINVAL; @@ -1995,14 +2008,8 @@ int netdev_register_kobject(struct net_device *ndev) *groups++ = &netstat_group; |