aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/drivers/net/netkit.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-10-31 05:10:11 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-10-31 05:10:11 -1000
commit89ed67ef126c4160349c1b96fdb775ea6170ac90 (patch)
tree98caaf8bba44b21f9345a0af1dd2bd9987764e27 /drivers/net/netkit.c
parentMerge tag 'cgroup-for-6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup (diff)
parentnet: pcs: xpcs: Add 2500BASE-X case in get state for XPCS drivers (diff)
downloadwireguard-linux-89ed67ef126c4160349c1b96fdb775ea6170ac90.tar.xz
wireguard-linux-89ed67ef126c4160349c1b96fdb775ea6170ac90.zip
Merge tag 'net-next-6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Support usec resolution of TCP timestamps, enabled selectively by a route attribute. - Defer regular TCP ACK while processing socket backlog, try to send a cumulative ACK at the end. Increase single TCP flow performance on a 200Gbit NIC by 20% (100Gbit -> 120Gbit). - The Fair Queuing (FQ) packet scheduler: - add built-in 3 band prio / WRR scheduling - support bypass if the qdisc is mostly idle (5% speed up for TCP RR) - improve inactive flow reporting - optimize the layout of structures for better cache locality - Support TCP Authentication Option (RFC 5925, TCP-AO), a more modern replacement for the old MD5 option. - Add more retransmission timeout (RTO) related statistics to TCP_INFO. - Support sending fragmented skbs over vsock sockets. - Make sure we send SIGPIPE for vsock sockets if socket was shutdown(). - Add sysctl for ignoring lower limit on lifetime in Router Advertisement PIO, based on an in-progress IETF draft. - Add sysctl to control activation of TCP ping-pong mode. - Add sysctl to make connection timeout in MPTCP configurable. - Support rcvlowat and notsent_lowat on MPTCP sockets, to help apps limit the number of wakeups. - Support netlink GET for MDB (multicast forwarding), allowing user space to request a single MDB entry instead of dumping the entire table. - Support selective FDB flushing in the VXLAN tunnel driver. - Allow limiting learned FDB entries in bridges, prevent OOM attacks. - Allow controlling via configfs netconsole targets which were created via the kernel cmdline at boot, rather than via configfs at runtime. - Support multiple PTP timestamp event queue readers with different filters. - MCTP over I3C. BPF: - Add new veth-like netdevice where BPF program defines the logic of the xmit routine. It can operate in L3 and L2 mode. - Support exceptions - allow asserting conditions which should never be true but are hard for the verifier to infer. With some extra flexibility around handling of the exit / failure: https://lwn.net/Articles/938435/ - Add support for local per-cpu kptr, allow allocating and storing per-cpu objects in maps. Access to those objects operates on the value for the current CPU. This allows to deprecate local one-off implementations of per-CPU storage like BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE maps. - Extend cgroup BPF sockaddr hooks for UNIX sockets. The use case is for systemd to re-implement the LogNamespace feature which allows running multiple instances of systemd-journald to process the logs of different services. - Enable open-coded task_vma iteration, after maple tree conversion made it hard to directly walk VMAs in tracing programs. - Add open-coded task, css_task and css iterator support. One of the use cases is customizable OOM victim selection via BPF. - Allow source address selection with bpf_*_fib_lookup(). - Add ability to pin BPF timer to the current CPU. - Prevent creation of infinite loops by combining tail calls and fentry/fexit programs. - Add missed stats for kprobes to retrieve the number of missed kprobe executions and subsequent executions of BPF programs. - Inherit system settings for CPU security mitigations. - Add BPF v4 CPU instruction support for arm32 and s390x. Changes to common code: - overflow: add DEFINE_FLEX() for on-stack definition of structs with flexible array members. - Process doc update with more guidance for reviewers. Driver API: - Simplify locking in WiFi (cfg80211 and mac80211 layers), use wiphy mutex in most places and remove a lot of smaller locks. - Create a common DPLL configuration API. Allow configuring and querying state of PLL circuits used for clock syntonization, in network time distribution. - Unify fragmented and full page allocation APIs in page pool code. Let drivers be ignorant of PAGE_SIZE. - Rework PHY state machine to avoid races with calls to phy_stop(). - Notify DSA drivers of MAC address changes on user ports, improve correctness of offloads which depend on matching port MAC addresses. - Allow antenna control on injected WiFi frames. - Reduce the number of variants of napi_schedule(). - Simplify error handling when composing devlink health messages. Misc: - A lot of KCSAN data race "fixes", from Eric. - A lot of __counted_by() annotations, from Kees. - A lot of strncpy -> strscpy and printf format fixes. - Replace master/slave terminology with conduit/user in DSA drivers. - Handful of KUnit tests for netdev and WiFi core. Removed: - AppleTalk COPS. - AppleTalk ipddp. - TI AR7 CPMAC Ethernet driver. Drivers: - Ethernet high-speed NICs: - Intel (100G, ice, idpf): - add a driver for the Intel E2000 IPUs - make CRC/FCS stripping configurable - cross-timestamping for E823 devices - basic support for E830 devices - use aux-bus for managing client drivers - i40e: report firmware versions via devlink - nVidia/Mellanox: - support 4-port NICs - increase max number of channels to 256 - optimize / parallelize SF creation flow - Broadcom (bnxt): - enhance NIC temperature reporting - support PAM4 speeds and lane configuration - Marvell OcteonTX2: - PTP pulse-per-second output support - enable hardware timestamping for VFs - Solarflare/AMD: - conntrack NAT offload and offload for tunnels - Wangxun (ngbe/txgbe): - expose HW statistics - Pensando/AMD: - support PCI level reset - narrow down the condition under which skbs are linearized - Netronome/Corigine (nfp): - support CHACHA20-POLY1305 crypto in IPsec offload - Ethernet NICs embedded, slower, virtual: - Synopsys (stmmac): - add Loongson-1 SoC support - enable use of HW queues with no offload capabilities - enable PPS input support on all 5 channels - increase TX coalesce timer to 5ms - RealTek USB (r8152): improve efficiency of Rx by using GRO frags - xen: support SW packet timestamping - add drivers for implementations based on TI's PRUSS (AM64x EVM) - nVidia/Mellanox Ethernet datacenter switches: - avoid poor HW resource use on Spectrum-4 by better block selection for IPv6 multicast forwarding and ordering of blocks in ACL region - Ethernet embedded switches: - Microchip: - support configuring the drive strength for EMI compliance - ksz9477: partial ACL support - ksz9477: HSR offload - ksz9477: Wake on LAN - Realtek: - rtl8366rb: respect device tree config of the CPU port - Ethernet PHYs: - support Broadcom BCM5221 PHYs - TI dp83867: support hardware LED blinking - CAN: - add support for Linux-PHY based CAN transceivers - at91_can: clean up and use rx-offload helpers - WiFi: - MediaTek (mt76): - new sub-driver for mt7925 USB/PCIe devices - HW wireless <> Ethernet bridging in MT7988 chips - mt7603/mt7628 stability improvements - Qualcomm (ath12k): - WCN7850: - enable 320 MHz channels in 6 GHz band - hardware rfkill support - enable IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS to make scan faster - read board data variant name from SMBIOS - QCN9274: mesh support - RealTek (rtw89): - TDMA-based multi-channel concurrency (MCC) - Silicon Labs (wfx): - Remain-On-Channel (ROC) support - Bluetooth: - ISO: many improvements for broadcast support - mark BCM4378/BCM4387 as BROKEN_LE_CODED - add support for QCA2066 - btmtksdio: enable Bluetooth wakeup from suspend" * tag 'net-next-6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1816 commits) net: pcs: xpcs: Add 2500BASE-X case in get state for XPCS drivers net: bpf: Use sockopt_lock_sock() in ip_sock_set_tos() net: mana: Use xdp_set_features_flag instead of direct assignment vxlan: Cleanup IFLA_VXLAN_PORT_RANGE entry in vxlan_get_size() iavf: delete the iavf client interface iavf: add a common function for undoing the interrupt scheme iavf: use unregister_netdev iavf: rely on netdev's own registered state iavf: fix the waiting time for initial reset iavf: in iavf_down, don't queue watchdog_task if comms failed iavf: simplify mutex_trylock+sleep loops iavf: fix comments about old bit locks doc/netlink: Update schema to support cmd-cnt-name and cmd-max-name tools: ynl: introduce option to process unknown attributes or types ipvlan: properly track tx_errors netdevsim: Block until all devices are released nfp: using napi_build_skb() to replace build_skb() net: dsa: microchip: ksz9477: Fix spelling mistake "Enery" -> "Energy" net: dsa: microchip: Ensure Stable PME Pin State for Wake-on-LAN net: dsa: microchip: Refactor switch shutdown routine for WoL preparation ...
Diffstat (limited to 'drivers/net/netkit.c')
-rw-r--r--drivers/net/netkit.c936
1 files changed, 936 insertions, 0 deletions
diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
new file mode 100644
index 000000000000..5a0f86f38f09
--- /dev/null
+++ b/drivers/net/netkit.c
@@ -0,0 +1,936 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/etherdevice.h>
+#include <linux/filter.h>
+#include <linux/netfilter_netdev.h>
+#include <linux/bpf_mprog.h>
+
+#include <net/netkit.h>
+#include <net/dst.h>
+#include <net/tcx.h>
+
+#define DRV_NAME "netkit"
+
+struct netkit {
+ /* Needed in fast-path */
+ struct net_device __rcu *peer;
+ struct bpf_mprog_entry __rcu *active;
+ enum netkit_action policy;
+ struct bpf_mprog_bundle bundle;
+
+ /* Needed in slow-path */
+ enum netkit_mode mode;
+ bool primary;
+ u32 headroom;
+};
+
+struct netkit_link {
+ struct bpf_link link;
+ struct net_device *dev;
+ u32 location;
+};
+
+static __always_inline int
+netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
+ enum netkit_action ret)
+{
+ const struct bpf_mprog_fp *fp;
+ const struct bpf_prog *prog;
+
+ bpf_mprog_foreach_prog(entry, fp, prog) {
+ bpf_compute_data_pointers(skb);
+ ret = bpf_prog_run(prog, skb);
+ if (ret != NETKIT_NEXT)
+ break;
+ }
+ return ret;
+}
+
+static void netkit_prep_forward(struct sk_buff *skb, bool xnet)
+{
+ skb_scrub_packet(skb, xnet);
+ skb->priority = 0;
+ nf_skip_egress(skb, true);
+}
+
+static struct netkit *netkit_priv(const struct net_device *dev)
+{
+ return netdev_priv(dev);
+}
+
+static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct netkit *nk = netkit_priv(dev);
+ enum netkit_action ret = READ_ONCE(nk->policy);
+ netdev_tx_t ret_dev = NET_XMIT_SUCCESS;
+ const struct bpf_mprog_entry *entry;
+ struct net_device *peer;
+
+ rcu_read_lock();
+ peer = rcu_dereference(nk->peer);
+ if (unlikely(!peer || !(peer->flags & IFF_UP) ||
+ !pskb_may_pull(skb, ETH_HLEN) ||
+ skb_orphan_frags(skb, GFP_ATOMIC)))
+ goto drop;
+ netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)));
+ skb->dev = peer;
+ entry = rcu_dereference(nk->active);
+ if (entry)
+ ret = netkit_run(entry, skb, ret);
+ switch (ret) {
+ case NETKIT_NEXT:
+ case NETKIT_PASS:
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ __netif_rx(skb);
+ break;
+ case NETKIT_REDIRECT:
+ skb_do_redirect(skb);
+ break;
+ case NETKIT_DROP:
+ default:
+drop:
+ kfree_skb(skb);
+ dev_core_stats_tx_dropped_inc(dev);
+ ret_dev = NET_XMIT_DROP;
+ break;
+ }
+ rcu_read_unlock();
+ return ret_dev;
+}
+
+static int netkit_open(struct net_device *dev)
+{
+ struct netkit *nk = netkit_priv(dev);
+ struct net_device *peer = rtnl_dereference(nk->peer);
+
+ if (!peer)
+ return -ENOTCONN;
+ if (peer->flags & IFF_UP) {
+ netif_carrier_on(dev);
+ netif_carrier_on(peer);
+ }
+ return 0;
+}
+
+static int netkit_close(struct net_device *dev)
+{
+ struct netkit *nk = netkit_priv(dev);
+ struct net_device *peer = rtnl_dereference(nk->peer);
+
+ netif_carrier_off(dev);
+ if (peer)
+ netif_carrier_off(peer);
+ return 0;
+}
+
+static int netkit_get_iflink(const struct net_device *dev)
+{
+ struct netkit *nk = netkit_priv(dev);
+ struct net_device *peer;
+ int iflink = 0;
+
+ rcu_read_lock();
+ peer = rcu_dereference(nk->peer);
+ if (peer)
+ iflink = peer->ifindex;
+ rcu_read_unlock();
+ return iflink;
+}
+
+static void netkit_set_multicast(struct net_device *dev)
+{
+ /* Nothing to do, we receive whatever gets pushed to us! */
+}
+
+static void netkit_set_headroom(struct net_device *dev, int headroom)
+{
+ struct netkit *nk = netkit_priv(dev), *nk2;
+ struct net_device *peer;
+
+ if (headroom < 0)
+ headroom = NET_SKB_PAD;
+
+ rcu_read_lock();
+ peer = rcu_dereference(nk->peer);
+ if (unlikely(!peer))
+ goto out;
+
+ nk2 = netkit_priv(peer);
+ nk->headroom = headroom;
+ headroom = max(nk->headroom, nk2->headroom);
+
+ peer->needed_headroom = headroom;
+ dev->needed_headroom = headroom;
+out:
+ rcu_read_unlock();
+}
+
+static struct net_device *netkit_peer_dev(struct net_device *dev)
+{
+ return rcu_dereference(netkit_priv(dev)->peer);
+}
+
+static void netkit_uninit(struct net_device *dev);
+
+static const struct net_device_ops netkit_netdev_ops = {
+ .ndo_open = netkit_open,
+ .ndo_stop = netkit_close,
+ .ndo_start_xmit = netkit_xmit,
+ .ndo_set_rx_mode = netkit_set_multicast,
+ .ndo_set_rx_headroom = netkit_set_headroom,
+ .ndo_get_iflink = netkit_get_iflink,
+ .ndo_get_peer_dev = netkit_peer_dev,
+ .ndo_uninit = netkit_uninit,
+ .ndo_features_check = passthru_features_check,
+};
+
+static void netkit_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strscpy(info->driver, DRV_NAME, sizeof(info->driver));
+}
+
+static const struct ethtool_ops netkit_ethtool_ops = {
+ .get_drvinfo = netkit_get_drvinfo,
+};
+
+static void netkit_setup(struct net_device *dev)
+{
+ static const netdev_features_t netkit_features_hw_vlan =
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_CTAG_RX |
+ NETIF_F_HW_VLAN_STAG_TX |
+ NETIF_F_HW_VLAN_STAG_RX;
+ static const netdev_features_t netkit_features =
+ netkit_features_hw_vlan |
+ NETIF_F_SG |
+ NETIF_F_FRAGLIST |
+ NETIF_F_HW_CSUM |
+ NETIF_F_RXCSUM |
+ NETIF_F_SCTP_CRC |
+ NETIF_F_HIGHDMA |
+ NETIF_F_GSO_SOFTWARE |
+ NETIF_F_GSO_ENCAP_ALL;
+
+ ether_setup(dev);
+ dev->max_mtu = ETH_MAX_MTU;
+
+ dev->flags |= IFF_NOARP;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ dev->priv_flags |= IFF_PHONY_HEADROOM;
+ dev->priv_flags |= IFF_NO_QUEUE;
+
+ dev->ethtool_ops = &netkit_ethtool_ops;
+ dev->netdev_ops = &netkit_netdev_ops;
+
+ dev->features |= netkit_features | NETIF_F_LLTX;
+ dev->hw_features = netkit_features;
+ dev->hw_enc_features = netkit_features;
+ dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
+ dev->vlan_features = dev->features & ~netkit_features_hw_vlan;
+
+ dev->needs_free_netdev = true;
+
+ netif_set_tso_max_size(dev, GSO_MAX_SIZE);
+}
+
+static struct net *netkit_get_link_net(const struct net_device *dev)
+{
+ struct netkit *nk = netkit_priv(dev);
+ struct net_device *peer = rtnl_dereference(nk->peer);
+
+ return peer ? dev_net(peer) : dev_net(dev);
+}
+
+static int netkit_check_policy(int policy, struct nlattr *tb,
+ struct netlink_ext_ack *extack)
+{
+ switch (policy) {
+ case NETKIT_PASS:
+ case NETKIT_DROP:
+ return 0;
+ default:
+ NL_SET_ERR_MSG_ATTR(extack, tb,
+ "Provided default xmit policy not supported");
+ return -EINVAL;
+ }
+}
+
+static int netkit_check_mode(int mode, struct nlattr *tb,
+ struct netlink_ext_ack *extack)
+{
+ switch (mode) {
+ case NETKIT_L2:
+ case NETKIT_L3:
+ return 0;
+ default:
+ NL_SET_ERR_MSG_ATTR(extack, tb,
+ "Provided device mode can only be L2 or L3");
+ return -EINVAL;
+ }
+}
+
+static int netkit_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *attr = tb[IFLA_ADDRESS];
+
+ if (!attr)
+ return 0;
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Setting Ethernet address is not supported");
+ return -EOPNOTSUPP;
+}
+
+static struct rtnl_link_ops netkit_link_ops;
+
+static int netkit_new_link(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr;
+ enum netkit_action default_prim = NETKIT_PASS;
+ enum netkit_action default_peer = NETKIT_PASS;
+ enum netkit_mode mode = NETKIT_L3;
+ unsigned char ifname_assign_type;
+ struct ifinfomsg *ifmp = NULL;
+ struct net_device *peer;
+ char ifname[IFNAMSIZ];
+ struct netkit *nk;
+ struct net *net;
+ int err;
+
+ if (data) {
+ if (data[IFLA_NETKIT_MODE]) {
+ attr = data[IFLA_NETKIT_MODE];
+ mode = nla_get_u32(attr);
+ err = netkit_check_mode(mode, attr, extack);
+ if (err < 0)
+ return err;
+ }
+ if (data[IFLA_NETKIT_PEER_INFO]) {
+ attr = data[IFLA_NETKIT_PEER_INFO];
+ ifmp = nla_data(attr);
+ err = rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack);
+ if (err < 0)
+ return err;
+ err = netkit_validate(peer_tb, NULL, extack);
+ if (err < 0)
+ return err;
+ tbp = peer_tb;
+ }
+ if (data[IFLA_NETKIT_POLICY]) {
+ attr = data[IFLA_NETKIT_POLICY];
+ default_prim = nla_get_u32(attr);
+ err = netkit_check_policy(default_prim, attr, extack);
+ if (err < 0)
+ return err;
+ }
+ if (data[IFLA_NETKIT_PEER_POLICY]) {
+ attr = data[IFLA_NETKIT_PEER_POLICY];
+ default_peer = nla_get_u32(attr);
+ err = netkit_check_policy(default_peer, attr, extack);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ if (ifmp && tbp[IFLA_IFNAME]) {
+ nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
+ ifname_assign_type = NET_NAME_USER;
+ } else {
+ strscpy(ifname, "nk%d", IFNAMSIZ);
+ ifname_assign_type = NET_NAME_ENUM;
+ }
+
+ net = rtnl_link_get_net(src_net, tbp);
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ peer = rtnl_create_link(net, ifname, ifname_assign_type,
+ &netkit_link_ops, tbp, extack);
+ if (IS_ERR(peer)) {
+ put_net(net);
+ return PTR_ERR(peer);
+ }
+
+ netif_inherit_tso_max(peer, dev);
+
+ if (mode == NETKIT_L2)
+ eth_hw_addr_random(peer);
+ if (ifmp && dev->ifindex)
+ peer->ifindex = ifmp->ifi_index;
+
+ nk = netkit_priv(peer);
+ nk->primary = false;
+ nk->policy = default_peer;
+ nk->mode = mode;
+ bpf_mprog_bundle_init(&nk->bundle);
+
+ err = register_netdevice(peer);
+ put_net(net);
+ if (err < 0)
+ goto err_register_peer;
+ netif_carrier_off(peer);
+ if (mode == NETKIT_L2)
+ dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
+
+ err = rtnl_configure_link(peer, NULL, 0, NULL);
+ if (err < 0)
+ goto err_configure_peer;
+
+ if (mode == NETKIT_L2)
+ eth_hw_addr_random(dev);
+ if (tb[IFLA_IFNAME])
+ nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
+ else
+ strscpy(dev->name, "nk%d", IFNAMSIZ);
+
+ nk = netkit_priv(dev);
+ nk->primary = true;
+ nk->policy = default_prim;
+ nk->mode = mode;
+ bpf_mprog_bundle_init(&nk->bundle);
+
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto err_configure_peer;
+ netif_carrier_off(dev);
+ if (mode == NETKIT_L2)
+ dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);
+
+ rcu_assign_pointer(netkit_priv(dev)->peer, peer);
+ rcu_assign_pointer(netkit_priv(peer)->peer, dev);
+ return 0;
+err_configure_peer:
+ unregister_netdevice(peer);
+ return err;
+err_register_peer:
+ free_netdev(peer);
+ return err;
+}
+
+static struct bpf_mprog_entry *netkit_entry_fetch(struct net_device *dev,
+ bool bundle_fallback)
+{
+ struct netkit *nk = netkit_priv(dev);
+ struct bpf_mprog_entry *entry;
+
+ ASSERT_RTNL();
+ entry = rcu_dereference_rtnl(nk->active);
+ if (entry)
+ return entry;
+ if (bundle_fallback)
+ return &nk->bundle.a;
+ return NULL;
+}
+
+static void netkit_entry_update(struct net_device *dev,
+ struct bpf_mprog_entry *entry)
+{
+ struct netkit *nk = netkit_priv(dev);
+
+ ASSERT_RTNL();
+ rcu_assign_pointer(nk->active, entry);
+}
+
+static void netkit_entry_sync(void)
+{
+ synchronize_rcu();
+}
+
+static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 which)
+{
+ struct net_device *dev;
+ struct netkit *nk;
+
+ ASSERT_RTNL();
+
+ switch (which) {
+ case BPF_NETKIT_PRIMARY:
+ case BPF_NETKIT_PEER:
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ dev = __dev_get_by_index(net, ifindex);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+ if (dev->netdev_ops != &netkit_netdev_ops)
+ return ERR_PTR(-ENXIO);
+
+ nk = netkit_priv(dev);
+ if (!nk->primary)
+ return ERR_PTR(-EACCES);
+ if (which == BPF_NETKIT_PEER) {
+ dev = rcu_dereference_rtnl(nk->peer);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+ }
+ return dev;
+}
+
+int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct bpf_prog *replace_prog = NULL;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
+ attr->attach_type);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto out;
+ }
+ entry = netkit_entry_fetch(dev, true);
+ if (attr->attach_flags & BPF_F_REPLACE) {
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd,
+ prog->type);
+ if (IS_ERR(replace_prog)) {
+ ret = PTR_ERR(replace_prog);
+ replace_prog = NULL;
+ goto out;
+ }
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog,
+ attr->attach_flags, attr->relative_fd,
+ attr->expected_revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ netkit_entry_update(dev, entry_new);
+ netkit_entry_sync();
+ }
+ bpf_mprog_commit(entry);
+ }
+out:
+ if (replace_prog)
+ bpf_prog_put(replace_prog);
+ rtnl_unlock();
+ return ret;
+}
+
+int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
+ attr->attach_type);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto out;
+ }
+ entry = netkit_entry_fetch(dev, false);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags,
+ attr->relative_fd, attr->expected_revision);
+ if (!ret) {
+ if (!bpf_mprog_total(entry_new))
+ entry_new = NULL;
+ netkit_entry_update(dev, entry_new);
+ netkit_entry_sync();
+ bpf_mprog_commit(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = netkit_dev_fetch(current->nsproxy->net_ns,
+ attr->query.target_ifindex,
+ attr->query.attach_type);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto out;
+ }
+ ret = bpf_mprog_query(attr, uattr, netkit_entry_fetch(dev, false));
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static struct netkit_link *netkit_link(const struct bpf_link *link)
+{
+ return container_of(link, struct netkit_link, link);
+}
+
+static int netkit_link_prog_attach(struct bpf_link *link, u32 flags,
+ u32 id_or_fd, u64 revision)
+{
+ struct netkit_link *nkl = netkit_link(link);
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev = nkl->dev;
+ int ret;
+
+ ASSERT_RTNL();
+ entry = netkit_entry_fetch(dev, true);
+ ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags,
+ id_or_fd, revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ netkit_entry_update(dev, entry_new);
+ netkit_entry_sync();
+ }
+ bpf_mprog_commit(entry);
+ }
+ return ret;
+}
+
+static void netkit_link_release(struct bpf_link *link)
+{
+ struct netkit_link *nkl = netkit_link(link);
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = nkl->dev;
+ if (!dev)
+ goto out;
+ entry = netkit_entry_fetch(dev, false);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0);
+ if (!ret) {
+ if (!bpf_mprog_total(entry_new))
+ entry_new = NULL;
+ netkit_entry_update(dev, entry_new);
+ netkit_entry_sync();
+ bpf_mprog_commit(entry);
+ nkl->dev = NULL;
+ }
+out:
+ WARN_ON_ONCE(ret);
+ rtnl_unlock();
+}
+
+static int netkit_link_update(struct bpf_link *link, struct bpf_prog *nprog,
+ struct bpf_prog *oprog)
+{
+ struct netkit_link *nkl = netkit_link(link);
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = nkl->dev;
+ if (!dev) {
+ ret = -ENOLINK;
+ goto out;
+ }
+ if (oprog && link->prog != oprog) {
+ ret = -EPERM;
+ goto out;
+ }
+ oprog = link->prog;
+ if (oprog == nprog) {
+ bpf_prog_put(nprog);
+ goto out;
+ }
+ entry = netkit_entry_fetch(dev, false);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog,
+ BPF_F_REPLACE | BPF_F_ID,
+ link->prog->aux->id, 0);
+ if (!ret) {
+ WARN_ON_ONCE(entry != entry_new);
+ oprog = xchg(&link->prog, nprog);
+ bpf_prog_put(oprog);
+ bpf_mprog_commit(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static void netkit_link_dealloc(struct bpf_link *link)
+{
+ kfree(netkit_link(link));
+}
+
+static void netkit_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
+{
+ const struct netkit_link *nkl = netkit_link(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (nkl->dev)
+ ifindex = nkl->dev->ifindex;
+ rtnl_unlock();
+
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
+ seq_printf(seq, "attach_type:\t%u (%s)\n",
+ nkl->location,
+ nkl->location == BPF_NETKIT_PRIMARY ? "primary" : "peer");
+}
+
+static int netkit_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct netkit_link *nkl = netkit_link(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (nkl->dev)
+ ifindex = nkl->dev->ifindex;
+ rtnl_unlock();
+
+ info->netkit.ifindex = ifindex;
+ info->netkit.attach_type = nkl->location;
+ return 0;
+}
+
+static int netkit_link_detach(struct bpf_link *link)
+{
+ netkit_link_release(link);
+ return 0;
+}
+
+static const struct bpf_link_ops netkit_link_lops = {
+ .release = netkit_link_release,
+ .detach = netkit_link_detach,
+ .dealloc = netkit_link_dealloc,
+ .update_prog = netkit_link_update,
+ .show_fdinfo = netkit_link_fdinfo,
+ .fill_link_info = netkit_link_fill_info,
+};
+
+static int netkit_link_init(struct netkit_link *nkl,
+ struct bpf_link_primer *link_primer,
+ const union bpf_attr *attr,
+ struct net_device *dev,
+ struct bpf_prog *prog)
+{
+ bpf_link_init(&nkl->link, BPF_LINK_TYPE_NETKIT,
+ &netkit_link_lops, prog);
+ nkl->location = attr->link_create.attach_type;
+ nkl->dev = dev;
+ return bpf_link_prime(&nkl->link, link_primer);
+}
+
+int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct netkit_link *nkl;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = netkit_dev_fetch(current->nsproxy->net_ns,
+ attr->link_create.target_ifindex,
+ attr->link_create.attach_type);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto out;
+ }
+ nkl = kzalloc(sizeof(*nkl), GFP_KERNEL_ACCOUNT);
+ if (!nkl) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = netkit_link_init(nkl, &link_primer, attr, dev, prog);
+ if (ret) {
+ kfree(nkl);
+ goto out;
+ }
+ ret = netkit_link_prog_attach(&nkl->link,
+ attr->link_create.flags,
+ attr->link_create.netkit.relative_fd,
+ attr->link_create.netkit.expected_revision);
+ if (ret) {
+ nkl->dev = NULL;
+ bpf_link_cleanup(&link_primer);
+ goto out;
+ }
+ ret = bpf_link_settle(&link_primer);
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static void netkit_release_all(struct net_device *dev)
+{
+ struct bpf_mprog_entry *entry;
+ struct bpf_tuple tuple = {};
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+
+ entry = netkit_entry_fetch(dev, false);
+ if (!entry)
+ return;
+ netkit_entry_update(dev, NULL);
+ netkit_entry_sync();
+ bpf_mprog_foreach_tuple(entry, fp, cp, tuple) {
+ if (tuple.link)
+ netkit_link(tuple.link)->dev = NULL;
+ else
+ bpf_prog_put(tuple.prog);
+ }
+}
+
+static void netkit_uninit(struct net_device *dev)
+{
+ netkit_release_all(dev);
+}
+
+static void netkit_del_link(struct net_device *dev, struct list_head *head)
+{
+ struct netkit *nk = netkit_priv(dev);
+ struct net_device *peer = rtnl_dereference(nk->peer);
+
+ RCU_INIT_POINTER(nk->peer, NULL);
+ unregister_netdevice_queue(dev, head);
+ if (peer) {
+ nk = netkit_priv(peer);
+ RCU_INIT_POINTER(nk->peer, NULL);
+ unregister_netdevice_queue(peer, head);
+ }
+}
+
+static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct netkit *nk = netkit_priv(dev);
+ struct net_device *peer = rtnl_dereference(nk->peer);
+ enum netkit_action policy;
+ struct nlattr *attr;
+ int err;
+
+ if (!nk->primary) {
+ NL_SET_ERR_MSG(extack,
+ "netkit link settings can be changed only through the primary device");
+ return -EACCES;
+ }
+
+ if (data[IFLA_NETKIT_MODE]) {
+ NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_MODE],
+ "netkit link operating mode cannot be changed after device creation");
+ return -EACCES;
+ }
+
+ if (data[IFLA_NETKIT_POLICY]) {
+ attr = data[IFLA_NETKIT_POLICY];
+ policy = nla_get_u32(attr);
+ err = netkit_check_policy(policy, attr, extack);
+ if (err)
+ return err;
+ WRITE_ONCE(nk->policy, policy);
+ }
+
+ if (data[IFLA_NETKIT_PEER_POLICY]) {
+ err = -EOPNOTSUPP;
+ attr = data[IFLA_NETKIT_PEER_POLICY];
+ policy = nla_get_u32(attr);
+ if (peer)
+ err = netkit_check_policy(policy, attr, extack);
+ if (err)
+ return err;
+ nk = netkit_priv(peer);
+ WRITE_ONCE(nk->policy, policy);
+ }
+
+ return 0;
+}
+
+static size_t netkit_get_size(const struct net_device *dev)
+{
+ return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
+ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */
+ nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */
+ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */
+ 0;
+}
+
+static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct netkit *nk = netkit_priv(dev);
+ struct net_device *peer = rtnl_dereference(nk->peer);
+
+ if (nla_put_u8(skb, IFLA_NETKIT_PRIMARY, nk->primary))
+ return -EMSGSIZE;
+ if (nla_put_u32(skb, IFLA_NETKIT_POLICY, nk->policy))
+ return -EMSGSIZE;
+ if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode))
+ return -EMSGSIZE;
+
+ if (peer) {
+ nk = netkit_priv(peer);
+ if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy))
+ return -EMSGSIZE;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
+ [IFLA_NETKIT_PEER_INFO] = { .len = sizeof(struct ifinfomsg) },
+ [IFLA_NETKIT_POLICY] = { .type = NLA_U32 },
+ [IFLA_NETKIT_MODE] = { .type = NLA_U32 },
+ [IFLA_NETKIT_PEER_POLICY] = { .type = NLA_U32 },
+ [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT,
+ .reject_message = "Primary attribute is read-only" },
+};
+
+static struct rtnl_link_ops netkit_link_ops = {
+ .kind = DRV_NAME,
+ .priv_size = sizeof(struct netkit),
+ .setup = netkit_setup,
+ .newlink = netkit_new_link,
+ .dellink = netkit_del_link,
+ .changelink = netkit_change_link,
+ .get_link_net = netkit_get_link_net,
+ .get_size = netkit_get_size,
+ .fill_info = netkit_fill_info,
+ .policy = netkit_policy,
+ .validate = netkit_validate,
+ .maxtype = IFLA_NETKIT_MAX,
+};
+
+static __init int netkit_init(void)
+{
+ BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT ||
+ (int)NETKIT_PASS != (int)TCX_PASS ||
+ (int)NETKIT_DROP != (int)TCX_DROP ||
+ (int)NETKIT_REDIRECT != (int)TCX_REDIRECT);
+
+ return rtnl_link_register(&netkit_link_ops);
+}
+
+static __exit void netkit_exit(void)
+{
+ rtnl_link_unregister(&netkit_link_ops);
+}
+
+module_init(netkit_init);
+module_exit(netkit_exit);
+
+MODULE_DESCRIPTION("BPF-programmable network device");
+MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>");
+MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK(DRV_NAME);