aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-04-29 11:57:23 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-04-29 11:57:23 -0700
commit9d31d2338950293ec19d9b095fbaa9030899dcb4 (patch)
treee688040d0557c24a2eeb9f6c9c223d949f6f7ef9 /net/core
parentMerge tag 'x86-mm-2021-04-29' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (diff)
parentnet: selftest: fix build issue if INET is disabled (diff)
downloadlinux-dev-9d31d2338950293ec19d9b095fbaa9030899dcb4.tar.xz
linux-dev-9d31d2338950293ec19d9b095fbaa9030899dcb4.zip
Merge tag 'net-next-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - bpf: - allow bpf programs calling kernel functions (initially to reuse TCP congestion control implementations) - enable task local storage for tracing programs - remove the need to store per-task state in hash maps, and allow tracing programs access to task local storage previously added for BPF_LSM - add bpf_for_each_map_elem() helper, allowing programs to walk all map elements in a more robust and easier to verify fashion - sockmap: support UDP and cross-protocol BPF_SK_SKB_VERDICT redirection - lpm: add support for batched ops in LPM trie - add BTF_KIND_FLOAT support - mostly to allow use of BTF on s390 which has floats in its headers files - improve BPF syscall documentation and extend the use of kdoc parsing scripts we already employ for bpf-helpers - libbpf, bpftool: support static linking of BPF ELF files - improve support for encapsulation of L2 packets - xdp: restructure redirect actions to avoid a runtime lookup, improving performance by 4-8% in microbenchmarks - xsk: build skb by page (aka generic zerocopy xmit) - improve performance of software AF_XDP path by 33% for devices which don't need headers in the linear skb part (e.g. virtio) - nexthop: resilient next-hop groups - improve path stability on next-hops group changes (incl. offload for mlxsw) - ipv6: segment routing: add support for IPv4 decapsulation - icmp: add support for RFC 8335 extended PROBE messages - inet: use bigger hash table for IP ID generation - tcp: deal better with delayed TX completions - make sure we don't give up on fast TCP retransmissions only because driver is slow in reporting that it completed transmitting the original - tcp: reorder tcp_congestion_ops for better cache locality - mptcp: - add sockopt support for common TCP options - add support for common TCP msg flags - include multiple address ids in RM_ADDR - add reset option support for resetting one subflow - udp: GRO L4 improvements - improve 'forward' / 'frag_list' co-existence with UDP tunnel GRO, allowing the first to take place correctly even for encapsulated UDP traffic - micro-optimize dev_gro_receive() and flow dissection, avoid retpoline overhead on VLAN and TEB GRO - use less memory for sysctls, add a new sysctl type, to allow using u8 instead of "int" and "long" and shrink networking sysctls - veth: allow GRO without XDP - this allows aggregating UDP packets before handing them off to routing, bridge, OvS, etc. - allow specifing ifindex when device is moved to another namespace - netfilter: - nft_socket: add support for cgroupsv2 - nftables: add catch-all set element - special element used to define a default action in case normal lookup missed - use net_generic infra in many modules to avoid allocating per-ns memory unnecessarily - xps: improve the xps handling to avoid potential out-of-bound accesses and use-after-free when XPS change race with other re-configuration under traffic - add a config knob to turn off per-cpu netdev refcnt to catch underflows in testing Device APIs: - add WWAN subsystem to organize the WWAN interfaces better and hopefully start driving towards more unified and vendor- independent APIs - ethtool: - add interface for reading IEEE MIB stats (incl. mlx5 and bnxt support) - allow network drivers to dump arbitrary SFP EEPROM data, current offset+length API was a poor fit for modern SFP which define EEPROM in terms of pages (incl. mlx5 support) - act_police, flow_offload: add support for packet-per-second policing (incl. offload for nfp) - psample: add additional metadata attributes like transit delay for packets sampled from switch HW (and corresponding egress and policy-based sampling in the mlxsw driver) - dsa: improve support for sandwiched LAGs with bridge and DSA - netfilter: - flowtable: use direct xmit in topologies with IP forwarding, bridging, vlans etc. - nftables: counter hardware offload support - Bluetooth: - improvements for firmware download w/ Intel devices - add support for reading AOSP vendor capabilities - add support for virtio transport driver - mac80211: - allow concurrent monitor iface and ethernet rx decap - set priority and queue mapping for injected frames - phy: add support for Clause-45 PHY Loopback - pci/iov: add sysfs MSI-X vector assignment interface to distribute MSI-X resources to VFs (incl. mlx5 support) New hardware/drivers: - dsa: mv88e6xxx: add support for Marvell mv88e6393x - 11-port Ethernet switch with 8x 1-Gigabit Ethernet and 3x 10-Gigabit interfaces. - dsa: support for legacy Broadcom tags used on BCM5325, BCM5365 and BCM63xx switches - Microchip KSZ8863 and KSZ8873; 3x 10/100Mbps Ethernet switches - ath11k: support for QCN9074 a 802.11ax device - Bluetooth: Broadcom BCM4330 and BMC4334 - phy: Marvell 88X2222 transceiver support - mdio: add BCM6368 MDIO mux bus controller - r8152: support RTL8153 and RTL8156 (USB Ethernet) chips - mana: driver for Microsoft Azure Network Adapter (MANA) - Actions Semi Owl Ethernet MAC - can: driver for ETAS ES58X CAN/USB interfaces Pure driver changes: - add XDP support to: enetc, igc, stmmac - add AF_XDP support to: stmmac - virtio: - page_to_skb() use build_skb when there's sufficient tailroom (21% improvement for 1000B UDP frames) - support XDP even without dedicated Tx queues - share the Tx queues with the stack when necessary - mlx5: - flow rules: add support for mirroring with conntrack, matching on ICMP, GTP, flex filters and more - support packet sampling with flow offloads - persist uplink representor netdev across eswitch mode changes - allow coexistence of CQE compression and HW time-stamping - add ethtool extended link error state reporting - ice, iavf: support flow filters, UDP Segmentation Offload - dpaa2-switch: - move the driver out of staging - add spanning tree (STP) support - add rx copybreak support - add tc flower hardware offload on ingress traffic - ionic: - implement Rx page reuse - support HW PTP time-stamping - octeon: support TC hardware offloads - flower matching on ingress and egress ratelimitting. - stmmac: - add RX frame steering based on VLAN priority in tc flower - support frame preemption (FPE) - intel: add cross time-stamping freq difference adjustment - ocelot: - support forwarding of MRP frames in HW - support multiple bridges - support PTP Sync one-step timestamping - dsa: mv88e6xxx, dpaa2-switch: offload bridge port flags like learning, flooding etc. - ipa: add IPA v4.5, v4.9 and v4.11 support (Qualcomm SDX55, SM8350, SC7280 SoCs) - mt7601u: enable TDLS support - mt76: - add support for 802.3 rx frames (mt7915/mt7615) - mt7915 flash pre-calibration support - mt7921/mt7663 runtime power management fixes" * tag 'net-next-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2451 commits) net: selftest: fix build issue if INET is disabled net: netrom: nr_in: Remove redundant assignment to ns net: tun: Remove redundant assignment to ret net: phy: marvell: add downshift support for M88E1240 net: dsa: ksz: Make reg_mib_cnt a u8 as it never exceeds 255 net/sched: act_ct: Remove redundant ct get and check icmp: standardize naming of RFC 8335 PROBE constants bpf, selftests: Update array map tests for per-cpu batched ops bpf: Add batched ops support for percpu array bpf: Implement formatted output helpers with bstr_printf seq_file: Add a seq_bprintf function sfc: adjust efx->xdp_tx_queue_count with the real number of initialized queues net:nfc:digital: Fix a double free in digital_tg_recv_dep_req net: fix a concurrency bug in l2tp_tunnel_register() net/smc: Remove redundant assignment to rc mpls: Remove redundant assignment to err llc2: Remove redundant assignment to rc net/tls: Remove redundant initialization of record rds: Remove redundant assignment to nr_sig dt-bindings: net: mdio-gpio: add compatible for microchip,mdio-smi0 ...
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile7
-rw-r--r--net/core/bpf_sk_storage.c2
-rw-r--r--net/core/dev.c404
-rw-r--r--net/core/dev_addr_lists.c4
-rw-r--r--net/core/devlink.c11
-rw-r--r--net/core/drop_monitor.c2
-rw-r--r--net/core/filter.c276
-rw-r--r--net/core/flow_dissector.c47
-rw-r--r--net/core/neighbour.c4
-rw-r--r--net/core/net-procfs.c3
-rw-r--r--net/core/net-sysfs.c177
-rw-r--r--net/core/netevent.c2
-rw-r--r--net/core/rtnetlink.c15
-rw-r--r--net/core/scm.c23
-rw-r--r--net/core/selftests.c400
-rw-r--r--net/core/skbuff.c55
-rw-r--r--net/core/skmsg.c383
-rw-r--r--net/core/sock.c2
-rw-r--r--net/core/sock_map.c194
-rw-r--r--net/core/sysctl_net_core.c10
20 files changed, 1309 insertions, 712 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 3e2c378e5f31..f7f16650fe9e 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,22 +16,25 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
obj-y += net-sysfs.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
-obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+obj-$(CONFIG_NET_SELFTESTS) += selftests.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
-obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
obj-$(CONFIG_FAILOVER) += failover.o
+ifeq ($(CONFIG_INET),y)
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
+obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
+endif
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 4edd033e899c..cc3712ad8716 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -89,7 +89,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
smap = (struct bpf_local_storage_map *)map;
bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap);
+ bpf_local_storage_map_free(smap, NULL);
}
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
diff --git a/net/core/dev.c b/net/core/dev.c
index 1f79b9aa9a3f..222b1d322c96 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -848,6 +848,52 @@ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
+static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
+{
+ int k = stack->num_paths++;
+
+ if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
+ return NULL;
+
+ return &stack->path[k];
+}
+
+int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+ struct net_device_path_stack *stack)
+{
+ const struct net_device *last_dev;
+ struct net_device_path_ctx ctx = {
+ .dev = dev,
+ .daddr = daddr,
+ };
+ struct net_device_path *path;
+ int ret = 0;
+
+ stack->num_paths = 0;
+ while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
+ last_dev = ctx.dev;
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+
+ memset(path, 0, sizeof(struct net_device_path));
+ ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
+ if (ret < 0)
+ return -1;
+
+ if (WARN_ON_ONCE(last_dev == ctx.dev))
+ return -1;
+ }
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+ path->type = DEV_PATH_ETHERNET;
+ path->dev = ctx.dev;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_fill_forward_path);
+
/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
@@ -2463,16 +2509,14 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
EXPORT_SYMBOL(netdev_txq_to_tc);
#ifdef CONFIG_XPS
-struct static_key xps_needed __read_mostly;
-EXPORT_SYMBOL(xps_needed);
-struct static_key xps_rxqs_needed __read_mostly;
-EXPORT_SYMBOL(xps_rxqs_needed);
+static struct static_key xps_needed __read_mostly;
+static struct static_key xps_rxqs_needed __read_mostly;
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
- int tci, u16 index)
+ struct xps_dev_maps *old_maps, int tci, u16 index)
{
struct xps_map *map = NULL;
int pos;
@@ -2491,6 +2535,8 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
break;
}
+ if (old_maps)
+ RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
return false;
@@ -2503,7 +2549,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
struct xps_dev_maps *dev_maps,
int cpu, u16 offset, u16 count)
{
- int num_tc = dev->num_tc ? : 1;
+ int num_tc = dev_maps->num_tc;
bool active = false;
int tci;
@@ -2511,7 +2557,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
int i, j;
for (i = count, j = offset; i--; j++) {
- if (!remove_xps_queue(dev_maps, tci, j))
+ if (!remove_xps_queue(dev_maps, NULL, tci, j))
break;
}
@@ -2523,74 +2569,54 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
static void reset_xps_maps(struct net_device *dev,
struct xps_dev_maps *dev_maps,
- bool is_rxqs_map)
+ enum xps_map_type type)
{
- if (is_rxqs_map) {
- static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
- RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
- } else {
- RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
- }
static_key_slow_dec_cpuslocked(&xps_needed);
+ if (type == XPS_RXQS)
+ static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+
+ RCU_INIT_POINTER(dev->xps_maps[type], NULL);
+
kfree_rcu(dev_maps, rcu);
}
-static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
- struct xps_dev_maps *dev_maps, unsigned int nr_ids,
- u16 offset, u16 count, bool is_rxqs_map)
+static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
+ u16 offset, u16 count)
{
+ struct xps_dev_maps *dev_maps;
bool active = false;
int i, j;
- for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
- j < nr_ids;)
- active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
- count);
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (!dev_maps)
+ return;
+
+ for (j = 0; j < dev_maps->nr_ids; j++)
+ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
if (!active)
- reset_xps_maps(dev, dev_maps, is_rxqs_map);
+ reset_xps_maps(dev, dev_maps, type);
- if (!is_rxqs_map) {
- for (i = offset + (count - 1); count--; i--) {
+ if (type == XPS_CPUS) {
+ for (i = offset + (count - 1); count--; i--)
netdev_queue_numa_node_write(
- netdev_get_tx_queue(dev, i),
- NUMA_NO_NODE);
- }
+ netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
}
}
static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
u16 count)
{
- const unsigned long *possible_mask = NULL;
- struct xps_dev_maps *dev_maps;
- unsigned int nr_ids;
-
if (!static_key_false(&xps_needed))
return;
cpus_read_lock();
mutex_lock(&xps_map_mutex);
- if (static_key_false(&xps_rxqs_needed)) {
- dev_maps = xmap_dereference(dev->xps_rxqs_map);
- if (dev_maps) {
- nr_ids = dev->num_rx_queues;
- clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
- offset, count, true);
- }
- }
-
- dev_maps = xmap_dereference(dev->xps_cpus_map);
- if (!dev_maps)
- goto out_no_maps;
+ if (static_key_false(&xps_rxqs_needed))
+ clean_xps_maps(dev, XPS_RXQS, offset, count);
- if (num_possible_cpus() > 1)
- possible_mask = cpumask_bits(cpu_possible_mask);
- nr_ids = nr_cpu_ids;
- clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
- false);
+ clean_xps_maps(dev, XPS_CPUS, offset, count);
-out_no_maps:
mutex_unlock(&xps_map_mutex);
cpus_read_unlock();
}
@@ -2640,16 +2666,35 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
return new_map;
}
+/* Copy xps maps at a given index */
+static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
+ struct xps_dev_maps *new_dev_maps, int index,
+ int tc, bool skip_tc)
+{
+ int i, tci = index * dev_maps->num_tc;
+ struct xps_map *map;
+
+ /* copy maps belonging to foreign traffic classes */
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc && skip_tc)
+ continue;
+
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
+ }
+}
+
/* Must be called under cpus_read_lock */
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
- u16 index, bool is_rxqs_map)
+ u16 index, enum xps_map_type type)
{
- const unsigned long *online_mask = NULL, *possible_mask = NULL;
- struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
+ const unsigned long *online_mask = NULL;
+ bool active = false, copy = false;
int i, j, tci, numa_node_id = -2;
int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
- bool active = false;
unsigned int nr_ids;
if (dev->num_tc) {
@@ -2667,38 +2712,48 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
}
mutex_lock(&xps_map_mutex);
- if (is_rxqs_map) {
+
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (type == XPS_RXQS) {
maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
- dev_maps = xmap_dereference(dev->xps_rxqs_map);
nr_ids = dev->num_rx_queues;
} else {
maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
- if (num_possible_cpus() > 1) {
+ if (num_possible_cpus() > 1)
online_mask = cpumask_bits(cpu_online_mask);
- possible_mask = cpumask_bits(cpu_possible_mask);
- }
- dev_maps = xmap_dereference(dev->xps_cpus_map);
nr_ids = nr_cpu_ids;
}
if (maps_sz < L1_CACHE_BYTES)
maps_sz = L1_CACHE_BYTES;
+ /* The old dev_maps could be larger or smaller than the one we're
+ * setting up now, as dev->num_tc or nr_ids could have been updated in
+ * between. We could try to be smart, but let's be safe instead and only
+ * copy foreign traffic classes if the two map sizes match.
+ */
+ if (dev_maps &&
+ dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
+ copy = true;
+
/* allocate memory for queue storage */
for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
j < nr_ids;) {
- if (!new_dev_maps)
- new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
- mutex_unlock(&xps_map_mutex);
- return -ENOMEM;
+ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ if (!new_dev_maps) {
+ mutex_unlock(&xps_map_mutex);
+ return -ENOMEM;
+ }
+
+ new_dev_maps->nr_ids = nr_ids;
+ new_dev_maps->num_tc = num_tc;
}
tci = j * num_tc + tc;
- map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
- NULL;
+ map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
- map = expand_xps_map(map, j, index, is_rxqs_map);
+ map = expand_xps_map(map, j, index, type == XPS_RXQS);
if (!map)
goto error;
@@ -2711,29 +2766,21 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
if (!dev_maps) {
/* Increment static keys at most once per type */
static_key_slow_inc_cpuslocked(&xps_needed);
- if (is_rxqs_map)
+ if (type == XPS_RXQS)
static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
}
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- /* copy maps belonging to foreign traffic classes */
- for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- }
+ for (j = 0; j < nr_ids; j++) {
+ bool skip_tc = false;
- /* We need to explicitly update tci as prevous loop
- * could break out early if dev_maps is NULL.
- */
tci = j * num_tc + tc;
-
if (netif_attr_test_mask(j, mask, nr_ids) &&
netif_attr_test_online(j, online_mask, nr_ids)) {
/* add tx-queue to CPU/rx-queue maps */
int pos = 0;
+ skip_tc = true;
+
map = xmap_dereference(new_dev_maps->attr_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
@@ -2741,78 +2788,81 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
if (pos == map->len)
map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
- if (!is_rxqs_map) {
+ if (type == XPS_CPUS) {
if (numa_node_id == -2)
numa_node_id = cpu_to_node(j);
else if (numa_node_id != cpu_to_node(j))
numa_node_id = -1;
}
#endif
- } else if (dev_maps) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
- /* copy maps belonging to foreign traffic classes */
- for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- }
+ if (copy)
+ xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
+ skip_tc);
}
- if (is_rxqs_map)
- rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
- else
- rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
+ rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
/* Cleanup old maps */
if (!dev_maps)
goto out_no_old_maps;
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- for (i = num_tc, tci = j * num_tc; i--; tci++) {
- new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
map = xmap_dereference(dev_maps->attr_map[tci]);
- if (map && map != new_map)
- kfree_rcu(map, rcu);
+ if (!map)
+ continue;
+
+ if (copy) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ if (map == new_map)
+ continue;
+ }
+
+ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
+ kfree_rcu(map, rcu);
}
}
- kfree_rcu(dev_maps, rcu);
+ old_dev_maps = dev_maps;
out_no_old_maps:
dev_maps = new_dev_maps;
active = true;
out_no_new_maps:
- if (!is_rxqs_map) {
+ if (type == XPS_CPUS)
/* update Tx queue numa node */
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
(numa_node_id >= 0) ?
numa_node_id : NUMA_NO_NODE);
- }
if (!dev_maps)
goto out_no_maps;
/* removes tx-queue from unused CPUs/rx-queues */
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- for (i = tc, tci = j * num_tc; i--; tci++)
- active |= remove_xps_queue(dev_maps, tci, index);
- if (!netif_attr_test_mask(j, mask, nr_ids) ||
- !netif_attr_test_online(j, online_mask, nr_ids))
- active |= remove_xps_queue(dev_maps, tci, index);
- for (i = num_tc - tc, tci++; --i; tci++)
- active |= remove_xps_queue(dev_maps, tci, index);
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ tci = j * dev_maps->num_tc;
+
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc &&
+ netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
+ netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
+ continue;
+
+ active |= remove_xps_queue(dev_maps,
+ copy ? old_dev_maps : NULL,
+ tci, index);
+ }
}
+ if (old_dev_maps)
+ kfree_rcu(old_dev_maps, rcu);
+
/* free map if not active */
if (!active)
- reset_xps_maps(dev, dev_maps, is_rxqs_map);
+ reset_xps_maps(dev, dev_maps, type);
out_no_maps:
mutex_unlock(&xps_map_mutex);
@@ -2820,11 +2870,10 @@ out_no_maps:
return 0;
error:
/* remove any maps that we added */
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
+ for (j = 0; j < nr_ids; j++) {
for (i = num_tc, tci = j * num_tc; i--; tci++) {
new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
- map = dev_maps ?
+ map = copy ?
xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
if (new_map && new_map != map)
@@ -2845,7 +2894,7 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
int ret;
cpus_read_lock();
- ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+ ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
cpus_read_unlock();
return ret;
@@ -3956,13 +4005,15 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
struct xps_dev_maps *dev_maps, unsigned int tci)
{
+ int tc = netdev_get_prio_tc_map(dev, skb->priority);
struct xps_map *map;
int queue_index = -1;
- if (dev->num_tc) {
- tci *= dev->num_tc;
- tci += netdev_get_prio_tc_map(dev, skb->priority);
- }
+ if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
+ return queue_index;
+
+ tci *= dev_maps->num_tc;
+ tci += tc;
map = rcu_dereference(dev_maps->attr_map[tci]);
if (map) {
@@ -3993,18 +4044,18 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
if (!static_key_false(&xps_rxqs_needed))
goto get_cpus_map;
- dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
if (dev_maps) {
int tci = sk_rx_queue_get(sk);
- if (tci >= 0 && tci < dev->num_rx_queues)
+ if (tci >= 0)
queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
tci);
}
get_cpus_map:
if (queue_index < 0) {
- dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
if (dev_maps) {
unsigned int tci = skb->sender_cpu - 1;
@@ -4672,10 +4723,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
u32 metalen, act = XDP_DROP;
+ bool orig_bcast, orig_host;
u32 mac_len, frame_sz;
__be16 orig_eth_type;
struct ethhdr *eth;
- bool orig_bcast;
int off;
/* Reinjected packets coming from act_mirred or similar should
@@ -4722,6 +4773,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
orig_data_end = xdp->data_end;
orig_data = xdp->data;
eth = (struct ethhdr *)xdp->data;
+ orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
orig_eth_type = eth->h_proto;
@@ -4749,8 +4801,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
/* check if XDP changed eth hdr such SKB needs update */
eth = (struct ethhdr *)xdp->data;
if ((orig_eth_type != eth->h_proto) ||
+ (orig_host != ether_addr_equal_64bits(eth->h_dest,
+ skb->dev->dev_addr)) ||
(orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
__skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, skb->dev);
}
@@ -5284,6 +5339,7 @@ skip_classify:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
+ break;
case RX_HANDLER_PASS:
break;
default:
@@ -5876,15 +5932,13 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
}
EXPORT_SYMBOL(napi_gro_flush);
-static struct list_head *gro_list_prepare(struct napi_struct *napi,
- struct sk_buff *skb)
+static void gro_list_prepare(const struct list_head *head,
+ const struct sk_buff *skb)
{
unsigned int maclen = skb->dev->hard_header_len;
u32 hash = skb_get_hash_raw(skb);
- struct list_head *head;
struct sk_buff *p;
- head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
list_for_each_entry(p, head, list) {
unsigned long diffs;
@@ -5910,11 +5964,9 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi,
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
}
-
- return head;
}
-static void skb_gro_reset_offset(struct sk_buff *skb)
+static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
{
const struct skb_shared_info *pinfo = skb_shinfo(skb);
const skb_frag_t *frag0 = &pinfo->frags[0];
@@ -5925,7 +5977,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
if (!skb_headlen(skb) && pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0)) &&
- (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) {
+ (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
skb_frag_size(frag0),
@@ -5975,11 +6027,11 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
- u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ struct gro_list *gro_list = &napi->gro_hash[bucket];
struct list_head *head = &offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
- struct list_head *gro_head;
struct sk_buff *pp = NULL;
enum gro_result ret;
int same_flow;
@@ -5988,7 +6040,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (netif_elide_gro(skb->dev))
goto normal;
- gro_head = gro_list_prepare(napi, skb);
+ gro_list_prepare(&gro_list->list, skb);
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
@@ -6024,7 +6076,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
ipv6_gro_receive, inet_gro_receive,
- gro_head, skb);
+ &gro_list->list, skb);
break;
}
rcu_read_unlock();
@@ -6043,7 +6095,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (pp) {
skb_list_del_init(pp);
napi_gro_complete(napi, pp);
- napi->gro_hash[hash].count--;
+ gro_list->count--;
}
if (same_flow)
@@ -6052,16 +6104,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (NAPI_GRO_CB(skb)->flush)
goto normal;
- if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
- gro_flush_oldest(napi, gro_head);
- } else {
- napi->gro_hash[hash].count++;
- }
+ if (unlikely(gro_list->count >= MAX_GRO_SKBS))
+ gro_flush_oldest(napi, &gro_list->list);
+ else
+ gro_list->count++;
+
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
- list_add(&skb->list, gro_head);
+ list_add(&skb->list, &gro_list->list);
ret = GRO_HELD;
pull:
@@ -6069,11 +6121,11 @@ pull:
if (grow > 0)
gro_pull_from_frag0(skb, grow);
ok:
- if (napi->gro_hash[hash].count) {
- if (!test_bit(hash, &napi->gro_bitmask))
- __set_bit(hash, &napi->gro_bitmask);
- } else if (test_bit(hash, &napi->gro_bitmask)) {
- __clear_bit(hash, &napi->gro_bitmask);
+ if (gro_list->count) {
+ if (!test_bit(bucket, &napi->gro_bitmask))
+ __set_bit(bucket, &napi->gro_bitmask);
+ } else if (test_bit(bucket, &napi->gro_bitmask)) {
+ __clear_bit(bucket, &napi->gro_bitmask);
}
return ret;
@@ -6143,7 +6195,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
skb_mark_napi_id(skb, napi);
trace_napi_gro_receive_entry(skb);
- skb_gro_reset_offset(skb);
+ skb_gro_reset_offset(skb, 0);
ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
trace_napi_gro_receive_exit(ret);
@@ -6232,7 +6284,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
napi->skb = NULL;
skb_reset_mac_header(skb);
- skb_gro_reset_offset(skb);
+ skb_gro_reset_offset(skb, hlen);
if (unlikely(skb_gro_header_hard(skb, hlen))) {
eth = skb_gro_header_slow(skb, hlen, 0);
@@ -6790,6 +6842,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
return err;
}
+EXPORT_SYMBOL(dev_set_threaded);
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
@@ -10338,14 +10391,20 @@ EXPORT_SYMBOL(register_netdev);
int netdev_refcnt_read(const struct net_device *dev)
{
+#ifdef CONFIG_PCPU_DEV_REFCNT
int i, refcnt = 0;
for_each_possible_cpu(i)
refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
return refcnt;
+#else
+ return refcount_read(&dev->dev_refcnt);
+#endif
}
EXPORT_SYMBOL(netdev_refcnt_read);
+int netdev_unregister_timeout_secs __read_mostly = 10;
+
#define WAIT_REFS_MIN_MSECS 1
#define WAIT_REFS_MAX_MSECS 250
/**
@@ -10370,7 +10429,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
rebroadcast_time = warning_time = jiffies;
refcnt = netdev_refcnt_read(dev);
- while (refcnt != 0) {
+ while (refcnt != 1) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
@@ -10407,7 +10466,9 @@ static void netdev_wait_allrefs(struct net_device *dev)
refcnt = netdev_refcnt_read(dev);
- if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
+ if (refcnt != 1 &&
+ time_after(jiffies, warning_time +
+ netdev_unregister_timeout_secs * HZ)) {
pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
dev->name, refcnt);
warning_time = jiffies;
@@ -10483,7 +10544,7 @@ void netdev_run_todo(void)
netdev_wait_allrefs(dev);
/* paranoia */
- BUG_ON(netdev_refcnt_read(dev));
+ BUG_ON(netdev_refcnt_read(dev) != 1);
BUG_ON(!list_empty(&dev->ptype_all));
BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr));
@@ -10700,9 +10761,14 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
+#ifdef CONFIG_PCPU_DEV_REFCNT
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
+ dev_hold(dev);
+#else
+ refcount_set(&dev->dev_refcnt, 1);
+#endif
if (dev_addr_init(dev))
goto free_pcpu;
@@ -10766,8 +10832,10 @@ free_all:
return NULL;
free_pcpu:
+#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
free_dev:
+#endif
netdev_freemem(dev);
return NULL;
}
@@ -10809,8 +10877,10 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
+#endif
free_percpu(dev->xdp_bulkq);
dev->xdp_bulkq = NULL;
@@ -10998,11 +11068,13 @@ void unregister_netdev(struct net_device *dev)
EXPORT_SYMBOL(unregister_netdev);
/**
- * dev_change_net_namespace - move device to different nethost namespace
+ * __dev_change_net_namespace - move device to different nethost namespace
* @dev: device
* @net: network namespace
* @pat: If not NULL name pattern to try if the current device name
* is already taken in the destination network namespace.
+ * @new_ifindex: If not zero, specifies device index in the target
+ * namespace.
*
* This function shuts down a device interface and moves it
* to a new network namespace. On success 0 is returned, on
@@ -11011,10 +11083,11 @@ EXPORT_SYMBOL(unregister_netdev);
* Callers must hold the rtnl semaphore.
*/
-int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+int __dev_change_net_namespace(struct net_device *dev, struct net *net,
+ const char *pat, int new_ifindex)
{
struct net *net_old = dev_net(dev);
- int err, new_nsid, new_ifindex;
+ int err, new_nsid;
ASSERT_RTNL();
@@ -11045,6 +11118,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
goto out;
}
+ /* Check that new_ifindex isn't used yet. */
+ err = -EBUSY;
+ if (new_ifindex && __dev_get_by_index(net, new_ifindex))
+ goto out;
+
/*
* And now a mini version of register_netdevice unregister_netdevice.
*/
@@ -11072,10 +11150,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
/* If there is an ifindex conflict assign a new one */
- if (__dev_get_by_index(net, dev->ifindex))
- new_ifindex = dev_new_index(net);
- else
- new_ifindex = dev->ifindex;
+ if (!new_ifindex) {
+ if (__dev_get_by_index(net, dev->ifindex))
+ new_ifindex = dev_new_index(net);
+ else
+ new_ifindex = dev->ifindex;
+ }
rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
new_ifindex);
@@ -11128,7 +11208,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
out:
return err;
}
-EXPORT_SYMBOL_GPL(dev_change_net_namespace);
+EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
static int dev_cpu_dead(unsigned int oldcpu)
{
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index fa1c37ec40c9..45ae6eeb2964 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -228,7 +228,7 @@ EXPORT_SYMBOL(__hw_addr_unsync);
* @sync: function to call if address should be added
* @unsync: function to call if address should be removed
*
- * This funciton is intended to be called from the ndo_set_rx_mode
+ * This function is intended to be called from the ndo_set_rx_mode
* function of devices that require explicit address add/remove
* notifications. The unsync function may be NULL in which case
* the addresses requiring removal will simply be removed without
@@ -723,7 +723,7 @@ void dev_uc_flush(struct net_device *dev)
EXPORT_SYMBOL(dev_uc_flush);
/**
- * dev_uc_flush - Init unicast address list
+ * dev_uc_init - Init unicast address list
* @dev: device
*
* Init unicast address list.
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 737b61c2976e..4eb969518ee0 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -8599,9 +8599,10 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
* @controller: associated controller number for the devlink port instance
* @pf: associated PF for the devlink port instance
* @sf: associated SF of a PF for the devlink port instance
+ * @external: indicates if the port is for an external controller
*/
void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller,
- u16 pf, u32 sf)
+ u16 pf, u32 sf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
@@ -8615,6 +8616,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
attrs->pci_sf.controller = controller;
attrs->pci_sf.pf = pf;
attrs->pci_sf.sf = sf;
+ attrs->pci_sf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
@@ -8667,6 +8669,13 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
attrs->pci_vf.pf, attrs->pci_vf.vf);
break;
case DEVLINK_PORT_FLAVOUR_PCI_SF:
+ if (attrs->pci_sf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_sf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf,
attrs->pci_sf.sf);
break;
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index db65ce62b625..ead2a8aa57b4 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -1754,7 +1754,7 @@ static void exit_net_drop_monitor(void)
/*
* Because of the module_get/put we do in the trace state change path
- * we are guarnateed not to have any current users when we get here
+ * we are guaranteed not to have any current users when we get here
*/
for_each_possible_cpu(cpu) {
diff --git a/net/core/filter.c b/net/core/filter.c
index 9323d34d34cc..cae56d08a670 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = {
static inline int sk_skb_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
- int err = __bpf_try_make_writable(skb, write_len);
-
- bpf_compute_data_end_sk_skb(skb);
- return err;
+ return __bpf_try_make_writable(skb, write_len);
}
BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -3412,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
BPF_F_ADJ_ROOM_ENCAP_L2( \
BPF_ADJ_ROOM_ENCAP_L2_MASK))
@@ -3448,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
return -EINVAL;
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
+ inner_mac_len < ETH_HLEN)
+ return -EINVAL;
+
if (skb->encapsulation)
return -EALREADY;
@@ -3466,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
skb->inner_mac_header = inner_net - inner_mac_len;
skb->inner_network_header = inner_net;
skb->inner_transport_header = inner_trans;
- skb_set_inner_protocol(skb, skb->protocol);
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+ else
+ skb_set_inner_protocol(skb, skb->protocol);
skb->encapsulation = 1;
skb_set_network_header(skb, mac_len);
@@ -3577,7 +3583,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
return -ENOMEM;
__skb_pull(skb, len_diff_abs);
}
- bpf_compute_data_end_sk_skb(skb);
if (tls_sw_has_ctx_rx(skb->sk)) {
struct strp_msg *rxm = strp_msg(skb);
@@ -3742,10 +3747,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
u64, flags)
{
- int ret = __bpf_skb_change_tail(skb, new_len, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_tail(skb, new_len, flags);
}
static const struct bpf_func_proto sk_skb_change_tail_proto = {
@@ -3808,10 +3810,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
u64, flags)
{
- int ret = __bpf_skb_change_head(skb, head_room, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_head(skb, head_room, flags);
}
static const struct bpf_func_proto sk_skb_change_head_proto = {
@@ -3919,23 +3918,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
.arg2_type = ARG_ANYTHING,
};
-static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
- struct bpf_map *map, struct xdp_buff *xdp)
-{
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP:
- case BPF_MAP_TYPE_DEVMAP_HASH:
- return dev_map_enqueue(fwd, xdp, dev_rx);
- case BPF_MAP_TYPE_CPUMAP:
- return cpu_map_enqueue(fwd, xdp, dev_rx);
- case BPF_MAP_TYPE_XSKMAP:
- return __xsk_map_redirect(fwd, xdp);
- default:
- return -EBADRQC;
- }
- return 0;
-}
-
void xdp_do_flush(void)
{
__dev_flush();
@@ -3944,71 +3926,52 @@ void xdp_do_flush(void)
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
-static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
-{
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP:
- return __dev_map_lookup_elem(map, index);
- case BPF_MAP_TYPE_DEVMAP_HASH:
- return __dev_map_hash_lookup_elem(map, index);
- case BPF_MAP_TYPE_CPUMAP:
- return __cpu_map_lookup_elem(map, index);
- case BPF_MAP_TYPE_XSKMAP:
- return __xsk_map_lookup_elem(map, index);
- default:
- return NULL;
- }
-}
-
-void bpf_clear_redirect_map(struct bpf_map *map)
-{
- struct bpf_redirect_info *ri;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- ri = per_cpu_ptr(&bpf_redirect_info, cpu);
- /* Avoid polluting remote cacheline due to writes if
- * not needed. Once we pass this test, we need the
- * cmpxchg() to make sure it hasn't been changed in
- * the meantime by remote CPU.
- */
- if (unlikely(READ_ONCE(ri->map) == map))
- cmpxchg(&ri->map, map, NULL);
- }
-}
-
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->tgt_index;
+ enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
int err;
- ri->tgt_index = 0;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
- if (unlikely(!map)) {
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ err = dev_map_enqueue(fwd, xdp, dev);
+ break;
+ case BPF_MAP_TYPE_CPUMAP:
+ err = cpu_map_enqueue(fwd, xdp, dev);
+ break;
+ case BPF_MAP_TYPE_XSKMAP:
+ err = __xsk_map_redirect(fwd, xdp);
+ break;
+ case BPF_MAP_TYPE_UNSPEC:
+ if (map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ break;
+ }
+ err = dev_xdp_enqueue(fwd, xdp, dev);
+ break;
}
-
- err = dev_xdp_enqueue(fwd, xdp, dev);
- } else {
- err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
+ fallthrough;
+ default:
+ err = -EBADRQC;
}
if (unlikely(err))
goto err;
- _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
return 0;
err:
- _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
return err;
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@ -4017,41 +3980,36 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
struct xdp_buff *xdp,
struct bpf_prog *xdp_prog,
- struct bpf_map *map)
+ void *fwd,
+ enum bpf_map_type map_type, u32 map_id)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- u32 index = ri->tgt_index;
- void *fwd = ri->tgt_value;
- int err = 0;
-
- ri->tgt_index = 0;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
-
- if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
- map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- struct bpf_dtab_netdev *dst = fwd;
+ int err;
- err = dev_map_generic_redirect(dst, skb, xdp_prog);
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ err = dev_map_generic_redirect(fwd, skb, xdp_prog);
if (unlikely(err))
goto err;
- } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
- struct xdp_sock *xs = fwd;
-
- err = xsk_generic_rcv(xs, xdp);
+ break;
+ case BPF_MAP_TYPE_XSKMAP:
+ err = xsk_generic_rcv(fwd, xdp);
if (err)
goto err;
consume_skb(skb);
- } else {
+ break;
+ default:
/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
err = -EBADRQC;
goto err;
}
- _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
return 0;
err:
- _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
return err;
}
@@ -4059,31 +4017,34 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->tgt_index;
- struct net_device *fwd;
- int err = 0;
-
- if (map)
- return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
- map);
- ri->tgt_index = 0;
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
+ enum bpf_map_type map_type = ri->map_type;
+ void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
+ int err;
- err = xdp_ok_fwd_dev(fwd, skb->len);
- if (unlikely(err))
- goto err;
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
- skb->dev = fwd;
- _trace_xdp_redirect(dev, xdp_prog, index);
- generic_xdp_tx(skb, xdp_prog);
- return 0;
+ if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = xdp_ok_fwd_dev(fwd, skb->len);
+ if (unlikely(err))
+ goto err;
+
+ skb->dev = fwd;
+ _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
+ generic_xdp_tx(skb, xdp_prog);
+ return 0;
+ }
+
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id);
err:
- _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
return err;
}
@@ -4094,10 +4055,12 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
if (unlikely(flags))
return XDP_ABORTED;
- ri->flags = flags;
+ /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
+ * by map_idr) is used for ifindex based XDP redirect.
+ */
ri->tgt_index = ifindex;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
return XDP_REDIRECT;
}
@@ -4113,28 +4076,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-
- /* Lower bits of the flags are used as return code on lookup failure */
- if (unlikely(flags > XDP_TX))
- return XDP_ABORTED;
-
- ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
- if (unlikely(!ri->tgt_value)) {
- /* If the lookup fails we want to clear out the state in the
- * redirect_info struct completely, so that if an eBPF program
- * performs multiple lookups, the last one always takes
- * precedence.
- */
- WRITE_ONCE(ri->map, NULL);
- return flags;
- }
-
- ri->flags = flags;
- ri->tgt_index = ifindex;
- WRITE_ONCE(ri->map, map);
-
- return XDP_REDIRECT;
+ return map->ops->map_redirect(map, ifindex, flags);
}
static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
@@ -4787,6 +4729,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
sk->sk_prot->keepalive(sk, valbool);
sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
break;
+ case SO_REUSEPORT:
+ sk->sk_reuseport = valbool;
+ break;
default:
ret = -EINVAL;
}
@@ -4956,6 +4901,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
case SO_BINDTOIFINDEX:
*((int *)optval) = sk->sk_bound_dev_if;
break;
+ case SO_REUSEPORT:
+ *((int *)optval) = sk->sk_reuseport;
+ break;
default:
goto err_clear;
}
@@ -9663,22 +9611,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+/* data_end = skb->data + skb_headlen() */
+static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ /* si->dst_reg = skb->data */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, data));
+ /* AX = skb->len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, len));
+ /* si->dst_reg = skb->data + skb->len */
+ *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+ /* AX = skb->data_len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, data_len));
+ /* si->dst_reg = skb->data + skb->len - skb->data_len */
+ *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX);
+
+ return insn;
+}
+
static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
- int off;
switch (si->off) {
case offsetof(struct __sk_buff, data_end):
- off = si->off;
- off -= offsetof(struct __sk_buff, data_end);
- off += offsetof(struct sk_buff, cb);
- off += offsetof(struct tcp_skb_cb, bpf.data_end);
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
- si->src_reg, off);
+ insn = bpf_convert_data_end_access(si, insn);
break;
default:
return bpf_convert_ctx_access(type, si, insn_buf, prog,
@@ -9847,6 +9813,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
.convert_ctx_access = tc_cls_act_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue,
.gen_ld_abs = bpf_gen_ld_abs,
+ .check_kfunc_call = bpf_prog_test_check_kfunc_call,
};
const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -10457,6 +10424,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
}
const struct bpf_prog_ops sk_lookup_prog_ops = {
+ .test_run = bpf_prog_test_run_sk_lookup,
};
const struct bpf_verifier_ops sk_lookup_verifier_ops = {
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a96a4f5de0ce..3ed7c98a98e1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -114,7 +114,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net,
* is the protocol port offset returned from proto_ports_offset
*/
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
- void *data, int hlen)
+ const void *data, int hlen)
{
int poff = proto_ports_offset(ip_proto);
@@ -161,7 +161,7 @@ static bool icmp_has_id(u8 type)
*/
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
struct flow_dissector_key_icmp *key_icmp,
- void *data, int thoff, int hlen)
+ const void *data, int thoff, int hlen)
{
struct icmphdr *ih, _ih;
@@ -187,8 +187,8 @@ EXPORT_SYMBOL(skb_flow_get_icmp_tci);
*/
static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- void *data, int thoff, int hlen)
+ void *target_container, const void *data,
+ int thoff, int hlen)
{
struct flow_dissector_key_icmp *key_icmp;
@@ -409,8 +409,8 @@ EXPORT_SYMBOL(skb_flow_dissect_hash);
static enum flow_dissect_ret
__skb_flow_dissect_mpls(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff, int hlen,
- int lse_index, bool *entropy_label)
+ void *target_container, const void *data, int nhoff,
+ int hlen, int lse_index, bool *entropy_label)
{
struct mpls_label *hdr, _hdr;
u32 entry, label, bos;
@@ -467,7 +467,8 @@ __skb_flow_dissect_mpls(const struct sk_buff *skb,
static enum flow_dissect_ret
__skb_flow_dissect_arp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff, int hlen)
+ void *target_container, const void *data,
+ int nhoff, int hlen)
{
struct flow_dissector_key_arp *key_arp;
struct {
@@ -523,7 +524,7 @@ static enum flow_dissect_ret
__skb_flow_dissect_gre(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control,
struct flow_dissector *flow_dissector,
- void *target_container, void *data,
+ void *target_container, const void *data,
__be16 *p_proto, int *p_nhoff, int *p_hlen,
unsigned int flags)
{
@@ -663,8 +664,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
static enum flow_dissect_ret
__skb_flow_dissect_batadv(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control,
- void *data, __be16 *p_proto, int *p_nhoff, int hlen,
- unsigned int flags)
+ const void *data, __be16 *p_proto, int *p_nhoff,
+ int hlen, unsigned int flags)
{
struct {
struct batadv_unicast_packet batadv_unicast;
@@ -695,7 +696,8 @@ __skb_flow_dissect_batadv(const struct sk_buff *skb,
static void
__skb_flow_dissect_tcp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int thoff, int hlen)
+ void *target_container, const void *data,
+ int thoff, int hlen)
{
struct flow_dissector_key_tcp *key_tcp;
struct tcphdr *th, _th;
@@ -719,8 +721,8 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb,
static void
__skb_flow_dissect_ports(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff,
- u8 ip_proto, int hlen)
+ void *target_container, const void *data,
+ int nhoff, u8 ip_proto, int hlen)
{
enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
struct flow_dissector_key_ports *key_ports;
@@ -744,7 +746,8 @@ __skb_flow_dissect_ports(const struct sk_buff *skb,
static void
__skb_flow_dissect_ipv4(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, const struct iphdr *iph)
+ void *target_container, const void *data,
+ const struct iphdr *iph)
{
struct flow_dissector_key_ip *key_ip;
@@ -761,7 +764,8 @@ __skb_flow_dissect_ipv4(const struct sk_buff *skb,
static void
__skb_flow_dissect_ipv6(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, const struct ipv6hdr *iph)
+ void *target_container, const void *data,
+ const struct ipv6hdr *iph)
{
struct flow_dissector_key_ip *key_ip;
@@ -828,8 +832,10 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
key_addrs = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_IPV6_ADDRS,
target_container);
- memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src,
- sizeof(key_addrs->v6addrs));
+ memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src,
+ sizeof(key_addrs->v6addrs.src));
+ memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst,
+ sizeof(key_addrs->v6addrs.dst));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
@@ -908,9 +914,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
bool __skb_flow_dissect(const struct net *net,
const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- void *data, __be16 proto, int nhoff, int hlen,
- unsigned int flags)
+ void *target_container, const void *data,
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
{
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
@@ -1642,7 +1647,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb,
}
EXPORT_SYMBOL(skb_get_hash_perturb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
const struct flow_keys_basic *keys, int hlen)
{
u32 poff = keys->control.thoff;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8379719d1dce..98f20efbfadf 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -131,6 +131,9 @@ static void neigh_update_gc_list(struct neighbour *n)
write_lock_bh(&n->tbl->lock);
write_lock(&n->lock);
+ if (n->dead)
+ goto out;
+
/* remove from the gc list if new state is permanent or if neighbor
* is externally learned; otherwise entry should be on the gc list
*/
@@ -147,6 +150,7 @@ static void neigh_update_gc_list(struct neighbour *n)
atomic_inc(&n->tbl->gc_entries);
}
+out:
write_unlock(&n->lock);
write_unlock_bh(&n->tbl->lock);
}
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index c714e6a9dad4..d8b9dbabd4a4 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -10,9 +10,6 @@
#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
-extern struct list_head ptype_all __read_mostly;
-extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-
static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
{
struct net *net = seq_file_net(seq);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 307628fdf380..f6197774048b 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1361,83 +1361,94 @@ static const struct attribute_group dql_group = {
#endif /* CONFIG_BQL */
#ifdef CONFIG_XPS
-static ssize_t xps_cpus_show(struct netdev_queue *queue,
- char *buf)
+static ssize_t xps_queue_show(struct net_device *dev, unsigned int index,
+ int tc, char *buf, enum xps_map_type type)
{
- int cpu, len, ret, num_tc = 1, tc = 0;
- struct net_device *dev = queue->dev;
struct xps_dev_maps *dev_maps;
- cpumask_var_t mask;
- unsigned long index;
-
- if (!netif_is_multiqueue(dev))
- return -ENOENT;
+ unsigned long *mask;
+ unsigned int nr_ids;
+ int j, len;
- index = get_netdev_queue_index(queue);
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps[type]);
- if (!rtnl_trylock())
- return restart_syscall();
+ /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0
+ * when dev_maps hasn't been allocated yet, to be backward compatible.
+ */
+ nr_ids = dev_maps ? dev_maps->nr_ids :
+ (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues);
- if (dev->num_tc) {
- /* Do not allow XPS on subordinate device directly */
- num_tc = dev->num_tc;
- if (num_tc < 0) {
- ret = -EINVAL;
- goto err_rtnl_unlock;
- }
+ mask = bitmap_zalloc(nr_ids, GFP_NOWAIT);
+ if (!mask) {
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
- /* If queue belongs to subordinate dev use its map */
- dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+ if (!dev_maps || tc >= dev_maps->num_tc)
+ goto out_no_maps;
- tc = netdev_txq_to_tc(dev, index);
- if (tc < 0) {
- ret = -EINVAL;
- goto err_rtnl_unlock;
- }
- }
+ for (j = 0; j < nr_ids; j++) {
+ int i, tci = j * dev_maps->num_tc + tc;
+ struct xps_map *map;
- if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto err_rtnl_unlock;
- }
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ continue;
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_cpus_map);
- if (dev_maps) {
- for_each_possible_cpu(cpu) {
- int i, tci = cpu * num_tc + tc;
- struct xps_map *map;
-
- map = rcu_dereference(dev_maps->attr_map[tci]);
- if (!map)
- continue;
-
- for (i = map->len; i--;) {
- if (map->queues[i] == index) {
- cpumask_set_cpu(cpu, mask);
- break;
- }
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ set_bit(j, mask);
+ break;
}
}
}
+out_no_maps:
rcu_read_unlock();
- rtnl_unlock();
+ len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids);
+ bitmap_free(mask);
- len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
- free_cpumask_var(mask);
return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
+{
+ struct net_device *dev = queue->dev;
+ unsigned int index;
+ int len, tc;
+
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
-err_rtnl_unlock:
+ index = get_netdev_queue_index(queue);
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ /* Make sure the subordinate device can't be freed */
+ get_device(&dev->dev);
rtnl_unlock();
- return ret;
+
+ len = xps_queue_show(dev, index, tc, buf, XPS_CPUS);
+
+ put_device(&dev->dev);
+ return len;
}
static ssize_t xps_cpus_store(struct netdev_queue *queue,
const char *buf, size_t len)
{
struct net_device *dev = queue->dev;
- unsigned long index;
+ unsigned int index;
cpumask_var_t mask;
int err;
@@ -1476,64 +1487,21 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
{
- int j, len, ret, num_tc = 1, tc = 0;
struct net_device *dev = queue->dev;
- struct xps_dev_maps *dev_maps;
- unsigned long *mask, index;
+ unsigned int index;
+ int tc;
index = get_netdev_queue_index(queue);
if (!rtnl_trylock())
return restart_syscall();
- if (dev->num_tc) {
- num_tc = dev->num_tc;
- tc = netdev_txq_to_tc(dev, index);
- if (tc < 0) {
- ret = -EINVAL;
- goto err_rtnl_unlock;
- }
- }
- mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
- if (!mask) {
- ret = -ENOMEM;
- goto err_rtnl_unlock;
- }
-
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_rxqs_map);
- if (!dev_maps)
- goto out_no_maps;
-
- for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
- j < dev->num_rx_queues;) {
- int i, tci = j * num_tc + tc;
- struct xps_map *map;
-
- map = rcu_dereference(dev_maps->attr_map[tci]);
- if (!map)
- continue;
-
- for (i = map->len; i--;) {
- if (map->queues[i] == index) {
- set_bit(j, mask);
- break;
- }
- }
- }
-out_no_maps:
- rcu_read_unlock();
-
+ tc = netdev_txq_to_tc(dev, index);
rtnl_unlock();
+ if (tc < 0)
+ return -EINVAL;
- len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
- bitmap_free(mask);
-
- return len < PAGE_SIZE ? len : -EINVAL;
-
-err_rtnl_unlock:
- rtnl_unlock();
- return ret;
+ return xps_queue_show(dev, index, tc, buf, XPS_RXQS);
}
static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
@@ -1541,7 +1509,8 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
{
struct net_device *dev = queue->dev;
struct net *net = dev_net(dev);
- unsigned long *mask, index;
+ unsigned long *mask;
+ unsigned int index;
int err;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -1565,7 +1534,7 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
}
cpus_read_lock();
- err = __netif_set_xps_queue(dev, mask, index, true);
+ err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS);
cpus_read_unlock();
rtnl_unlock();
diff --git a/net/core/netevent.c b/net/core/netevent.c
index d76ed7739c70..5bb615e963cc 100644
--- a/net/core/netevent.c
+++ b/net/core/netevent.c
@@ -32,7 +32,7 @@ int register_netevent_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(register_netevent_notifier);
/**
- * netevent_unregister_notifier - unregister a netevent notifier block
+ * unregister_netevent_notifier - unregister a netevent notifier block
* @nb: notifier
*
* Unregister a notifier previously registered by
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3485b16a7ff3..714d5fa38546 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1877,6 +1877,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
.len = ALTIFNAMSIZ - 1 },
[IFLA_PERM_ADDRESS] = { .type = NLA_REJECT },
[IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED },
+ [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2603,14 +2604,22 @@ static int do_setlink(const struct sk_buff *skb,
return err;
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) {
- struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
- tb, CAP_NET_ADMIN);
+ struct net *net;
+ int new_ifindex;
+
+ net = rtnl_link_get_net_capable(skb, dev_net(dev),
+ tb, CAP_NET_ADMIN);
if (IS_ERR(net)) {
err = PTR_ERR(net);
goto errout;
}
- err = dev_change_net_namespace(dev, net, ifname);
+ if (tb[IFLA_NEW_IFINDEX])
+ new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]);
+ else
+ new_ifindex = 0;
+
+ err = __dev_change_net_namespace(dev, net, ifname, new_ifindex);
put_net(net);
if (err)
goto errout;
diff --git a/net/core/scm.c b/net/core/scm.c
index 8156d4fb8a39..ae3085d9aae8 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -228,14 +228,16 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
if (msg->msg_control_is_user) {
struct cmsghdr __user *cm = msg->msg_control_user;
- struct cmsghdr cmhdr;
-
- cmhdr.cmsg_level = level;
- cmhdr.cmsg_type = type;
- cmhdr.cmsg_len = cmlen;
- if (copy_to_user(cm, &cmhdr, sizeof cmhdr) ||
- copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm)))
- return -EFAULT;
+
+ if (!user_write_access_begin(cm, cmlen))
+ goto efault;
+
+ unsafe_put_user(cmlen, &cm->cmsg_len, efault_end);
+ unsafe_put_user(level, &cm->cmsg_level, efault_end);
+ unsafe_put_user(type, &cm->cmsg_type, efault_end);
+ unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
+ cmlen - sizeof(*cm), efault_end);
+ user_write_access_end();
} else {
struct cmsghdr *cm = msg->msg_control;
@@ -249,6 +251,11 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
return 0;
+
+efault_end:
+ user_write_access_end();
+efault:
+ return -EFAULT;
}
EXPORT_SYMBOL(put_cmsg);
diff --git a/net/core/selftests.c b/net/core/selftests.c
new file mode 100644
index 000000000000..ba7b0171974c
--- /dev/null
+++ b/net/core/selftests.c
@@ -0,0 +1,400 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Synopsys, Inc. and/or its affiliates.
+ * stmmac Selftests Support
+ *
+ * Author: Jose Abreu <joabreu@synopsys.com>
+ *
+ * Ported from stmmac by:
+ * Copyright (C) 2021 Oleksij Rempel <o.rempel@pengutronix.de>
+ */
+
+#include <linux/phy.h>
+#include <net/selftests.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+struct net_packet_attrs {
+ unsigned char *src;
+ unsigned char *dst;
+ u32 ip_src;
+ u32 ip_dst;
+ bool tcp;
+ u16 sport;
+ u16 dport;
+ int timeout;
+ int size;
+ int max_size;
+ u8 id;
+ u16 queue_mapping;
+};
+
+struct net_test_priv {
+ struct net_packet_attrs *packet;
+ struct packet_type pt;
+ struct completion comp;
+ int double_vlan;
+ int vlan_id;
+ int ok;
+};
+
+struct netsfhdr {
+ __be32 version;
+ __be64 magic;
+ u8 id;
+} __packed;
+
+static u8 net_test_next_id;
+
+#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
+ sizeof(struct netsfhdr))
+#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL
+#define NET_LB_TIMEOUT msecs_to_jiffies(200)
+
+static struct sk_buff *net_test_get_skb(struct net_device *ndev,
+ struct net_packet_attrs *attr)
+{
+ struct sk_buff *skb = NULL;
+ struct udphdr *uhdr = NULL;
+ struct tcphdr *thdr = NULL;
+ struct netsfhdr *shdr;
+ struct ethhdr *ehdr;
+ struct iphdr *ihdr;
+ int iplen, size;
+
+ size = attr->size + NET_TEST_PKT_SIZE;
+
+ if (attr->tcp)
+ size += sizeof(struct tcphdr);
+ else
+ size += sizeof(struct udphdr);
+
+ if (attr->max_size && attr->max_size > size)
+ size = attr->max_size;
+
+ skb = netdev_alloc_skb(ndev, size);
+ if (!skb)
+ return NULL;
+
+ prefetchw(skb->data);
+
+ ehdr = skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+
+ skb_set_network_header(skb, skb->len);
+ ihdr = skb_put(skb, sizeof(*ihdr));
+
+ skb_set_transport_header(skb, skb->len);
+ if (attr->tcp)
+ thdr = skb_put(skb, sizeof(*thdr));
+ else
+ uhdr = skb_put(skb, sizeof(*uhdr));
+
+ eth_zero_addr(ehdr->h_dest);
+
+ if (attr->src)
+ ether_addr_copy(ehdr->h_source, attr->src);
+ if (attr->dst)
+ ether_addr_copy(ehdr->h_dest, attr->dst);
+
+ ehdr->h_proto = htons(ETH_P_IP);
+
+ if (attr->tcp) {
+ thdr->source = htons(attr->sport);
+ thdr->dest = htons(attr->dport);
+ thdr->doff = sizeof(struct tcphdr) / 4;
+ thdr->check = 0;
+ } else {
+ uhdr->source = htons(attr->sport);
+ uhdr->dest = htons(attr->dport);
+ uhdr->len = htons(sizeof(*shdr) + sizeof(*uhdr) + attr->size);
+ if (attr->max_size)
+ uhdr->len = htons(attr->max_size -
+ (sizeof(*ihdr) + sizeof(*ehdr)));
+ uhdr->check = 0;
+ }
+
+ ihdr->ihl = 5;
+ ihdr->ttl = 32;
+ ihdr->version = 4;
+ if (attr->tcp)
+ ihdr->protocol = IPPROTO_TCP;
+ else
+ ihdr->protocol = IPPROTO_UDP;
+ iplen = sizeof(*ihdr) + sizeof(*shdr) + attr->size;
+ if (attr->tcp)
+ iplen += sizeof(*thdr);
+ else
+ iplen += sizeof(*uhdr);
+
+ if (attr->max_size)
+ iplen = attr->max_size - sizeof(*ehdr);
+
+ ihdr->tot_len = htons(iplen);
+ ihdr->frag_off = 0;
+ ihdr->saddr = htonl(attr->ip_src);
+ ihdr->daddr = htonl(attr->ip_dst);
+ ihdr->tos = 0;
+ ihdr->id = 0;
+ ip_send_check(ihdr);
+
+ shdr = skb_put(skb, sizeof(*shdr));
+ shdr->version = 0;
+ shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC);
+ attr->id = net_test_next_id;
+ shdr->id = net_test_next_id++;
+
+ if (attr->size)
+ skb_put(skb, attr->size);
+ if (attr->max_size && attr->max_size > skb->len)
+ skb_put(skb, attr->max_size - skb->len);
+
+ skb->csum = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ if (attr->tcp) {
+ thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr,
+ ihdr->daddr, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+ } else {
+ udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr);
+ }
+
+ skb->protocol = htons(ETH_P_IP);
+ skb->pkt_type = PACKET_HOST;
+ skb->dev = ndev;
+
+ return skb;
+}
+
+static int net_test_loopback_validate(struct sk_buff *skb,
+ struct net_device *ndev,
+ struct packet_type *pt,
+ struct net_device *orig_ndev)
+{
+ struct net_test_priv *tpriv = pt->af_packet_priv;
+ unsigned char *src = tpriv->packet->src;
+ unsigned char *dst = tpriv->packet->dst;
+ struct netsfhdr *shdr;
+ struct ethhdr *ehdr;
+ struct udphdr *uhdr;
+ struct tcphdr *thdr;
+ struct iphdr *ihdr;
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+
+ if (skb_linearize(skb))
+ goto out;
+ if (skb_headlen(skb) < (NET_TEST_PKT_SIZE - ETH_HLEN))
+ goto out;
+
+ ehdr = (struct ethhdr *)skb_mac_header(skb);
+ if (dst) {
+ if (!ether_addr_equal_unaligned(ehdr->h_dest, dst))
+ goto out;
+ }
+
+ if (src) {
+ if (!ether_addr_equal_unaligned(ehdr->h_source, src))
+ goto out;
+ }
+
+ ihdr = ip_hdr(skb);
+ if (tpriv->double_vlan)
+ ihdr = (struct iphdr *)(skb_network_header(skb) + 4);
+
+ if (tpriv->packet->tcp) {
+ if (ihdr->protocol != IPPROTO_TCP)
+ goto out;
+
+ thdr = (struct tcphdr *)((u8 *)ihdr + 4 * ihdr->ihl);
+ if (thdr->dest != htons(tpriv->packet->dport))
+ goto out;
+
+ shdr = (struct netsfhdr *)((u8 *)thdr + sizeof(*thdr));
+ } else {
+ if (ihdr->protocol != IPPROTO_UDP)
+ goto out;
+
+ uhdr = (struct udphdr *)((u8 *)ihdr + 4 * ihdr->ihl);
+ if (uhdr->dest != htons(tpriv->packet->dport))
+ goto out;
+
+ shdr = (struct netsfhdr *)((u8 *)uhdr + sizeof(*uhdr));
+ }
+
+ if (shdr->magic != cpu_to_be64(NET_TEST_PKT_MAGIC))
+ goto out;
+ if (tpriv->packet->id != shdr->id)
+ goto out;
+
+ tpriv->ok = true;
+ complete(&tpriv->comp);
+out:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int __net_test_loopback(struct net_device *ndev,
+ struct net_packet_attrs *attr)
+{
+ struct net_test_priv *tpriv;
+ struct sk_buff *skb = NULL;
+ int ret = 0;
+
+ tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL);
+ if (!tpriv)
+ return -ENOMEM;
+
+ tpriv->ok = false;
+ init_completion(&tpriv->comp);
+
+ tpriv->pt.type = htons(ETH_P_IP);
+ tpriv->pt.func = net_test_loopback_validate;
+ tpriv->pt.dev = ndev;
+ tpriv->pt.af_packet_priv = tpriv;
+ tpriv->packet = attr;
+ dev_add_pack(&tpriv->pt);
+
+ skb = net_test_get_skb(ndev, attr);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+
+ ret = dev_direct_xmit(skb, attr->queue_mapping);
+ if (ret < 0) {
+ goto cleanup;
+ } else if (ret > 0) {
+ ret = -ENETUNREACH;
+ goto cleanup;
+ }
+
+ if (!attr->timeout)
+ attr->timeout = NET_LB_TIMEOUT;
+
+ wait_for_completion_timeout(&tpriv->comp, attr->timeout);
+ ret = tpriv->ok ? 0 : -ETIMEDOUT;
+
+cleanup:
+ dev_remove_pack(&tpriv->pt);
+ kfree(tpriv);
+ return ret;
+}
+
+static int net_test_netif_carrier(struct net_device *ndev)
+{
+ return netif_carrier_ok(ndev) ? 0 : -ENOLINK;
+}
+
+static int net_test_phy_phydev(struct net_device *ndev)
+{
+ return ndev->phydev ? 0 : -EOPNOTSUPP;
+}
+
+static int net_test_phy_loopback_enable(struct net_device *ndev)
+{
+ if (!ndev->phydev)
+ return -EOPNOTSUPP;
+
+ return phy_loopback(ndev->phydev, true);
+}
+
+static int net_test_phy_loopback_disable(struct net_device *ndev)
+{
+ if (!ndev->phydev)
+ return -EOPNOTSUPP;
+
+ return phy_loopback(ndev->phydev, false);
+}
+
+static int net_test_phy_loopback_udp(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ return __net_test_loopback(ndev, &attr);
+}
+
+static int net_test_phy_loopback_tcp(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ attr.tcp = true;
+ return __net_test_loopback(ndev, &attr);
+}
+
+static const struct net_test {
+ char name[ETH_GSTRING_LEN];
+ int (*fn)(struct net_device *ndev);
+} net_selftests[] = {
+ {
+ .name = "Carrier ",
+ .fn = net_test_netif_carrier,
+ }, {
+ .name = "PHY dev is present ",
+ .fn = net_test_phy_phydev,
+ }, {
+ /* This test should be done before all PHY loopback test */
+ .name = "PHY internal loopback, enable ",
+ .fn = net_test_phy_loopback_enable,
+ }, {
+ .name = "PHY internal loopback, UDP ",
+ .fn = net_test_phy_loopback_udp,
+ }, {
+ .name = "PHY internal loopback, TCP ",
+ .fn = net_test_phy_loopback_tcp,
+ }, {
+ /* This test should be done after all PHY loopback test */
+ .name = "PHY internal loopback, disable",
+ .fn = net_test_phy_loopback_disable,
+ },
+};
+
+void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf)
+{
+ int count = net_selftest_get_count();
+ int i;
+
+ memset(buf, 0, sizeof(*buf) * count);
+ net_test_next_id = 0;
+
+ if (etest->flags != ETH_TEST_FL_OFFLINE) {
+ netdev_err(ndev, "Only offline tests are supported\n");
+ etest->flags |= ETH_TEST_FL_FAILED;
+ return;
+ }
+
+
+ for (i = 0; i < count; i++) {
+ buf[i] = net_selftests[i].fn(ndev);
+ if (buf[i] && (buf[i] != -EOPNOTSUPP))
+ etest->flags |= ETH_TEST_FL_FAILED;
+ }
+}
+EXPORT_SYMBOL_GPL(net_selftest);
+
+int net_selftest_get_count(void)
+{
+ return ARRAY_SIZE(net_selftests);
+}
+EXPORT_SYMBOL_GPL(net_selftest_get_count);
+
+void net_selftest_get_strings(u8 *data)
+{
+ u8 *p = data;
+ int i;
+
+ for (i = 0; i < net_selftest_get_count(); i++) {
+ snprintf(p, ETH_GSTRING_LEN, "%2d. %s", i + 1,
+ net_selftests[i].name);
+ p += ETH_GSTRING_LEN;
+ }
+}
+EXPORT_SYMBOL_GPL(net_selftest_get_strings);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Oleksij Rempel <o.rempel@pengutronix.de>");
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4275b88726f4..3ad22870298c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2501,9 +2501,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
}
EXPORT_SYMBOL_GPL(skb_splice_bits);
-/* Send skb data on a socket. Socket must be locked. */
-int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
- int len)
+static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg,
+ struct kvec *vec, size_t num, size_t size)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (!sock)
+ return -EINVAL;
+ return kernel_sendmsg(sock, msg, vec, num, size);
+}
+
+static int sendpage_unlocked(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (!sock)
+ return -EINVAL;
+ return kernel_sendpage(sock, page, offset, size, flags);
+}
+
+typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg,
+ struct kvec *vec, size_t num, size_t size);
+typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags);
+static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
+ int len, sendmsg_func sendmsg, sendpage_func sendpage)
{
unsigned int orig_len = len;
struct sk_buff *head = skb;
@@ -2523,7 +2546,8 @@ do_frag_list:
memset(&msg, 0, sizeof(msg));
msg.msg_flags = MSG_DONTWAIT;
- ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
+ ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked,
+ sendmsg_unlocked, sk, &msg, &kv, 1, slen);
if (ret <= 0)
goto error;
@@ -2554,9 +2578,11 @@ do_frag_list:
slen = min_t(size_t, len, skb_frag_size(frag) - offset);
while (slen) {
- ret = kernel_sendpage_locked(sk, skb_frag_page(frag),
- skb_frag_off(frag) + offset,
- slen, MSG_DONTWAIT);
+ ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked,
+ sendpage_unlocked, sk,
+ skb_frag_page(frag),
+ skb_frag_off(frag) + offset,
+ slen, MSG_DONTWAIT);
if (ret <= 0)
goto error;
@@ -2588,8 +2614,23 @@ out:
error:
return orig_len == len ? ret : orig_len - len;
}
+
+/* Send skb data on a socket. Socket must be locked. */
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+ int len)
+{
+ return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked,
+ kernel_sendpage_locked);
+}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+/* Send skb data on a socket. Socket must be unlocked. */
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked,
+ sendpage_unlocked);
+}
+
/**
* skb_store_bits - store bits from kernel buffer to skb
* @skb: destination buffer
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 5def3a2e85be..43ce17a6a585 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -399,6 +399,104 @@ out:
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+ long timeo, int *err)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 1;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_wait_data);
+
+/* Receive sk_msg from psock->ingress_msg to @msg. */
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+ int len, int flags)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ int peek = flags & MSG_PEEK;
+ struct sk_msg *msg_rx;
+ int i, copied = 0;
+
+ msg_rx = sk_psock_peek_msg(psock);
+ while (copied != len) {
+ struct scatterlist *sge;
+
+ if (unlikely(!msg_rx))
+ break;
+
+ i = msg_rx->sg.start;
+ do {
+ struct page *page;
+ int copy;
+
+ sge = sk_msg_elem(msg_rx, i);
+ copy = sge->length;
+ page = sg_page(sge);
+ if (copied + copy > len)
+ copy = len - copied;
+ copy = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (!copy)
+ return copied ? copied : -EFAULT;
+
+ copied += copy;
+ if (likely(!peek)) {
+ sge->offset += copy;
+ sge->length -= copy;
+ if (!msg_rx->skb)
+ sk_mem_uncharge(sk, copy);
+ msg_rx->sg.size -= copy;
+
+ if (!sge->length) {
+ sk_msg_iter_var_next(i);
+ if (!msg_rx->skb)
+ put_page(page);
+ }
+ } else {
+ /* Lets not optimize peek case if copy_page_to_iter
+ * didn't copy the entire length lets just break.
+ */
+ if (copy != sge->length)
+ return copied;
+ sk_msg_iter_var_next(i);
+ }
+
+ if (copied == len)
+ break;
+ } while (i != msg_rx->sg.end);
+
+ if (unlikely(peek)) {
+ msg_rx = sk_psock_next_msg(psock, msg_rx);
+ if (!msg_rx)
+ break;
+ continue;
+ }
+
+ msg_rx->sg.start = i;
+ if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+ msg_rx = sk_psock_dequeue_msg(psock);
+ kfree_sk_msg(msg_rx);
+ }
+ msg_rx = sk_psock_peek_msg(psock);
+ }
+
+ return copied;
+}
+EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
+
static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
struct sk_buff *skb)
{
@@ -410,7 +508,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
if (!sk_rmem_schedule(sk, skb, skb->truesize))
return NULL;
- msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+ msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL);
if (unlikely(!msg))
return NULL;
@@ -498,7 +596,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
if (!ingress) {
if (!sock_writeable(psock->sk))
return -EAGAIN;
- return skb_send_sock_locked(psock->sk, skb, off, len);
+ return skb_send_sock(psock->sk, skb, off, len);
}
return sk_psock_skb_ingress(psock, skb);
}
@@ -512,8 +610,7 @@ static void sk_psock_backlog(struct work_struct *work)
u32 len, off;
int ret;
- /* Lock sock to avoid losing sk_socket during loop. */
- lock_sock(psock->sk);
+ mutex_lock(&psock->work_mutex);
if (state->skb) {
skb = state->skb;
len = state->len;
@@ -526,10 +623,11 @@ static void sk_psock_backlog(struct work_struct *work)
len = skb->len;
off = 0;
start:
- ingress = tcp_skb_bpf_ingress(skb);
+ ingress = skb_bpf_ingress(skb);
+ skb_bpf_redirect_clear(skb);
do {
ret = -EIO;
- if (likely(psock->sk->sk_socket))
+ if (!sock_flag(psock->sk, SOCK_DEAD))
ret = sk_psock_handle_skb(psock, skb, off,
len, ingress);
if (ret <= 0) {
@@ -553,7 +651,7 @@ start:
kfree_skb(skb);
}
end:
- release_sock(psock->sk);
+ mutex_unlock(&psock->work_mutex);
}
struct sk_psock *sk_psock_init(struct sock *sk, int node)
@@ -563,11 +661,6 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
write_lock_bh(&sk->sk_callback_lock);
- if (inet_csk_has_ulp(sk)) {
- psock = ERR_PTR(-EINVAL);
- goto out;
- }
-
if (sk->sk_user_data) {
psock = ERR_PTR(-EBUSY);
goto out;
@@ -591,7 +684,9 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
spin_lock_init(&psock->link_lock);
INIT_WORK(&psock->work, sk_psock_backlog);
+ mutex_init(&psock->work_mutex);
INIT_LIST_HEAD(&psock->ingress_msg);
+ spin_lock_init(&psock->ingress_lock);
skb_queue_head_init(&psock->ingress_skb);
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
@@ -619,7 +714,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
return link;
}
-void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
{
struct sk_msg *msg, *tmp;
@@ -630,9 +725,14 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
}
}
-static void sk_psock_zap_ingress(struct sk_psock *psock)
+static void __sk_psock_zap_ingress(struct sk_psock *psock)
{
- __skb_queue_purge(&psock->ingress_skb);
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) {
+ skb_bpf_redirect_clear(skb);
+ kfree_skb(skb);
+ }
__sk_psock_purge_ingress_msg(psock);
}
@@ -646,23 +746,35 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
}
}
-static void sk_psock_destroy_deferred(struct work_struct *gc)
+void sk_psock_stop(struct sk_psock *psock, bool wait)
{
- struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
+ spin_lock_bh(&psock->ingress_lock);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+ sk_psock_cork_free(psock);
+ __sk_psock_zap_ingress(psock);
+ spin_unlock_bh(&psock->ingress_lock);
+ if (wait)
+ cancel_work_sync(&psock->work);
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock);
+
+static void sk_psock_destroy(struct work_struct *work)
+{
+ struct sk_psock *psock = container_of(to_rcu_work(work),
+ struct sk_psock, rwork);
/* No sk_callback_lock since already detached. */
- /* Parser has been stopped */
- if (psock->progs.skb_parser)
- strp_done(&psock->parser.strp);
+ sk_psock_done_strp(psock);
cancel_work_sync(&psock->work);
+ mutex_destroy(&psock->work_mutex);
psock_progs_drop(&psock->progs);
sk_psock_link_destroy(psock);
sk_psock_cork_free(psock);
- sk_psock_zap_ingress(psock);
if (psock->sk_redir)
sock_put(psock->sk_redir);
@@ -670,30 +782,21 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
kfree(psock);
}
-static void sk_psock_destroy(struct rcu_head *rcu)
-{
- struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
-
- INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
- schedule_work(&psock->gc);
-}
-
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
- sk_psock_cork_free(psock);
- sk_psock_zap_ingress(psock);
+ sk_psock_stop(psock, false);
write_lock_bh(&sk->sk_callback_lock);
sk_psock_restore_proto(sk, psock);
rcu_assign_sk_user_data(sk, NULL);
- if (psock->progs.skb_parser)
+ if (psock->progs.stream_parser)
sk_psock_stop_strp(sk, psock);
- else if (psock->progs.skb_verdict)
+ else if (psock->progs.stream_verdict || psock->progs.skb_verdict)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
- sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
- call_rcu(&psock->rcu, sk_psock_destroy);
+ INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
+ queue_rcu_work(system_wq, &psock->rwork);
}
EXPORT_SYMBOL_GPL(sk_psock_drop);
@@ -744,27 +847,12 @@ out:
}
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
-static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
- struct sk_buff *skb)
-{
- bpf_compute_data_end_sk_skb(skb);
- return bpf_prog_run_pin_on_cpu(prog, skb);
-}
-
-static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
-{
- struct sk_psock_parser *parser;
-
- parser = container_of(strp, struct sk_psock_parser, strp);
- return container_of(parser, struct sk_psock, parser);
-}
-
static void sk_psock_skb_redirect(struct sk_buff *skb)
{
struct sk_psock *psock_other;
struct sock *sk_other;
- sk_other = tcp_skb_bpf_redirect_fetch(skb);
+ sk_other = skb_bpf_redirect_fetch(skb);
/* This error is a buggy BPF program, it returned a redirect
* return code, but then didn't set a redirect interface.
*/
@@ -777,14 +865,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
* error that caused the pipe to break. We can't send a packet on
* a socket that is in this state so we drop the skb.
*/
- if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
- !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
+ kfree_skb(skb);
+ return;
+ }
+ spin_lock_bh(&psock_other->ingress_lock);
+ if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ spin_unlock_bh(&psock_other->ingress_lock);
kfree_skb(skb);
return;
}
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
+ spin_unlock_bh(&psock_other->ingress_lock);
}
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
@@ -806,12 +900,13 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
int ret = __SK_PASS;
rcu_read_lock();
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
if (likely(prog)) {
skb->sk = psock->sk;
- tcp_skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_tls_verdict_apply(skb, psock->sk, ret);
@@ -823,7 +918,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
{
- struct tcp_skb_cb *tcp;
struct sock *sk_other;
int err = -EIO;
@@ -835,8 +929,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
goto out_free;
}
- tcp = TCP_SKB_CB(skb);
- tcp->bpf.flags |= BPF_F_INGRESS;
+ skb_bpf_set_ingress(skb);
/* If the queue is empty then we can submit directly
* into the msg queue. If its not empty we have to
@@ -848,8 +941,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
err = sk_psock_skb_ingress_self(psock, skb);
}
if (err < 0) {
- skb_queue_tail(&psock->ingress_skb, skb);
- schedule_work(&psock->work);
+ spin_lock_bh(&psock->ingress_lock);
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ skb_queue_tail(&psock->ingress_skb, skb);
+ schedule_work(&psock->work);
+ }
+ spin_unlock_bh(&psock->ingress_lock);
}
break;
case __SK_REDIRECT:
@@ -862,6 +959,24 @@ out_free:
}
}
+static void sk_psock_write_space(struct sock *sk)
+{
+ struct sk_psock *psock;
+ void (*write_space)(struct sock *sk) = NULL;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_work(&psock->work);
+ write_space = psock->saved_write_space;
+ }
+ rcu_read_unlock();
+ if (write_space)
+ write_space(sk);
+}
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
{
struct sk_psock *psock;
@@ -876,12 +991,13 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
kfree_skb(skb);
goto out;
}
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
if (likely(prog)) {
skb->sk = sk;
- tcp_skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_verdict_apply(psock, skb, ret);
@@ -896,15 +1012,15 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err)
static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
{
- struct sk_psock *psock = sk_psock_from_strp(strp);
+ struct sk_psock *psock = container_of(strp, struct sk_psock, strp);
struct bpf_prog *prog;
int ret = skb->len;
rcu_read_lock();
- prog = READ_ONCE(psock->progs.skb_parser);
+ prog = READ_ONCE(psock->progs.stream_parser);
if (likely(prog)) {
skb->sk = psock->sk;
- ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
skb->sk = NULL;
}
rcu_read_unlock();
@@ -920,16 +1036,59 @@ static void sk_psock_strp_data_ready(struct sock *sk)
psock = sk_psock(sk);
if (likely(psock)) {
if (tls_sw_has_ctx_rx(sk)) {
- psock->parser.saved_data_ready(sk);
+ psock->saved_data_ready(sk);
} else {
write_lock_bh(&sk->sk_callback_lock);
- strp_data_ready(&psock->parser.strp);
+ strp_data_ready(&psock->strp);
write_unlock_bh(&sk->sk_callback_lock);
}
}
rcu_read_unlock();
}
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+ static const struct strp_callbacks cb = {
+ .rcv_msg = sk_psock_strp_read,
+ .read_sock_done = sk_psock_strp_read_done,
+ .parse_msg = sk_psock_strp_parse,
+ };
+
+ return strp_init(&psock->strp, sk, &cb);
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+ if (psock->saved_data_ready)
+ return;
+
+ psock->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_strp_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+ if (!psock->saved_data_ready)
+ return;
+
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
+ strp_stop(&psock->strp);
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+ /* Parser has been stopped */
+ if (psock->progs.stream_parser)
+ strp_done(&psock->strp);
+}
+#else
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t orig_len)
{
@@ -953,12 +1112,15 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
kfree_skb(skb);
goto out;
}
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
+ if (!prog)
+ prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
skb->sk = sk;
- tcp_skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_verdict_apply(psock, skb, ret);
@@ -982,82 +1144,21 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
}
-static void sk_psock_write_space(struct sock *sk)
-{
- struct sk_psock *psock;
- void (*write_space)(struct sock *sk) = NULL;
-
- rcu_read_lock();
- psock = sk_psock(sk);
- if (likely(psock)) {
- if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
- schedule_work(&psock->work);
- write_space = psock->saved_write_space;
- }
- rcu_read_unlock();
- if (write_space)
- write_space(sk);
-}
-
-int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
-{
- static const struct strp_callbacks cb = {
- .rcv_msg = sk_psock_strp_read,
- .read_sock_done = sk_psock_strp_read_done,
- .parse_msg = sk_psock_strp_parse,
- };
-
- psock->parser.enabled = false;
- return strp_init(&psock->parser.strp, sk, &cb);
-}
-
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (parser->enabled)
+ if (psock->saved_data_ready)
return;
- parser->saved_data_ready = sk->sk_data_ready;
+ psock->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_verdict_data_ready;
sk->sk_write_space = sk_psock_write_space;
- parser->enabled = true;
-}
-
-void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
-{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (parser->enabled)
- return;
-
- parser->saved_data_ready = sk->sk_data_ready;
- sk->sk_data_ready = sk_psock_strp_data_ready;
- sk->sk_write_space = sk_psock_write_space;
- parser->enabled = true;
-}
-
-void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
-{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (!parser->enabled)
- return;
-
- sk->sk_data_ready = parser->saved_data_ready;
- parser->saved_data_ready = NULL;
- strp_stop(&parser->strp);
- parser->enabled = false;
}
void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (!parser->enabled)
+ if (!psock->saved_data_ready)
return;
- sk->sk_data_ready = parser->saved_data_ready;
- parser->saved_data_ready = NULL;
- parser->enabled = false;
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 5ec90f99e102..c761c4a0b66b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3531,7 +3531,7 @@ int proto_register(struct proto *prot, int alloc_slab)
return ret;
out_free_timewait_sock_slab:
- if (alloc_slab && prot->twsk_prot)
+ if (alloc_slab)
tw_prot_cleanup(prot->twsk_prot);
out_free_request_sock_slab:
if (alloc_slab) {
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index d758fb83c884..6f1b82b8ad49 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -24,6 +24,10 @@ struct bpf_stab {
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, u32 which);
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
+
static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
{
struct bpf_stab *stab;
@@ -148,9 +152,11 @@ static void sock_map_del_link(struct sock *sk,
struct bpf_map *map = link->map;
struct bpf_stab *stab = container_of(map, struct bpf_stab,
map);
- if (psock->parser.enabled && stab->progs.skb_parser)
+ if (psock->saved_data_ready && stab->progs.stream_parser)
strp_stop = true;
- if (psock->parser.enabled && stab->progs.skb_verdict)
+ if (psock->saved_data_ready && stab->progs.stream_verdict)
+ verdict_stop = true;
+ if (psock->saved_data_ready && stab->progs.skb_verdict)
verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
@@ -179,26 +185,10 @@ static void sock_map_unref(struct sock *sk, void *link_raw)
static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
{
- struct proto *prot;
-
- switch (sk->sk_type) {
- case SOCK_STREAM:
- prot = tcp_bpf_get_proto(sk, psock);
- break;
-
- case SOCK_DGRAM:
- prot = udp_bpf_get_proto(sk, psock);
- break;
-
- default:
+ if (!sk->sk_prot->psock_update_sk_prot)
return -EINVAL;
- }
-
- if (IS_ERR(prot))
- return PTR_ERR(prot);
-
- sk_psock_update_proto(sk, psock, prot);
- return 0;
+ psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot;
+ return sk->sk_prot->psock_update_sk_prot(sk, psock, false);
}
static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
@@ -221,26 +211,38 @@ out:
return psock;
}
-static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
- struct sock *sk)
+static bool sock_map_redirect_allowed(const struct sock *sk);
+
+static int sock_map_link(struct bpf_map *map, struct sock *sk)
{
- struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
+ struct sk_psock_progs *progs = sock_map_progs(map);
+ struct bpf_prog *stream_verdict = NULL;
+ struct bpf_prog *stream_parser = NULL;
+ struct bpf_prog *skb_verdict = NULL;
+ struct bpf_prog *msg_parser = NULL;
struct sk_psock *psock;
int ret;
- skb_verdict = READ_ONCE(progs->skb_verdict);
- if (skb_verdict) {
- skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
- if (IS_ERR(skb_verdict))
- return PTR_ERR(skb_verdict);
+ /* Only sockets we can redirect into/from in BPF need to hold
+ * refs to parser/verdict progs and have their sk_data_ready
+ * and sk_write_space callbacks overridden.
+ */
+ if (!sock_map_redirect_allowed(sk))
+ goto no_progs;
+
+ stream_verdict = READ_ONCE(progs->stream_verdict);
+ if (stream_verdict) {
+ stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
+ if (IS_ERR(stream_verdict))
+ return PTR_ERR(stream_verdict);
}
- skb_parser = READ_ONCE(progs->skb_parser);
- if (skb_parser) {
- skb_parser = bpf_prog_inc_not_zero(skb_parser);
- if (IS_ERR(skb_parser)) {
- ret = PTR_ERR(skb_parser);
- goto out_put_skb_verdict;
+ stream_parser = READ_ONCE(progs->stream_parser);
+ if (stream_parser) {
+ stream_parser = bpf_prog_inc_not_zero(stream_parser);
+ if (IS_ERR(stream_parser)) {
+ ret = PTR_ERR(stream_parser);
+ goto out_put_stream_verdict;
}
}
@@ -249,10 +251,20 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
msg_parser = bpf_prog_inc_not_zero(msg_parser);
if (IS_ERR(msg_parser)) {
ret = PTR_ERR(msg_parser);
- goto out_put_skb_parser;
+ goto out_put_stream_parser;
}
}
+ skb_verdict = READ_ONCE(progs->skb_verdict);
+ if (skb_verdict) {
+ skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+ if (IS_ERR(skb_verdict)) {
+ ret = PTR_ERR(skb_verdict);
+ goto out_put_msg_parser;
+ }
+ }
+
+no_progs:
psock = sock_map_psock_get_checked(sk);
if (IS_ERR(psock)) {
ret = PTR_ERR(psock);
@@ -261,8 +273,11 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
if (psock) {
if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
- (skb_parser && READ_ONCE(psock->progs.skb_parser)) ||
- (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) {
+ (stream_parser && READ_ONCE(psock->progs.stream_parser)) ||
+ (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+ (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) ||
+ (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+ (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
sk_psock_put(sk, psock);
ret = -EBUSY;
goto out_progs;
@@ -283,16 +298,19 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
goto out_drop;
write_lock_bh(&sk->sk_callback_lock);
- if (skb_parser && skb_verdict && !psock->parser.enabled) {
+ if (stream_parser && stream_verdict && !psock->saved_data_ready) {
ret = sk_psock_init_strp(sk, psock);
if (ret)
goto out_unlock_drop;
- psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
- psock_set_prog(&psock->progs.skb_parser, skb_parser);
+ psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
+ psock_set_prog(&psock->progs.stream_parser, stream_parser);
sk_psock_start_strp(sk, psock);
- } else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
- psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+ } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
+ psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
sk_psock_start_verdict(sk,psock);
+ } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) {
+ psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+ sk_psock_start_verdict(sk, psock);
}
write_unlock_bh(&sk->sk_callback_lock);
return 0;
@@ -301,35 +319,17 @@ out_unlock_drop:
out_drop:
sk_psock_put(sk, psock);
out_progs:
- if (msg_parser)
- bpf_prog_put(msg_parser);
-out_put_skb_parser:
- if (skb_parser)
- bpf_prog_put(skb_parser);
-out_put_skb_verdict:
if (skb_verdict)
bpf_prog_put(skb_verdict);
- return ret;
-}
-
-static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk)
-{
- struct sk_psock *psock;
- int ret;
-
- psock = sock_map_psock_get_checked(sk);
- if (IS_ERR(psock))
- return PTR_ERR(psock);
-
- if (!psock) {
- psock = sk_psock_init(sk, map->numa_node);
- if (IS_ERR(psock))
- return PTR_ERR(psock);
- }
-
- ret = sock_map_init_proto(sk, psock);
- if (ret < 0)
- sk_psock_put(sk, psock);
+out_put_msg_parser:
+ if (msg_parser)
+ bpf_prog_put(msg_parser);
+out_put_stream_parser:
+ if (stream_parser)
+ bpf_prog_put(stream_parser);
+out_put_stream_verdict:
+ if (stream_verdict)
+ bpf_prog_put(stream_verdict);
return ret;
}
@@ -463,8 +463,6 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
return 0;
}
-static bool sock_map_redirect_allowed(const struct sock *sk);
-
static int sock_map_update_common(struct bpf_map *map, u32 idx,
struct sock *sk, u64 flags)
{
@@ -484,14 +482,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
if (!link)
return -ENOMEM;
- /* Only sockets we can redirect into/from in BPF need to hold
- * refs to parser/verdict progs and have their sk_data_ready
- * and sk_write_space callbacks overridden.
- */
- if (sock_map_redirect_allowed(sk))
- ret = sock_map_link(map, &stab->progs, sk);
- else
- ret = sock_map_link_no_progs(map, sk);
+ ret = sock_map_link(map, sk);
if (ret < 0)
goto out_free;
@@ -544,12 +535,15 @@ static bool sk_is_udp(const struct sock *sk)
static bool sock_map_redirect_allowed(const struct sock *sk)
{
- return sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN;
+ if (sk_is_tcp(sk))
+ return sk->sk_state != TCP_LISTEN;
+ else
+ return sk->sk_state == TCP_ESTABLISHED;
}
static bool sock_map_sk_is_suitable(const struct sock *sk)
{
- return sk_is_tcp(sk) || sk_is_udp(sk);
+ return !!sk->sk_prot->psock_update_sk_prot;
}
static bool sock_map_sk_state_allowed(const struct sock *sk)
@@ -657,7 +651,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct bpf_map *, map, u32, key, u64, flags)
{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -667,8 +660,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = sk;
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
}
@@ -998,14 +990,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
if (!link)
return -ENOMEM;
- /* Only sockets we can redirect into/from in BPF need to hold
- * refs to parser/verdict progs and have their sk_data_ready
- * and sk_write_space callbacks overridden.
- */
- if (sock_map_redirect_allowed(sk))
- ret = sock_map_link(map, &htab->progs, sk);
- else
- ret = sock_map_link_no_progs(map, sk);
+ ret = sock_map_link(map, sk);
if (ret < 0)
goto out_free;
@@ -1250,7 +1235,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = {
BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
struct bpf_map *, map, void *, key, u64, flags)
{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -1260,8 +1244,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = sk;
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
}
@@ -1448,8 +1431,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
return NULL;
}
-int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- struct bpf_prog *old, u32 which)
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, u32 which)
{
struct sk_psock_progs *progs = sock_map_progs(map);
struct bpf_prog **pprog;
@@ -1461,10 +1444,19 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
case BPF_SK_MSG_VERDICT:
pprog = &progs->msg_parser;
break;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
case BPF_SK_SKB_STREAM_PARSER:
- pprog = &progs->skb_parser;
+ pprog = &progs->stream_parser;
break;
+#endif
case BPF_SK_SKB_STREAM_VERDICT:
+ if (progs->skb_verdict)
+ return -EBUSY;
+ pprog = &progs->stream_verdict;
+ break;
+ case BPF_SK_SKB_VERDICT:
+ if (progs->stream_verdict)
+ return -EBUSY;
pprog = &progs->skb_verdict;
break;
default:
@@ -1529,7 +1521,7 @@ void sock_map_close(struct sock *sk, long timeout)
lock_sock(sk);
rcu_read_lock();
- psock = sk_psock(sk);
+ psock = sk_psock_get(sk);
if (unlikely(!psock)) {
rcu_read_unlock();
release_sock(sk);
@@ -1539,6 +1531,8 @@ void sock_map_close(struct sock *sk, long timeout)
saved_close = psock->saved_close;
sock_map_remove_links(sk, psock);
rcu_read_unlock();
+ sk_psock_stop(psock, true);
+ sk_psock_put(sk, psock);
release_sock(sk);
saved_close(sk, timeout);
}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 4567de519603..c8496c1142c9 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -24,6 +24,7 @@
static int two = 2;
static int three = 3;
+static int int_3600 = 3600;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
static int max_skb_frags = MAX_SKB_FRAGS;
@@ -570,6 +571,15 @@ static struct ctl_table net_core_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
+ {
+ .procname = "netdev_unregister_timeout_secs",
+ .data = &netdev_unregister_timeout_secs,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &int_3600,
+ },
{ }
};