aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c23
-rw-r--r--net/ipv4/esp4_offload.c2
-rw-r--r--net/ipv4/fib_lookup.h6
-rw-r--r--net/ipv4/fib_semantics.c7
-rw-r--r--net/ipv4/fib_trie.c38
-rw-r--r--net/ipv4/gre_offload.c22
-rw-r--r--net/ipv4/inet_hashtables.c25
-rw-r--r--net/ipv4/ip_input.c1
-rw-r--r--net/ipv4/ip_output.c6
-rw-r--r--net/ipv4/ip_tunnel_core.c9
-rw-r--r--net/ipv4/ipconfig.c22
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c18
-rw-r--r--net/ipv4/nexthop.c347
-rw-r--r--net/ipv4/proc.c50
-rw-r--r--net/ipv4/route.c14
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp.c199
-rw-r--r--net/ipv4/tcp_cubic.c11
-rw-r--r--net/ipv4/tcp_input.c27
-rw-r--r--net/ipv4/tcp_ipv4.c6
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/udp.c13
-rw-r--r--net/ipv4/udp_offload.c7
-rw-r--r--net/ipv4/udp_tunnel_core.c24
24 files changed, 572 insertions, 316 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b94fa8eb831b..a02ce89b56b5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -438,6 +438,7 @@ EXPORT_SYMBOL(inet_release);
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sock *sk = sock->sk;
+ u32 flags = BIND_WITH_LOCK;
int err;
/* If the socket has its own bind function then use it. (RAW) */
@@ -450,11 +451,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
/* BPF prog is run before any checks are done so that if the prog
* changes context in a wrong way it will be caught.
*/
- err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr);
+ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+ BPF_CGROUP_INET4_BIND, &flags);
if (err)
return err;
- return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
+ return __inet_bind(sk, uaddr, addr_len, flags);
}
EXPORT_SYMBOL(inet_bind);
@@ -499,7 +501,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && inet_port_requires_bind_service(net, snum) &&
+ if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+ snum && inet_port_requires_bind_service(net, snum) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
@@ -777,18 +780,19 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
return -ENOTCONN;
sin->sin_port = inet->inet_dport;
sin->sin_addr.s_addr = inet->inet_daddr;
+ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+ BPF_CGROUP_INET4_GETPEERNAME,
+ NULL);
} else {
__be32 addr = inet->inet_rcv_saddr;
if (!addr)
addr = inet->inet_saddr;
sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
- }
- if (cgroup_bpf_enabled)
BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
- peer ? BPF_CGROUP_INET4_GETPEERNAME :
- BPF_CGROUP_INET4_GETSOCKNAME,
+ BPF_CGROUP_INET4_GETSOCKNAME,
NULL);
+ }
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
return sizeof(*sin);
}
@@ -1419,7 +1423,6 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
out:
return segs;
}
-EXPORT_SYMBOL(inet_gso_segment);
static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
netdev_features_t features)
@@ -1550,7 +1553,6 @@ out:
return pp;
}
-EXPORT_SYMBOL(inet_gro_receive);
static struct sk_buff *ipip_gro_receive(struct list_head *head,
struct sk_buff *skb)
@@ -1636,7 +1638,6 @@ out_unlock:
return err;
}
-EXPORT_SYMBOL(inet_gro_complete);
static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
{
@@ -1871,6 +1872,8 @@ static __net_init int inet_init_net(struct net *net)
net->ipv4.sysctl_igmp_llm_reports = 1;
net->ipv4.sysctl_igmp_qrv = 2;
+ net->ipv4.sysctl_fib_notify_on_flag_change = 0;
+
return 0;
}
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 5bda5aeda579..601f5fbfc63f 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -285,7 +285,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
esp.esph = ip_esp_hdr(skb);
- if (!hw_offload || (hw_offload && !skb_is_gso(skb))) {
+ if (!hw_offload || !skb_is_gso(skb)) {
esp.nfrags = esp_output_head(x, skb, &esp);
if (esp.nfrags < 0)
return esp.nfrags;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 818916b2a04d..b58db1ca4bfb 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -18,7 +18,8 @@ struct fib_alias {
s16 fa_default;
u8 offload:1,
trap:1,
- unused:6;
+ offload_failed:1,
+ unused:5;
struct rcu_head rcu;
};
@@ -39,9 +40,10 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack);
bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
- struct fib_rt_info *fri, unsigned int flags);
+ const struct fib_rt_info *fri, unsigned int flags);
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+size_t fib_nlmsg_size(struct fib_info *fi);
static inline void fib_result_assign(struct fib_result *res,
struct fib_info *fi)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b5400cec4f69..a632b66bc13a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -452,7 +452,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
return -1;
}
-static inline size_t fib_nlmsg_size(struct fib_info *fi)
+size_t fib_nlmsg_size(struct fib_info *fi)
{
size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(4) /* RTA_TABLE */
@@ -521,6 +521,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
fri.type = fa->fa_type;
fri.offload = fa->offload;
fri.trap = fa->trap;
+ fri.offload_failed = fa->offload_failed;
err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
@@ -1733,7 +1734,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
#endif
int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
- struct fib_rt_info *fri, unsigned int flags)
+ const struct fib_rt_info *fri, unsigned int flags)
{
unsigned int nhs = fib_info_num_path(fri->fi);
struct fib_info *fi = fri->fi;
@@ -1811,6 +1812,8 @@ offload:
rtm->rtm_flags |= RTM_F_OFFLOAD;
if (fri->trap)
rtm->rtm_flags |= RTM_F_TRAP;
+ if (fri->offload_failed)
+ rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
nlmsg_end(skb, nlh);
return 0;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 28117c05dc35..25cf387cca5b 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1038,6 +1038,8 @@ fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
{
struct fib_alias *fa_match;
+ struct sk_buff *skb;
+ int err;
rcu_read_lock();
@@ -1045,9 +1047,42 @@ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
if (!fa_match)
goto out;
+ if (fa_match->offload == fri->offload && fa_match->trap == fri->trap &&
+ fa_match->offload_failed == fri->offload_failed)
+ goto out;
+
fa_match->offload = fri->offload;
fa_match->trap = fri->trap;
+ /* 2 means send notifications only if offload_failed was changed. */
+ if (net->ipv4.sysctl_fib_notify_on_flag_change == 2 &&
+ fa_match->offload_failed == fri->offload_failed)
+ goto out;
+
+ fa_match->offload_failed = fri->offload_failed;
+
+ if (!net->ipv4.sysctl_fib_notify_on_flag_change)
+ goto out;
+
+ skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC);
+ goto out;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err);
out:
rcu_read_unlock();
}
@@ -1263,6 +1298,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->fa_default = -1;
new_fa->offload = 0;
new_fa->trap = 0;
+ new_fa->offload_failed = 0;
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
@@ -1323,6 +1359,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->fa_default = -1;
new_fa->offload = 0;
new_fa->trap = 0;
+ new_fa->offload_failed = 0;
/* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
@@ -2262,6 +2299,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
fri.type = fa->fa_type;
fri.offload = fa->offload;
fri.trap = fa->trap;
+ fri.offload_failed = fa->offload_failed;
err = fib_dump_info(skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index e0a246575887..1121a9d5fed9 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -15,7 +15,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
- bool need_csum, need_recompute_csum, gso_partial;
+ bool need_csum, offload_csum, gso_partial, need_ipsec;
struct sk_buff *segs = ERR_PTR(-EINVAL);
u16 mac_offset = skb->mac_header;
__be16 protocol = skb->protocol;
@@ -41,10 +41,16 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
skb->protocol = skb->inner_protocol;
need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
- need_recompute_csum = skb->csum_not_inet;
skb->encap_hdr_csum = need_csum;
features &= skb->dev->hw_enc_features;
+ if (need_csum)
+ features &= ~NETIF_F_SCTP_CRC;
+
+ need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum && !need_ipsec &&
+ (skb->dev->features & NETIF_F_HW_CSUM));
/* segment inner packet. */
segs = skb_mac_gso_segment(skb, features);
@@ -99,14 +105,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
}
*(pcsum + 1) = 0;
- if (need_recompute_csum && !skb_is_gso(skb)) {
- __wsum csum;
-
- csum = skb_checksum(skb, gre_offset,
- skb->len - gre_offset, 0);
- *pcsum = csum_fold(csum);
- } else {
+ if (skb->encapsulation || !offload_csum) {
*pcsum = gso_make_checksum(skb, 0);
+ } else {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = sizeof(*greh);
}
} while ((skb = skb->next));
out:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 45fb450b4522..c96866a53a66 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -709,6 +709,17 @@ unlock:
}
EXPORT_SYMBOL_GPL(inet_unhash);
+/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
+ * Note that we use 32bit integers (vs RFC 'short integers')
+ * because 2^16 is not a multiple of num_ephemeral and this
+ * property might be used by clever attacker.
+ * RFC claims using TABLE_LENGTH=10 buckets gives an improvement,
+ * we use 256 instead to really give more isolation and
+ * privacy, this only consumes 1 KB of kernel memory.
+ */
+#define INET_TABLE_PERTURB_SHIFT 8
+static u32 table_perturb[1 << INET_TABLE_PERTURB_SHIFT];
+
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
@@ -722,8 +733,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct inet_bind_bucket *tb;
u32 remaining, offset;
int ret, i, low, high;
- static u32 hint;
int l3mdev;
+ u32 index;
if (port) {
head = &hinfo->bhash[inet_bhashfn(net, port,
@@ -750,7 +761,10 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
if (likely(remaining > 1))
remaining &= ~1U;
- offset = (hint + port_offset) % remaining;
+ net_get_random_once(table_perturb, sizeof(table_perturb));
+ index = hash_32(port_offset, INET_TABLE_PERTURB_SHIFT);
+
+ offset = (READ_ONCE(table_perturb[index]) + port_offset) % remaining;
/* In first pass we try ports of @low parity.
* inet_csk_get_port() does the opposite choice.
*/
@@ -804,7 +818,12 @@ next_port:
return -EADDRNOTAVAIL;
ok:
- hint += i + 2;
+ /* If our first attempt found a candidate, skip next candidate
+ * in 1/16 of cases to add some noise.
+ */
+ if (!i && !(prandom_u32() % 16))
+ i = 2;
+ WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, port);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b0c244af1e4d..3a025c011971 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -253,6 +253,7 @@ int ip_local_deliver(struct sk_buff *skb)
net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
+EXPORT_SYMBOL(ip_local_deliver);
static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
{
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 2ed0b01f72f0..3aab53beb4ea 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -434,6 +434,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
+EXPORT_SYMBOL(ip_output);
/*
* copy saddr and daddr, possibly using 64bit load/stores
@@ -1018,7 +1019,7 @@ static int __ip_append_data(struct sock *sk,
csummode = CHECKSUM_PARTIAL;
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
- uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
if (!uarg)
return -ENOBUFS;
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
@@ -1230,8 +1231,7 @@ alloc_new_skb:
error_efault:
err = -EFAULT;
error:
- if (uarg)
- sock_zerocopy_put_abort(uarg, extra_uref);
+ net_zcopy_put_abort(uarg, extra_uref);
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 7ca338fbe8ba..6b2dc7b2b612 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -222,7 +222,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
.code = ICMP_FRAG_NEEDED,
.checksum = 0,
.un.frag.__unused = 0,
- .un.frag.mtu = ntohs(mtu),
+ .un.frag.mtu = htons(mtu),
};
icmph->checksum = ip_compute_csum(icmph, len);
skb_reset_transport_header(skb);
@@ -245,7 +245,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
skb->ip_summed = CHECKSUM_NONE;
- eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0);
+ eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
skb_reset_mac_header(skb);
return skb->len;
@@ -338,7 +338,7 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
skb->ip_summed = CHECKSUM_NONE;
- eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0);
+ eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
skb_reset_mac_header(skb);
return skb->len;
@@ -583,8 +583,9 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr,
static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
struct netlink_ext_ack *extack)
{
- int err, rem, opt_len, opts_len = 0, type = 0;
+ int err, rem, opt_len, opts_len = 0;
struct nlattr *nla;
+ __be16 type = 0;
if (!attr)
return 0;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 3cd13e1bc6a7..47db1bfdaaa0 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -61,7 +61,6 @@
#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/arp.h>
-#include <net/dsa.h>
#include <net/ip.h>
#include <net/ipconfig.h>
#include <net/route.h>
@@ -218,9 +217,9 @@ static int __init ic_open_devs(void)
last = &ic_first_dev;
rtnl_lock();
- /* bring loopback and DSA master network devices up first */
+ /* bring loopback device up first */
for_each_netdev(&init_net, dev) {
- if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev))
+ if (!(dev->flags & IFF_LOOPBACK))
continue;
if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0)
pr_err("IP-Config: Failed to open %s\n", dev->name);
@@ -305,17 +304,32 @@ have_carrier:
return 0;
}
+/* Close all network interfaces except the one we've autoconfigured, and its
+ * lowers, in case it's a stacked virtual interface.
+ */
static void __init ic_close_devs(void)
{
+ struct net_device *selected_dev = ic_dev->dev;
struct ic_device *d, *next;
struct net_device *dev;
rtnl_lock();
next = ic_first_dev;
while ((d = next)) {
+ bool bring_down = (d != ic_dev);
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
next = d->next;
dev = d->dev;
- if (d != ic_dev && !netdev_uses_dsa(dev)) {
+
+ netdev_for_each_lower_dev(selected_dev, lower_dev, iter) {
+ if (dev == lower_dev) {
+ bring_down = false;
+ break;
+ }
+ }
+ if (bring_down) {
pr_debug("IP-Config: Downing %s\n", dev->name);
dev_change_flags(dev, d->flags, NULL);
}
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index bcdb37f86a94..aeb631760eb9 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -13,8 +13,8 @@
#include <net/netfilter/ipv4/nf_dup_ipv4.h>
struct nft_dup_ipv4 {
- enum nft_registers sreg_addr:8;
- enum nft_registers sreg_dev:8;
+ u8 sreg_addr;
+ u8 sreg_dev;
};
static void nft_dup_ipv4_eval(const struct nft_expr *expr,
@@ -40,16 +40,16 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_ADDR] == NULL)
return -EINVAL;
- priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
- err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr));
+ err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
+ sizeof(struct in_addr));
if (err < 0)
return err;
- if (tb[NFTA_DUP_SREG_DEV] != NULL) {
- priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
- return nft_validate_register_load(priv->sreg_dev, sizeof(int));
- }
- return 0;
+ if (tb[NFTA_DUP_SREG_DEV])
+ err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV],
+ &priv->sreg_dev, sizeof(int));
+
+ return err;
}
static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index e53e43aef785..f1c6cbdb9e43 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -22,7 +22,7 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
#define NH_DEV_HASHBITS 8
#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
-static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
+static const struct nla_policy rtm_nh_policy_new[] = {
[NHA_ID] = { .type = NLA_U32 },
[NHA_GROUP] = { .type = NLA_BINARY },
[NHA_GROUP_TYPE] = { .type = NLA_U16 },
@@ -31,6 +31,15 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
[NHA_GATEWAY] = { .type = NLA_BINARY },
[NHA_ENCAP_TYPE] = { .type = NLA_U16 },
[NHA_ENCAP] = { .type = NLA_NESTED },
+ [NHA_FDB] = { .type = NLA_FLAG },
+};
+
+static const struct nla_policy rtm_nh_policy_get[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_dump[] = {
+ [NHA_OIF] = { .type = NLA_U32 },
[NHA_GROUPS] = { .type = NLA_FLAG },
[NHA_MASTER] = { .type = NLA_U32 },
[NHA_FDB] = { .type = NLA_FLAG },
@@ -62,6 +71,7 @@ __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
static int nh_notifier_single_info_init(struct nh_notifier_info *info,
const struct nexthop *nh)
{
+ info->type = NH_NOTIFIER_INFO_TYPE_SINGLE;
info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL);
if (!info->nh)
return -ENOMEM;
@@ -76,13 +86,13 @@ static void nh_notifier_single_info_fini(struct nh_notifier_info *info)
kfree(info->nh);
}
-static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
- const struct nexthop *nh)
+static int nh_notifier_mp_info_init(struct nh_notifier_info *info,
+ struct nh_group *nhg)
{
- struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
u16 num_nh = nhg->num_nh;
int i;
+ info->type = NH_NOTIFIER_INFO_TYPE_GRP;
info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh),
GFP_KERNEL);
if (!info->nh_grp)
@@ -103,27 +113,41 @@ static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
return 0;
}
-static void nh_notifier_grp_info_fini(struct nh_notifier_info *info)
+static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ if (nhg->mpath)
+ return nh_notifier_mp_info_init(info, nhg);
+ return -EINVAL;
+}
+
+static void nh_notifier_grp_info_fini(struct nh_notifier_info *info,
+ const struct nexthop *nh)
{
- kfree(info->nh_grp);
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ if (nhg->mpath)
+ kfree(info->nh_grp);
}
static int nh_notifier_info_init(struct nh_notifier_info *info,
const struct nexthop *nh)
{
info->id = nh->id;
- info->is_grp = nh->is_group;
- if (info->is_grp)
+ if (nh->is_group)
return nh_notifier_grp_info_init(info, nh);
else
return nh_notifier_single_info_init(info, nh);
}
-static void nh_notifier_info_fini(struct nh_notifier_info *info)
+static void nh_notifier_info_fini(struct nh_notifier_info *info,
+ const struct nexthop *nh)
{
- if (info->is_grp)
- nh_notifier_grp_info_fini(info);
+ if (nh->is_group)
+ nh_notifier_grp_info_fini(info, nh);
else
nh_notifier_single_info_fini(info);
}
@@ -152,7 +176,7 @@ static int call_nexthop_notifiers(struct net *net,
err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
event_type, &info);
- nh_notifier_info_fini(&info);
+ nh_notifier_info_fini(&info, nh);
return notifier_to_errno(err);
}
@@ -173,7 +197,7 @@ static int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
return err;
err = nb->notifier_call(nb, event_type, &info);
- nh_notifier_info_fini(&info);
+ nh_notifier_info_fini(&info, nh);
return notifier_to_errno(err);
}
@@ -200,7 +224,7 @@ static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
hlist_add_head(&nhi->dev_hash, head);
}
-static void nexthop_free_mpath(struct nexthop *nh)
+static void nexthop_free_group(struct nexthop *nh)
{
struct nh_group *nhg;
int i;
@@ -240,7 +264,7 @@ void nexthop_free_rcu(struct rcu_head *head)
struct nexthop *nh = container_of(head, struct nexthop, rcu);
if (nh->is_group)
- nexthop_free_mpath(nh);
+ nexthop_free_group(nh);
else
nexthop_free_single(nh);
@@ -565,7 +589,8 @@ static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
return 0;
}
-static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
+static int nh_check_attr_group(struct net *net,
+ struct nlattr *tb[], size_t tb_size,
struct netlink_ext_ack *extack)
{
unsigned int len = nla_len(tb[NHA_GROUP]);
@@ -624,7 +649,7 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
return -EINVAL;
}
}
- for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) {
+ for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
if (!tb[i])
continue;
if (i == NHA_FDB)
@@ -670,21 +695,16 @@ static bool ipv4_good_nh(const struct fib_nh *nh)
return !!(state & NUD_VALID);
}
-struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
+static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash)
{
struct nexthop *rc = NULL;
- struct nh_group *nhg;
int i;
- if (!nh->is_group)
- return nh;
-
- nhg = rcu_dereference(nh->nh_grp);
for (i = 0; i < nhg->num_nh; ++i) {
struct nh_grp_entry *nhge = &nhg->nh_entries[i];
struct nh_info *nhi;
- if (hash > atomic_read(&nhge->upper_bound))
+ if (hash > atomic_read(&nhge->mpath.upper_bound))
continue;
nhi = rcu_dereference(nhge->nh->nh_info);
@@ -711,6 +731,21 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
return rc;
}
+
+struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
+{
+ struct nh_group *nhg;
+
+ if (!nh->is_group)
+ return nh;
+
+ nhg = rcu_dereference(nh->nh_grp);
+ if (nhg->mpath)
+ return nexthop_select_path_mp(nhg, hash);
+
+ /* Unreachable. */
+ return NULL;
+}
EXPORT_SYMBOL_GPL(nexthop_select_path);
int nexthop_for_each_fib6_nh(struct nexthop *nh,
@@ -904,7 +939,7 @@ static void nh_group_rebalance(struct nh_group *nhg)
w += nhge->weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
- atomic_set(&nhge->upper_bound, upper_bound);
+ atomic_set(&nhge->mpath.upper_bound, upper_bound);
}
}
@@ -1446,10 +1481,13 @@ static struct nexthop *nexthop_create_group(struct net *net,
nhg->nh_entries[i].nh_parent = nh;
}
- if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+ if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH)
nhg->mpath = 1;
+
+ WARN_ON_ONCE(nhg->mpath != 1);
+
+ if (nhg->mpath)
nh_group_rebalance(nhg);
- }
if (cfg->nh_fdb)
nhg->fdb_nh = 1;
@@ -1643,11 +1681,12 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
struct netlink_ext_ack *extack)
{
struct nhmsg *nhm = nlmsg_data(nlh);
- struct nlattr *tb[NHA_MAX + 1];
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)];
int err;
- err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
- extack);
+ err = nlmsg_parse(nlh, sizeof(*nhm), tb,
+ ARRAY_SIZE(rtm_nh_policy_new) - 1,
+ rtm_nh_policy_new, extack);
if (err < 0)
return err;
@@ -1674,11 +1713,6 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
goto out;
}
- if (tb[NHA_GROUPS] || tb[NHA_MASTER]) {
- NL_SET_ERR_MSG(extack, "Invalid attributes in request");
- goto out;
- }
-
memset(cfg, 0, sizeof(*cfg));
cfg->nlflags = nlh->nlmsg_flags;
cfg->nlinfo.portid = NETLINK_CB(skb).portid;
@@ -1720,7 +1754,7 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
NL_SET_ERR_MSG(extack, "Invalid group type");
goto out;
}
- err = nh_check_attr_group(net, tb, extack);
+ err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), extack);
/* no other attributes should be set */
goto out;
@@ -1838,49 +1872,44 @@ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
return err;
}
-static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id,
- struct netlink_ext_ack *extack)
+static int __nh_valid_get_del_req(const struct nlmsghdr *nlh,
+ struct nlattr **tb, u32 *id,
+ struct netlink_ext_ack *extack)
{
struct nhmsg *nhm = nlmsg_data(nlh);
- struct nlattr *tb[NHA_MAX + 1];
- int err, i;
-
- err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
- extack);
- if (err < 0)
- return err;
- err = -EINVAL;
- for (i = 0; i < __NHA_MAX; ++i) {
- if (!tb[i])
- continue;
-
- switch (i) {
- case NHA_ID:
- break;
- default:
- NL_SET_ERR_MSG_ATTR(extack, tb[i],
- "Unexpected attribute in request");
- goto out;
- }
- }
if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
NL_SET_ERR_MSG(extack, "Invalid values in header");
- goto out;
+ return -EINVAL;
}
if (!tb[NHA_ID]) {
NL_SET_ERR_MSG(extack, "Nexthop id is missing");
- goto out;
+ return -EINVAL;
}
*id = nla_get_u32(tb[NHA_ID]);
- if (!(*id))
+ if (!(*id)) {
NL_SET_ERR_MSG(extack, "Invalid nexthop id");
- else
- err = 0;
-out:
- return err;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nh_valid_get_del_req(const struct nlmsghdr *nlh, u32 *id,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_get) - 1,
+ rtm_nh_policy_get, extack);
+ if (err < 0)
+ return err;
+
+ return __nh_valid_get_del_req(nlh, tb, id, extack);
}
/* rtnl */
@@ -1949,16 +1978,23 @@ errout_free:
goto out;
}
-static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
- bool group_filter, u8 family)
+struct nh_dump_filter {
+ int dev_idx;
+ int master_idx;
+ bool group_filter;
+ bool fdb_filter;
+};
+
+static bool nh_dump_filtered(struct nexthop *nh,
+ struct nh_dump_filter *filter, u8 family)
{
const struct net_device *dev;
const struct nh_info *nhi;
- if (group_filter && !nh->is_group)
+ if (filter->group_filter && !nh->is_group)
return true;
- if (!dev_idx && !master_idx && !family)
+ if (!filter->dev_idx && !filter->master_idx && !family)
return false;
if (nh->is_group)
@@ -1969,70 +2005,48 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
return true;
dev = nhi->fib_nhc.nhc_dev;
- if (dev_idx && (!dev || dev->ifindex != dev_idx))
+ if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx))
return true;
- if (master_idx) {
+ if (filter->master_idx) {
struct net_device *master;
if (!dev)
return true;
master = netdev_master_upper_dev_get((struct net_device *)dev);
- if (!master || master->ifindex != master_idx)
+ if (!master || master->ifindex != filter->master_idx)
return true;
}
return false;
}
-static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
- int *master_idx, bool *group_filter,
- bool *fdb_filter, struct netlink_callback *cb)
+static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb,
+ struct nh_dump_filter *filter,
+ struct netlink_ext_ack *extack)
{
- struct netlink_ext_ack *extack = cb->extack;
- struct nlattr *tb[NHA_MAX + 1];
struct nhmsg *nhm;
- int err, i;
u32 idx;
- err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
- NULL);
- if (err < 0)
- return err;
-
- for (i = 0; i <= NHA_MAX; ++i) {
- if (!tb[i])
- continue;
-
- switch (i) {
- case NHA_OIF:
- idx = nla_get_u32(tb[i]);
- if (idx > INT_MAX) {
- NL_SET_ERR_MSG(extack, "Invalid device index");
- return -EINVAL;
- }
- *dev_idx = idx;
- break;
- case NHA_MASTER:
- idx = nla_get_u32(tb[i]);
- if (idx > INT_MAX) {
- NL_SET_ERR_MSG(extack, "Invalid master device index");
- return -EINVAL;
- }
- *master_idx = idx;
- break;
- case NHA_GROUPS:
- *group_filter = true;
- break;
- case NHA_FDB:
- *fdb_filter = true;
- break;
- default:
- NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+ if (tb[NHA_OIF]) {
+ idx = nla_get_u32(tb[NHA_OIF]);
+ if (idx > INT_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ return -EINVAL;
+ }
+ filter->dev_idx = idx;
+ }
+ if (tb[NHA_MASTER]) {
+ idx = nla_get_u32(tb[NHA_MASTER]);
+ if (idx > INT_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid master device index");
return -EINVAL;
}
+ filter->master_idx = idx;
}
+ filter->group_filter = nla_get_flag(tb[NHA_GROUPS]);
+ filter->fdb_filter = nla_get_flag(tb[NHA_FDB]);
nhm = nlmsg_data(nlh);
if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
@@ -2043,24 +2057,49 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
return 0;
}
-/* rtnl */
-static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
+static int nh_valid_dump_req(const struct nlmsghdr *nlh,
+ struct nh_dump_filter *filter,
+ struct netlink_callback *cb)
{
- bool group_filter = false, fdb_filter = false;
- struct nhmsg *nhm = nlmsg_data(cb->nlh);
- int dev_filter_idx = 0, master_idx = 0;
- struct net *net = sock_net(skb->sk);
- struct rb_root *root = &net->nexthop.rb_root;
- struct rb_node *node;
- int idx = 0, s_idx;
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)];
int err;
- err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
- &group_filter, &fdb_filter, cb);
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_dump) - 1,
+ rtm_nh_policy_dump, cb->extack);
if (err < 0)
return err;
- s_idx = cb->args[0];
+ return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
+}
+
+struct rtm_dump_nh_ctx {
+ u32 idx;
+};
+
+static struct rtm_dump_nh_ctx *
+rtm_dump_nh_ctx(struct netlink_callback *cb)
+{
+ struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+ return ctx;
+}
+
+static int rtm_dump_walk_nexthops(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct rb_root *root,
+ struct rtm_dump_nh_ctx *ctx,
+ int (*nh_cb)(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nexthop *nh, void *data),
+ void *data)
+{
+ struct rb_node *node;
+ int idx = 0, s_idx;
+ int err;
+
+ s_idx = ctx->idx;
for (node = rb_first(root); node; node = rb_next(node)) {
struct nexthop *nh;
@@ -2068,30 +2107,58 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
goto cont;
nh = rb_entry(node, struct nexthop, rb_node);
- if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
- group_filter, nhm->nh_family))
- goto cont;
-
- err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI);
- if (err < 0) {
- if (likely(skb->len))
- goto out;
-
- goto out_err;
- }
+ ctx->idx = idx;
+ err = nh_cb(skb, cb, nh, data);
+ if (err)
+ return err;
cont:
idx++;
}
+ ctx->idx = idx;
+ return 0;
+}
+
+static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb,
+ struct nexthop *nh, void *data)
+{
+ struct nhmsg *nhm = nlmsg_data(cb->nlh);
+ struct nh_dump_filter *filter = data;
+
+ if (nh_dump_filtered(nh, filter, nhm->nh_family))
+ return 0;
+
+ return nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+}
+
+/* rtnl */
+static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ struct rb_root *root = &net->nexthop.rb_root;
+ struct nh_dump_filter filter = {};
+ int err;
+
+ err = nh_valid_dump_req(cb->nlh, &filter, cb);
+ if (err < 0)
+ return err;
+
+ err = rtm_dump_walk_nexthops(skb, cb, root, ctx,
+ &rtm_dump_nexthop_cb, &filter);
+ if (err < 0) {
+ if (likely(skb->len))
+ goto out;
+ goto out_err;
+ }
+
out:
err = skb->len;
out_err:
- cb->args[0] = idx;
cb->seq = net->nexthop.seq;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
-
return err;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 63cd370ea29d..6d46297a99f8 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -464,30 +464,52 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
*/
static int netstat_seq_show(struct seq_file *seq, void *v)
{
- int i;
+ const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1;
+ const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1;
struct net *net = seq->private;
+ unsigned long *buff;
+ int i;
seq_puts(seq, "TcpExt:");
- for (i = 0; snmp4_net_list[i].name; i++)
+ for (i = 0; i < tcp_cnt; i++)
seq_printf(seq, " %s", snmp4_net_list[i].name);
seq_puts(seq, "\nTcpExt:");
- for (i = 0; snmp4_net_list[i].name; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field(net->mib.net_statistics,
- snmp4_net_list[i].entry));
-
+ buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)),
+ GFP_KERNEL);
+ if (buff) {
+ snmp_get_cpu_field_batch(buff, snmp4_net_list,
+ net->mib.net_statistics);
+ for (i = 0; i < tcp_cnt; i++)
+ seq_printf(seq, " %lu", buff[i]);
+ } else {
+ for (i = 0; i < tcp_cnt; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.net_statistics,
+ snmp4_net_list[i].entry));
+ }
seq_puts(seq, "\nIpExt:");
- for (i = 0; snmp4_ipextstats_list[i].name; i++)
+ for (i = 0; i < ip_cnt; i++)
seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
seq_puts(seq, "\nIpExt:");
- for (i = 0; snmp4_ipextstats_list[i].name; i++)
- seq_printf(seq, " %llu",
- snmp_fold_field64(net->mib.ip_statistics,
- snmp4_ipextstats_list[i].entry,
- offsetof(struct ipstats_mib, syncp)));
-
+ if (buff) {
+ u64 *buff64 = (u64 *)buff;
+
+ memset(buff64, 0, ip_cnt * sizeof(u64));
+ snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list,
+ net->mib.ip_statistics,
+ offsetof(struct ipstats_mib, syncp));
+ for (i = 0; i < ip_cnt; i++)
+ seq_printf(seq, " %llu", buff64[i]);
+ } else {
+ for (i = 0; i < ip_cnt; i++)
+ seq_printf(seq, " %llu",
+ snmp_fold_field64(net->mib.ip_statistics,
+ snmp4_ipextstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
+ }
+ kfree(buff);
seq_putc(seq, '\n');
mptcp_seq_show(seq);
return 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e26652ff7059..02d81d79deeb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,9 +133,11 @@ static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
* Interface to generic destination cache.
*/
-static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+INDIRECT_CALLABLE_SCOPE
+struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
-static unsigned int ipv4_mtu(const struct dst_entry *dst);
+INDIRECT_CALLABLE_SCOPE
+unsigned int ipv4_mtu(const struct dst_entry *dst);
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
@@ -1187,7 +1189,8 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
}
EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
-static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
+ u32 cookie)
{
struct rtable *rt = (struct rtable *) dst;
@@ -1203,6 +1206,7 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
return NULL;
return dst;
}
+EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
static void ipv4_send_dest_unreach(struct sk_buff *skb)
{
@@ -1311,7 +1315,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
return min(advmss, IPV4_MAX_PMTU - header_size);
}
-static unsigned int ipv4_mtu(const struct dst_entry *dst)
+INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
{
const struct rtable *rt = (const struct rtable *)dst;
unsigned int mtu = rt->rt_pmtu;
@@ -1333,6 +1337,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
+EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
{
@@ -3299,6 +3304,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fri.type = rt->rt_type;
fri.offload = 0;
fri.trap = 0;
+ fri.offload_failed = 0;
if (res.fa_head) {
struct fib_alias *fa;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3e5f4f2e705e..f55095d3ed16 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1354,6 +1354,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE
},
+ {
+ .procname = "fib_notify_on_flag_change",
+ .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &two,
+ },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 32545ecf2ab1..a3422e42784e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -280,6 +280,12 @@
#include <asm/ioctls.h>
#include <net/busy_poll.h>
+/* Track pending CMSGs. */
+enum {
+ TCP_CMSG_INQ = 1,
+ TCP_CMSG_TS = 2
+};
+
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
@@ -475,19 +481,11 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
}
}
-static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
- int target, struct sock *sk)
+static bool tcp_stream_is_readable(struct sock *sk, int target)
{
- int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
-
- if (avail > 0) {
- if (avail >= target)
- return true;
- if (tcp_rmem_pressure(sk))
- return true;
- if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
- return true;
- }
+ if (tcp_epollin_ready(sk, target))
+ return true;
+
if (sk->sk_prot->stream_memory_read)
return sk->sk_prot->stream_memory_read(sk);
return false;
@@ -562,7 +560,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
tp->urg_data)
target++;
- if (tcp_stream_is_readable(tp, target, sk))
+ if (tcp_stream_is_readable(sk, target))
mask |= EPOLLIN | EPOLLRDNORM;
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
@@ -1010,7 +1008,7 @@ new_segment:
}
if (!(flags & MSG_NO_SHARED_FRAGS))
- skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+ skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
skb->len += copy;
skb->data_len += copy;
@@ -1217,7 +1215,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
skb = tcp_write_queue_tail(sk);
- uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
+ uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
if (!uarg) {
err = -ENOBUFS;
goto out_err;
@@ -1429,7 +1427,7 @@ out:
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
out_nopush:
- sock_zerocopy_put(uarg);
+ net_zcopy_put(uarg);
return copied + copied_syn;
do_error:
@@ -1440,7 +1438,7 @@ do_fault:
if (copied + copied_syn)
goto out;
out_err:
- sock_zerocopy_put_abort(uarg, true);
+ net_zcopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
@@ -1739,6 +1737,20 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
}
EXPORT_SYMBOL(tcp_set_rcvlowat);
+static void tcp_update_recv_tstamps(struct sk_buff *skb,
+ struct scm_timestamping_internal *tss)
+{
+ if (skb->tstamp)
+ tss->ts[0] = ktime_to_timespec64(skb->tstamp);
+ else
+ tss->ts[0] = (struct timespec64) {0};
+
+ if (skb_hwtstamps(skb)->hwtstamp)
+ tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
+ else
+ tss->ts[2] = (struct timespec64) {0};
+}
+
#ifdef CONFIG_MMU
static const struct vm_operations_struct tcp_vm_ops = {
};
@@ -1842,13 +1854,13 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
struct scm_timestamping_internal *tss,
int *cmsg_flags);
static int receive_fallback_to_copy(struct sock *sk,
- struct tcp_zerocopy_receive *zc, int inq)
+ struct tcp_zerocopy_receive *zc, int inq,
+ struct scm_timestamping_internal *tss)
{
unsigned long copy_address = (unsigned long)zc->copybuf_address;
- struct scm_timestamping_internal tss_unused;
- int err, cmsg_flags_unused;
struct msghdr msg = {};
struct iovec iov;
+ int err;
zc->length = 0;
zc->recv_skip_hint = 0;
@@ -1862,7 +1874,7 @@ static int receive_fallback_to_copy(struct sock *sk,
return err;
err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0,
- &tss_unused, &cmsg_flags_unused);
+ tss, &zc->msg_flags);
if (err < 0)
return err;
@@ -1903,21 +1915,27 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
return (__s32)copylen;
}
-static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
- struct sock *sk,
- struct sk_buff *skb,
- u32 *seq,
- s32 copybuf_len)
+static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
+ struct sock *sk,
+ struct sk_buff *skb,
+ u32 *seq,
+ s32 copybuf_len,
+ struct scm_timestamping_internal *tss)
{
u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
if (!copylen)
return 0;
/* skb is null if inq < PAGE_SIZE. */
- if (skb)
+ if (skb) {
offset = *seq - TCP_SKB_CB(skb)->seq;
- else
+ } else {
skb = tcp_recv_skb(sk, *seq, &offset);
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ tcp_update_recv_tstamps(skb, tss);
+ zc->msg_flags |= TCP_CMSG_TS;
+ }
+ }
zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
seq);
@@ -2004,9 +2022,38 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
err);
}
+#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS)
+static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+ struct scm_timestamping_internal *tss);
+static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
+ struct tcp_zerocopy_receive *zc,
+ struct scm_timestamping_internal *tss)
+{
+ unsigned long msg_control_addr;
+ struct msghdr cmsg_dummy;
+
+ msg_control_addr = (unsigned long)zc->msg_control;
+ cmsg_dummy.msg_control = (void *)msg_control_addr;
+ cmsg_dummy.msg_controllen =
+ (__kernel_size_t)zc->msg_controllen;
+ cmsg_dummy.msg_flags = in_compat_syscall()
+ ? MSG_CMSG_COMPAT : 0;
+ zc->msg_flags = 0;
+ if (zc->msg_control == msg_control_addr &&
+ zc->msg_controllen == cmsg_dummy.msg_controllen) {
+ tcp_recv_timestamp(&cmsg_dummy, sk, tss);
+ zc->msg_control = (__u64)
+ ((uintptr_t)cmsg_dummy.msg_control);
+ zc->msg_controllen =
+ (__u64)cmsg_dummy.msg_controllen;
+ zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
+ }
+}
+
#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
static int tcp_zerocopy_receive(struct sock *sk,
- struct tcp_zerocopy_receive *zc)
+ struct tcp_zerocopy_receive *zc,
+ struct scm_timestamping_internal *tss)
{
u32 length = 0, offset, vma_len, avail_len, copylen = 0;
unsigned long address = (unsigned long)zc->address;
@@ -2023,6 +2070,7 @@ static int tcp_zerocopy_receive(struct sock *sk,
int ret;
zc->copybuf_len = 0;
+ zc->msg_flags = 0;
if (address & (PAGE_SIZE - 1) || address != zc->address)
return -EINVAL;
@@ -2033,7 +2081,7 @@ static int tcp_zerocopy_receive(struct sock *sk,
sock_rps_record_flow(sk);
if (inq && inq <= copybuf_len)
- return receive_fallback_to_copy(sk, zc, inq);
+ return receive_fallback_to_copy(sk, zc, inq, tss);
if (inq < PAGE_SIZE) {
zc->length = 0;
@@ -2078,6 +2126,11 @@ static int tcp_zerocopy_receive(struct sock *sk,
} else {
skb = tcp_recv_skb(sk, seq, &offset);
}
+
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ tcp_update_recv_tstamps(skb, tss);
+ zc->msg_flags |= TCP_CMSG_TS;
+ }
zc->recv_skip_hint = skb->len - offset;
frags = skb_advance_to_frag(skb, offset, &offset_frag);
if (!frags || offset_frag)
@@ -2120,8 +2173,7 @@ out:
mmap_read_unlock(current->mm);
/* Try to copy straggler data. */
if (!ret)
- copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
- copybuf_len);
+ copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
if (length + copylen) {
WRITE_ONCE(tp->copied_seq, seq);
@@ -2142,20 +2194,6 @@ out:
}
#endif
-static void tcp_update_recv_tstamps(struct sk_buff *skb,
- struct scm_timestamping_internal *tss)
-{
- if (skb->tstamp)
- tss->ts[0] = ktime_to_timespec64(skb->tstamp);
- else
- tss->ts[0] = (struct timespec64) {0};
-
- if (skb_hwtstamps(skb)->hwtstamp)
- tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
- else
- tss->ts[2] = (struct timespec64) {0};
-}
-
/* Similar to __sock_recv_timestamp, but does not require an skb */
static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
struct scm_timestamping_internal *tss)
@@ -2272,7 +2310,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
goto out;
if (tp->recvmsg_inq)
- *cmsg_flags = 1;
+ *cmsg_flags = TCP_CMSG_INQ;
timeo = sock_rcvtimeo(sk, nonblock);
/* Urgent data needs to be handled specially. */
@@ -2453,7 +2491,7 @@ skip_copy:
if (TCP_SKB_CB(skb)->has_rxtstamp) {
tcp_update_recv_tstamps(skb, tss);
- *cmsg_flags |= 2;
+ *cmsg_flags |= TCP_CMSG_TS;
}
if (used + offset < skb->len)
@@ -2513,9 +2551,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
release_sock(sk);
if (cmsg_flags && ret >= 0) {
- if (cmsg_flags & 2)
+ if (cmsg_flags & TCP_CMSG_TS)
tcp_recv_timestamp(msg, sk, &tss);
- if (cmsg_flags & 1) {
+ if (cmsg_flags & TCP_CMSG_INQ) {
inq = tcp_inq_hint(sk);
put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
}
@@ -3767,11 +3805,24 @@ static size_t tcp_opt_stats_get_size(void)
nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
0;
}
+/* Returns TTL or hop limit of an incoming packet from skb. */
+static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
+{
+ if (skb->protocol == htons(ETH_P_IP))
+ return ip_hdr(skb)->ttl;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ return ipv6_hdr(skb)->hop_limit;
+ else
+ return 0;
+}
+
struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
- const struct sk_buff *orig_skb)
+ const struct sk_buff *orig_skb,
+ const struct sk_buff *ack_skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats;
@@ -3827,6 +3878,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
max_t(int, 0, tp->write_seq - tp->snd_nxt));
nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
TCP_NLA_PAD);
+ if (ack_skb)
+ nla_put_u8(stats, TCP_NLA_TTL,
+ tcp_skb_ttl_or_hop_limit(ack_skb));
return stats;
}
@@ -4083,6 +4137,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
}
#ifdef CONFIG_MMU
case TCP_ZEROCOPY_RECEIVE: {
+ struct scm_timestamping_internal tss;
struct tcp_zerocopy_receive zc = {};
int err;
@@ -4090,19 +4145,36 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
if (len < offsetofend(struct tcp_zerocopy_receive, length))
return -EINVAL;
- if (len > sizeof(zc)) {
+ if (unlikely(len > sizeof(zc))) {
+ err = check_zeroed_user(optval + sizeof(zc),
+ len - sizeof(zc));
+ if (err < 1)
+ return err == 0 ? -EINVAL : err;
len = sizeof(zc);
if (put_user(len, optlen))
return -EFAULT;
}
if (copy_from_user(&zc, optval, len))
return -EFAULT;
+ if (zc.reserved)
+ return -EINVAL;
+ if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))
+ return -EINVAL;
lock_sock(sk);
- err = tcp_zerocopy_receive(sk, &zc);
+ err = tcp_zerocopy_receive(sk, &zc, &tss);
+ err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
+ &zc, &len, err);
release_sock(sk);
- if (len >= offsetofend(struct tcp_zerocopy_receive, err))
- goto zerocopy_rcv_sk_err;
+ if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
+ goto zerocopy_rcv_cmsg;
switch (len) {
+ case offsetofend(struct tcp_zerocopy_receive, msg_flags):
+ goto zerocopy_rcv_cmsg;
+ case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
+ case offsetofend(struct tcp_zerocopy_receive, msg_control):
+ case offsetofend(struct tcp_zerocopy_receive, flags):
+ case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
+ case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
case offsetofend(struct tcp_zerocopy_receive, err):
goto zerocopy_rcv_sk_err;
case offsetofend(struct tcp_zerocopy_receive, inq):
@@ -4111,6 +4183,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
default:
goto zerocopy_rcv_out;
}
+zerocopy_rcv_cmsg:
+ if (zc.msg_flags & TCP_CMSG_TS)
+ tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
+ else
+ zc.msg_flags = 0;
zerocopy_rcv_sk_err:
if (!err)
zc.err = sock_error(sk);
@@ -4133,6 +4210,18 @@ zerocopy_rcv_out:
return 0;
}
+bool tcp_bpf_bypass_getsockopt(int level, int optname)
+{
+ /* TCP do_tcp_getsockopt has optimized getsockopt implementation
+ * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
+ */
+ if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
+
int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int __user *optlen)
{
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index c7bf5b26bf0c..ffcbe46dacdb 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -104,16 +104,7 @@ struct bictcp {
static inline void bictcp_reset(struct bictcp *ca)
{
- ca->cnt = 0;
- ca->last_max_cwnd = 0;
- ca->last_cwnd = 0;
- ca->last_time = 0;
- ca->bic_origin_point = 0;
- ca->bic_K = 0;
- ca->delay_min = 0;
- ca->epoch_start = 0;
- ca->ack_cnt = 0;
- ca->tcp_cwnd = 0;
+ memset(ca, 0, offsetof(struct bictcp, unused));
ca->found = 0;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9b44caa4b956..69a545db80d2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3146,7 +3146,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
}
static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
- u32 prior_snd_una)
+ const struct sk_buff *ack_skb, u32 prior_snd_una)
{
const struct skb_shared_info *shinfo;
@@ -3158,7 +3158,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
if (!before(shinfo->tskey, prior_snd_una) &&
before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
tcp_skb_tsorted_save(skb) {
- __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+ __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK);
} tcp_skb_tsorted_restore(skb);
}
}
@@ -3167,8 +3167,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
* is before the ack sequence we can discard it as it's confirmed to have
* arrived at the other end.
*/
-static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
- u32 prior_snd_una,
+static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
+ u32 prior_fack, u32 prior_snd_una,
struct tcp_sacktag_state *sack, bool ece_ack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3257,7 +3257,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
if (!fully_acked)
break;
- tcp_ack_tstamp(sk, skb, prior_snd_una);
+ tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
next = skb_rb_next(skb);
if (unlikely(skb == tp->retransmit_skb_hint))
@@ -3275,7 +3275,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->snd_up = tp->snd_una;
if (skb) {
- tcp_ack_tstamp(sk, skb, prior_snd_una);
+ tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
flag |= FLAG_SACK_RENEGING;
}
@@ -3810,8 +3810,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state,
- flag & FLAG_ECE);
+ flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una,
+ &sack_state, flag & FLAG_ECE);
tcp_rack_update_reo_wnd(sk, &rs);
@@ -4924,15 +4924,8 @@ err:
void tcp_data_ready(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
- int avail = tp->rcv_nxt - tp->copied_seq;
-
- if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) &&
- !sock_flag(sk, SOCK_DONE) &&
- tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss)
- return;
-
- sk->sk_data_ready(sk);
+ if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE))
+ sk->sk_data_ready(sk);
}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 777306b5bc22..daad4f99db32 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1649,6 +1649,8 @@ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
return mss;
}
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ u32));
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
@@ -1668,7 +1670,8 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
- !dst->ops->check(dst, 0)) {
+ !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
+ dst, 0)) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
@@ -2793,6 +2796,7 @@ struct proto tcp_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
+ .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8478cf749821..fbf140a770d8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1319,7 +1319,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
skb_orphan(skb);
skb->sk = sk;
skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
- skb_set_hash_from_sk(skb, sk);
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
@@ -1390,6 +1389,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tcp_skb_pcount(skb));
tp->segs_out += tcp_skb_pcount(skb);
+ skb_set_hash_from_sk(skb, sk);
/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 69ea76578abb..4a0478b17243 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -596,6 +596,12 @@ void udp_encap_enable(void)
}
EXPORT_SYMBOL(udp_encap_enable);
+void udp_encap_disable(void)
+{
+ static_branch_dec(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_disable);
+
/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
* through error handlers in encapsulations looking for a match.
*/
@@ -1124,7 +1130,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_unlock();
}
- if (cgroup_bpf_enabled && !connected) {
+ if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) {
err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
(struct sockaddr *)usin, &ipc.addr);
if (err)
@@ -1858,9 +1864,8 @@ try_again:
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
*addr_len = sizeof(*sin);
- if (cgroup_bpf_enabled)
- BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
- (struct sockaddr *)sin);
+ BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+ (struct sockaddr *)sin);
}
if (udp_sk(sk)->gro_enabled)
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index cfc872689b99..b76c48efd37e 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -68,8 +68,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
(NETIF_F_HW_CSUM | NETIF_F_IP_CSUM))));
features &= skb->dev->hw_enc_features;
- /* CRC checksum can't be handled by HW when it's a UDP tunneling packet. */
- features &= ~NETIF_F_SCTP_CRC;
+ if (need_csum)
+ features &= ~NETIF_F_SCTP_CRC;
/* The only checksum offload we care about from here on out is the
* outer one so strip the existing checksum feature flags and
@@ -519,7 +519,8 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1;
- if ((sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) {
+ if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
+ (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) {
pp = call_gro_receive(udp_gro_receive_segment, head, skb);
return pp;
}
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 3eecba0874aa..b97e3635acf5 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -90,15 +90,11 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
struct sock *sk = sock->sk;
struct udp_tunnel_info ti;
- if (!dev->netdev_ops->ndo_udp_tunnel_add ||
- !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- return;
-
ti.type = type;
ti.sa_family = sk->sk_family;
ti.port = inet_sk(sk)->inet_sport;
- dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
+ udp_tunnel_nic_add_port(dev, &ti);
}
EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port);
@@ -108,15 +104,11 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
struct sock *sk = sock->sk;
struct udp_tunnel_info ti;
- if (!dev->netdev_ops->ndo_udp_tunnel_del ||
- !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- return;
-
ti.type = type;
ti.sa_family = sk->sk_family;
ti.port = inet_sk(sk)->inet_sport;
- dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
+ udp_tunnel_nic_del_port(dev, &ti);
}
EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port);
@@ -134,11 +126,7 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
- if (!dev->netdev_ops->ndo_udp_tunnel_add)
- continue;
- if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- continue;
- dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
+ udp_tunnel_nic_add_port(dev, &ti);
}
rcu_read_unlock();
}
@@ -158,11 +146,7 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
- if (!dev->netdev_ops->ndo_udp_tunnel_del)
- continue;
- if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- continue;
- dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
+ udp_tunnel_nic_del_port(dev, &ti);
}
rcu_read_unlock();
}