diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 23 | ||||
-rw-r--r-- | net/ipv4/esp4_offload.c | 2 | ||||
-rw-r--r-- | net/ipv4/fib_lookup.h | 6 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 7 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 38 | ||||
-rw-r--r-- | net/ipv4/gre_offload.c | 22 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 25 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 1 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 6 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 9 | ||||
-rw-r--r-- | net/ipv4/ipconfig.c | 22 | ||||
-rw-r--r-- | net/ipv4/netfilter/nft_dup_ipv4.c | 18 | ||||
-rw-r--r-- | net/ipv4/nexthop.c | 347 | ||||
-rw-r--r-- | net/ipv4/proc.c | 50 | ||||
-rw-r--r-- | net/ipv4/route.c | 14 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 9 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 199 | ||||
-rw-r--r-- | net/ipv4/tcp_cubic.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 27 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp.c | 13 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 7 | ||||
-rw-r--r-- | net/ipv4/udp_tunnel_core.c | 24 |
24 files changed, 572 insertions, 316 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b94fa8eb831b..a02ce89b56b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -438,6 +438,7 @@ EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + u32 flags = BIND_WITH_LOCK; int err; /* If the socket has its own bind function then use it. (RAW) */ @@ -450,11 +451,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + BPF_CGROUP_INET4_BIND, &flags); if (err) return err; - return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); + return __inet_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet_bind); @@ -499,7 +501,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, snum = ntohs(addr->sin_port); err = -EACCES; - if (snum && inet_port_requires_bind_service(net, snum) && + if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && + snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; @@ -777,18 +780,19 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, return -ENOTCONN; sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET4_GETPEERNAME, + NULL); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET4_GETPEERNAME : - BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET4_GETSOCKNAME, NULL); + } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } @@ -1419,7 +1423,6 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, out: return segs; } -EXPORT_SYMBOL(inet_gso_segment); static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, netdev_features_t features) @@ -1550,7 +1553,6 @@ out: return pp; } -EXPORT_SYMBOL(inet_gro_receive); static struct sk_buff *ipip_gro_receive(struct list_head *head, struct sk_buff *skb) @@ -1636,7 +1638,6 @@ out_unlock: return err; } -EXPORT_SYMBOL(inet_gro_complete); static int ipip_gro_complete(struct sk_buff *skb, int nhoff) { @@ -1871,6 +1872,8 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_igmp_llm_reports = 1; net->ipv4.sysctl_igmp_qrv = 2; + net->ipv4.sysctl_fib_notify_on_flag_change = 0; + return 0; } diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 5bda5aeda579..601f5fbfc63f 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -285,7 +285,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ esp.esph = ip_esp_hdr(skb); - if (!hw_offload || (hw_offload && !skb_is_gso(skb))) { + if (!hw_offload || !skb_is_gso(skb)) { esp.nfrags = esp_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 818916b2a04d..b58db1ca4bfb 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -18,7 +18,8 @@ struct fib_alias { s16 fa_default; u8 offload:1, trap:1, - unused:6; + offload_failed:1, + unused:5; struct rcu_head rcu; }; @@ -39,9 +40,10 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - struct fib_rt_info *fri, unsigned int flags); + const struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); +size_t fib_nlmsg_size(struct fib_info *fi); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b5400cec4f69..a632b66bc13a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -452,7 +452,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) return -1; } -static inline size_t fib_nlmsg_size(struct fib_info *fi) +size_t fib_nlmsg_size(struct fib_info *fi) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ @@ -521,6 +521,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, fri.type = fa->fa_type; fri.offload = fa->offload; fri.trap = fa->trap; + fri.offload_failed = fa->offload_failed; err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ @@ -1733,7 +1734,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) #endif int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, - struct fib_rt_info *fri, unsigned int flags) + const struct fib_rt_info *fri, unsigned int flags) { unsigned int nhs = fib_info_num_path(fri->fi); struct fib_info *fi = fri->fi; @@ -1811,6 +1812,8 @@ offload: rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) rtm->rtm_flags |= RTM_F_TRAP; + if (fri->offload_failed) + rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 28117c05dc35..25cf387cca5b 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1038,6 +1038,8 @@ fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) { struct fib_alias *fa_match; + struct sk_buff *skb; + int err; rcu_read_lock(); @@ -1045,9 +1047,42 @@ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) if (!fa_match) goto out; + if (fa_match->offload == fri->offload && fa_match->trap == fri->trap && + fa_match->offload_failed == fri->offload_failed) + goto out; + fa_match->offload = fri->offload; fa_match->trap = fri->trap; + /* 2 means send notifications only if offload_failed was changed. */ + if (net->ipv4.sysctl_fib_notify_on_flag_change == 2 && + fa_match->offload_failed == fri->offload_failed) + goto out; + + fa_match->offload_failed = fri->offload_failed; + + if (!net->ipv4.sysctl_fib_notify_on_flag_change) + goto out; + + skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC); + if (!skb) { + err = -ENOBUFS; + goto errout; + } + + err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC); + goto out; + +errout: + rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err); out: rcu_read_unlock(); } @@ -1263,6 +1298,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; + new_fa->offload_failed = 0; hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); @@ -1323,6 +1359,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; + new_fa->offload_failed = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); @@ -2262,6 +2299,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, fri.type = fa->fa_type; fri.offload = fa->offload; fri.trap = fa->trap; + fri.offload_failed = fa->offload_failed; err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index e0a246575887..1121a9d5fed9 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -15,7 +15,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - bool need_csum, need_recompute_csum, gso_partial; + bool need_csum, offload_csum, gso_partial, need_ipsec; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; @@ -41,10 +41,16 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, skb->protocol = skb->inner_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); - need_recompute_csum = skb->csum_not_inet; skb->encap_hdr_csum = need_csum; features &= skb->dev->hw_enc_features; + if (need_csum) + features &= ~NETIF_F_SCTP_CRC; + + need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && !need_ipsec && + (skb->dev->features & NETIF_F_HW_CSUM)); /* segment inner packet. */ segs = skb_mac_gso_segment(skb, features); @@ -99,14 +105,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } *(pcsum + 1) = 0; - if (need_recompute_csum && !skb_is_gso(skb)) { - __wsum csum; - - csum = skb_checksum(skb, gre_offset, - skb->len - gre_offset, 0); - *pcsum = csum_fold(csum); - } else { + if (skb->encapsulation || !offload_csum) { *pcsum = gso_make_checksum(skb, 0); + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = sizeof(*greh); } } while ((skb = skb->next)); out: diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 45fb450b4522..c96866a53a66 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -709,6 +709,17 @@ unlock: } EXPORT_SYMBOL_GPL(inet_unhash); +/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm + * Note that we use 32bit integers (vs RFC 'short integers') + * because 2^16 is not a multiple of num_ephemeral and this + * property might be used by clever attacker. + * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, + * we use 256 instead to really give more isolation and + * privacy, this only consumes 1 KB of kernel memory. + */ +#define INET_TABLE_PERTURB_SHIFT 8 +static u32 table_perturb[1 << INET_TABLE_PERTURB_SHIFT]; + int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u32 port_offset, int (*check_established)(struct inet_timewait_death_row *, @@ -722,8 +733,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct inet_bind_bucket *tb; u32 remaining, offset; int ret, i, low, high; - static u32 hint; int l3mdev; + u32 index; if (port) { head = &hinfo->bhash[inet_bhashfn(net, port, @@ -750,7 +761,10 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (likely(remaining > 1)) remaining &= ~1U; - offset = (hint + port_offset) % remaining; + net_get_random_once(table_perturb, sizeof(table_perturb)); + index = hash_32(port_offset, INET_TABLE_PERTURB_SHIFT); + + offset = (READ_ONCE(table_perturb[index]) + port_offset) % remaining; /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ @@ -804,7 +818,12 @@ next_port: return -EADDRNOTAVAIL; ok: - hint += i + 2; + /* If our first attempt found a candidate, skip next candidate + * in 1/16 of cases to add some noise. + */ + if (!i && !(prandom_u32() % 16)) + i = 2; + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b0c244af1e4d..3a025c011971 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -253,6 +253,7 @@ int ip_local_deliver(struct sk_buff *skb) net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } +EXPORT_SYMBOL(ip_local_deliver); static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2ed0b01f72f0..3aab53beb4ea 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -434,6 +434,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } +EXPORT_SYMBOL(ip_output); /* * copy saddr and daddr, possibly using 64bit load/stores @@ -1018,7 +1019,7 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ @@ -1230,8 +1231,7 @@ alloc_new_skb: error_efault: err = -EFAULT; error: - if (uarg) - sock_zerocopy_put_abort(uarg, extra_uref); + net_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 7ca338fbe8ba..6b2dc7b2b612 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -222,7 +222,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) .code = ICMP_FRAG_NEEDED, .checksum = 0, .un.frag.__unused = 0, - .un.frag.mtu = ntohs(mtu), + .un.frag.mtu = htons(mtu), }; icmph->checksum = ip_compute_csum(icmph, len); skb_reset_transport_header(skb); @@ -245,7 +245,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) skb->ip_summed = CHECKSUM_NONE; - eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0); + eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0); skb_reset_mac_header(skb); return skb->len; @@ -338,7 +338,7 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) skb->ip_summed = CHECKSUM_NONE; - eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0); + eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0); skb_reset_mac_header(skb); return skb->len; @@ -583,8 +583,9 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr, static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, struct netlink_ext_ack *extack) { - int err, rem, opt_len, opts_len = 0, type = 0; + int err, rem, opt_len, opts_len = 0; struct nlattr *nla; + __be16 type = 0; if (!attr) return 0; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 3cd13e1bc6a7..47db1bfdaaa0 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -61,7 +61,6 @@ #include <linux/export.h> #include <net/net_namespace.h> #include <net/arp.h> -#include <net/dsa.h> #include <net/ip.h> #include <net/ipconfig.h> #include <net/route.h> @@ -218,9 +217,9 @@ static int __init ic_open_devs(void) last = &ic_first_dev; rtnl_lock(); - /* bring loopback and DSA master network devices up first */ + /* bring loopback device up first */ for_each_netdev(&init_net, dev) { - if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev)) + if (!(dev->flags & IFF_LOOPBACK)) continue; if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0) pr_err("IP-Config: Failed to open %s\n", dev->name); @@ -305,17 +304,32 @@ have_carrier: return 0; } +/* Close all network interfaces except the one we've autoconfigured, and its + * lowers, in case it's a stacked virtual interface. + */ static void __init ic_close_devs(void) { + struct net_device *selected_dev = ic_dev->dev; struct ic_device *d, *next; struct net_device *dev; rtnl_lock(); next = ic_first_dev; while ((d = next)) { + bool bring_down = (d != ic_dev); + struct net_device *lower_dev; + struct list_head *iter; + next = d->next; dev = d->dev; - if (d != ic_dev && !netdev_uses_dsa(dev)) { + + netdev_for_each_lower_dev(selected_dev, lower_dev, iter) { + if (dev == lower_dev) { + bring_down = false; + break; + } + } + if (bring_down) { pr_debug("IP-Config: Downing %s\n", dev->name); dev_change_flags(dev, d->flags, NULL); } diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index bcdb37f86a94..aeb631760eb9 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -13,8 +13,8 @@ #include <net/netfilter/ipv4/nf_dup_ipv4.h> struct nft_dup_ipv4 { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_dev:8; + u8 sreg_addr; + u8 sreg_dev; }; static void nft_dup_ipv4_eval(const struct nft_expr *expr, @@ -40,16 +40,16 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); - err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr)); + err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + sizeof(struct in_addr)); if (err < 0) return err; - if (tb[NFTA_DUP_SREG_DEV] != NULL) { - priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); - } - return 0; + if (tb[NFTA_DUP_SREG_DEV]) + err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + &priv->sreg_dev, sizeof(int)); + + return err; } static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index e53e43aef785..f1c6cbdb9e43 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -22,7 +22,7 @@ static void remove_nexthop(struct net *net, struct nexthop *nh, #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) -static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { +static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, @@ -31,6 +31,15 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { [NHA_GATEWAY] = { .type = NLA_BINARY }, [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, + [NHA_FDB] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy rtm_nh_policy_get[] = { + [NHA_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy rtm_nh_policy_dump[] = { + [NHA_OIF] = { .type = NLA_U32 }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_FDB] = { .type = NLA_FLAG }, @@ -62,6 +71,7 @@ __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { + info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; @@ -76,13 +86,13 @@ static void nh_notifier_single_info_fini(struct nh_notifier_info *info) kfree(info->nh); } -static int nh_notifier_grp_info_init(struct nh_notifier_info *info, - const struct nexthop *nh) +static int nh_notifier_mp_info_init(struct nh_notifier_info *info, + struct nh_group *nhg) { - struct nh_group *nhg = rtnl_dereference(nh->nh_grp); u16 num_nh = nhg->num_nh; int i; + info->type = NH_NOTIFIER_INFO_TYPE_GRP; info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), GFP_KERNEL); if (!info->nh_grp) @@ -103,27 +113,41 @@ static int nh_notifier_grp_info_init(struct nh_notifier_info *info, return 0; } -static void nh_notifier_grp_info_fini(struct nh_notifier_info *info) +static int nh_notifier_grp_info_init(struct nh_notifier_info *info, + const struct nexthop *nh) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + if (nhg->mpath) + return nh_notifier_mp_info_init(info, nhg); + return -EINVAL; +} + +static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, + const struct nexthop *nh) { - kfree(info->nh_grp); + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + if (nhg->mpath) + kfree(info->nh_grp); } static int nh_notifier_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { info->id = nh->id; - info->is_grp = nh->is_group; - if (info->is_grp) + if (nh->is_group) return nh_notifier_grp_info_init(info, nh); else return nh_notifier_single_info_init(info, nh); } -static void nh_notifier_info_fini(struct nh_notifier_info *info) +static void nh_notifier_info_fini(struct nh_notifier_info *info, + const struct nexthop *nh) { - if (info->is_grp) - nh_notifier_grp_info_fini(info); + if (nh->is_group) + nh_notifier_grp_info_fini(info, nh); else nh_notifier_single_info_fini(info); } @@ -152,7 +176,7 @@ static int call_nexthop_notifiers(struct net *net, err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, event_type, &info); - nh_notifier_info_fini(&info); + nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } @@ -173,7 +197,7 @@ static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, return err; err = nb->notifier_call(nb, event_type, &info); - nh_notifier_info_fini(&info); + nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } @@ -200,7 +224,7 @@ static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) hlist_add_head(&nhi->dev_hash, head); } -static void nexthop_free_mpath(struct nexthop *nh) +static void nexthop_free_group(struct nexthop *nh) { struct nh_group *nhg; int i; @@ -240,7 +264,7 @@ void nexthop_free_rcu(struct rcu_head *head) struct nexthop *nh = container_of(head, struct nexthop, rcu); if (nh->is_group) - nexthop_free_mpath(nh); + nexthop_free_group(nh); else nexthop_free_single(nh); @@ -565,7 +589,8 @@ static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, return 0; } -static int nh_check_attr_group(struct net *net, struct nlattr *tb[], +static int nh_check_attr_group(struct net *net, + struct nlattr *tb[], size_t tb_size, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); @@ -624,7 +649,7 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], return -EINVAL; } } - for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) { + for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; if (i == NHA_FDB) @@ -670,21 +695,16 @@ static bool ipv4_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) +static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) { struct nexthop *rc = NULL; - struct nh_group *nhg; int i; - if (!nh->is_group) - return nh; - - nhg = rcu_dereference(nh->nh_grp); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; - if (hash > atomic_read(&nhge->upper_bound)) + if (hash > atomic_read(&nhge->mpath.upper_bound)) continue; nhi = rcu_dereference(nhge->nh->nh_info); @@ -711,6 +731,21 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) return rc; } + +struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) +{ + struct nh_group *nhg; + + if (!nh->is_group) + return nh; + + nhg = rcu_dereference(nh->nh_grp); + if (nhg->mpath) + return nexthop_select_path_mp(nhg, hash); + + /* Unreachable. */ + return NULL; +} EXPORT_SYMBOL_GPL(nexthop_select_path); int nexthop_for_each_fib6_nh(struct nexthop *nh, @@ -904,7 +939,7 @@ static void nh_group_rebalance(struct nh_group *nhg) w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; - atomic_set(&nhge->upper_bound, upper_bound); + atomic_set(&nhge->mpath.upper_bound, upper_bound); } } @@ -1446,10 +1481,13 @@ static struct nexthop *nexthop_create_group(struct net *net, nhg->nh_entries[i].nh_parent = nh; } - if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) nhg->mpath = 1; + + WARN_ON_ONCE(nhg->mpath != 1); + + if (nhg->mpath) nh_group_rebalance(nhg); - } if (cfg->nh_fdb) nhg->fdb_nh = 1; @@ -1643,11 +1681,12 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); - struct nlattr *tb[NHA_MAX + 1]; + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; int err; - err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, - extack); + err = nlmsg_parse(nlh, sizeof(*nhm), tb, + ARRAY_SIZE(rtm_nh_policy_new) - 1, + rtm_nh_policy_new, extack); if (err < 0) return err; @@ -1674,11 +1713,6 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, goto out; } - if (tb[NHA_GROUPS] || tb[NHA_MASTER]) { - NL_SET_ERR_MSG(extack, "Invalid attributes in request"); - goto out; - } - memset(cfg, 0, sizeof(*cfg)); cfg->nlflags = nlh->nlmsg_flags; cfg->nlinfo.portid = NETLINK_CB(skb).portid; @@ -1720,7 +1754,7 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } - err = nh_check_attr_group(net, tb, extack); + err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), extack); /* no other attributes should be set */ goto out; @@ -1838,49 +1872,44 @@ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } -static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id, - struct netlink_ext_ack *extack) +static int __nh_valid_get_del_req(const struct nlmsghdr *nlh, + struct nlattr **tb, u32 *id, + struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); - struct nlattr *tb[NHA_MAX + 1]; - int err, i; - - err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, - extack); - if (err < 0) - return err; - err = -EINVAL; - for (i = 0; i < __NHA_MAX; ++i) { - if (!tb[i]) - continue; - - switch (i) { - case NHA_ID: - break; - default: - NL_SET_ERR_MSG_ATTR(extack, tb[i], - "Unexpected attribute in request"); - goto out; - } - } if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header"); - goto out; + return -EINVAL; } if (!tb[NHA_ID]) { NL_SET_ERR_MSG(extack, "Nexthop id is missing"); - goto out; + return -EINVAL; } *id = nla_get_u32(tb[NHA_ID]); - if (!(*id)) + if (!(*id)) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); - else - err = 0; -out: - return err; + return -EINVAL; + } + + return 0; +} + +static int nh_valid_get_del_req(const struct nlmsghdr *nlh, u32 *id, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; + int err; + + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_get) - 1, + rtm_nh_policy_get, extack); + if (err < 0) + return err; + + return __nh_valid_get_del_req(nlh, tb, id, extack); } /* rtnl */ @@ -1949,16 +1978,23 @@ errout_free: goto out; } -static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, - bool group_filter, u8 family) +struct nh_dump_filter { + int dev_idx; + int master_idx; + bool group_filter; + bool fdb_filter; +}; + +static bool nh_dump_filtered(struct nexthop *nh, + struct nh_dump_filter *filter, u8 family) { const struct net_device *dev; const struct nh_info *nhi; - if (group_filter && !nh->is_group) + if (filter->group_filter && !nh->is_group) return true; - if (!dev_idx && !master_idx && !family) + if (!filter->dev_idx && !filter->master_idx && !family) return false; if (nh->is_group) @@ -1969,70 +2005,48 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, return true; dev = nhi->fib_nhc.nhc_dev; - if (dev_idx && (!dev || dev->ifindex != dev_idx)) + if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx)) return true; - if (master_idx) { + if (filter->master_idx) { struct net_device *master; if (!dev) return true; master = netdev_master_upper_dev_get((struct net_device *)dev); - if (!master || master->ifindex != master_idx) + if (!master || master->ifindex != filter->master_idx) return true; } return false; } -static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, - int *master_idx, bool *group_filter, - bool *fdb_filter, struct netlink_callback *cb) +static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb, + struct nh_dump_filter *filter, + struct netlink_ext_ack *extack) { - struct netlink_ext_ack *extack = cb->extack; - struct nlattr *tb[NHA_MAX + 1]; struct nhmsg *nhm; - int err, i; u32 idx; - err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, - NULL); - if (err < 0) - return err; - - for (i = 0; i <= NHA_MAX; ++i) { - if (!tb[i]) - continue; - - switch (i) { - case NHA_OIF: - idx = nla_get_u32(tb[i]); - if (idx > INT_MAX) { - NL_SET_ERR_MSG(extack, "Invalid device index"); - return -EINVAL; - } - *dev_idx = idx; - break; - case NHA_MASTER: - idx = nla_get_u32(tb[i]); - if (idx > INT_MAX) { - NL_SET_ERR_MSG(extack, "Invalid master device index"); - return -EINVAL; - } - *master_idx = idx; - break; - case NHA_GROUPS: - *group_filter = true; - break; - case NHA_FDB: - *fdb_filter = true; - break; - default: - NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); + if (tb[NHA_OIF]) { + idx = nla_get_u32(tb[NHA_OIF]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + return -EINVAL; + } + filter->dev_idx = idx; + } + if (tb[NHA_MASTER]) { + idx = nla_get_u32(tb[NHA_MASTER]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid master device index"); return -EINVAL; } + filter->master_idx = idx; } + filter->group_filter = nla_get_flag(tb[NHA_GROUPS]); + filter->fdb_filter = nla_get_flag(tb[NHA_FDB]); nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { @@ -2043,24 +2057,49 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, return 0; } -/* rtnl */ -static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) +static int nh_valid_dump_req(const struct nlmsghdr *nlh, + struct nh_dump_filter *filter, + struct netlink_callback *cb) { - bool group_filter = false, fdb_filter = false; - struct nhmsg *nhm = nlmsg_data(cb->nlh); - int dev_filter_idx = 0, master_idx = 0; - struct net *net = sock_net(skb->sk); - struct rb_root *root = &net->nexthop.rb_root; - struct rb_node *node; - int idx = 0, s_idx; + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)]; int err; - err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, - &group_filter, &fdb_filter, cb); + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_dump) - 1, + rtm_nh_policy_dump, cb->extack); if (err < 0) return err; - s_idx = cb->args[0]; + return __nh_valid_dump_req(nlh, tb, filter, cb->extack); +} + +struct rtm_dump_nh_ctx { + u32 idx; +}; + +static struct rtm_dump_nh_ctx * +rtm_dump_nh_ctx(struct netlink_callback *cb) +{ + struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + return ctx; +} + +static int rtm_dump_walk_nexthops(struct sk_buff *skb, + struct netlink_callback *cb, + struct rb_root *root, + struct rtm_dump_nh_ctx *ctx, + int (*nh_cb)(struct sk_buff *skb, + struct netlink_callback *cb, + struct nexthop *nh, void *data), + void *data) +{ + struct rb_node *node; + int idx = 0, s_idx; + int err; + + s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; @@ -2068,30 +2107,58 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) goto cont; nh = rb_entry(node, struct nexthop, rb_node); - if (nh_dump_filtered(nh, dev_filter_idx, master_idx, - group_filter, nhm->nh_family)) - goto cont; - - err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err < 0) { - if (likely(skb->len)) - goto out; - - goto out_err; - } + ctx->idx = idx; + err = nh_cb(skb, cb, nh, data); + if (err) + return err; cont: idx++; } + ctx->idx = idx; + return 0; +} + +static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, + struct nexthop *nh, void *data) +{ + struct nhmsg *nhm = nlmsg_data(cb->nlh); + struct nh_dump_filter *filter = data; + + if (nh_dump_filtered(nh, filter, nhm->nh_family)) + return 0; + + return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); +} + +/* rtnl */ +static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb); + struct net *net = sock_net(skb->sk); + struct rb_root *root = &net->nexthop.rb_root; + struct nh_dump_filter filter = {}; + int err; + + err = nh_valid_dump_req(cb->nlh, &filter, cb); + if (err < 0) + return err; + + err = rtm_dump_walk_nexthops(skb, cb, root, ctx, + &rtm_dump_nexthop_cb, &filter); + if (err < 0) { + if (likely(skb->len)) + goto out; + goto out_err; + } + out: err = skb->len; out_err: - cb->args[0] = idx; cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); - return err; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 63cd370ea29d..6d46297a99f8 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -464,30 +464,52 @@ static int snmp_seq_show(struct seq_file *seq, void *v) */ static int netstat_seq_show(struct seq_file *seq, void *v) { - int i; + const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1; + const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1; struct net *net = seq->private; + unsigned long *buff; + int i; seq_puts(seq, "TcpExt:"); - for (i = 0; snmp4_net_list[i].name; i++) + for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %s", snmp4_net_list[i].name); seq_puts(seq, "\nTcpExt:"); - for (i = 0; snmp4_net_list[i].name; i++) - seq_printf(seq, " %lu", - snmp_fold_field(net->mib.net_statistics, - snmp4_net_list[i].entry)); - + buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)), + GFP_KERNEL); + if (buff) { + snmp_get_cpu_field_batch(buff, snmp4_net_list, + net->mib.net_statistics); + for (i = 0; i < tcp_cnt; i++) + seq_printf(seq, " %lu", buff[i]); + } else { + for (i = 0; i < tcp_cnt; i++) + seq_printf(seq, " %lu", + snmp_fold_field(net->mib.net_statistics, + snmp4_net_list[i].entry)); + } seq_puts(seq, "\nIpExt:"); - for (i = 0; snmp4_ipextstats_list[i].name; i++) + for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); seq_puts(seq, "\nIpExt:"); - for (i = 0; snmp4_ipextstats_list[i].name; i++) - seq_printf(seq, " %llu", - snmp_fold_field64(net->mib.ip_statistics, - snmp4_ipextstats_list[i].entry, - offsetof(struct ipstats_mib, syncp))); - + if (buff) { + u64 *buff64 = (u64 *)buff; + + memset(buff64, 0, ip_cnt * sizeof(u64)); + snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list, + net->mib.ip_statistics, + offsetof(struct ipstats_mib, syncp)); + for (i = 0; i < ip_cnt; i++) + seq_printf(seq, " %llu", buff64[i]); + } else { + for (i = 0; i < ip_cnt; i++) + seq_printf(seq, " %llu", + snmp_fold_field64(net->mib.ip_statistics, + snmp4_ipextstats_list[i].entry, + offsetof(struct ipstats_mib, syncp))); + } + kfree(buff); seq_putc(seq, '\n'); mptcp_seq_show(seq); return 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e26652ff7059..02d81d79deeb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -133,9 +133,11 @@ static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; * Interface to generic destination cache. */ -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); +INDIRECT_CALLABLE_SCOPE +struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); -static unsigned int ipv4_mtu(const struct dst_entry *dst); +INDIRECT_CALLABLE_SCOPE +unsigned int ipv4_mtu(const struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, @@ -1187,7 +1189,8 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) } EXPORT_SYMBOL_GPL(ipv4_sk_redirect); -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, + u32 cookie) { struct rtable *rt = (struct rtable *) dst; @@ -1203,6 +1206,7 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) return NULL; return dst; } +EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { @@ -1311,7 +1315,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) return min(advmss, IPV4_MAX_PMTU - header_size); } -static unsigned int ipv4_mtu(const struct dst_entry *dst) +INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { const struct rtable *rt = (const struct rtable *)dst; unsigned int mtu = rt->rt_pmtu; @@ -1333,6 +1337,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } +EXPORT_INDIRECT_CALLABLE(ipv4_mtu); static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { @@ -3299,6 +3304,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fri.type = rt->rt_type; fri.offload = 0; fri.trap = 0; + fri.offload_failed = 0; if (res.fa_head) { struct fib_alias *fa; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3e5f4f2e705e..f55095d3ed16 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1354,6 +1354,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, + { + .procname = "fib_notify_on_flag_change", + .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 32545ecf2ab1..a3422e42784e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -280,6 +280,12 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> +/* Track pending CMSGs. */ +enum { + TCP_CMSG_INQ = 1, + TCP_CMSG_TS = 2 +}; + struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -475,19 +481,11 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) } } -static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, - int target, struct sock *sk) +static bool tcp_stream_is_readable(struct sock *sk, int target) { - int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); - - if (avail > 0) { - if (avail >= target) - return true; - if (tcp_rmem_pressure(sk)) - return true; - if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss) - return true; - } + if (tcp_epollin_ready(sk, target)) + return true; + if (sk->sk_prot->stream_memory_read) return sk->sk_prot->stream_memory_read(sk); return false; @@ -562,7 +560,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - if (tcp_stream_is_readable(tp, target, sk)) + if (tcp_stream_is_readable(sk, target)) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { @@ -1010,7 +1008,7 @@ new_segment: } if (!(flags & MSG_NO_SHARED_FRAGS)) - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; skb->len += copy; skb->data_len += copy; @@ -1217,7 +1215,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); - uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; goto out_err; @@ -1429,7 +1427,7 @@ out: tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: - sock_zerocopy_put(uarg); + net_zcopy_put(uarg); return copied + copied_syn; do_error: @@ -1440,7 +1438,7 @@ do_fault: if (copied + copied_syn) goto out; out_err: - sock_zerocopy_put_abort(uarg, true); + net_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { @@ -1739,6 +1737,20 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); +static void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) +{ + if (skb->tstamp) + tss->ts[0] = ktime_to_timespec64(skb->tstamp); + else + tss->ts[0] = (struct timespec64) {0}; + + if (skb_hwtstamps(skb)->hwtstamp) + tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); + else + tss->ts[2] = (struct timespec64) {0}; +} + #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; @@ -1842,13 +1854,13 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, struct scm_timestamping_internal *tss, int *cmsg_flags); static int receive_fallback_to_copy(struct sock *sk, - struct tcp_zerocopy_receive *zc, int inq) + struct tcp_zerocopy_receive *zc, int inq, + struct scm_timestamping_internal *tss) { unsigned long copy_address = (unsigned long)zc->copybuf_address; - struct scm_timestamping_internal tss_unused; - int err, cmsg_flags_unused; struct msghdr msg = {}; struct iovec iov; + int err; zc->length = 0; zc->recv_skip_hint = 0; @@ -1862,7 +1874,7 @@ static int receive_fallback_to_copy(struct sock *sk, return err; err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0, - &tss_unused, &cmsg_flags_unused); + tss, &zc->msg_flags); if (err < 0) return err; @@ -1903,21 +1915,27 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, return (__s32)copylen; } -static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc, - struct sock *sk, - struct sk_buff *skb, - u32 *seq, - s32 copybuf_len) +static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, + struct sock *sk, + struct sk_buff *skb, + u32 *seq, + s32 copybuf_len, + struct scm_timestamping_internal *tss) { u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); if (!copylen) return 0; /* skb is null if inq < PAGE_SIZE. */ - if (skb) + if (skb) { offset = *seq - TCP_SKB_CB(skb)->seq; - else + } else { skb = tcp_recv_skb(sk, *seq, &offset); + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + zc->msg_flags |= TCP_CMSG_TS; + } + } zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, seq); @@ -2004,9 +2022,38 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, err); } +#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) +static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss); +static void tcp_zc_finalize_rx_tstamp(struct sock *sk, + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss) +{ + unsigned long msg_control_addr; + struct msghdr cmsg_dummy; + + msg_control_addr = (unsigned long)zc->msg_control; + cmsg_dummy.msg_control = (void *)msg_control_addr; + cmsg_dummy.msg_controllen = + (__kernel_size_t)zc->msg_controllen; + cmsg_dummy.msg_flags = in_compat_syscall() + ? MSG_CMSG_COMPAT : 0; + zc->msg_flags = 0; + if (zc->msg_control == msg_control_addr && + zc->msg_controllen == cmsg_dummy.msg_controllen) { + tcp_recv_timestamp(&cmsg_dummy, sk, tss); + zc->msg_control = (__u64) + ((uintptr_t)cmsg_dummy.msg_control); + zc->msg_controllen = + (__u64)cmsg_dummy.msg_controllen; + zc->msg_flags = (__u32)cmsg_dummy.msg_flags; + } +} + #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, - struct tcp_zerocopy_receive *zc) + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; @@ -2023,6 +2070,7 @@ static int tcp_zerocopy_receive(struct sock *sk, int ret; zc->copybuf_len = 0; + zc->msg_flags = 0; if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; @@ -2033,7 +2081,7 @@ static int tcp_zerocopy_receive(struct sock *sk, sock_rps_record_flow(sk); if (inq && inq <= copybuf_len) - return receive_fallback_to_copy(sk, zc, inq); + return receive_fallback_to_copy(sk, zc, inq, tss); if (inq < PAGE_SIZE) { zc->length = 0; @@ -2078,6 +2126,11 @@ static int tcp_zerocopy_receive(struct sock *sk, } else { skb = tcp_recv_skb(sk, seq, &offset); } + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + zc->msg_flags |= TCP_CMSG_TS; + } zc->recv_skip_hint = skb->len - offset; frags = skb_advance_to_frag(skb, offset, &offset_frag); if (!frags || offset_frag) @@ -2120,8 +2173,7 @@ out: mmap_read_unlock(current->mm); /* Try to copy straggler data. */ if (!ret) - copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq, - copybuf_len); + copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); if (length + copylen) { WRITE_ONCE(tp->copied_seq, seq); @@ -2142,20 +2194,6 @@ out: } #endif -static void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss) -{ - if (skb->tstamp) - tss->ts[0] = ktime_to_timespec64(skb->tstamp); - else - tss->ts[0] = (struct timespec64) {0}; - - if (skb_hwtstamps(skb)->hwtstamp) - tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); - else - tss->ts[2] = (struct timespec64) {0}; -} - /* Similar to __sock_recv_timestamp, but does not require an skb */ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) @@ -2272,7 +2310,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, goto out; if (tp->recvmsg_inq) - *cmsg_flags = 1; + *cmsg_flags = TCP_CMSG_INQ; timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */ @@ -2453,7 +2491,7 @@ skip_copy: if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); - *cmsg_flags |= 2; + *cmsg_flags |= TCP_CMSG_TS; } if (used + offset < skb->len) @@ -2513,9 +2551,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, release_sock(sk); if (cmsg_flags && ret >= 0) { - if (cmsg_flags & 2) + if (cmsg_flags & TCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); - if (cmsg_flags & 1) { + if (cmsg_flags & TCP_CMSG_INQ) { inq = tcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } @@ -3767,11 +3805,24 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ 0; } +/* Returns TTL or hop limit of an incoming packet from skb. */ +static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IP)) + return ip_hdr(skb)->ttl; + else if (skb->protocol == htons(ETH_P_IPV6)) + return ipv6_hdr(skb)->hop_limit; + else + return 0; +} + struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, - const struct sk_buff *orig_skb) + const struct sk_buff *orig_skb, + const struct sk_buff *ack_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; @@ -3827,6 +3878,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, max_t(int, 0, tp->write_seq - tp->snd_nxt)); nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, TCP_NLA_PAD); + if (ack_skb) + nla_put_u8(stats, TCP_NLA_TTL, + tcp_skb_ttl_or_hop_limit(ack_skb)); return stats; } @@ -4083,6 +4137,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, } #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { + struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; int err; @@ -4090,19 +4145,36 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; if (len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; - if (len > sizeof(zc)) { + if (unlikely(len > sizeof(zc))) { + err = check_zeroed_user(optval + sizeof(zc), + len - sizeof(zc)); + if (err < 1) + return err == 0 ? -EINVAL : err; len = sizeof(zc); if (put_user(len, optlen)) return -EFAULT; } if (copy_from_user(&zc, optval, len)) return -EFAULT; + if (zc.reserved) + return -EINVAL; + if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) + return -EINVAL; lock_sock(sk); - err = tcp_zerocopy_receive(sk, &zc); + err = tcp_zerocopy_receive(sk, &zc, &tss); + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, + &zc, &len, err); release_sock(sk); - if (len >= offsetofend(struct tcp_zerocopy_receive, err)) - goto zerocopy_rcv_sk_err; + if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) + goto zerocopy_rcv_cmsg; switch (len) { + case offsetofend(struct tcp_zerocopy_receive, msg_flags): + goto zerocopy_rcv_cmsg; + case offsetofend(struct tcp_zerocopy_receive, msg_controllen): + case offsetofend(struct tcp_zerocopy_receive, msg_control): + case offsetofend(struct tcp_zerocopy_receive, flags): + case offsetofend(struct tcp_zerocopy_receive, copybuf_len): + case offsetofend(struct tcp_zerocopy_receive, copybuf_address): case offsetofend(struct tcp_zerocopy_receive, err): goto zerocopy_rcv_sk_err; case offsetofend(struct tcp_zerocopy_receive, inq): @@ -4111,6 +4183,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level, default: goto zerocopy_rcv_out; } +zerocopy_rcv_cmsg: + if (zc.msg_flags & TCP_CMSG_TS) + tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); + else + zc.msg_flags = 0; zerocopy_rcv_sk_err: if (!err) zc.err = sock_error(sk); @@ -4133,6 +4210,18 @@ zerocopy_rcv_out: return 0; } +bool tcp_bpf_bypass_getsockopt(int level, int optname) +{ + /* TCP do_tcp_getsockopt has optimized getsockopt implementation + * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. + */ + if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) + return true; + + return false; +} +EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); + int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index c7bf5b26bf0c..ffcbe46dacdb 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -104,16 +104,7 @@ struct bictcp { static inline void bictcp_reset(struct bictcp *ca) { - ca->cnt = 0; - ca->last_max_cwnd = 0; - ca->last_cwnd = 0; - ca->last_time = 0; - ca->bic_origin_point = 0; - ca->bic_K = 0; - ca->delay_min = 0; - ca->epoch_start = 0; - ca->ack_cnt = 0; - ca->tcp_cwnd = 0; + memset(ca, 0, offsetof(struct bictcp, unused)); ca->found = 0; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9b44caa4b956..69a545db80d2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3146,7 +3146,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) } static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, - u32 prior_snd_una) + const struct sk_buff *ack_skb, u32 prior_snd_una) { const struct skb_shared_info *shinfo; @@ -3158,7 +3158,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, if (!before(shinfo->tskey, prior_snd_una) && before(shinfo->tskey, tcp_sk(sk)->snd_una)) { tcp_skb_tsorted_save(skb) { - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK); } tcp_skb_tsorted_restore(skb); } } @@ -3167,8 +3167,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, - u32 prior_snd_una, +static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, + u32 prior_fack, u32 prior_snd_una, struct tcp_sacktag_state *sack, bool ece_ack) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3257,7 +3257,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (!fully_acked) break; - tcp_ack_tstamp(sk, skb, prior_snd_una); + tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) @@ -3275,7 +3275,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->snd_up = tp->snd_una; if (skb) { - tcp_ack_tstamp(sk, skb, prior_snd_una); + tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) flag |= FLAG_SACK_RENEGING; } @@ -3810,8 +3810,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, - flag & FLAG_ECE); + flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una, + &sack_state, flag & FLAG_ECE); tcp_rack_update_reo_wnd(sk, &rs); @@ -4924,15 +4924,8 @@ err: void tcp_data_ready(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); - int avail = tp->rcv_nxt - tp->copied_seq; - - if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && - !sock_flag(sk, SOCK_DONE) && - tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss) - return; - - sk->sk_data_ready(sk); + if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE)) + sk->sk_data_ready(sk); } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 777306b5bc22..daad4f99db32 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1649,6 +1649,8 @@ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, return mss; } +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * @@ -1668,7 +1670,8 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || - !dst->ops->check(dst, 0)) { + !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, + dst, 0)) { dst_release(dst); sk->sk_rx_dst = NULL; } @@ -2793,6 +2796,7 @@ struct proto tcp_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8478cf749821..fbf140a770d8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1319,7 +1319,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, skb_orphan(skb); skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; - skb_set_hash_from_sk(skb, sk); refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); @@ -1390,6 +1389,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); + skb_set_hash_from_sk(skb, sk); /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 69ea76578abb..4a0478b17243 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -596,6 +596,12 @@ void udp_encap_enable(void) } EXPORT_SYMBOL(udp_encap_enable); +void udp_encap_disable(void) +{ + static_branch_dec(&udp_encap_needed_key); +} +EXPORT_SYMBOL(udp_encap_disable); + /* Handler for tunnels with arbitrary destination ports: no socket lookup, go * through error handlers in encapsulations looking for a match. */ @@ -1124,7 +1130,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) @@ -1858,9 +1864,8 @@ try_again: memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, - (struct sockaddr *)sin); + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, + (struct sockaddr *)sin); } if (udp_sk(sk)->gro_enabled) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index cfc872689b99..b76c48efd37e 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -68,8 +68,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); features &= skb->dev->hw_enc_features; - /* CRC checksum can't be handled by HW when it's a UDP tunneling packet. */ - features &= ~NETIF_F_SCTP_CRC; + if (need_csum) + features &= ~NETIF_F_SCTP_CRC; /* The only checksum offload we care about from here on out is the * outer one so strip the existing checksum feature flags and @@ -519,7 +519,8 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, if (skb->dev->features & NETIF_F_GRO_FRAGLIST) NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1; - if ((sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) { + if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) || + (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) { pp = call_gro_receive(udp_gro_receive_segment, head, skb); return pp; } diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 3eecba0874aa..b97e3635acf5 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -90,15 +90,11 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, struct sock *sk = sock->sk; struct udp_tunnel_info ti; - if (!dev->netdev_ops->ndo_udp_tunnel_add || - !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; - dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); + udp_tunnel_nic_add_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); @@ -108,15 +104,11 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, struct sock *sk = sock->sk; struct udp_tunnel_info ti; - if (!dev->netdev_ops->ndo_udp_tunnel_del || - !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; - dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); + udp_tunnel_nic_del_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port); @@ -134,11 +126,7 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) rcu_read_lock(); for_each_netdev_rcu(net, dev) { - if (!dev->netdev_ops->ndo_udp_tunnel_add) - continue; - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - continue; - dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); + udp_tunnel_nic_add_port(dev, &ti); } rcu_read_unlock(); } @@ -158,11 +146,7 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) rcu_read_lock(); for_each_netdev_rcu(net, dev) { - if (!dev->netdev_ops->ndo_udp_tunnel_del) - continue; - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - continue; - dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); + udp_tunnel_nic_del_port(dev, &ti); } rcu_read_unlock(); } |