aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/ah4.c4
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/cipso_ipv4.c11
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/devinet.c208
-rw-r--r--net/ipv4/esp4.c11
-rw-r--r--net/ipv4/fib_frontend.c158
-rw-r--r--net/ipv4/fib_semantics.c87
-rw-r--r--net/ipv4/fib_trie.c37
-rw-r--r--net/ipv4/gre_demux.c7
-rw-r--r--net/ipv4/icmp.c4
-rw-r--r--net/ipv4/igmp.c53
-rw-r--r--net/ipv4/inet_connection_sock.c5
-rw-r--r--net/ipv4/inet_hashtables.c2
-rw-r--r--net/ipv4/ip_fragment.c27
-rw-r--r--net/ipv4/ip_gre.c15
-rw-r--r--net/ipv4/ip_input.c6
-rw-r--r--net/ipv4/ip_output.c4
-rw-r--r--net/ipv4/ip_sockglue.c3
-rw-r--r--net/ipv4/ip_vti.c4
-rw-r--r--net/ipv4/ipcomp.c4
-rw-r--r--net/ipv4/ipip.c5
-rw-r--r--net/ipv4/ipmr.c60
-rw-r--r--net/ipv4/ipmr_base.c125
-rw-r--r--net/ipv4/metrics.c30
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c1
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c22
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic_main.c1
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c27
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c55
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c6
-rw-r--r--net/ipv4/tcp.c58
-rw-r--r--net/ipv4/tcp_bbr.c90
-rw-r--r--net/ipv4/tcp_bpf.c669
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_dctcp.c55
-rw-r--r--net/ipv4/tcp_dctcp.h40
-rw-r--r--net/ipv4/tcp_input.c63
-rw-r--r--net/ipv4/tcp_ipv4.c8
-rw-r--r--net/ipv4/tcp_output.c162
-rw-r--r--net/ipv4/tcp_rate.c15
-rw-r--r--net/ipv4/tcp_recovery.c5
-rw-r--r--net/ipv4/tcp_timer.c2
-rw-r--r--net/ipv4/tcp_ulp.c75
-rw-r--r--net/ipv4/udp.c32
-rw-r--r--net/ipv4/udp_diag.c1
-rw-r--r--net/ipv4/udp_offload.c2
-rw-r--r--net/ipv4/xfrm4_input.c1
-rw-r--r--net/ipv4/xfrm4_mode_transport.c4
54 files changed, 1731 insertions, 563 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7446b98661d8..58629314eae9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 4dd95cdd8070..c01fa791260d 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -461,9 +461,9 @@ static int ah4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH);
else
- ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_AH);
xfrm_state_put(x);
return 0;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index e90c89ef8c08..850a6f13a082 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1255,6 +1255,8 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event,
change_info = ptr;
if (change_info->flags_changed & IFF_NOARP)
neigh_changeaddr(&arp_tbl, dev);
+ if (!netif_carrier_ok(dev))
+ neigh_carrier_down(&arp_tbl, dev);
break;
default:
break;
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 82178cc69c96..777fa3b7fb13 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1512,7 +1512,7 @@ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
*
* Description:
* Parse the packet's IP header looking for a CIPSO option. Returns a pointer
- * to the start of the CIPSO option on success, NULL if one if not found.
+ * to the start of the CIPSO option on success, NULL if one is not found.
*
*/
unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
@@ -1522,10 +1522,8 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
int optlen;
int taglen;
- for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) {
+ for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) {
switch (optptr[0]) {
- case IPOPT_CIPSO:
- return optptr;
case IPOPT_END:
return NULL;
case IPOPT_NOOP:
@@ -1534,6 +1532,11 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
default:
taglen = optptr[1];
}
+ if (!taglen || taglen > optlen)
+ return NULL;
+ if (optptr[0] == IPOPT_CIPSO)
+ return optptr;
+
optlen -= taglen;
optptr += taglen;
}
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f915abff1350..300921417f89 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
oif = sk->sk_bound_dev_if;
saddr = inet->inet_saddr;
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
- if (!oif)
+ if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ea4bd8a52422..a34602ae27de 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -100,6 +100,16 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .type = NLA_U32 },
[IFA_RT_PRIORITY] = { .type = NLA_U32 },
+ [IFA_TARGET_NETNSID] = { .type = NLA_S32 },
+};
+
+struct inet_fill_args {
+ u32 portid;
+ u32 seq;
+ int event;
+ unsigned int flags;
+ int netnsid;
+ int ifindex;
};
#define IN4_ADDR_HSIZE_SHIFT 8
@@ -773,7 +783,8 @@ static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
}
static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
- __u32 *pvalid_lft, __u32 *pprefered_lft)
+ __u32 *pvalid_lft, __u32 *pprefered_lft,
+ struct netlink_ext_ack *extack)
{
struct nlattr *tb[IFA_MAX+1];
struct in_ifaddr *ifa;
@@ -783,7 +794,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
int err;
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy,
- NULL);
+ extack);
if (err < 0)
goto errout;
@@ -888,7 +899,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
ASSERT_RTNL();
- ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft);
+ ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft, extack);
if (IS_ERR(ifa))
return PTR_ERR(ifa);
@@ -1584,13 +1595,14 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
}
static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
- u32 portid, u32 seq, int event, unsigned int flags)
+ struct inet_fill_args *args)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
u32 preferred, valid;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm),
+ args->flags);
if (!nlh)
return -EMSGSIZE;
@@ -1601,6 +1613,10 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ goto nla_put_failure;
+
if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
preferred = ifa->ifa_preferred_lft;
valid = ifa->ifa_valid_lft;
@@ -1645,27 +1661,142 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
+ struct inet_fill_args *fillargs,
+ struct net **tgt_net, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[IFA_MAX+1];
+ struct ifaddrmsg *ifm;
+ int err, i;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request");
+ return -EINVAL;
+ }
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request");
+ return -EINVAL;
+ }
+
+ fillargs->ifindex = ifm->ifa_index;
+ if (fillargs->ifindex) {
+ cb->answer_flags |= NLM_F_DUMP_FILTERED;
+ fillargs->flags |= NLM_F_DUMP_FILTERED;
+ }
+
+ err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv4_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= IFA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ if (i == IFA_TARGET_NETNSID) {
+ struct net *net;
+
+ fillargs->netnsid = nla_get_s32(tb[i]);
+
+ net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
+ if (IS_ERR(net)) {
+ fillargs->netnsid = -1;
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id");
+ return PTR_ERR(net);
+ }
+ *tgt_net = net;
+ } else {
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
+ struct netlink_callback *cb, int s_ip_idx,
+ struct inet_fill_args *fillargs)
+{
+ struct in_ifaddr *ifa;
+ int ip_idx = 0;
+ int err;
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+
+ err = inet_fill_ifaddr(skb, ifa, fillargs);
+ if (err < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ }
+ err = 0;
+
+done:
+ cb->args[2] = ip_idx;
+
+ return err;
+}
+
static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct inet_fill_args fillargs = {
+ .portid = NETLINK_CB(cb->skb).portid,
+ .seq = nlh->nlmsg_seq,
+ .event = RTM_NEWADDR,
+ .flags = NLM_F_MULTI,
+ .netnsid = -1,
+ };
struct net *net = sock_net(skb->sk);
+ struct net *tgt_net = net;
int h, s_h;
int idx, s_idx;
- int ip_idx, s_ip_idx;
+ int s_ip_idx;
struct net_device *dev;
struct in_device *in_dev;
- struct in_ifaddr *ifa;
struct hlist_head *head;
+ int err = 0;
s_h = cb->args[0];
s_idx = idx = cb->args[1];
- s_ip_idx = ip_idx = cb->args[2];
+ s_ip_idx = cb->args[2];
+
+ if (cb->strict_check) {
+ err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
+ skb->sk, cb);
+ if (err < 0)
+ goto put_tgt_net;
+
+ err = 0;
+ if (fillargs.ifindex) {
+ dev = __dev_get_by_index(tgt_net, fillargs.ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto put_tgt_net;
+ }
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
+ err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx,
+ &fillargs);
+ }
+ goto put_tgt_net;
+ }
+ }
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
- head = &net->dev_index_head[h];
+ head = &tgt_net->dev_index_head[h];
rcu_read_lock();
- cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
- net->dev_base_seq;
+ cb->seq = atomic_read(&tgt_net->ipv4.dev_addr_genid) ^
+ tgt_net->dev_base_seq;
hlist_for_each_entry_rcu(dev, head, index_hlist) {
if (idx < s_idx)
goto cont;
@@ -1675,18 +1806,11 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
if (!in_dev)
goto cont;
- for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
- ifa = ifa->ifa_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
- continue;
- if (inet_fill_ifaddr(skb, ifa,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWADDR, NLM_F_MULTI) < 0) {
- rcu_read_unlock();
- goto done;
- }
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx,
+ &fillargs);
+ if (err < 0) {
+ rcu_read_unlock();
+ goto done;
}
cont:
idx++;
@@ -1697,16 +1821,24 @@ cont:
done:
cb->args[0] = h;
cb->args[1] = idx;
- cb->args[2] = ip_idx;
+put_tgt_net:
+ if (fillargs.netnsid >= 0)
+ put_net(tgt_net);
- return skb->len;
+ return err < 0 ? err : skb->len;
}
static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
u32 portid)
{
+ struct inet_fill_args fillargs = {
+ .portid = portid,
+ .seq = nlh ? nlh->nlmsg_seq : 0,
+ .event = event,
+ .flags = 0,
+ .netnsid = -1,
+ };
struct sk_buff *skb;
- u32 seq = nlh ? nlh->nlmsg_seq : 0;
int err = -ENOBUFS;
struct net *net;
@@ -1715,7 +1847,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
if (!skb)
goto errout;
- err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
+ err = inet_fill_ifaddr(skb, ifa, &fillargs);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -1995,6 +2127,7 @@ errout:
static int inet_netconf_dump_devconf(struct sk_buff *skb,
struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
int h, s_h;
int idx, s_idx;
@@ -2002,6 +2135,21 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
struct in_device *in_dev;
struct hlist_head *head;
+ if (cb->strict_check) {
+ struct netlink_ext_ack *extack = cb->extack;
+ struct netconfmsg *ncm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request");
+ return -EINVAL;
+ }
+ }
+
s_h = cb->args[0];
s_idx = idx = cb->args[1];
@@ -2021,7 +2169,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
if (inet_netconf_fill_devconf(skb, dev->ifindex,
&in_dev->cnf,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNETCONF,
NLM_F_MULTI,
NETCONFA_ALL) < 0) {
@@ -2038,7 +2186,7 @@ cont:
if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
NETCONFA_ALL) < 0)
goto done;
@@ -2049,7 +2197,7 @@ cont:
if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
net->ipv4.devconf_dflt,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
NETCONFA_ALL) < 0)
goto done;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 97689012b357..9e1c840596c5 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -683,12 +683,11 @@ static void esp_input_done_esn(struct crypto_async_request *base, int err)
*/
static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct ip_esp_hdr *esph;
struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
int ivlen = crypto_aead_ivsize(aead);
- int elen = skb->len - sizeof(*esph) - ivlen;
+ int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
int nfrags;
int assoclen;
int seqhilen;
@@ -698,13 +697,13 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
struct scatterlist *sg;
int err = -EINVAL;
- if (!pskb_may_pull(skb, sizeof(*esph) + ivlen))
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen))
goto out;
if (elen <= 0)
goto out;
- assoclen = sizeof(*esph);
+ assoclen = sizeof(struct ip_esp_hdr);
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
@@ -820,9 +819,9 @@ static int esp4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ESP);
else
- ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_ESP);
xfrm_state_put(x);
return 0;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2998b0e47d4b..6df95be96311 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -315,6 +315,32 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
}
+bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
+{
+ bool dev_match = false;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ int ret;
+
+ for (ret = 0; ret < fi->fib_nhs; ret++) {
+ struct fib_nh *nh = &fi->fib_nh[ret];
+
+ if (nh->nh_dev == dev) {
+ dev_match = true;
+ break;
+ } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
+ dev_match = true;
+ break;
+ }
+ }
+#else
+ if (fi->fib_nh[0].nh_dev == dev)
+ dev_match = true;
+#endif
+
+ return dev_match;
+}
+EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
+
/* Given (packet source, input interface) and optional (dst, oif, tos):
* - (main) check, that source is valid i.e. not broadcast or our local
* address.
@@ -361,24 +387,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
(res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
goto e_inval;
fib_combine_itag(itag, &res);
- dev_match = false;
-
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- for (ret = 0; ret < res.fi->fib_nhs; ret++) {
- struct fib_nh *nh = &res.fi->fib_nh[ret];
- if (nh->nh_dev == dev) {
- dev_match = true;
- break;
- } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
- dev_match = true;
- break;
- }
- }
-#else
- if (FIB_RES_DEV(res) == dev)
- dev_match = true;
-#endif
+ dev_match = fib_info_nh_uses_dev(res.fi, dev);
if (dev_match) {
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
return ret;
@@ -792,19 +802,115 @@ errout:
return err;
}
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+ struct fib_dump_filter *filter,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[RTA_MAX + 1];
+ struct rtmsg *rtm;
+ int err, i;
+
+ ASSERT_RTNL();
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
+ return -EINVAL;
+ }
+
+ rtm = nlmsg_data(nlh);
+ if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos ||
+ rtm->rtm_scope) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
+ return -EINVAL;
+ }
+ if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
+ NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
+ return -EINVAL;
+ }
+
+ filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC);
+ filter->flags = rtm->rtm_flags;
+ filter->protocol = rtm->rtm_protocol;
+ filter->rt_type = rtm->rtm_type;
+ filter->table_id = rtm->rtm_table;
+
+ err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= RTA_MAX; ++i) {
+ int ifindex;
+
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_TABLE:
+ filter->table_id = nla_get_u32(tb[i]);
+ break;
+ case RTA_OIF:
+ ifindex = nla_get_u32(tb[i]);
+ filter->dev = __dev_get_by_index(net, ifindex);
+ if (!filter->dev)
+ return -ENODEV;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ if (filter->flags || filter->protocol || filter->rt_type ||
+ filter->table_id || filter->dev) {
+ filter->filter_set = 1;
+ cb->answer_flags = NLM_F_DUMP_FILTERED;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
+
static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
+ struct fib_dump_filter filter = {};
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib_table *tb;
struct hlist_head *head;
int dumped = 0, err;
- if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
- ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
+ if (err < 0)
+ return err;
+ } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+ struct rtmsg *rtm = nlmsg_data(nlh);
+
+ filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
+ }
+
+ /* fib entries are never clones and ipv4 does not use prefix flag */
+ if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED))
return skb->len;
+ if (filter.table_id) {
+ tb = fib_get_table(net, filter.table_id);
+ if (!tb) {
+ if (filter.dump_all_families)
+ return skb->len;
+
+ NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
+ return -ENOENT;
+ }
+
+ err = fib_table_dump(tb, skb, cb, &filter);
+ return skb->len ? : err;
+ }
+
s_h = cb->args[0];
s_e = cb->args[1];
@@ -819,7 +925,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
if (dumped)
memset(&cb->args[2], 0, sizeof(cb->args) -
2 * sizeof(cb->args[0]));
- err = fib_table_dump(tb, skb, cb);
+ err = fib_table_dump(tb, skb, cb, &filter);
if (err < 0) {
if (likely(skb->len))
goto out;
@@ -1243,7 +1349,8 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct netdev_notifier_changeupper_info *info;
+ struct netdev_notifier_changeupper_info *upper_info = ptr;
+ struct netdev_notifier_info_ext *info_ext = ptr;
struct in_device *in_dev;
struct net *net = dev_net(dev);
unsigned int flags;
@@ -1278,16 +1385,19 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
fib_sync_up(dev, RTNH_F_LINKDOWN);
else
fib_sync_down_dev(dev, event, false);
- /* fall through */
+ rt_cache_flush(net);
+ break;
case NETDEV_CHANGEMTU:
+ fib_sync_mtu(dev, info_ext->ext.mtu);
rt_cache_flush(net);
break;
case NETDEV_CHANGEUPPER:
- info = ptr;
+ upper_info = ptr;
/* flush all routes if dev is linked to or unlinked from
* an L3 master device (e.g., VRF)
*/
- if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+ if (upper_info->upper_dev &&
+ netif_is_l3_master(upper_info->upper_dev))
fib_disable_ip(dev, NETDEV_DOWN, true);
break;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f3c89ccf14c5..b5c3937ca6ec 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -208,7 +208,6 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
- struct dst_metrics *m;
change_nexthops(fi) {
if (nexthop_nh->nh_dev)
@@ -219,9 +218,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
} endfor_nexthops(fi);
- m = fi->fib_metrics;
- if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
- kfree(m);
+ ip_fib_metrics_put(fi->fib_metrics);
+
kfree(fi);
}
@@ -797,8 +795,10 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
return -EINVAL;
}
dev = __dev_get_by_index(net, nh->nh_oif);
- if (!dev)
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
return -ENODEV;
+ }
if (!(dev->flags & IFF_UP)) {
NL_SET_ERR_MSG(extack,
"Nexthop device is not up");
@@ -1018,13 +1018,6 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
return true;
}
-static int
-fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
-{
- return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len,
- fi->fib_metrics->metrics);
-}
-
struct fib_info *fib_create_info(struct fib_config *cfg,
struct netlink_ext_ack *extack)
{
@@ -1082,16 +1075,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
if (!fi)
goto failure;
- if (cfg->fc_mx) {
- fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
- if (unlikely(!fi->fib_metrics)) {
- kfree(fi);
- return ERR_PTR(err);
- }
- refcount_set(&fi->fib_metrics->refcnt, 1);
- } else {
- fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
+ fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
+ cfg->fc_mx_len);
+ if (unlikely(IS_ERR(fi->fib_metrics))) {
+ err = PTR_ERR(fi->fib_metrics);
+ kfree(fi);
+ return ERR_PTR(err);
}
+
fib_info_cnt++;
fi->fib_net = net;
fi->fib_protocol = cfg->fc_protocol;
@@ -1110,10 +1101,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto failure;
} endfor_nexthops(fi)
- err = fib_convert_metrics(fi, cfg);
- if (err)
- goto failure;
-
if (cfg->fc_mp) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
@@ -1470,6 +1457,56 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
return NOTIFY_DONE;
}
+/* Update the PMTU of exceptions when:
+ * - the new MTU of the first hop becomes smaller than the PMTU
+ * - the old MTU was the same as the PMTU, and it limited discovery of
+ * larger MTUs on the path. With that limit raised, we can now
+ * discover larger MTUs
+ * A special case is locked exceptions, for which the PMTU is smaller
+ * than the minimal accepted PMTU:
+ * - if the new MTU is greater than the PMTU, don't make any change
+ * - otherwise, unlock and set PMTU
+ */
+static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig)
+{
+ struct fnhe_hash_bucket *bucket;
+ int i;
+
+ bucket = rcu_dereference_protected(nh->nh_exceptions, 1);
+ if (!bucket)
+ return;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ for (fnhe = rcu_dereference_protected(bucket[i].chain, 1);
+ fnhe;
+ fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) {
+ if (fnhe->fnhe_mtu_locked) {
+ if (new <= fnhe->fnhe_pmtu) {
+ fnhe->fnhe_pmtu = new;
+ fnhe->fnhe_mtu_locked = false;
+ }
+ } else if (new < fnhe->fnhe_pmtu ||
+ orig == fnhe->fnhe_pmtu) {
+ fnhe->fnhe_pmtu = new;
+ }
+ }
+ }
+}
+
+void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+ unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+ struct hlist_head *head = &fib_info_devhash[hash];
+ struct fib_nh *nh;
+
+ hlist_for_each_entry(nh, head, nh_hash) {
+ if (nh->nh_dev == dev)
+ nh_update_mtu(nh, dev->mtu, orig_mtu);
+ }
+}
+
/* Event force Flags Description
* NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host
* NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5bc0c89e81e4..237c9f72b265 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2003,12 +2003,17 @@ void fib_free_table(struct fib_table *tb)
}
static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
- struct sk_buff *skb, struct netlink_callback *cb)
+ struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_dump_filter *filter)
{
+ unsigned int flags = NLM_F_MULTI;
__be32 xkey = htonl(l->key);
struct fib_alias *fa;
int i, s_i;
+ if (filter->filter_set)
+ flags |= NLM_F_DUMP_FILTERED;
+
s_i = cb->args[4];
i = 0;
@@ -2016,25 +2021,35 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
int err;
- if (i < s_i) {
- i++;
- continue;
- }
+ if (i < s_i)
+ goto next;
- if (tb->tb_id != fa->tb_id) {
- i++;
- continue;
+ if (tb->tb_id != fa->tb_id)
+ goto next;
+
+ if (filter->filter_set) {
+ if (filter->rt_type && fa->fa_type != filter->rt_type)
+ goto next;
+
+ if ((filter->protocol &&
+ fa->fa_info->fib_protocol != filter->protocol))
+ goto next;
+
+ if (filter->dev &&
+ !fib_info_nh_uses_dev(fa->fa_info, filter->dev))
+ goto next;
}
err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, RTM_NEWROUTE,
tb->tb_id, fa->fa_type,
xkey, KEYLENGTH - fa->fa_slen,
- fa->fa_tos, fa->fa_info, NLM_F_MULTI);
+ fa->fa_tos, fa->fa_info, flags);
if (err < 0) {
cb->args[4] = i;
return err;
}
+next:
i++;
}
@@ -2044,7 +2059,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
/* rcu_read_lock needs to be hold by caller from readside */
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb, struct fib_dump_filter *filter)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *l, *tp = t->kv;
@@ -2057,7 +2072,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
int err;
- err = fn_trie_dump_leaf(l, tb, skb, cb);
+ err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
if (err < 0) {
cb->args[3] = key;
cb->args[2] = count;
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index b798862b6be5..7efe740c06eb 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -86,13 +86,14 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
- if (skb_checksum_simple_validate(skb)) {
+ if (!skb_checksum_simple_validate(skb)) {
+ skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+ } else if (csum_err) {
*csum_err = true;
return -EINVAL;
}
- skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
- null_compute_pseudo);
options++;
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 695979b7ef6d..d832beed6e3a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1098,9 +1098,9 @@ void icmp_err(struct sk_buff *skb, u32 info)
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
else if (type == ICMP_REDIRECT)
- ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
}
/*
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4da39446da2d..765b2b32c4a4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -111,13 +111,10 @@
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
-#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ)
-#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ)
#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ)
#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ)
+#define IGMP_QUERY_INTERVAL (125*HZ)
#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ)
-#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2
-
#define IGMP_INITIAL_REPORT_DELAY (1)
@@ -935,13 +932,15 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
in_dev->mr_v1_seen = jiffies +
- IGMP_V1_ROUTER_PRESENT_TIMEOUT;
+ (in_dev->mr_qrv * in_dev->mr_qi) +
+ in_dev->mr_qri;
group = 0;
} else {
/* v2 router present */
max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
in_dev->mr_v2_seen = jiffies +
- IGMP_V2_ROUTER_PRESENT_TIMEOUT;
+ (in_dev->mr_qrv * in_dev->mr_qi) +
+ in_dev->mr_qri;
}
/* cancel the interface change timer */
in_dev->mr_ifc_count = 0;
@@ -981,8 +980,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
in_dev->mr_maxdelay = max_delay;
- if (ih3->qrv)
- in_dev->mr_qrv = ih3->qrv;
+
+ /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently
+ * received value was zero, use the default or statically
+ * configured value.
+ */
+ in_dev->mr_qrv = ih3->qrv ?: net->ipv4.sysctl_igmp_qrv;
+ in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;
+
+ /* RFC3376, 8.3. Query Response Interval:
+ * The number of seconds represented by the [Query Response
+ * Interval] must be less than the [Query Interval].
+ */
+ if (in_dev->mr_qri >= in_dev->mr_qi)
+ in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ;
+
if (!group) { /* general query */
if (ih3->nsrcs)
return true; /* no sources allowed */
@@ -1723,18 +1735,30 @@ void ip_mc_down(struct in_device *in_dev)
ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
}
-void ip_mc_init_dev(struct in_device *in_dev)
-{
#ifdef CONFIG_IP_MULTICAST
+static void ip_mc_reset(struct in_device *in_dev)
+{
struct net *net = dev_net(in_dev->dev);
+
+ in_dev->mr_qi = IGMP_QUERY_INTERVAL;
+ in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL;
+ in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
+}
+#else
+static void ip_mc_reset(struct in_device *in_dev)
+{
+}
#endif
+
+void ip_mc_init_dev(struct in_device *in_dev)
+{
ASSERT_RTNL();
#ifdef CONFIG_IP_MULTICAST
timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0);
timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0);
- in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
#endif
+ ip_mc_reset(in_dev);
spin_lock_init(&in_dev->mc_tomb_lock);
}
@@ -1744,15 +1768,10 @@ void ip_mc_init_dev(struct in_device *in_dev)
void ip_mc_up(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
-#ifdef CONFIG_IP_MULTICAST
- struct net *net = dev_net(in_dev->dev);
-#endif
ASSERT_RTNL();
-#ifdef CONFIG_IP_MULTICAST
- in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
-#endif
+ ip_mc_reset(in_dev);
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
for_each_pmc_rtnl(in_dev, pmc) {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index dfd5009f96ef..15e7f7915a21 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -544,7 +544,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
struct ip_options_rcu *opt;
struct rtable *rt;
- opt = ireq_opt_deref(ireq);
+ rcu_read_lock();
+ opt = rcu_dereference(ireq->ireq_opt);
flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
@@ -558,11 +559,13 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
goto no_route;
if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
goto route_err;
+ rcu_read_unlock();
return &rt->dst;
route_err:
ip_rt_put(rt);
no_route:
+ rcu_read_unlock();
__IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index f5c9ef2586de..411dd7a90046 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -19,7 +19,7 @@
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/vmalloc.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e7227128df2c..9b0158fa431f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -260,8 +260,7 @@ out:
spin_unlock(&qp->q.lock);
out_rcu_unlock:
rcu_read_unlock();
- if (head)
- kfree_skb(head);
+ kfree_skb(head);
ipq_put(qp);
}
@@ -382,7 +381,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
*/
if (end < qp->q.len ||
((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
- goto err;
+ goto discard_qp;
qp->q.flags |= INET_FRAG_LAST_IN;
qp->q.len = end;
} else {
@@ -394,20 +393,20 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (end > qp->q.len) {
/* Some bits beyond end -> corruption. */
if (qp->q.flags & INET_FRAG_LAST_IN)
- goto err;
+ goto discard_qp;
qp->q.len = end;
}
}
if (end == offset)
- goto err;
+ goto discard_qp;
err = -ENOMEM;
if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
- goto err;
+ goto discard_qp;
err = pskb_trim_rcsum(skb, end - offset);
if (err)
- goto err;
+ goto discard_qp;
/* Note : skb->rbnode and skb->dev share the same location. */
dev = skb->dev;
@@ -423,6 +422,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
* We do the same here for IPv4 (and increment an snmp counter).
*/
+ err = -EINVAL;
/* Find out where to put this fragment. */
prev_tail = qp->q.fragments_tail;
if (!prev_tail)
@@ -431,7 +431,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
/* This is the common case: skb goes to the end. */
/* Detect and discard overlaps. */
if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
- goto discard_qp;
+ goto overlap;
if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
ip4_frag_append_to_last_run(&qp->q, skb);
else
@@ -450,7 +450,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
FRAG_CB(skb1)->frag_run_len)
rbn = &parent->rb_right;
else /* Found an overlap with skb1. */
- goto discard_qp;
+ goto overlap;
} while (*rbn);
/* Here we have parent properly set, and rbn pointing to
* one of its NULL left/right children. Insert skb.
@@ -487,16 +487,18 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
skb->_skb_refdst = 0UL;
err = ip_frag_reasm(qp, skb, prev_tail, dev);
skb->_skb_refdst = orefdst;
+ if (err)
+ inet_frag_kill(&qp->q);
return err;
}
skb_dst_drop(skb);
return -EINPROGRESS;
+overlap:
+ __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
discard_qp:
inet_frag_kill(&qp->q);
- err = -EINVAL;
- __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
err:
kfree_skb(skb);
return err;
@@ -621,7 +623,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
sub_frag_mem_limit(qp->q.net, head->truesize);
*nextp = NULL;
- head->next = NULL;
+ skb_mark_not_on_list(head);
head->prev = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
@@ -820,7 +822,6 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
table[0].data = &net->ipv4.frags.high_thresh;
table[0].extra1 = &net->ipv4.frags.low_thresh;
- table[0].extra2 = &init_net.ipv4.frags.high_thresh;
table[1].data = &net->ipv4.frags.low_thresh;
table[1].extra2 = &net->ipv4.frags.high_thresh;
table[2].data = &net->ipv4.frags.timeout;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8cce0e9ea08c..38befe829caf 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -232,22 +232,19 @@ static void gre_err(struct sk_buff *skb, u32 info)
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
- bool csum_err = false;
- if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
- iph->ihl * 4) < 0) {
- if (!csum_err) /* ignore csum errors. */
- return;
- }
+ if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
+ iph->ihl * 4) < 0)
+ return;
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+ skb->dev->ifindex, IPPROTO_GRE);
return;
}
if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
- IPPROTO_GRE, 0);
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
+ IPPROTO_GRE);
return;
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3196cf58f418..35a786c0aaa0 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -531,11 +531,7 @@ static void ip_sublist_rcv_finish(struct list_head *head)
struct sk_buff *skb, *next;
list_for_each_entry_safe(skb, next, head, list) {
- list_del(&skb->list);
- /* Handle ip{6}_forward case, as sch_direct_xmit have
- * another kind of SKB-list usage (see validate_xmit_skb_list)
- */
- skb->next = NULL;
+ skb_list_del_init(skb);
dst_input(skb);
}
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9c4e72e9c60a..c09219e7f230 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -278,7 +278,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
struct sk_buff *nskb = segs->next;
int err;
- segs->next = NULL;
+ skb_mark_not_on_list(segs);
err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
@@ -684,7 +684,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
skb = frag;
frag = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
}
if (err == 0) {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c0fe5ad996f2..26c36cccabdc 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -149,7 +149,6 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
{
struct sockaddr_in sin;
- const struct iphdr *iph = ip_hdr(skb);
__be16 *ports;
int end;
@@ -164,7 +163,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
ports = (__be16 *)skb_transport_header(skb);
sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = iph->daddr;
+ sin.sin_addr.s_addr = ip_hdr(skb)->daddr;
sin.sin_port = ports[1];
memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index f38cb21d773d..de31b302d69c 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -318,9 +318,9 @@ static int vti4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+ ipv4_update_pmtu(skb, net, info, 0, protocol);
else
- ipv4_redirect(skb, net, 0, 0, protocol, 0);
+ ipv4_redirect(skb, net, 0, protocol);
xfrm_state_put(x);
return 0;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index d97f4f2787f5..9119d012ba46 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -48,9 +48,9 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP);
else
- ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_COMP);
xfrm_state_put(x);
return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c891235b4966..e65287c27e3d 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -175,13 +175,12 @@ static int ipip_err(struct sk_buff *skb, u32 info)
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- ipv4_update_pmtu(skb, net, info, t->parms.link, 0,
- iph->protocol, 0);
+ ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
goto out;
}
if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0);
+ ipv4_redirect(skb, net, t->parms.link, iph->protocol);
goto out;
}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5660adcf7a04..a6defbec4f1b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2527,8 +2527,34 @@ errout_free:
static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct fib_dump_filter filter = {};
+ int err;
+
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
+ &filter, cb);
+ if (err < 0)
+ return err;
+ }
+
+ if (filter.table_id) {
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id);
+ if (!mrt) {
+ if (filter.dump_all_families)
+ return skb->len;
+
+ NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist");
+ return -ENOENT;
+ }
+ err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute,
+ &mfc_unres_lock, &filter);
+ return skb->len ? : err;
+ }
+
return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
- _ipmr_fill_mroute, &mfc_unres_lock);
+ _ipmr_fill_mroute, &mfc_unres_lock, &filter);
}
static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
@@ -2710,6 +2736,31 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
return true;
}
+static int ipmr_valid_dumplink(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifinfomsg *ifm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump");
+ return -EINVAL;
+ }
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change || ifm->ifi_index) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
@@ -2718,6 +2769,13 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
unsigned int e = 0, s_e;
struct mr_table *mrt;
+ if (cb->strict_check) {
+ int err = ipmr_valid_dumplink(cb->nlh, cb->extack);
+
+ if (err < 0)
+ return err;
+ }
+
s_t = cb->args[0];
s_e = cb->args[1];
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 1ad9aa62a97b..3e614cc824f7 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -268,6 +268,81 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
}
EXPORT_SYMBOL(mr_fill_mroute);
+static bool mr_mfc_uses_dev(const struct mr_table *mrt,
+ const struct mr_mfc *c,
+ const struct net_device *dev)
+{
+ int ct;
+
+ for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+ if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+ const struct vif_device *vif;
+
+ vif = &mrt->vif_table[ct];
+ if (vif->dev == dev)
+ return true;
+ }
+ }
+ return false;
+}
+
+int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
+ u32 portid, u32 seq, struct mr_mfc *c,
+ int cmd, int flags),
+ spinlock_t *lock, struct fib_dump_filter *filter)
+{
+ unsigned int e = 0, s_e = cb->args[1];
+ unsigned int flags = NLM_F_MULTI;
+ struct mr_mfc *mfc;
+ int err;
+
+ if (filter->filter_set)
+ flags |= NLM_F_DUMP_FILTERED;
+
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+ if (e < s_e)
+ goto next_entry;
+ if (filter->dev &&
+ !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+ goto next_entry;
+
+ err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+ if (err < 0)
+ goto out;
+next_entry:
+ e++;
+ }
+
+ spin_lock_bh(lock);
+ list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+ if (e < s_e)
+ goto next_entry2;
+ if (filter->dev &&
+ !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+ goto next_entry2;
+
+ err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+ if (err < 0) {
+ spin_unlock_bh(lock);
+ goto out;
+ }
+next_entry2:
+ e++;
+ }
+ spin_unlock_bh(lock);
+ err = 0;
+ e = 0;
+
+out:
+ cb->args[1] = e;
+ return err;
+}
+EXPORT_SYMBOL(mr_table_dump);
+
int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
struct mr_table *(*iter)(struct net *net,
struct mr_table *mrt),
@@ -275,53 +350,35 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
struct sk_buff *skb,
u32 portid, u32 seq, struct mr_mfc *c,
int cmd, int flags),
- spinlock_t *lock)
+ spinlock_t *lock, struct fib_dump_filter *filter)
{
- unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1];
+ unsigned int t = 0, s_t = cb->args[0];
struct net *net = sock_net(skb->sk);
struct mr_table *mrt;
- struct mr_mfc *mfc;
+ int err;
+
+ /* multicast does not track protocol or have route type other
+ * than RTN_MULTICAST
+ */
+ if (filter->filter_set) {
+ if (filter->protocol || filter->flags ||
+ (filter->rt_type && filter->rt_type != RTN_MULTICAST))
+ return skb->len;
+ }
rcu_read_lock();
for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
if (t < s_t)
goto next_table;
- list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
- if (e < s_e)
- goto next_entry;
- if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, mfc,
- RTM_NEWROUTE, NLM_F_MULTI) < 0)
- goto done;
-next_entry:
- e++;
- }
- e = 0;
- s_e = 0;
-
- spin_lock_bh(lock);
- list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
- if (e < s_e)
- goto next_entry2;
- if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, mfc,
- RTM_NEWROUTE, NLM_F_MULTI) < 0) {
- spin_unlock_bh(lock);
- goto done;
- }
-next_entry2:
- e++;
- }
- spin_unlock_bh(lock);
- e = 0;
- s_e = 0;
+
+ err = mr_table_dump(mrt, skb, cb, fill, lock, filter);
+ if (err < 0)
+ break;
next_table:
t++;
}
-done:
rcu_read_unlock();
- cb->args[1] = e;
cb->args[0] = t;
return skb->len;
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 04311f7067e2..6d218f5a2e71 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -5,8 +5,8 @@
#include <net/net_namespace.h>
#include <net/tcp.h>
-int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
- u32 *metrics)
+static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
+ int fc_mx_len, u32 *metrics)
{
bool ecn_ca = false;
struct nlattr *nla;
@@ -52,4 +52,28 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
return 0;
}
-EXPORT_SYMBOL_GPL(ip_metrics_convert);
+
+struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
+ int fc_mx_len)
+{
+ struct dst_metrics *fib_metrics;
+ int err;
+
+ if (!fc_mx)
+ return (struct dst_metrics *)&dst_default_metrics;
+
+ fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL);
+ if (unlikely(!fib_metrics))
+ return ERR_PTR(-ENOMEM);
+
+ err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics);
+ if (!err) {
+ refcount_set(&fib_metrics->refcnt, 1);
+ } else {
+ kfree(fib_metrics);
+ fib_metrics = ERR_PTR(err);
+ }
+
+ return fib_metrics;
+}
+EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 12843c9ef142..0b10d8812828 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -36,7 +36,6 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
const struct net_device *dev, u8 flags)
{
struct fib_result res;
- bool dev_match;
int ret __maybe_unused;
if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
@@ -46,21 +45,7 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
return false;
}
- dev_match = false;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- for (ret = 0; ret < res.fi->fib_nhs; ret++) {
- struct fib_nh *nh = &res.fi->fib_nh[ret];
-
- if (nh->nh_dev == dev) {
- dev_match = true;
- break;
- }
- }
-#else
- if (FIB_RES_DEV(res) == dev)
- dev_match = true;
-#endif
- return dev_match || flags & XT_RPFILTER_LOOSE;
+ return fib_info_nh_uses_dev(res.fi, dev) || flags & XT_RPFILTER_LOOSE;
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 6115bf1ff6f0..78a67f961d86 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -264,7 +264,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
return nf_nat_inet_fn(priv, skb, state);
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
static unsigned int
nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index ad3aeff152ed..a9d5e013e555 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -104,12 +104,26 @@ static int masq_device_event(struct notifier_block *this,
return NOTIFY_DONE;
}
+static int inet_cmp(struct nf_conn *ct, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct nf_conntrack_tuple *tuple;
+
+ if (!device_cmp(ct, (void *)(long)dev->ifindex))
+ return 0;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ return ifa->ifa_address == tuple->dst.u3.ip;
+}
+
static int masq_inet_event(struct notifier_block *this,
unsigned long event,
void *ptr)
{
struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
- struct netdev_notifier_info info;
+ struct net *net = dev_net(idev->dev);
/* The masq_dev_notifier will catch the case of the device going
* down. So if the inetdev is dead and being destroyed we have
@@ -119,8 +133,10 @@ static int masq_inet_event(struct notifier_block *this,
if (idev->dead)
return NOTIFY_DONE;
- netdev_notifier_info_init(&info, idev->dev);
- return masq_device_event(this, event, &info);
+ if (event == NETDEV_DOWN)
+ nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0);
+
+ return NOTIFY_DONE;
}
static struct notifier_block masq_dev_notifier = {
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
index ac110c1d55b5..a0aa13bcabda 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -60,6 +60,7 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
MODULE_ALIAS("ip_nat_snmp_basic");
+MODULE_ALIAS_NFCT_HELPER("snmp_trap");
#define SNMP_PORT 161
#define SNMP_TRAP_PORT 162
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index e50976e3c213..94eb25bc8d7e 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -76,10 +76,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
.flowi4_iif = LOOPBACK_IFINDEX,
};
const struct net_device *oif;
- struct net_device *found;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- int i;
-#endif
+ const struct net_device *found;
/*
* Do not set flowi4_oif, it restricts results (for example, asking
@@ -146,25 +143,13 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (!oif) {
found = FIB_RES_DEV(res);
- goto ok;
- }
-
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- for (i = 0; i < res.fi->fib_nhs; i++) {
- struct fib_nh *nh = &res.fi->fib_nh[i];
+ } else {
+ if (!fib_info_nh_uses_dev(res.fi, oif))
+ return;
- if (nh->nh_dev == oif) {
- found = nh->nh_dev;
- goto ok;
- }
+ found = oif;
}
- return;
-#else
- found = FIB_RES_DEV(res);
- if (found != oif)
- return;
-#endif
-ok:
+
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
*dest = found->ifindex;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8d7aaf118a30..7ccb5f87f70b 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -779,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if (ipv4_is_multicast(daddr)) {
- if (!ipc.oif)
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 33df4d76db2d..8ca3eb06ba04 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -608,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
tos |= RTO_ONLINK;
if (ipv4_is_multicast(daddr)) {
- if (!ipc.oif)
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b678466da451..c0a9d26c06ce 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1001,21 +1001,22 @@ out: kfree_skb(skb);
static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
struct dst_entry *dst = &rt->dst;
+ u32 old_mtu = ipv4_mtu(dst);
struct fib_result res;
bool lock = false;
if (ip_mtu_locked(dst))
return;
- if (ipv4_mtu(dst) < mtu)
+ if (old_mtu < mtu)
return;
if (mtu < ip_rt_min_pmtu) {
lock = true;
- mtu = ip_rt_min_pmtu;
+ mtu = min(old_mtu, ip_rt_min_pmtu);
}
- if (rt->rt_pmtu == mtu &&
+ if (rt->rt_pmtu == mtu && !lock &&
time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
return;
@@ -1040,17 +1041,15 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
}
void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
- int oif, u32 mark, u8 protocol, int flow_flags)
+ int oif, u8 protocol)
{
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
struct rtable *rt;
-
- if (!mark)
- mark = IP4_REPLY_MARK(net, skb->mark);
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
__build_flow_key(net, &fl4, NULL, iph, oif,
- RT_TOS(iph->tos), protocol, mark, flow_flags);
+ RT_TOS(iph->tos), protocol, mark, 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_rt_update_pmtu(rt, &fl4, mtu);
@@ -1132,14 +1131,14 @@ out:
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
void ipv4_redirect(struct sk_buff *skb, struct net *net,
- int oif, u32 mark, u8 protocol, int flow_flags)
+ int oif, u8 protocol)
{
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
struct rtable *rt;
__build_flow_key(net, &fl4, NULL, iph, oif,
- RT_TOS(iph->tos), protocol, mark, flow_flags);
+ RT_TOS(iph->tos), protocol, 0, 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_do_redirect(rt, skb, &fl4, false);
@@ -1219,18 +1218,15 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
src = ip_hdr(skb)->saddr;
else {
struct fib_result res;
- struct flowi4 fl4;
- struct iphdr *iph;
-
- iph = ip_hdr(skb);
-
- memset(&fl4, 0, sizeof(fl4));
- fl4.daddr = iph->daddr;
- fl4.saddr = iph->saddr;
- fl4.flowi4_tos = RT_TOS(iph->tos);
- fl4.flowi4_oif = rt->dst.dev->ifindex;
- fl4.flowi4_iif = skb->dev->ifindex;
- fl4.flowi4_mark = skb->mark;
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowi4_tos = RT_TOS(iph->tos),
+ .flowi4_oif = rt->dst.dev->ifindex,
+ .flowi4_iif = skb->dev->ifindex,
+ .flowi4_mark = skb->mark,
+ };
rcu_read_lock();
if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
@@ -1481,12 +1477,9 @@ void rt_del_uncached_list(struct rtable *rt)
static void ipv4_dst_destroy(struct dst_entry *dst)
{
- struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
struct rtable *rt = (struct rtable *)dst;
- if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
- kfree(p);
-
+ ip_dst_metrics_put(dst);
rt_del_uncached_list(rt);
}
@@ -1533,11 +1526,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
rt->rt_gateway = nh->nh_gw;
rt->rt_uses_gateway = 1;
}
- dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
- if (fi->fib_metrics != &dst_default_metrics) {
- rt->dst._metrics |= DST_METRICS_REFCOUNTED;
- refcount_inc(&fi->fib_metrics->refcnt);
- }
+ ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
+
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
@@ -2785,7 +2775,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct rtable *rt = NULL;
struct sk_buff *skb;
struct rtmsg *rtm;
- struct flowi4 fl4;
+ struct flowi4 fl4 = {};
__be32 dst = 0;
__be32 src = 0;
kuid_t uid;
@@ -2825,7 +2815,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (!skb)
return -ENOBUFS;
- memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
fl4.saddr = src;
fl4.flowi4_tos = rtm->rtm_tos;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c3387dfd725b..606f868d9f3f 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -88,7 +88,7 @@ u64 cookie_init_timestamp(struct request_sock *req)
ts <<= TSBITS;
ts |= options;
}
- return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ);
+ return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b92f422f2fa8..891ed2f91467 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -48,6 +48,7 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
static int comp_sack_nr_max = 255;
+static u32 u32_max_div_HZ = UINT_MAX / HZ;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -745,9 +746,10 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "tcp_probe_interval",
.data = &init_net.ipv4.sysctl_tcp_probe_interval,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u32),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = &u32_max_div_HZ,
},
{
.procname = "igmp_link_local_mcast_reports",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 10c6246396cc..9e6bc4d6daa7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -262,7 +262,7 @@
#include <linux/net.h>
#include <linux/socket.h>
#include <linux/random.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/cache.h>
@@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
const struct tcp_sock *tp = tcp_sk(sk);
int state;
- sock_poll_wait(file, wait);
+ sock_poll_wait(file, sock, wait);
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
@@ -1295,7 +1295,7 @@ new_segment:
copy = size_goal;
/* All packets are restored as if they have
- * already been sent. skb_mstamp isn't set to
+ * already been sent. skb_mstamp_ns isn't set to
* avoid wrong rtt estimation.
*/
if (tp->repair)
@@ -1753,6 +1753,7 @@ static int tcp_zerocopy_receive(struct sock *sk,
struct vm_area_struct *vma;
struct sk_buff *skb = NULL;
struct tcp_sock *tp;
+ int inq;
int ret;
if (address & (PAGE_SIZE - 1) || address != zc->address)
@@ -1773,12 +1774,15 @@ static int tcp_zerocopy_receive(struct sock *sk,
tp = tcp_sk(sk);
seq = tp->copied_seq;
- zc->length = min_t(u32, zc->length, tcp_inq(sk));
+ inq = tcp_inq(sk);
+ zc->length = min_t(u32, zc->length, inq);
zc->length &= ~(PAGE_SIZE - 1);
-
- zap_page_range(vma, address, zc->length);
-
- zc->recv_skip_hint = 0;
+ if (zc->length) {
+ zap_page_range(vma, address, zc->length);
+ zc->recv_skip_hint = 0;
+ } else {
+ zc->recv_skip_hint = inq;
+ }
ret = 0;
while (length + PAGE_SIZE <= zc->length) {
if (zc->recv_skip_hint < PAGE_SIZE) {
@@ -1801,8 +1805,17 @@ static int tcp_zerocopy_receive(struct sock *sk,
frags++;
}
}
- if (frags->size != PAGE_SIZE || frags->page_offset)
+ if (frags->size != PAGE_SIZE || frags->page_offset) {
+ int remaining = zc->recv_skip_hint;
+
+ while (remaining && (frags->size != PAGE_SIZE ||
+ frags->page_offset)) {
+ remaining -= frags->size;
+ frags++;
+ }
+ zc->recv_skip_hint -= remaining;
break;
+ }
ret = vm_insert_page(vma, address + length,
skb_frag_page(frags));
if (ret)
@@ -2403,16 +2416,10 @@ adjudge_to_death:
sock_hold(sk);
sock_orphan(sk);
- /* It is the last release_sock in its life. It will remove backlog. */
- release_sock(sk);
-
-
- /* Now socket is owned by kernel and we acquire BH lock
- * to finish close. No need to check for user refs.
- */
local_bh_disable();
bh_lock_sock(sk);
- WARN_ON(sock_owned_by_user(sk));
+ /* remove backlog if any, without releasing ownership. */
+ __release_sock(sk);
percpu_counter_inc(sk->sk_prot->orphan_count);
@@ -2481,6 +2488,7 @@ adjudge_to_death:
out:
bh_unlock_sock(sk);
local_bh_enable();
+ release_sock(sk);
sock_put(sk);
}
EXPORT_SYMBOL(tcp_close);
@@ -2595,6 +2603,8 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->compressed_ack = 0;
tp->bytes_sent = 0;
tp->bytes_retrans = 0;
+ tp->duplicate_sack[0].start_seq = 0;
+ tp->duplicate_sack[0].end_seq = 0;
tp->dsack_dups = 0;
tp->reord_seen = 0;
@@ -3101,10 +3111,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long rate;
u32 now;
u64 rate64;
bool slow;
- u32 rate;
memset(info, 0, sizeof(*info));
if (sk->sk_type != SOCK_STREAM)
@@ -3114,11 +3124,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
/* Report meaningful fields for all TCP states, including listeners */
rate = READ_ONCE(sk->sk_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_pacing_rate = rate64;
rate = READ_ONCE(sk->sk_max_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_max_pacing_rate = rate64;
info->tcpi_reordering = tp->reordering;
@@ -3244,8 +3254,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats;
struct tcp_info info;
+ unsigned long rate;
u64 rate64;
- u32 rate;
stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
if (!stats)
@@ -3264,7 +3274,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
tp->total_retrans, TCP_NLA_PAD);
rate = READ_ONCE(sk->sk_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
rate64 = tcp_compute_delivery_rate(tp);
@@ -3894,8 +3904,8 @@ void __init tcp_init(void)
init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
- init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
- init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
+ init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
+ init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 02ff2dde9609..9277abdd822a 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -128,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
/* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000;
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
+static const int bbr_pacing_margin_percent = 1;
+
/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
* that will allow a smoothly increasing pacing rate that will double each RTT
* and send the same number of packets per RTT that an un-paced, slow-starting
@@ -208,17 +211,15 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
{
unsigned int mss = tcp_sk(sk)->mss_cache;
- if (!tcp_needs_internal_pacing(sk))
- mss = tcp_mss_to_mtu(sk, mss);
rate *= mss;
rate *= gain;
rate >>= BBR_SCALE;
- rate *= USEC_PER_SEC;
+ rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
return rate >> BW_SCALE;
}
/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
-static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
{
u64 rate = bw;
@@ -257,7 +258,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
- u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
+ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
bbr_init_pacing_rate_from_rtt(sk);
@@ -279,7 +280,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
/* Sort of tcp_tso_autosize() but ignoring
* driver provided sk_gso_max_size.
*/
- bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift,
+ bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift,
GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
@@ -368,6 +369,39 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
return cwnd;
}
+/* With pacing at lower layers, there's often less data "in the network" than
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
+ * inflight level that it estimates has already been "baked in" by previous
+ * departure time decisions. We calculate a rough estimate of the number of our
+ * packets that might be in the network at the earliest departure time for the
+ * next skb scheduled:
+ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw
+ * If we're increasing inflight, then we want to know if the transmit of the
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
+ * then estimate if inflight will sink too low just before the EDT transmit.
+ */
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 now_ns, edt_ns, interval_us;
+ u32 interval_delivered, inflight_at_edt;
+
+ now_ns = tp->tcp_clock_cache;
+ edt_ns = max(tp->tcp_wstamp_ns, now_ns);
+ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
+ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
+ inflight_at_edt = inflight_now;
+ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */
+ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */
+ if (interval_delivered >= inflight_at_edt)
+ return 0;
+ return inflight_at_edt - interval_delivered;
+}
+
/* An optimization in BBR to reduce losses: On the first round of recovery, we
* follow the packet conservation principle: send P packets per P packets acked.
* After that, we slow-start and send at most 2*P packets per P packets acked.
@@ -459,7 +493,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk,
if (bbr->pacing_gain == BBR_UNIT)
return is_full_length; /* just use wall clock time */
- inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
+ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
bw = bbr_max_bw(sk);
/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
@@ -487,8 +521,6 @@ static void bbr_advance_cycle_phase(struct sock *sk)
bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
bbr->cycle_mstamp = tp->delivered_mstamp;
- bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT :
- bbr_pacing_gain[bbr->cycle_idx];
}
/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
@@ -506,8 +538,6 @@ static void bbr_reset_startup_mode(struct sock *sk)
struct bbr *bbr = inet_csk_ca(sk);
bbr->mode = BBR_STARTUP;
- bbr->pacing_gain = bbr_high_gain;
- bbr->cwnd_gain = bbr_high_gain;
}
static void bbr_reset_probe_bw_mode(struct sock *sk)
@@ -515,8 +545,6 @@ static void bbr_reset_probe_bw_mode(struct sock *sk)
struct bbr *bbr = inet_csk_ca(sk);
bbr->mode = BBR_PROBE_BW;
- bbr->pacing_gain = BBR_UNIT;
- bbr->cwnd_gain = bbr_cwnd_gain;
bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
}
@@ -734,13 +762,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
bbr->mode = BBR_DRAIN; /* drain queue we created */
- bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
- bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
tcp_sk(sk)->snd_ssthresh =
bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT);
} /* fall through to check if in-flight is already small: */
if (bbr->mode == BBR_DRAIN &&
- tcp_packets_in_flight(tcp_sk(sk)) <=
+ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
}
@@ -797,8 +823,6 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
!bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
- bbr->pacing_gain = BBR_UNIT;
- bbr->cwnd_gain = BBR_UNIT;
bbr_save_cwnd(sk); /* note cwnd so we can restore it */
bbr->probe_rtt_done_stamp = 0;
}
@@ -826,6 +850,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
bbr->idle_restart = 0;
}
+static void bbr_update_gains(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ switch (bbr->mode) {
+ case BBR_STARTUP:
+ bbr->pacing_gain = bbr_high_gain;
+ bbr->cwnd_gain = bbr_high_gain;
+ break;
+ case BBR_DRAIN:
+ bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */
+ bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */
+ break;
+ case BBR_PROBE_BW:
+ bbr->pacing_gain = (bbr->lt_use_bw ?
+ BBR_UNIT :
+ bbr_pacing_gain[bbr->cycle_idx]);
+ bbr->cwnd_gain = bbr_cwnd_gain;
+ break;
+ case BBR_PROBE_RTT:
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = BBR_UNIT;
+ break;
+ default:
+ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
+ break;
+ }
+}
+
static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
{
bbr_update_bw(sk, rs);
@@ -833,6 +886,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
bbr_check_full_bw_reached(sk, rs);
bbr_check_drain(sk, rs);
bbr_update_min_rtt(sk, rs);
+ bbr_update_gains(sk);
}
static void bbr_main(struct sock *sk, const struct rate_sample *rs)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
new file mode 100644
index 000000000000..3b45fe530f91
--- /dev/null
+++ b/net/ipv4/tcp_bpf.c
@@ -0,0 +1,669 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+
+#include <net/inet_common.h>
+
+static bool tcp_bpf_stream_read(const struct sock *sk)
+{
+ struct sk_psock *psock;
+ bool empty = true;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock))
+ empty = list_empty(&psock->ingress_msg);
+ rcu_read_unlock();
+ return !empty;
+}
+
+static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
+ int flags, long timeo, int *err)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
+ struct msghdr *msg, int len, int flags)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ int peek = flags & MSG_PEEK;
+ int i, ret, copied = 0;
+ struct sk_msg *msg_rx;
+
+ msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+ struct sk_msg, list);
+
+ while (copied != len) {
+ struct scatterlist *sge;
+
+ if (unlikely(!msg_rx))
+ break;
+
+ i = msg_rx->sg.start;
+ do {
+ struct page *page;
+ int copy;
+
+ sge = sk_msg_elem(msg_rx, i);
+ copy = sge->length;
+ page = sg_page(sge);
+ if (copied + copy > len)
+ copy = len - copied;
+ ret = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (ret != copy) {
+ msg_rx->sg.start = i;
+ return -EFAULT;
+ }
+
+ copied += copy;
+ if (likely(!peek)) {
+ sge->offset += copy;
+ sge->length -= copy;
+ sk_mem_uncharge(sk, copy);
+ msg_rx->sg.size -= copy;
+
+ if (!sge->length) {
+ sk_msg_iter_var_next(i);
+ if (!msg_rx->skb)
+ put_page(page);
+ }
+ } else {
+ sk_msg_iter_var_next(i);
+ }
+
+ if (copied == len)
+ break;
+ } while (i != msg_rx->sg.end);
+
+ if (unlikely(peek)) {
+ msg_rx = list_next_entry(msg_rx, list);
+ continue;
+ }
+
+ msg_rx->sg.start = i;
+ if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+ list_del(&msg_rx->list);
+ if (msg_rx->skb)
+ consume_skb(msg_rx->skb);
+ kfree(msg_rx);
+ }
+ msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+ struct sk_msg, list);
+ }
+
+ return copied;
+}
+EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
+
+int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ lock_sock(sk);
+msg_bytes_ready:
+ copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ int data, err = 0;
+ long timeo;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+ if (data) {
+ if (skb_queue_empty(&sk->sk_receive_queue))
+ goto msg_bytes_ready;
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ }
+ if (err) {
+ ret = err;
+ goto out;
+ }
+ copied = -EAGAIN;
+ }
+ ret = copied;
+out:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
+static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg, u32 apply_bytes, int flags)
+{
+ bool apply = apply_bytes;
+ struct scatterlist *sge;
+ u32 size, copied = 0;
+ struct sk_msg *tmp;
+ int i, ret = 0;
+
+ tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
+ if (unlikely(!tmp))
+ return -ENOMEM;
+
+ lock_sock(sk);
+ tmp->sg.start = msg->sg.start;
+ i = msg->sg.start;
+ do {
+ sge = sk_msg_elem(msg, i);
+ size = (apply && apply_bytes < sge->length) ?
+ apply_bytes : sge->length;
+ if (!sk_wmem_schedule(sk, size)) {
+ if (!copied)
+ ret = -ENOMEM;
+ break;
+ }
+
+ sk_mem_charge(sk, size);
+ sk_msg_xfer(tmp, msg, i, size);
+ copied += size;
+ if (sge->length)
+ get_page(sk_msg_page(tmp, i));
+ sk_msg_iter_var_next(i);
+ tmp->sg.end = i;
+ if (apply) {
+ apply_bytes -= size;
+ if (!apply_bytes)
+ break;
+ }
+ } while (i != msg->sg.end);
+
+ if (!ret) {
+ msg->sg.start = i;
+ msg->sg.size -= apply_bytes;
+ sk_psock_queue_msg(psock, tmp);
+ sk->sk_data_ready(sk);
+ } else {
+ sk_msg_free(sk, tmp);
+ kfree(tmp);
+ }
+
+ release_sock(sk);
+ return ret;
+}
+
+static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
+ int flags, bool uncharge)
+{
+ bool apply = apply_bytes;
+ struct scatterlist *sge;
+ struct page *page;
+ int size, ret = 0;
+ u32 off;
+
+ while (1) {
+ sge = sk_msg_elem(msg, msg->sg.start);
+ size = (apply && apply_bytes < sge->length) ?
+ apply_bytes : sge->length;
+ off = sge->offset;
+ page = sg_page(sge);
+
+ tcp_rate_check_app_limited(sk);
+retry:
+ ret = do_tcp_sendpages(sk, page, off, size, flags);
+ if (ret <= 0)
+ return ret;
+ if (apply)
+ apply_bytes -= ret;
+ msg->sg.size -= ret;
+ sge->offset += ret;
+ sge->length -= ret;
+ if (uncharge)
+ sk_mem_uncharge(sk, ret);
+ if (ret != size) {
+ size -= ret;
+ off += ret;
+ goto retry;
+ }
+ if (!sge->length) {
+ put_page(page);
+ sk_msg_iter_next(msg, start);
+ sg_init_table(sge, 1);
+ if (msg->sg.start == msg->sg.end)
+ break;
+ }
+ if (apply && !apply_bytes)
+ break;
+ }
+
+ return 0;
+}
+
+static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
+ u32 apply_bytes, int flags, bool uncharge)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
+ release_sock(sk);
+ return ret;
+}
+
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
+ u32 bytes, int flags)
+{
+ bool ingress = sk_msg_to_ingress(msg);
+ struct sk_psock *psock = sk_psock_get(sk);
+ int ret;
+
+ if (unlikely(!psock)) {
+ sk_msg_free(sk, msg);
+ return 0;
+ }
+ ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
+ tcp_bpf_push_locked(sk, msg, bytes, flags, false);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+
+static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg, int *copied, int flags)
+{
+ bool cork = false, enospc = msg->sg.start == msg->sg.end;
+ struct sock *sk_redir;
+ u32 tosend;
+ int ret;
+
+more_data:
+ if (psock->eval == __SK_NONE)
+ psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+
+ if (msg->cork_bytes &&
+ msg->cork_bytes > msg->sg.size && !enospc) {
+ psock->cork_bytes = msg->cork_bytes - msg->sg.size;
+ if (!psock->cork) {
+ psock->cork = kzalloc(sizeof(*psock->cork),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!psock->cork)
+ return -ENOMEM;
+ }
+ memcpy(psock->cork, msg, sizeof(*msg));
+ return 0;
+ }
+
+ tosend = msg->sg.size;
+ if (psock->apply_bytes && psock->apply_bytes < tosend)
+ tosend = psock->apply_bytes;
+
+ switch (psock->eval) {
+ case __SK_PASS:
+ ret = tcp_bpf_push(sk, msg, tosend, flags, true);
+ if (unlikely(ret)) {
+ *copied -= sk_msg_free(sk, msg);
+ break;
+ }
+ sk_msg_apply_bytes(psock, tosend);
+ break;
+ case __SK_REDIRECT:
+ sk_redir = psock->sk_redir;
+ sk_msg_apply_bytes(psock, tosend);
+ if (psock->cork) {
+ cork = true;
+ psock->cork = NULL;
+ }
+ sk_msg_return(sk, msg, tosend);
+ release_sock(sk);
+ ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+ lock_sock(sk);
+ if (unlikely(ret < 0)) {
+ int free = sk_msg_free_nocharge(sk, msg);
+
+ if (!cork)
+ *copied -= free;
+ }
+ if (cork) {
+ sk_msg_free(sk, msg);
+ kfree(msg);
+ msg = NULL;
+ ret = 0;
+ }
+ break;
+ case __SK_DROP:
+ default:
+ sk_msg_free_partial(sk, msg, tosend);
+ sk_msg_apply_bytes(psock, tosend);
+ *copied -= tosend;
+ return -EACCES;
+ }
+
+ if (likely(!ret)) {
+ if (!psock->apply_bytes) {
+ psock->eval = __SK_NONE;
+ if (psock->sk_redir) {
+ sock_put(psock->sk_redir);
+ psock->sk_redir = NULL;
+ }
+ }
+ if (msg &&
+ msg->sg.data[msg->sg.start].page_link &&
+ msg->sg.data[msg->sg.start].length)
+ goto more_data;
+ }
+ return ret;
+}
+
+static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_msg tmp, *msg_tx = NULL;
+ int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
+ int copied = 0, err = 0;
+ struct sk_psock *psock;
+ long timeo;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_sendmsg(sk, msg, size);
+
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ while (msg_data_left(msg)) {
+ bool enospc = false;
+ u32 copy, osize;
+
+ if (sk->sk_err) {
+ err = -sk->sk_err;
+ goto out_err;
+ }
+
+ copy = msg_data_left(msg);
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+ if (psock->cork) {
+ msg_tx = psock->cork;
+ } else {
+ msg_tx = &tmp;
+ sk_msg_init(msg_tx);
+ }
+
+ osize = msg_tx->sg.size;
+ err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
+ if (err) {
+ if (err != -ENOSPC)
+ goto wait_for_memory;
+ enospc = true;
+ copy = msg_tx->sg.size - osize;
+ }
+
+ err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
+ copy);
+ if (err < 0) {
+ sk_msg_trim(sk, msg_tx, osize);
+ goto out_err;
+ }
+
+ copied += copy;
+ if (psock->cork_bytes) {
+ if (size > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= size;
+ if (psock->cork_bytes && !enospc)
+ goto out_err;
+ /* All cork bytes are accounted, rerun the prog. */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
+ if (unlikely(err < 0))
+ goto out_err;
+ continue;
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err) {
+ if (msg_tx && msg_tx != psock->cork)
+ sk_msg_free(sk, msg_tx);
+ goto out_err;
+ }
+ }
+out_err:
+ if (err < 0)
+ err = sk_stream_error(sk, msg->msg_flags, err);
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return copied ? copied : err;
+}
+
+static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct sk_msg tmp, *msg = NULL;
+ int err = 0, copied = 0;
+ struct sk_psock *psock;
+ bool enospc = false;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_sendpage(sk, page, offset, size, flags);
+
+ lock_sock(sk);
+ if (psock->cork) {
+ msg = psock->cork;
+ } else {
+ msg = &tmp;
+ sk_msg_init(msg);
+ }
+
+ /* Catch case where ring is full and sendpage is stalled. */
+ if (unlikely(sk_msg_full(msg)))
+ goto out_err;
+
+ sk_msg_page_add(msg, page, size, offset);
+ sk_mem_charge(sk, size);
+ copied = size;
+ if (sk_msg_full(msg))
+ enospc = true;
+ if (psock->cork_bytes) {
+ if (size > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= size;
+ if (psock->cork_bytes && !enospc)
+ goto out_err;
+ /* All cork bytes are accounted, rerun the prog. */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
+out_err:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return copied ? copied : err;
+}
+
+static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ sk_psock_cork_free(psock);
+ __sk_psock_purge_ingress_msg(psock);
+ while ((link = sk_psock_link_pop(psock))) {
+ sk_psock_unlink(sk, link);
+ sk_psock_free_link(link);
+ }
+}
+
+static void tcp_bpf_unhash(struct sock *sk)
+{
+ void (*saved_unhash)(struct sock *sk);
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ if (sk->sk_prot->unhash)
+ sk->sk_prot->unhash(sk);
+ return;
+ }
+
+ saved_unhash = psock->saved_unhash;
+ tcp_bpf_remove(sk, psock);
+ rcu_read_unlock();
+ saved_unhash(sk);
+}
+
+static void tcp_bpf_close(struct sock *sk, long timeout)
+{
+ void (*saved_close)(struct sock *sk, long timeout);
+ struct sk_psock *psock;
+
+ lock_sock(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ release_sock(sk);
+ return sk->sk_prot->close(sk, timeout);
+ }
+
+ saved_close = psock->saved_close;
+ tcp_bpf_remove(sk, psock);
+ rcu_read_unlock();
+ release_sock(sk);
+ saved_close(sk, timeout);
+}
+
+enum {
+ TCP_BPF_IPV4,
+ TCP_BPF_IPV6,
+ TCP_BPF_NUM_PROTS,
+};
+
+enum {
+ TCP_BPF_BASE,
+ TCP_BPF_TX,
+ TCP_BPF_NUM_CFGS,
+};
+
+static struct proto *tcpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
+
+static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
+ struct proto *base)
+{
+ prot[TCP_BPF_BASE] = *base;
+ prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash;
+ prot[TCP_BPF_BASE].close = tcp_bpf_close;
+ prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
+ prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read;
+
+ prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
+ prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
+ prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage;
+}
+
+static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+{
+ if (sk->sk_family == AF_INET6 &&
+ unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
+ spin_lock_bh(&tcpv6_prot_lock);
+ if (likely(ops != tcpv6_prot_saved)) {
+ tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
+ smp_store_release(&tcpv6_prot_saved, ops);
+ }
+ spin_unlock_bh(&tcpv6_prot_lock);
+ }
+}
+
+static int __init tcp_bpf_v4_build_proto(void)
+{
+ tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
+ return 0;
+}
+core_initcall(tcp_bpf_v4_build_proto);
+
+static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
+
+ sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
+}
+
+static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
+
+ /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
+ * or added requiring sk_prot hook updates. We keep original saved
+ * hooks in this case.
+ */
+ sk->sk_prot = &tcp_bpf_prots[family][config];
+}
+
+static int tcp_bpf_assert_proto_ops(struct proto *ops)
+{
+ /* In order to avoid retpoline, we make assumptions when we call
+ * into ops if e.g. a psock is not present. Make sure they are
+ * indeed valid assumptions.
+ */
+ return ops->recvmsg == tcp_recvmsg &&
+ ops->sendmsg == tcp_sendmsg &&
+ ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
+}
+
+void tcp_bpf_reinit(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ sock_owned_by_me(sk);
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ tcp_bpf_reinit_sk_prot(sk, psock);
+ rcu_read_unlock();
+}
+
+int tcp_bpf_init(struct sock *sk)
+{
+ struct proto *ops = READ_ONCE(sk->sk_prot);
+ struct sk_psock *psock;
+
+ sock_owned_by_me(sk);
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock || psock->sk_proto ||
+ tcp_bpf_assert_proto_ops(ops))) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ tcp_bpf_check_v6_needs_rebuild(sk, ops);
+ tcp_bpf_update_sk_prot(sk, psock);
+ rcu_read_unlock();
+ return 0;
+}
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 06fbe102a425..37eebd910396 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -146,7 +146,7 @@ static void tcp_cdg_hystart_update(struct sock *sk)
return;
if (hystart_detect & HYSTART_ACK_TRAIN) {
- u32 now_us = div_u64(local_clock(), NSEC_PER_USEC);
+ u32 now_us = tp->tcp_mstamp;
if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) {
ca->last_ack = now_us;
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index ca61e2a659e7..cd4814f7e962 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -44,6 +44,7 @@
#include <linux/mm.h>
#include <net/tcp.h>
#include <linux/inet_diag.h>
+#include "tcp_dctcp.h"
#define DCTCP_MAX_ALPHA 1024U
@@ -118,54 +119,6 @@ static u32 dctcp_ssthresh(struct sock *sk)
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
}
-/* Minimal DCTP CE state machine:
- *
- * S: 0 <- last pkt was non-CE
- * 1 <- last pkt was CE
- */
-
-static void dctcp_ce_state_0_to_1(struct sock *sk)
-{
- struct dctcp *ca = inet_csk_ca(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (!ca->ce_state) {
- /* State has changed from CE=0 to CE=1, force an immediate
- * ACK to reflect the new CE state. If an ACK was delayed,
- * send that first to reflect the prior CE state.
- */
- if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
- __tcp_send_ack(sk, ca->prior_rcv_nxt);
- inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
- }
-
- ca->prior_rcv_nxt = tp->rcv_nxt;
- ca->ce_state = 1;
-
- tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
-}
-
-static void dctcp_ce_state_1_to_0(struct sock *sk)
-{
- struct dctcp *ca = inet_csk_ca(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (ca->ce_state) {
- /* State has changed from CE=1 to CE=0, force an immediate
- * ACK to reflect the new CE state. If an ACK was delayed,
- * send that first to reflect the prior CE state.
- */
- if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
- __tcp_send_ack(sk, ca->prior_rcv_nxt);
- inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
- }
-
- ca->prior_rcv_nxt = tp->rcv_nxt;
- ca->ce_state = 0;
-
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
-}
-
static void dctcp_update_alpha(struct sock *sk, u32 flags)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -230,12 +183,12 @@ static void dctcp_state(struct sock *sk, u8 new_state)
static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
{
+ struct dctcp *ca = inet_csk_ca(sk);
+
switch (ev) {
case CA_EVENT_ECN_IS_CE:
- dctcp_ce_state_0_to_1(sk);
- break;
case CA_EVENT_ECN_NO_CE:
- dctcp_ce_state_1_to_0(sk);
+ dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
break;
default:
/* Don't care for the rest. */
diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h
new file mode 100644
index 000000000000..d69a77cbd0c7
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.h
@@ -0,0 +1,40 @@
+#ifndef _TCP_DCTCP_H
+#define _TCP_DCTCP_H
+
+static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (ce_state == 1)
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ else
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S: 0 <- last pkt was non-CE
+ * 1 <- last pkt was CE
+ */
+static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
+ u32 *prior_rcv_nxt, u32 *ce_state)
+{
+ u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;
+
+ if (*ce_state != new_ce_state) {
+ /* CE state has changed, force an immediate ACK to
+ * reflect the new CE state. If an ACK was delayed,
+ * send that first to reflect the prior CE state.
+ */
+ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+ dctcp_ece_ack_cwr(sk, *ce_state);
+ __tcp_send_ack(sk, *prior_rcv_nxt);
+ }
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ }
+ *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt;
+ *ce_state = new_ce_state;
+ dctcp_ece_ack_cwr(sk, new_ce_state);
+}
+
+#endif
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4cf2f7bb2802..2868ef28ce52 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -426,26 +426,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
}
}
-/* 3. Tuning rcvbuf, when connection enters established state. */
-static void tcp_fixup_rcvbuf(struct sock *sk)
-{
- u32 mss = tcp_sk(sk)->advmss;
- int rcvmem;
-
- rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
- tcp_default_init_rwnd(mss);
-
- /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
- * Allow enough cushion so that sender is not limited by our window
- */
- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)
- rcvmem <<= 2;
-
- if (sk->sk_rcvbuf < rcvmem)
- sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
-}
-
-/* 4. Try to fixup all. It is made immediately after connection enters
+/* 3. Try to fixup all. It is made immediately after connection enters
* established state.
*/
void tcp_init_buffer_space(struct sock *sk)
@@ -454,12 +435,10 @@ void tcp_init_buffer_space(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
int maxwin;
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
- tcp_fixup_rcvbuf(sk);
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
tcp_sndbuf_expand(sk);
- tp->rcvq_space.space = tp->rcv_wnd;
+ tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss);
tcp_mstamp_refresh(tp);
tp->rcvq_space.time = tp->tcp_mstamp;
tp->rcvq_space.seq = tp->copied_seq;
@@ -485,7 +464,7 @@ void tcp_init_buffer_space(struct sock *sk)
tp->snd_cwnd_stamp = tcp_jiffies32;
}
-/* 5. Recalculate window clamp after socket hit its memory bounds. */
+/* 4. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1305,7 +1284,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
- skb->skb_mstamp);
+ tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (skb == tp->lost_skb_hint)
@@ -1580,7 +1559,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb),
- skb->skb_mstamp);
+ tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
list_del_init(&skb->tcp_tsorted_anchor);
@@ -3000,8 +2979,8 @@ void tcp_rearm_rto(struct sock *sk)
*/
rto = usecs_to_jiffies(max_t(int, delta_us, 1));
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+ TCP_RTO_MAX, tcp_rtx_queue_head(sk));
}
}
@@ -3103,7 +3082,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
} else if (!(sacked & TCPCB_SACKED_ACKED)) {
- last_ackt = skb->skb_mstamp;
+ last_ackt = tcp_skb_timestamp_us(skb);
WARN_ON_ONCE(last_ackt == 0);
if (!first_ackt)
first_ackt = last_ackt;
@@ -3121,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb))
tcp_rack_advance(tp, sacked, scb->end_seq,
- skb->skb_mstamp);
+ tcp_skb_timestamp_us(skb));
}
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3215,7 +3194,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
}
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
- sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
+ sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
+ tcp_skb_timestamp_us(skb))) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
@@ -3275,8 +3255,8 @@ static void tcp_ack_probe(struct sock *sk)
} else {
unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- when, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ when, TCP_RTO_MAX, NULL);
}
}
@@ -4199,6 +4179,17 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
+static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
+{
+ /* When the ACK path fails or drops most ACKs, the sender would
+ * timeout and spuriously retransmit the same segment repeatedly.
+ * The receiver remembers and reflects via DSACKs. Leverage the
+ * DSACK state and change the txhash to re-route speculatively.
+ */
+ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq)
+ sk_rethink_txhash(sk);
+}
+
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4211,6 +4202,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_spurious_retrans(sk, skb);
if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
end_seq = tp->rcv_nxt;
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
@@ -4755,6 +4747,7 @@ queue_and_out:
}
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ tcp_rcv_spurious_retrans(sk, skb);
/* A retransmit, 2nd most common case. Force an immediate ack. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -6009,11 +6002,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (th->fin)
goto discard;
/* It is possible that we process SYN packets from backlog,
- * so we need to make sure to disable BH right there.
+ * so we need to make sure to disable BH and RCU right there.
*/
+ rcu_read_lock();
local_bh_disable();
acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
local_bh_enable();
+ rcu_read_unlock();
if (!acceptable)
return 1;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 44c09eddbb78..de47038afdf0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -544,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
BUG_ON(!skb);
tcp_mstamp_refresh(tp);
- delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
+ delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
remaining = icsk->icsk_rto -
usecs_to_jiffies(delta_us);
@@ -943,9 +943,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
+ rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- ireq_opt_deref(ireq));
+ rcu_dereference(ireq->ireq_opt));
+ rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -2549,7 +2551,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_tw_reuse = 2;
cnt = tcp_hashinfo.ehash_mask + 1;
- net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 597dbd749f05..9c34b97d365d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -45,6 +45,21 @@
#include <trace/events/tcp.h>
+/* Refresh clocks of a TCP socket,
+ * ensuring monotically increasing values.
+ */
+void tcp_mstamp_refresh(struct tcp_sock *tp)
+{
+ u64 val = tcp_clock_ns();
+
+ if (val > tp->tcp_clock_cache)
+ tp->tcp_clock_cache = val;
+
+ val = div_u64(val, NSEC_PER_USEC);
+ if (val > tp->tcp_mstamp)
+ tp->tcp_mstamp = val;
+}
+
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
@@ -179,21 +194,6 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
-
-u32 tcp_default_init_rwnd(u32 mss)
-{
- /* Initial receive window should be twice of TCP_INIT_CWND to
- * enable proper sending of new unsent data during fast recovery
- * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
- * limit when mss is larger than 1460.
- */
- u32 init_rwnd = TCP_INIT_CWND * 2;
-
- if (mss > 1460)
- init_rwnd = max((1460 * init_rwnd) / mss, 2U);
- return init_rwnd;
-}
-
/* Determine a window scaling and initial window to offer.
* Based on the assumption that the given amount of space
* will be offered. Store the results in the tp structure.
@@ -228,7 +228,10 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
- (*rcv_wnd) = space;
+ (*rcv_wnd) = min_t(u32, space, U16_MAX);
+
+ if (init_rcv_wnd)
+ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
(*rcv_wscale) = 0;
if (wscale_ok) {
@@ -241,11 +244,6 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
(*rcv_wscale)++;
}
}
-
- if (!init_rcv_wnd) /* Use default unless specified otherwise */
- init_rcv_wnd = tcp_default_init_rwnd(mss);
- *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
-
/* Set the clamp no higher than max representable value */
(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
}
@@ -977,28 +975,28 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
+static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
+ u64 prior_wstamp)
{
- u64 len_ns;
- u32 rate;
+ struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_needs_internal_pacing(sk))
- return;
- rate = sk->sk_pacing_rate;
- if (!rate || rate == ~0U)
- return;
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
+ if (sk->sk_pacing_status != SK_PACING_NONE) {
+ unsigned long rate = sk->sk_pacing_rate;
- len_ns = (u64)skb->len * NSEC_PER_SEC;
- do_div(len_ns, rate);
- hrtimer_start(&tcp_sk(sk)->pacing_timer,
- ktime_add_ns(ktime_get(), len_ns),
- HRTIMER_MODE_ABS_PINNED_SOFT);
- sock_hold(sk);
-}
+ /* Original sch_fq does not pace first 10 MSS
+ * Note that tp->data_segs_out overflows after 2^32 packets,
+ * this is a minor annoyance.
+ */
+ if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
+ u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
+ u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
-static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
-{
- skb->skb_mstamp = tp->tcp_mstamp;
+ /* take into account OS jitter */
+ len_ns -= min_t(u64, len_ns / 2, credit);
+ tp->tcp_wstamp_ns += len_ns;
+ }
+ }
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}
@@ -1025,6 +1023,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
struct sk_buff *oskb = NULL;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
+ u64 prior_wstamp;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
@@ -1045,7 +1044,11 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
if (unlikely(!skb))
return -ENOBUFS;
}
- skb->skb_mstamp = tp->tcp_mstamp;
+
+ prior_wstamp = tp->tcp_wstamp_ns;
+ tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
+
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
@@ -1137,7 +1140,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb);
tp->bytes_sent += skb->len - tcp_header_size;
- tcp_internal_pacing(sk, skb);
}
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
@@ -1149,8 +1151,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
- /* Our usage of tstamp should remain private */
- skb->tstamp = 0;
+ /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -1163,7 +1164,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
err = net_xmit_eval(err);
}
if (!err && oskb) {
- tcp_update_skb_after_send(tp, oskb);
+ tcp_update_skb_after_send(sk, oskb, prior_wstamp);
tcp_rate_skb_sent(sk, oskb);
}
return err;
@@ -1698,8 +1699,9 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
{
u32 bytes, segs;
- bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
- sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+ bytes = min_t(unsigned long,
+ sk->sk_pacing_rate >> sk->sk_pacing_shift,
+ sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
/* Goal is to send at least one packet per ms,
* not one big TSO packet every 100 ms.
@@ -1966,7 +1968,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
head = tcp_rtx_queue_head(sk);
if (!head)
goto send_now;
- age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
+ age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
/* If next ACK is likely to come too late (half srtt), do not defer */
if (age < (tp->srtt_us >> 4))
goto send_now;
@@ -2172,10 +2174,23 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
-static bool tcp_pacing_check(const struct sock *sk)
+static bool tcp_pacing_check(struct sock *sk)
{
- return tcp_needs_internal_pacing(sk) &&
- hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tcp_needs_internal_pacing(sk))
+ return false;
+
+ if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
+ return false;
+
+ if (!hrtimer_is_queued(&tp->pacing_timer)) {
+ hrtimer_start(&tp->pacing_timer,
+ ns_to_ktime(tp->tcp_wstamp_ns),
+ HRTIMER_MODE_ABS_PINNED_SOFT);
+ sock_hold(sk);
+ }
+ return true;
}
/* TCP Small Queues :
@@ -2192,10 +2207,12 @@ static bool tcp_pacing_check(const struct sock *sk)
static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
unsigned int factor)
{
- unsigned int limit;
+ unsigned long limit;
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
- limit = min_t(u32, limit,
+ limit = max_t(unsigned long,
+ 2 * skb->truesize,
+ sk->sk_pacing_rate >> sk->sk_pacing_shift);
+ limit = min_t(unsigned long, limit,
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
@@ -2304,18 +2321,19 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+ /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
+ list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ goto repair; /* Skip network transmission */
+ }
+
if (tcp_pacing_check(sk))
break;
tso_segs = tcp_init_tso_segs(skb, mss_now);
BUG_ON(!tso_segs);
- if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
- /* "skb_mstamp" is used as a start point for the retransmit timer */
- tcp_update_skb_after_send(tp, skb);
- goto repair; /* Skip network transmission */
- }
-
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
if (push_one == 2)
@@ -2437,8 +2455,8 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
if (rto_delta_us > 0)
timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
+ TCP_RTO_MAX, NULL);
return true;
}
@@ -2887,7 +2905,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
} tcp_skb_tsorted_restore(skb);
if (!err) {
- tcp_update_skb_after_send(tp, skb);
+ tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
tcp_rate_skb_sent(sk, skb);
}
} else {
@@ -3002,9 +3020,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (skb == rtx_head &&
icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto,
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX,
+ skb);
}
}
@@ -3205,10 +3224,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
- skb->skb_mstamp = cookie_init_timestamp(req);
+ skb->skb_mstamp_ns = cookie_init_timestamp(req);
else
#endif
- skb->skb_mstamp = tcp_clock_us();
+ skb->skb_mstamp_ns = tcp_clock_ns();
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
@@ -3424,7 +3443,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- syn->skb_mstamp = syn_data->skb_mstamp;
+ syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
/* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data)
@@ -3734,9 +3753,10 @@ void tcp_send_probe0(struct sock *sk)
icsk->icsk_probes_out = 1;
probe_max = TCP_RESOURCE_PROBE_INTERVAL;
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- tcp_probe0_when(sk, probe_max),
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ tcp_probe0_when(sk, probe_max),
+ TCP_RTO_MAX,
+ NULL);
}
int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 4dff40dad4dc..baed2186c7c6 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -55,8 +55,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
* bandwidth estimate.
*/
if (!tp->packets_out) {
- tp->first_tx_mstamp = skb->skb_mstamp;
- tp->delivered_mstamp = skb->skb_mstamp;
+ u64 tstamp_us = tcp_skb_timestamp_us(skb);
+
+ tp->first_tx_mstamp = tstamp_us;
+ tp->delivered_mstamp = tstamp_us;
}
TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
@@ -88,13 +90,12 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
rs->is_app_limited = scb->tx.is_app_limited;
rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+ /* Record send time of most recently ACKed packet: */
+ tp->first_tx_mstamp = tcp_skb_timestamp_us(skb);
/* Find the duration of the "send phase" of this window: */
- rs->interval_us = tcp_stamp_us_delta(
- skb->skb_mstamp,
- scb->tx.first_tx_mstamp);
+ rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+ scb->tx.first_tx_mstamp);
- /* Record send time of most recently ACKed packet: */
- tp->first_tx_mstamp = skb->skb_mstamp;
}
/* Mark off the skb delivered once it's sacked to avoid being
* used again when it's cumulatively acked. For acked packets
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index c81aadff769b..fdb715bdd2d1 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -50,7 +50,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
{
return tp->rack.rtt_us + reo_wnd -
- tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+ tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
}
/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
@@ -91,7 +91,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
!(scb->sacked & TCPCB_SACKED_RETRANS))
continue;
- if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
+ if (!tcp_rack_sent_after(tp->rack.mstamp,
+ tcp_skb_timestamp_us(skb),
tp->rack.end_seq, scb->end_seq))
break;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 7fdf222a0bdf..676020663ce8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk)
*/
start_ts = tcp_skb_timestamp(skb);
if (!start_ts)
- skb->skb_mstamp = tp->tcp_mstamp;
+ skb->skb_mstamp_ns = tp->tcp_clock_cache;
else if (icsk->icsk_user_timeout &&
(s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
goto abort;
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index a5995bb2eaca..95df7f7f6328 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -6,7 +6,7 @@
*
*/
-#include<linux/module.h>
+#include <linux/module.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/list.h>
@@ -29,18 +29,6 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
return NULL;
}
-static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp)
-{
- struct tcp_ulp_ops *e;
-
- list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
- if (e->uid == ulp)
- return e;
- }
-
- return NULL;
-}
-
static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
{
const struct tcp_ulp_ops *ulp = NULL;
@@ -63,18 +51,6 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
return ulp;
}
-static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid)
-{
- const struct tcp_ulp_ops *ulp;
-
- rcu_read_lock();
- ulp = tcp_ulp_find_id(uid);
- if (!ulp || !try_module_get(ulp->owner))
- ulp = NULL;
- rcu_read_unlock();
- return ulp;
-}
-
/* Attach new upper layer protocol to the list
* of available protocols.
*/
@@ -123,6 +99,10 @@ void tcp_cleanup_ulp(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ /* No sock_owned_by_me() check here as at the time the
+ * stack calls this function, the socket is dead and
+ * about to be destroyed.
+ */
if (!icsk->icsk_ulp_ops)
return;
@@ -133,54 +113,35 @@ void tcp_cleanup_ulp(struct sock *sk)
icsk->icsk_ulp_ops = NULL;
}
-/* Change upper layer protocol for socket */
-int tcp_set_ulp(struct sock *sk, const char *name)
+static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- const struct tcp_ulp_ops *ulp_ops;
- int err = 0;
+ int err;
+ err = -EEXIST;
if (icsk->icsk_ulp_ops)
- return -EEXIST;
-
- ulp_ops = __tcp_ulp_find_autoload(name);
- if (!ulp_ops)
- return -ENOENT;
-
- if (!ulp_ops->user_visible) {
- module_put(ulp_ops->owner);
- return -ENOENT;
- }
+ goto out_err;
err = ulp_ops->init(sk);
- if (err) {
- module_put(ulp_ops->owner);
- return err;
- }
+ if (err)
+ goto out_err;
icsk->icsk_ulp_ops = ulp_ops;
return 0;
+out_err:
+ module_put(ulp_ops->owner);
+ return err;
}
-int tcp_set_ulp_id(struct sock *sk, int ulp)
+int tcp_set_ulp(struct sock *sk, const char *name)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_ulp_ops *ulp_ops;
- int err;
- if (icsk->icsk_ulp_ops)
- return -EEXIST;
+ sock_owned_by_me(sk);
- ulp_ops = __tcp_ulp_lookup(ulp);
+ ulp_ops = __tcp_ulp_find_autoload(name);
if (!ulp_ops)
return -ENOENT;
- err = ulp_ops->init(sk);
- if (err) {
- module_put(ulp_ops->owner);
- return err;
- }
-
- icsk->icsk_ulp_ops = ulp_ops;
- return 0;
+ return __tcp_set_ulp(sk, ulp_ops);
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7d69dd6fa7e8..1976fddb9e00 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -81,7 +81,7 @@
#include <linux/uaccess.h>
#include <asm/ioctls.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/types.h>
@@ -609,8 +609,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
struct net *net = dev_net(skb->dev);
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
- iph->saddr, uh->source, skb->dev->ifindex, 0,
- udptable, NULL);
+ iph->saddr, uh->source, skb->dev->ifindex,
+ inet_sdif(skb), udptable, NULL);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
@@ -1042,7 +1042,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if (ipv4_is_multicast(daddr)) {
- if (!ipc.oif)
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
@@ -1627,7 +1627,7 @@ busy_check:
*err = error;
return NULL;
}
-EXPORT_SYMBOL_GPL(__skb_recv_udp);
+EXPORT_SYMBOL(__skb_recv_udp);
/*
* This should be easy, if there is something there we
@@ -1889,7 +1889,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return 0;
}
-static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
void udp_encap_enable(void)
{
static_branch_enable(&udp_encap_needed_key);
@@ -2120,8 +2120,24 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
/* Note, we are only interested in != 0 or == 0, thus the
* force to int.
*/
- return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
- inet_compute_pseudo);
+ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
+ if (err)
+ return err;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
+ /* If SW calculated the value, we know it's bad */
+ if (skb->csum_complete_sw)
+ return 1;
+
+ /* HW says the value is bad. Let's validate that.
+ * skb->csum is no longer the full packet checksum,
+ * so don't treat it as such.
+ */
+ skb_checksum_complete_unset(skb);
+ }
+
+ return 0;
}
/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index d9ad986c7b2c..5cbb9be05295 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -42,6 +42,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
rcu_read_lock();
if (req->sdiag_family == AF_INET)
+ /* src and dst are swapped for historical reasons */
sk = __udp4_lib_lookup(net,
req->id.idiag_src[0], req->id.idiag_sport,
req->id.idiag_dst[0], req->id.idiag_dport,
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0c0522b79b43..802f2bc00d69 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -405,7 +405,7 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head,
{
struct udphdr *uh = udp_gro_udphdr(skb);
- if (unlikely(!uh))
+ if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key))
goto flush;
/* Don't bother verifying checksum if we're going to flush anyway. */
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index bcfc00e88756..f8de2482a529 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -67,6 +67,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
if (xo && (xo->flags & XFRM_GRO)) {
skb_mac_header_rebuild(skb);
+ skb_reset_transport_header(skb);
return 0;
}
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index 3d36644890bb..1ad2c2c4e250 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -46,7 +46,6 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
{
int ihl = skb->data - skb_transport_header(skb);
- struct xfrm_offload *xo = xfrm_offload(skb);
if (skb->transport_header != skb->network_header) {
memmove(skb_transport_header(skb),
@@ -54,8 +53,7 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
skb->network_header = skb->transport_header;
}
ip_hdr(skb)->tot_len = htons(skb->len + ihl);
- if (!xo || !(xo->flags & XFRM_GRO))
- skb_reset_transport_header(skb);
+ skb_reset_transport_header(skb);
return 0;
}