aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/net/ipv4
diff options
context:
space:
mode:
authorPablo Neira Ayuso <pablo@netfilter.org>2016-09-25 23:23:57 +0200
committerPablo Neira Ayuso <pablo@netfilter.org>2016-09-25 23:34:19 +0200
commitf20fbc0717f9f007c94b2641134b19228d0ce9ed (patch)
tree1404248ebbec552a3fb7928b75322b65d74de1bd /net/ipv4
parentnetfilter: nf_log: get rid of XT_LOG_* macros (diff)
parentMerge branch '40GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue (diff)
downloadwireguard-linux-f20fbc0717f9f007c94b2641134b19228d0ce9ed.tar.xz
wireguard-linux-f20fbc0717f9f007c94b2641134b19228d0ce9ed.zip
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Conflicts: net/netfilter/core.c net/netfilter/nf_tables_netdev.c Resolve two conflicts before pull request for David's net-next tree: 1) Between c73c24849011 ("netfilter: nf_tables_netdev: remove redundant ip_hdr assignment") from the net tree and commit ddc8b6027ad0 ("netfilter: introduce nft_set_pktinfo_{ipv4, ipv6}_validate()"). 2) Between e8bffe0cf964 ("net: Add _nf_(un)register_hooks symbols") and Aaron Conole's patches to replace list_head with single linked list. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig18
-rw-r--r--net/ipv4/Makefile3
-rw-r--r--net/ipv4/af_inet.c14
-rw-r--r--net/ipv4/devinet.c11
-rw-r--r--net/ipv4/fib_frontend.c3
-rw-r--r--net/ipv4/fib_rules.c3
-rw-r--r--net/ipv4/fib_semantics.c8
-rw-r--r--net/ipv4/fib_trie.c10
-rw-r--r--net/ipv4/gre_offload.c6
-rw-r--r--net/ipv4/inet_diag.c49
-rw-r--r--net/ipv4/ip_gre.c23
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_output.c13
-rw-r--r--net/ipv4/ip_sockglue.c7
-rw-r--r--net/ipv4/ip_tunnel.c76
-rw-r--r--net/ipv4/ip_tunnel_core.c2
-rw-r--r--net/ipv4/ip_vti.c15
-rw-r--r--net/ipv4/ipip.c35
-rw-r--r--net/ipv4/ipmr.c7
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c11
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c1
-rw-r--r--net/ipv4/raw.c6
-rw-r--r--net/ipv4/route.c34
-rw-r--r--net/ipv4/tcp.c61
-rw-r--r--net/ipv4/tcp_bbr.c896
-rw-r--r--net/ipv4/tcp_cdg.c12
-rw-r--r--net/ipv4/tcp_cong.c2
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/ipv4/tcp_input.c495
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv4/tcp_offload.c13
-rw-r--r--net/ipv4/tcp_output.c106
-rw-r--r--net/ipv4/tcp_rate.c186
-rw-r--r--net/ipv4/tcp_timer.c1
-rw-r--r--net/ipv4/tcp_yeah.c2
-rw-r--r--net/ipv4/udp.c6
-rw-r--r--net/ipv4/udp_diag.c14
-rw-r--r--net/ipv4/udp_offload.c6
-rw-r--r--net/ipv4/xfrm4_policy.c4
40 files changed, 1779 insertions, 395 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b49f6c..300b06888fdf 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -640,6 +640,21 @@ config TCP_CONG_CDG
D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+config TCP_CONG_BBR
+ tristate "BBR TCP"
+ default n
+ ---help---
+
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+ maximize network utilization and minimize queues. It builds an explicit
+ model of the the bottleneck delivery rate and path round-trip
+ propagation delay. It tolerates packet loss and delay unrelated to
+ congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+ modem links. It can coexist with flows that use loss-based congestion
+ control, and can operate with shallow buffers, deep buffers,
+ bufferbloat, policers, or AQM schemes that do not provide a delay
+ signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -674,6 +689,9 @@ choice
config DEFAULT_CDG
bool "CDG" if TCP_CONG_CDG=y
+ config DEFAULT_BBR
+ bool "BBR" if TCP_CONG_BBR=y
+
config DEFAULT_RENO
bool "Reno"
endchoice
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 24629b6f57cc..bc6a6c8b9bcd 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
- tcp_recovery.o \
+ tcp_rate.o tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
@@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e94b47be0019..1effc986739e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1192,7 +1192,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
- bool udpfrag = false, fixedid = false, encap;
+ bool udpfrag = false, fixedid = false, gso_partial, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
unsigned int offset = 0;
@@ -1245,6 +1245,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (IS_ERR_OR_NULL(segs))
goto out;
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
skb = segs;
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
@@ -1259,9 +1261,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
iph->id = htons(id);
id += skb_shinfo(skb)->gso_segs;
}
- tot_len = skb_shinfo(skb)->gso_size +
- SKB_GSO_CB(skb)->data_offset +
- skb->head - (unsigned char *)iph;
+
+ if (gso_partial)
+ tot_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)iph;
+ else
+ tot_len = skb->len - nhoff;
} else {
if (!fixedid)
iph->id = htons(id++);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 415e117967c7..062a67ca9a21 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2232,7 +2232,7 @@ static struct devinet_sysctl_table {
};
static int __devinet_sysctl_register(struct net *net, char *dev_name,
- struct ipv4_devconf *p)
+ int ifindex, struct ipv4_devconf *p)
{
int i;
struct devinet_sysctl_table *t;
@@ -2255,6 +2255,8 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
goto free;
p->sysctl = t;
+
+ inet_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p);
return 0;
free:
@@ -2286,7 +2288,7 @@ static int devinet_sysctl_register(struct in_device *idev)
if (err)
return err;
err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
- &idev->cnf);
+ idev->dev->ifindex, &idev->cnf);
if (err)
neigh_sysctl_unregister(idev->arp_parms);
return err;
@@ -2347,11 +2349,12 @@ static __net_init int devinet_init_net(struct net *net)
}
#ifdef CONFIG_SYSCTL
- err = __devinet_sysctl_register(net, "all", all);
+ err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all);
if (err < 0)
goto err_reg_all;
- err = __devinet_sysctl_register(net, "default", dflt);
+ err = __devinet_sysctl_register(net, "default",
+ NETCONFA_IFINDEX_DEFAULT, dflt);
if (err < 0)
goto err_reg_dflt;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 317c31939732..4e56a4c20a3c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -503,6 +503,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
if (!dev)
return -ENODEV;
cfg->fc_oif = dev->ifindex;
+ cfg->fc_table = l3mdev_fib_table(dev);
if (colon) {
struct in_ifaddr *ifa;
struct in_device *in_dev = __in_dev_get_rtnl(dev);
@@ -1021,7 +1022,7 @@ no_promotions:
* First of all, we scan fib_info list searching
* for stray nexthop entries, then ignite fib_flush.
*/
- if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+ if (fib_sync_down_addr(dev, ifa->ifa_local))
fib_flush(dev_net(dev));
}
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 6e9ea69e5f75..770bebed6b28 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -56,6 +56,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp,
};
int err;
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi4_to_flowi(flp));
+
err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
#ifdef CONFIG_IP_ROUTE_CLASSID
if (arg.rule)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8066ccc48a17..388d3e21629b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1057,6 +1057,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi->fib_priority = cfg->fc_priority;
fi->fib_prefsrc = cfg->fc_prefsrc;
fi->fib_type = cfg->fc_type;
+ fi->fib_tb_id = cfg->fc_table;
fi->fib_nhs = nhs;
change_nexthops(fi) {
@@ -1337,18 +1338,21 @@ nla_put_failure:
* referring to it.
* - device went down -> we must shutdown all nexthops going via it.
*/
-int fib_sync_down_addr(struct net *net, __be32 local)
+int fib_sync_down_addr(struct net_device *dev, __be32 local)
{
int ret = 0;
unsigned int hash = fib_laddr_hashfn(local);
struct hlist_head *head = &fib_info_laddrhash[hash];
+ struct net *net = dev_net(dev);
+ int tb_id = l3mdev_fib_table(dev);
struct fib_info *fi;
if (!fib_info_laddrhash || local == 0)
return 0;
hlist_for_each_entry(fi, head, fib_lhash) {
- if (!net_eq(fi->fib_net, net))
+ if (!net_eq(fi->fib_net, net) ||
+ fi->fib_tb_id != tb_id)
continue;
if (fi->fib_prefsrc == local) {
fi->fib_flags |= RTNH_F_DEAD;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e2ffc2a5c7db..241f27bbd7ad 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1081,7 +1081,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
- unsigned int nlflags = 0;
+ u16 nlflags = NLM_F_EXCL;
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
@@ -1126,6 +1126,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (cfg->fc_nlflags & NLM_F_EXCL)
goto out;
+ nlflags &= ~NLM_F_EXCL;
+
/* We have 2 goals:
* 1. Find exact match for type, scope, fib_info to avoid
* duplicate routes
@@ -1151,6 +1153,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
struct fib_info *fi_drop;
u8 state;
+ nlflags |= NLM_F_REPLACE;
fa = fa_first;
if (fa_match) {
if (fa == fa_match)
@@ -1191,7 +1194,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
- tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
+ tb->tb_id, &cfg->fc_nlinfo, nlflags);
goto succeeded;
}
@@ -1203,7 +1206,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
goto out;
if (cfg->fc_nlflags & NLM_F_APPEND)
- nlflags = NLM_F_APPEND;
+ nlflags |= NLM_F_APPEND;
else
fa = fa_first;
}
@@ -1211,6 +1214,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (!(cfg->fc_nlflags & NLM_F_CREATE))
goto out;
+ nlflags |= NLM_F_CREATE;
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (!new_fa)
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index ecd1e09dbbf1..96e0efecefa6 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
__be16 protocol = skb->protocol;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
- bool need_csum, ufo;
+ bool need_csum, ufo, gso_partial;
if (!skb->encapsulation)
goto out;
@@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
goto out;
}
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
outer_hlen = skb_tnl_header_len(skb);
gre_offset = outer_hlen - tnl_hlen;
skb = segs;
@@ -96,7 +98,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
greh = (struct gre_base_hdr *)skb_transport_header(skb);
pcsum = (__sum16 *)(greh + 1);
- if (skb_is_gso(skb)) {
+ if (gso_partial) {
unsigned int partial_adj;
/* Adjust checksum to account for the fact that
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index abfbe492ebfe..e4d16fc5bbb3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -99,6 +99,7 @@ static size_t inet_sk_attr_size(void)
+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+ nla_total_size(1) /* INET_DIAG_TOS */
+ nla_total_size(1) /* INET_DIAG_TCLASS */
+ + nla_total_size(4) /* INET_DIAG_MARK */
+ nla_total_size(sizeof(struct inet_diag_meminfo))
+ nla_total_size(sizeof(struct inet_diag_msg))
+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
@@ -109,7 +110,8 @@ static size_t inet_sk_attr_size(void)
int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
struct inet_diag_msg *r, int ext,
- struct user_namespace *user_ns)
+ struct user_namespace *user_ns,
+ bool net_admin)
{
const struct inet_sock *inet = inet_sk(sk);
@@ -136,6 +138,9 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
}
#endif
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
+ goto errout;
+
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = sock_i_ino(sk);
@@ -149,7 +154,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct sk_buff *skb, const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh,
+ bool net_admin)
{
const struct tcp_congestion_ops *ca_ops;
const struct inet_diag_handler *handler;
@@ -175,7 +181,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 0;
r->idiag_retrans = 0;
- if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
goto errout;
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -274,10 +280,11 @@ static int inet_csk_diag_fill(struct sock *sk,
const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh,
+ bool net_admin)
{
- return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
- user_ns, portid, seq, nlmsg_flags, unlh);
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
+ portid, seq, nlmsg_flags, unlh, net_admin);
}
static int inet_twsk_diag_fill(struct sock *sk,
@@ -319,8 +326,9 @@ static int inet_twsk_diag_fill(struct sock *sk,
static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh, bool net_admin)
{
+ struct request_sock *reqsk = inet_reqsk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
@@ -334,7 +342,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
inet_diag_msg_common_fill(r, sk);
r->idiag_state = TCP_SYN_RECV;
r->idiag_timer = 1;
- r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+ r->idiag_retrans = reqsk->num_retrans;
BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
offsetof(struct sock, sk_cookie));
@@ -346,6 +354,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
r->idiag_uid = 0;
r->idiag_inode = 0;
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ inet_rsk(reqsk)->ir_mark))
+ return -EMSGSIZE;
+
nlmsg_end(skb, nlh);
return 0;
}
@@ -354,7 +366,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
const struct inet_diag_req_v2 *r,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh, bool net_admin)
{
if (sk->sk_state == TCP_TIME_WAIT)
return inet_twsk_diag_fill(sk, skb, portid, seq,
@@ -362,10 +374,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
if (sk->sk_state == TCP_NEW_SYN_RECV)
return inet_req_diag_fill(sk, skb, portid, seq,
- nlmsg_flags, unlh);
+ nlmsg_flags, unlh, net_admin);
return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
- nlmsg_flags, unlh);
+ nlmsg_flags, unlh, net_admin);
}
struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -435,7 +447,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
err = sk_diag_fill(sk, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh);
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
nlmsg_free(rep);
@@ -796,7 +809,8 @@ static int inet_csk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
- const struct nlattr *bc)
+ const struct nlattr *bc,
+ bool net_admin)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
@@ -804,7 +818,8 @@ static int inet_csk_diag_dump(struct sock *sk,
return inet_csk_diag_fill(sk, skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
+ net_admin);
}
static void twsk_build_assert(void)
@@ -840,6 +855,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num;
u32 idiag_states = r->idiag_states;
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV;
@@ -880,7 +896,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
cb->args[3] > 0)
goto next_listen;
- if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ if (inet_csk_diag_dump(sk, skb, cb, r,
+ bc, net_admin) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -948,7 +965,7 @@ skip_listen_ht:
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh);
+ cb->nlh, net_admin);
if (res < 0) {
spin_unlock_bh(lock);
goto done;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 113cc43df789..576f705d8180 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -246,25 +246,6 @@ static void gre_err(struct sk_buff *skb, u32 info)
ipgre_err(skb, info, &tpi);
}
-static __be64 key_to_tunnel_id(__be32 key)
-{
-#ifdef __BIG_ENDIAN
- return (__force __be64)((__force u32)key);
-#else
- return (__force __be64)((__force u64)key << 32);
-#endif
-}
-
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 tunnel_id_to_key(__be64 x)
-{
-#ifdef __BIG_ENDIAN
- return (__force __be32)x;
-#else
- return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
{
@@ -290,7 +271,7 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
__be64 tun_id;
flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
- tun_id = key_to_tunnel_id(tpi->key);
+ tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
if (!tun_dst)
return PACKET_REJECT;
@@ -446,7 +427,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
gre_build_header(skb, tunnel_hlen, flags, proto,
- tunnel_id_to_key(tun_info->key.tun_id), 0);
+ tunnel_id_to_key32(tun_info->key.tun_id), 0);
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4b351af3e67b..d6feabb03516 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -312,6 +312,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
+ struct net_device *dev = skb->dev;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
@@ -341,7 +342,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
if (!skb_valid_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev);
+ iph->tos, dev);
if (unlikely(err)) {
if (err == -EXDEV)
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
@@ -370,7 +371,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
} else if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
/* RFC 1122 3.3.6:
*
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 65569274efb8..05d105832bdb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -99,6 +99,14 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
+
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_out(sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
@@ -490,7 +498,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+#if IS_ENABLED(CONFIG_IP_VS)
to->ipvs_property = from->ipvs_property;
#endif
skb_copy_secmark(to, from);
@@ -1574,8 +1582,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
}
oif = arg->bound_dev_if;
- if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
- oif = skb->skb_iif;
+ oif = oif ? : skb->skb_iif;
flowi4_init_output(&fl4, oif,
IP4_REPLY_MARK(net, skb->mark),
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 71a52f4d4cff..af4919792b6a 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -284,9 +284,12 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
ipc->ttl = val;
break;
case IP_TOS:
- if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)))
+ val = *(int *)CMSG_DATA(cmsg);
+ else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8)))
+ val = *(u8 *)CMSG_DATA(cmsg);
+ else
return -EINVAL;
- val = *(int *)CMSG_DATA(cmsg);
if (val < 0 || val > 255)
return -EINVAL;
ipc->tos = val;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 95649ebd2874..5719d6ba0824 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -55,6 +55,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/udp.h>
+#include <net/dst_metadata.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -546,6 +547,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
return 0;
}
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ u32 headroom = sizeof(struct iphdr);
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ const struct iphdr *inner_iph;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ __be16 df = 0;
+ u8 tos, ttl;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto tx_error;
+ key = &tun_info->key;
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ tos = key->tos;
+ if (tos == 1) {
+ if (skb->protocol == htons(ETH_P_IP))
+ tos = inner_iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ }
+ init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
+ RT_TOS(tos), tunnel->parms.link);
+ if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
+ goto tx_error;
+ rt = ip_route_output_key(tunnel->net, &fl4);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+ if (rt->dst.dev == dev) {
+ ip_rt_put(rt);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+ tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+ ttl = key->ttl;
+ if (ttl == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ ttl = inner_iph->ttl;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+ else
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ }
+ if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+ df = htons(IP_DF);
+ else if (skb->protocol == htons(ETH_P_IP))
+ df = inner_iph->frag_off & htons(IP_DF);
+ headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ if (headroom > dev->needed_headroom)
+ dev->needed_headroom = headroom;
+
+ if (skb_cow_head(skb, dev->needed_headroom)) {
+ ip_rt_put(rt);
+ goto tx_dropped;
+ }
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
+ key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+ return;
+tx_error:
+ dev->stats.tx_errors++;
+ goto kfree;
+tx_dropped:
+ dev->stats.tx_dropped++;
+kfree:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
+
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, u8 protocol)
{
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 0f227db0e9ac..777bc1883870 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -69,7 +69,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
skb_scrub_packet(skb, xnet);
- skb_clear_hash(skb);
+ skb_clear_hash_if_not_l4(skb);
skb_dst_set(skb, &rt->dst);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index cc701fa70b12..5d7944f394d9 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -88,6 +88,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
struct net_device *dev;
struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
+ struct xfrm_mode *inner_mode;
struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
u32 orig_mark = skb->mark;
int ret;
@@ -105,7 +106,19 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
}
x = xfrm_input_state(skb);
- family = x->inner_mode->afinfo->family;
+
+ inner_mode = x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ family = inner_mode->afinfo->family;
skb->mark = be32_to_cpu(tunnel->parms.i_key);
ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4ae3f8e6c6cc..c9392589c415 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -115,6 +115,7 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/dst_metadata.h>
static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
@@ -193,6 +194,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
{
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ struct metadata_dst *tun_dst = NULL;
struct ip_tunnel *tunnel;
const struct iphdr *iph;
@@ -216,7 +218,12 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
tpi = &ipip_tpi;
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
- return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+ if (tunnel->collect_md) {
+ tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
+ if (!tun_dst)
+ return 0;
+ }
+ return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
}
return -1;
@@ -270,7 +277,10 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
skb_set_inner_ipproto(skb, ipproto);
- ip_tunnel_xmit(skb, dev, tiph, ipproto);
+ if (tunnel->collect_md)
+ ip_md_tunnel_xmit(skb, dev, ipproto);
+ else
+ ip_tunnel_xmit(skb, dev, tiph, ipproto);
return NETDEV_TX_OK;
tx_error:
@@ -380,13 +390,14 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
}
static void ipip_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm *parms, bool *collect_md)
{
memset(parms, 0, sizeof(*parms));
parms->iph.version = 4;
parms->iph.protocol = IPPROTO_IPIP;
parms->iph.ihl = 5;
+ *collect_md = false;
if (!data)
return;
@@ -414,6 +425,9 @@ static void ipip_netlink_parms(struct nlattr *data[],
if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
parms->iph.frag_off = htons(IP_DF);
+
+ if (data[IFLA_IPTUN_COLLECT_METADATA])
+ *collect_md = true;
}
/* This function returns true when ENCAP attributes are present in the nl msg */
@@ -453,18 +467,18 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
static int ipip_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
+ struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
if (ipip_netlink_encap_parms(data, &ipencap)) {
- struct ip_tunnel *t = netdev_priv(dev);
int err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
- ipip_netlink_parms(data, &p);
+ ipip_netlink_parms(data, &p, &t->collect_md);
return ip_tunnel_newlink(dev, tb, &p);
}
@@ -473,6 +487,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
{
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ bool collect_md;
if (ipip_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
@@ -482,7 +497,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
return err;
}
- ipip_netlink_parms(data, &p);
+ ipip_netlink_parms(data, &p, &collect_md);
+ if (collect_md)
+ return -EINVAL;
if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
(!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@@ -516,6 +533,8 @@ static size_t ipip_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_IPTUN_ENCAP_DPORT */
nla_total_size(2) +
+ /* IFLA_IPTUN_COLLECT_METADATA */
+ nla_total_size(0) +
0;
}
@@ -544,6 +563,9 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
tunnel->encap.flags))
goto nla_put_failure;
+ if (tunnel->collect_md)
+ if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -562,6 +584,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
};
static struct rtnl_link_ops ipip_link_ops __read_mostly = {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 26253328d227..a87bcd2d4a94 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2076,6 +2076,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct rta_mfc_stats mfcs;
struct nlattr *mp_attr;
struct rtnexthop *nhp;
+ unsigned long lastuse;
int ct;
/* If cache is unresolved, don't try to parse IIF and OIF */
@@ -2105,12 +2106,14 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
nla_nest_end(skb, mp_attr);
+ lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+ lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
mfcs.mfcs_packets = c->mfc_un.res.pkt;
mfcs.mfcs_bytes = c->mfc_un.res.bytes;
mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
- nla_put_u64_64bit(skb, RTA_EXPIRES,
- jiffies_to_clock_t(c->mfc_un.res.lastuse),
+ nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
RTA_PAD))
return -EMSGSIZE;
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 2375b0a8be46..30493beb611a 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv,
__be32 saddr, daddr;
u_int8_t tos;
const struct iphdr *iph;
+ int err;
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
@@ -46,15 +47,17 @@ static unsigned int nf_route_table_hook(void *priv,
tos = iph->tos;
ret = nft_do_chain(&pkt, priv);
- if (ret != NF_DROP && ret != NF_QUEUE) {
+ if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
if (iph->saddr != saddr ||
iph->daddr != daddr ||
skb->mark != mark ||
- iph->tos != tos)
- if (ip_route_me_harder(state->net, skb, RTN_UNSPEC))
- ret = NF_DROP;
+ iph->tos != tos) {
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
}
return ret;
}
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index c24f41c816b3..2c2553b9026c 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -46,6 +46,7 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = {
.eval = nft_reject_ipv4_eval,
.init = nft_reject_init,
.dump = nft_reject_dump,
+ .validate = nft_reject_validate,
};
static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 438f50c1a676..90a85c955872 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -606,12 +606,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
- if (!saddr && ipc.oif) {
- err = l3mdev_get_saddr(net, ipc.oif, &fl4);
- if (err < 0)
- goto done;
- }
-
if (!inet->hdrincl) {
rfv.msg = msg;
rfv.hlen = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3e992783c1d0..654a9af20136 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -476,12 +476,18 @@ u32 ip_idents_reserve(u32 hash, int segs)
atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
u32 old = ACCESS_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
- u32 delta = 0;
+ u32 new, delta = 0;
if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
- return atomic_add_return(segs + delta, p_id) - segs;
+ /* Do not use atomic_add_return() as it makes UBSAN unhappy */
+ do {
+ old = (u32)atomic_read(p_id);
+ new = old + delta + segs;
+ } while (atomic_cmpxchg(p_id, old, new) != old);
+
+ return new - segs;
}
EXPORT_SYMBOL(ip_idents_reserve);
@@ -1831,7 +1837,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
* Now we are ready to route packet.
*/
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
+ fl4.flowi4_iif = dev->ifindex;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
@@ -2018,7 +2024,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
return ERR_PTR(-EINVAL);
if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
- if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+ if (ipv4_is_loopback(fl4->saddr) &&
+ !(dev_out->flags & IFF_LOOPBACK) &&
+ !netif_is_l3_master(dev_out))
return ERR_PTR(-EINVAL);
if (ipv4_is_lbcast(fl4->daddr))
@@ -2148,7 +2156,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
unsigned int flags = 0;
struct fib_result res;
struct rtable *rth;
- int master_idx;
int orig_oif;
int err = -ENETUNREACH;
@@ -2158,9 +2165,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
orig_oif = fl4->flowi4_oif;
- master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
- if (master_idx)
- fl4->flowi4_oif = master_idx;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
fl4->flowi4_tos = tos & IPTOS_RT_MASK;
fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
@@ -2244,10 +2248,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
-
- rth = l3mdev_get_rtable(dev_out, fl4);
- if (rth)
- goto out;
}
if (!fl4->daddr) {
@@ -2265,8 +2265,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
if (err) {
res.fi = NULL;
res.table = NULL;
- if (fl4->flowi4_oif &&
- !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
+ if (fl4->flowi4_oif) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2302,7 +2301,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
else
fl4->saddr = fl4->daddr;
}
- dev_out = net->loopback_dev;
+
+ /* L3 master device is the loopback for that domain */
+ dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
fl4->flowi4_oif = dev_out->ifindex;
flags |= RTCF_LOCAL;
goto make_route;
@@ -2577,9 +2578,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
- if (netif_index_is_l3_master(net, fl4.flowi4_oif))
- fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
-
if (iif) {
struct net_device *dev;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 77311a92275c..f253e5019d22 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -380,14 +380,14 @@ void tcp_init_sock(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- __skb_queue_head_init(&tp->out_of_order_queue);
+ tp->out_of_order_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
INIT_LIST_HEAD(&tp->tsq_node);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
- tp->rtt_min[0].rtt = ~0U;
+ minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk)
*/
tp->snd_cwnd = TCP_INIT_CWND;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ tp->app_limited = ~0U;
+
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
@@ -1014,23 +1017,40 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
flags);
lock_sock(sk);
+
+ tcp_rate_check_app_limited(sk); /* is sending application-limited? */
+
res = do_tcp_sendpages(sk, page, offset, size, flags);
release_sock(sk);
return res;
}
EXPORT_SYMBOL(tcp_sendpage);
-static inline int select_size(const struct sock *sk, bool sg)
+/* Do not bother using a page frag for very small frames.
+ * But use this heuristic only for the first skb in write queue.
+ *
+ * Having no payload in skb->head allows better SACK shifting
+ * in tcp_shift_skb_data(), reducing sack/rack overhead, because
+ * write queue has less skbs.
+ * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
+ * This also speeds up tso_fragment(), since it wont fallback
+ * to tcp_fragment().
+ */
+static int linear_payload_sz(bool first_skb)
+{
+ if (first_skb)
+ return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
+ return 0;
+}
+
+static int select_size(const struct sock *sk, bool sg, bool first_skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
int tmp = tp->mss_cache;
if (sg) {
if (sk_can_gso(sk)) {
- /* Small frames wont use a full page:
- * Payload will immediately follow tcp header.
- */
- tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
+ tmp = linear_payload_sz(first_skb);
} else {
int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
@@ -1101,6 +1121,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ tcp_rate_check_app_limited(sk); /* is sending application-limited? */
+
/* Wait for a connection to finish. One exception is TCP Fast Open
* (passive side) where data is allowed to be sent before a connection
* is fully established.
@@ -1161,6 +1183,8 @@ restart:
}
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
+ bool first_skb;
+
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
@@ -1172,10 +1196,11 @@ new_segment:
process_backlog = false;
goto restart;
}
+ first_skb = skb_queue_empty(&sk->sk_write_queue);
skb = sk_stream_alloc_skb(sk,
- select_size(sk, sg),
+ select_size(sk, sg, first_skb),
sk->sk_allocation,
- skb_queue_empty(&sk->sk_write_queue));
+ first_skb);
if (!skb)
goto wait_for_memory;
@@ -2243,7 +2268,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
tcp_write_queue_purge(sk);
- __skb_queue_purge(&tp->out_of_order_queue);
+ skb_rbtree_purge(&tp->out_of_order_queue);
inet->inet_dport = 0;
@@ -2687,7 +2712,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 now = tcp_time_stamp;
+ u32 now = tcp_time_stamp, intv;
unsigned int start;
int notsent_bytes;
u64 rate64;
@@ -2777,6 +2802,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_min_rtt = tcp_min_rtt(tp);
info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
+
+ info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
+ rate = READ_ONCE(tp->rate_delivered);
+ intv = READ_ONCE(tp->rate_interval_us);
+ if (rate && intv) {
+ rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+ do_div(rate64, intv);
+ put_unaligned(rate64, &info->tcpi_delivery_rate);
+ }
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3244,11 +3278,12 @@ static void __init tcp_init_mem(void)
void __init tcp_init(void)
{
- unsigned long limit;
int max_rshare, max_wshare, cnt;
+ unsigned long limit;
unsigned int i;
- sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
+ BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
+ FIELD_SIZEOF(struct sk_buff, cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
new file mode 100644
index 000000000000..0ea66c2c9344
--- /dev/null
+++ b/net/ipv4/tcp_bbr.c
@@ -0,0 +1,896 @@
+/* Bottleneck Bandwidth and RTT (BBR) congestion control
+ *
+ * BBR congestion control computes the sending rate based on the delivery
+ * rate (throughput) estimated from ACKs. In a nutshell:
+ *
+ * On each ACK, update our model of the network path:
+ * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+ * min_rtt = windowed_min(rtt, 10 seconds)
+ * pacing_rate = pacing_gain * bottleneck_bandwidth
+ * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+ *
+ * The core algorithm does not react directly to packet losses or delays,
+ * although BBR may adjust the size of next send per ACK when loss is
+ * observed, or adjust the sending rate if it estimates there is a
+ * traffic policer, in order to keep the drop rate reasonable.
+ *
+ * BBR is described in detail in:
+ * "BBR: Congestion-Based Congestion Control",
+ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ * https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * since pacing is integral to the BBR design and implementation.
+ * BBR without pacing would not function properly, and may incur unnecessary
+ * high packet loss rates.
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+#include <linux/win_minmax.h>
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
+ BBR_DRAIN, /* drain any queue created during startup */
+ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
+ BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */
+};
+
+/* BBR congestion control block */
+struct bbr {
+ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
+ u32 min_rtt_stamp; /* timestamp of min_rtt_us */
+ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
+ struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */
+ u32 rtt_cnt; /* count of packet-timed rounds elapsed */
+ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
+ struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */
+ u32 mode:3, /* current bbr_mode in state machine */
+ prev_ca_state:3, /* CA state on previous ACK */
+ packet_conservation:1, /* use packet conservation? */
+ restore_cwnd:1, /* decided to revert cwnd to old value */
+ round_start:1, /* start of packet-timed tx->ack round? */
+ tso_segs_goal:7, /* segments we want in each skb we send */
+ idle_restart:1, /* restarting after idle? */
+ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
+ unused:5,
+ lt_is_sampling:1, /* taking long-term ("LT") samples now? */
+ lt_rtt_cnt:7, /* round trips in long-term interval */
+ lt_use_bw:1; /* use lt_bw as our bw estimate? */
+ u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */
+ u32 lt_last_delivered; /* LT intvl start: tp->delivered */
+ u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */
+ u32 lt_last_lost; /* LT intvl start: tp->lost */
+ u32 pacing_gain:10, /* current gain for setting pacing rate */
+ cwnd_gain:10, /* current gain for setting cwnd */
+ full_bw_cnt:3, /* number of rounds without large bw gains */
+ cycle_idx:3, /* current index in pacing_gain cycle array */
+ unused_b:6;
+ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
+ u32 full_bw; /* recent bw, to estimate if pipe is full */
+};
+
+#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
+
+/* Window length of bw filter (in rounds): */
+static const int bbr_bw_rtts = CYCLE_LEN + 2;
+/* Window length of min_rtt filter (in sec): */
+static const u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+static const u32 bbr_probe_rtt_mode_ms = 200;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static const int bbr_min_tso_rate = 1200000;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would:
+ */
+static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round:
+ */
+static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
+static const int bbr_cwnd_gain = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+static const int bbr_pacing_gain[] = {
+ BBR_UNIT * 5 / 4, /* probe for more available bw */
+ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */
+};
+/* Randomize the starting gain cycling phase over N phases: */
+static const u32 bbr_cycle_rand = 7;
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight:
+ */
+static const u32 bbr_cwnd_min_target = 4;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available: */
+static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+static const u32 bbr_full_bw_cnt = 3;
+
+/* "long-term" ("LT") bandwidth estimator parameters... */
+/* The minimum number of rounds in an LT bw sampling interval: */
+static const u32 bbr_lt_intvl_min_rtts = 4;
+/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+static const u32 bbr_lt_loss_thresh = 50;
+/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+static const u32 bbr_lt_bw_diff = 4000 / 8;
+/* If we estimate we're policed, use lt_bw for this many round trips: */
+static const u32 bbr_lt_bw_max_rtts = 48;
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+ const struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->full_bw_cnt >= bbr_full_bw_cnt;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return minmax_get(&bbr->bw);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+{
+ rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
+ rate *= gain;
+ rate >>= BBR_SCALE;
+ rate *= USEC_PER_SEC;
+ return rate >> BW_SCALE;
+}
+
+/* Pace using current bw estimate and a gain factor. In order to help drive the
+ * network toward lower queues while maintaining high utilization and low
+ * latency, the average pacing rate aims to be slightly (~1%) lower than the
+ * estimated bandwidth. This is an important aspect of the design. In this
+ * implementation this slightly lower pacing rate is achieved implicitly by not
+ * including link-layer headers in the packet size used for the pacing rate.
+ */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 rate = bw;
+
+ rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+ rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+ if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
+ sk->sk_pacing_rate = rate;
+}
+
+/* Return count of segments we want in the skbs we send, or 0 for default. */
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->tso_segs_goal;
+}
+
+static void bbr_set_tso_segs_goal(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 min_segs;
+
+ min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+ bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
+ 0x7FU);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+ bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */
+ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+ bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+}
+
+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (event == CA_EVENT_TX_START && tp->app_limited) {
+ bbr->idle_restart = 1;
+ /* Avoid pointless buffer overflows: pace at est. bw if we don't
+ * need more speed (we're restarting from idle and app-limited).
+ */
+ if (bbr->mode == BBR_PROBE_BW)
+ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ }
+}
+
+/* Find target cwnd. Right-size the cwnd based on min RTT and the
+ * estimated bottleneck bandwidth:
+ *
+ * cwnd = bw * min_rtt * gain = BDP * gain
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ *
+ * To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ * - one skb in sending host Qdisc,
+ * - one skb in sending host TSO/GSO engine
+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 cwnd;
+ u64 w;
+
+ /* If we've never had a valid RTT sample, cap cwnd at the initial
+ * default. This should only happen when the connection is not using TCP
+ * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+ * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+ */
+ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */
+ return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/
+
+ w = (u64)bw * bbr->min_rtt_us;
+
+ /* Apply a gain to the given value, then remove the BW_SCALE shift. */
+ cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+ /* Allow enough full-sized skbs in flight to utilize end systems. */
+ cwnd += 3 * bbr->tso_segs_goal;
+
+ /* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+ cwnd = (cwnd + 1) & ~1U;
+
+ return cwnd;
+}
+
+/* An optimization in BBR to reduce losses: On the first round of recovery, we
+ * follow the packet conservation principle: send P packets per P packets acked.
+ * After that, we slow-start and send at most 2*P packets per P packets acked.
+ * After recovery finishes, or upon undo, we restore the cwnd we had when
+ * recovery started (capped by the target cwnd based on estimated BDP).
+ *
+ * TODO(ycheng/ncardwell): implement a rate-based approach.
+ */
+static bool bbr_set_cwnd_to_recover_or_restore(
+ struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+ u32 cwnd = tp->snd_cwnd;
+
+ /* An ACK for P pkts should release at most 2*P packets. We do this
+ * in two steps. First, here we deduct the number of lost packets.
+ * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+ */
+ if (rs->losses > 0)
+ cwnd = max_t(s32, cwnd - rs->losses, 1);
+
+ if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+ /* Starting 1st round of Recovery, so do packet conservation. */
+ bbr->packet_conservation = 1;
+ bbr->next_rtt_delivered = tp->delivered; /* start round now */
+ /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+ cwnd = tcp_packets_in_flight(tp) + acked;
+ } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+ /* Exiting loss recovery; restore cwnd saved before recovery. */
+ bbr->restore_cwnd = 1;
+ bbr->packet_conservation = 0;
+ }
+ bbr->prev_ca_state = state;
+
+ if (bbr->restore_cwnd) {
+ /* Restore cwnd after exiting loss recovery or PROBE_RTT. */
+ cwnd = max(cwnd, bbr->prior_cwnd);
+ bbr->restore_cwnd = 0;
+ }
+
+ if (bbr->packet_conservation) {
+ *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+ return true; /* yes, using packet conservation */
+ }
+ *new_cwnd = cwnd;
+ return false;
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ u32 acked, u32 bw, int gain)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 cwnd = 0, target_cwnd = 0;
+
+ if (!acked)
+ return;
+
+ if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+ goto done;
+
+ /* If we're below target cwnd, slow start cwnd toward target cwnd. */
+ target_cwnd = bbr_target_cwnd(sk, bw, gain);
+ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
+ cwnd = min(cwnd + acked, target_cwnd);
+ else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+ cwnd = cwnd + acked;
+ cwnd = max(cwnd, bbr_cwnd_min_target);
+
+done:
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */
+ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
+ tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
+}
+
+/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+static bool bbr_is_next_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool is_full_length =
+ skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
+ bbr->min_rtt_us;
+ u32 inflight, bw;
+
+ /* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+ * use the pipe without increasing the queue.
+ */
+ if (bbr->pacing_gain == BBR_UNIT)
+ return is_full_length; /* just use wall clock time */
+
+ inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
+ bw = bbr_max_bw(sk);
+
+ /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+ * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+ * small (e.g. on a LAN). We do not persist if packets are lost, since
+ * a path with small buffers may not hold that much.
+ */
+ if (bbr->pacing_gain > BBR_UNIT)
+ return is_full_length &&
+ (rs->losses || /* perhaps pacing_gain*BDP won't fit */
+ inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
+
+ /* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+ * probing didn't find more bw. If inflight falls to match BDP then we
+ * estimate queue is drained; persisting would underutilize the pipe.
+ */
+ return is_full_length ||
+ inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
+}
+
+static void bbr_advance_cycle_phase(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+ bbr->cycle_mstamp = tp->delivered_mstamp;
+ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
+}
+
+/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+static void bbr_update_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
+ bbr_is_next_cycle_phase(sk, rs))
+ bbr_advance_cycle_phase(sk);
+}
+
+static void bbr_reset_startup_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_STARTUP;
+ bbr->pacing_gain = bbr_high_gain;
+ bbr->cwnd_gain = bbr_high_gain;
+}
+
+static void bbr_reset_probe_bw_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_PROBE_BW;
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = bbr_cwnd_gain;
+ bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
+ bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
+}
+
+static void bbr_reset_mode(struct sock *sk)
+{
+ if (!bbr_full_bw_reached(sk))
+ bbr_reset_startup_mode(sk);
+ else
+ bbr_reset_probe_bw_mode(sk);
+}
+
+/* Start a new long-term sampling interval. */
+static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
+ bbr->lt_last_delivered = tp->delivered;
+ bbr->lt_last_lost = tp->lost;
+ bbr->lt_rtt_cnt = 0;
+}
+
+/* Completely reset long-term bandwidth sampling. */
+static void bbr_reset_lt_bw_sampling(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_bw = 0;
+ bbr->lt_use_bw = 0;
+ bbr->lt_is_sampling = false;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 diff;
+
+ if (bbr->lt_bw) { /* do we have bw from a previous interval? */
+ /* Is new bw close to the lt_bw from the previous interval? */
+ diff = abs(bw - bbr->lt_bw);
+ if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+ (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+ bbr_lt_bw_diff)) {
+ /* All criteria are met; estimate we're policed. */
+ bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */
+ bbr->lt_use_bw = 1;
+ bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */
+ bbr->lt_rtt_cnt = 0;
+ return;
+ }
+ }
+ bbr->lt_bw = bw;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+ * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+ * explicitly models their policed rate, to reduce unnecessary losses. We
+ * estimate that we're policed if we see 2 consecutive sampling intervals with
+ * consistent throughput and high packet loss. If we think we're being policed,
+ * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+ */
+static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 lost, delivered;
+ u64 bw;
+ s32 t;
+
+ if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */
+ if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+ ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */
+ bbr_reset_probe_bw_mode(sk); /* restart gain cycling */
+ }
+ return;
+ }
+
+ /* Wait for the first loss before sampling, to let the policer exhaust
+ * its tokens and estimate the steady-state rate allowed by the policer.
+ * Starting samples earlier includes bursts that over-estimate the bw.
+ */
+ if (!bbr->lt_is_sampling) {
+ if (!rs->losses)
+ return;
+ bbr_reset_lt_bw_sampling_interval(sk);
+ bbr->lt_is_sampling = true;
+ }
+
+ /* To avoid underestimates, reset sampling if we run out of data. */
+ if (rs->is_app_limited) {
+ bbr_reset_lt_bw_sampling(sk);
+ return;
+ }
+
+ if (bbr->round_start)
+ bbr->lt_rtt_cnt++; /* count round trips in this interval */
+ if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+ return; /* sampling interval needs to be longer */
+ if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* interval is too long */
+ return;
+ }
+
+ /* End sampling interval when a packet is lost, so we estimate the
+ * policer tokens were exhausted. Stopping the sampling before the
+ * tokens are exhausted under-estimates the policed rate.
+ */
+ if (!rs->losses)
+ return;
+
+ /* Calculate packets lost and delivered in sampling interval. */
+ lost = tp->lost - bbr->lt_last_lost;
+ delivered = tp->delivered - bbr->lt_last_delivered;
+ /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+ if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+ return;
+
+ /* Find average delivery rate in this sampling interval. */
+ t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
+ if (t < 1)
+ return; /* interval is less than one jiffy, so wait */
+ t = jiffies_to_usecs(t);
+ /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
+ if (t < 1) {
+ bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */
+ return;
+ }
+ bw = (u64)delivered * BW_UNIT;
+ do_div(bw, t);
+ bbr_lt_bw_interval_done(sk, bw);
+}
+
+/* Estimate the bandwidth based on how fast packets are delivered */
+static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+
+ bbr->round_start = 0;
+ if (rs->delivered < 0 || rs->interval_us <= 0)
+ return; /* Not a valid observation */
+
+ /* See if we've reached the next RTT */
+ if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+ bbr->next_rtt_delivered = tp->delivered;
+ bbr->rtt_cnt++;
+ bbr->round_start = 1;
+ bbr->packet_conservation = 0;
+ }
+
+ bbr_lt_bw_sampling(sk, rs);
+
+ /* Divide delivered by the interval to find a (lower bound) bottleneck
+ * bandwidth sample. Delivered is in packets and interval_us in uS and
+ * ratio will be <<1 for most connections. So delivered is first scaled.
+ */
+ bw = (u64)rs->delivered * BW_UNIT;
+ do_div(bw, rs->interval_us);
+
+ /* If this sample is application-limited, it is likely to have a very
+ * low delivered count that represents application behavior rather than
+ * the available network rate. Such a sample could drag down estimated
+ * bw, causing needless slow-down. Thus, to continue to send at the
+ * last measured network rate, we filter out app-limited samples unless
+ * they describe the path bw at least as well as our bw model.
+ *
+ * So the goal during app-limited phase is to proceed with the best
+ * network rate no matter how long. We automatically leave this
+ * phase when app writes faster than the network can deliver :)
+ */
+ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+ /* Incorporate new sample into our max bw filter. */
+ minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+ }
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw_thresh;
+
+ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+ return;
+
+ bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+ if (bbr_max_bw(sk) >= bw_thresh) {
+ bbr->full_bw = bbr_max_bw(sk);
+ bbr->full_bw_cnt = 0;
+ return;
+ }
+ ++bbr->full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+ bbr->mode = BBR_DRAIN; /* drain queue we created */
+ bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
+ bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
+ } /* fall through to check if in-flight is already small: */
+ if (bbr->mode == BBR_DRAIN &&
+ tcp_packets_in_flight(tcp_sk(sk)) <=
+ bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
+ bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool filter_expired;
+
+ /* Track min RTT seen in the min_rtt_win_sec filter window: */
+ filter_expired = after(tcp_time_stamp,
+ bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+ if (rs->rtt_us >= 0 &&
+ (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
+ bbr->min_rtt_us = rs->rtt_us;
+ bbr->min_rtt_stamp = tcp_time_stamp;
+ }
+
+ if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = BBR_UNIT;
+ bbr_save_cwnd(sk); /* note cwnd so we can restore it */
+ bbr->probe_rtt_done_stamp = 0;
+ }
+
+ if (bbr->mode == BBR_PROBE_RTT) {
+ /* Ignore low rate samples during this mode. */
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ /* Maintain min packets in flight for max(200 ms, 1 round). */
+ if (!bbr->probe_rtt_done_stamp &&
+ tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+ bbr->probe_rtt_done_stamp = tcp_time_stamp +
+ msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+ bbr->probe_rtt_round_done = 0;
+ bbr->next_rtt_delivered = tp->delivered;
+ } else if (bbr->probe_rtt_done_stamp) {
+ if (bbr->round_start)
+ bbr->probe_rtt_round_done = 1;
+ if (bbr->probe_rtt_round_done &&
+ after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
+ bbr->min_rtt_stamp = tcp_time_stamp;
+ bbr->restore_cwnd = 1; /* snap to prior_cwnd */
+ bbr_reset_mode(sk);
+ }
+ }
+ }
+ bbr->idle_restart = 0;
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+{
+ bbr_update_bw(sk, rs);
+ bbr_update_cycle_phase(sk, rs);
+ bbr_check_full_bw_reached(sk, rs);
+ bbr_check_drain(sk, rs);
+ bbr_update_min_rtt(sk, rs);
+}
+
+static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw;
+
+ bbr_update_model(sk, rs);
+
+ bw = bbr_bw(sk);
+ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+ bbr_set_tso_segs_goal(sk);
+ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+}
+
+static void bbr_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+
+ bbr->prior_cwnd = 0;
+ bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */
+ bbr->rtt_cnt = 0;
+ bbr->next_rtt_delivered = 0;
+ bbr->prev_ca_state = TCP_CA_Open;
+ bbr->packet_conservation = 0;
+
+ bbr->probe_rtt_done_stamp = 0;
+ bbr->probe_rtt_round_done = 0;
+ bbr->min_rtt_us = tcp_min_rtt(tp);
+ bbr->min_rtt_stamp = tcp_time_stamp;
+
+ minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */
+
+ /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+ bw = (u64)tp->snd_cwnd * BW_UNIT;
+ do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
+ sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */
+ bbr_set_pacing_rate(sk, bw, bbr_high_gain);
+
+ bbr->restore_cwnd = 0;
+ bbr->round_start = 0;
+ bbr->idle_restart = 0;
+ bbr->full_bw = 0;
+ bbr->full_bw_cnt = 0;
+ bbr->cycle_mstamp.v64 = 0;
+ bbr->cycle_idx = 0;
+ bbr_reset_lt_bw_sampling(sk);
+ bbr_reset_startup_mode(sk);
+}
+
+static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+ return 3;
+}
+
+/* In theory BBR does not need to undo the cwnd since it does not
+ * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+ */
+static u32 bbr_undo_cwnd(struct sock *sk)
+{
+ return tcp_sk(sk)->snd_cwnd;
+}
+
+/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+static u32 bbr_ssthresh(struct sock *sk)
+{
+ bbr_save_cwnd(sk);
+ return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */
+}
+
+static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw = bbr_bw(sk);
+
+ bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+ memset(&info->bbr, 0, sizeof(info->bbr));
+ info->bbr.bbr_bw_lo = (u32)bw;
+ info->bbr.bbr_bw_hi = (u32)(bw >> 32);
+ info->bbr.bbr_min_rtt = bbr->min_rtt_us;
+ info->bbr.bbr_pacing_gain = bbr->pacing_gain;
+ info->bbr.bbr_cwnd_gain = bbr->cwnd_gain;
+ *attr = INET_DIAG_BBRINFO;
+ return sizeof(info->bbr);
+ }
+ return 0;
+}
+
+static void bbr_set_state(struct sock *sk, u8 new_state)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Loss) {
+ struct rate_sample rs = { .losses = 1 };
+
+ bbr->prev_ca_state = TCP_CA_Loss;
+ bbr->full_bw = 0;
+ bbr->round_start = 1; /* treat RTO like end of a round */
+ bbr_lt_bw_sampling(sk, &rs);
+ }
+}
+
+static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+ .flags = TCP_CONG_NON_RESTRICTED,
+ .name = "bbr",
+ .owner = THIS_MODULE,
+ .init = bbr_init,
+ .cong_control = bbr_main,
+ .sndbuf_expand = bbr_sndbuf_expand,
+ .undo_cwnd = bbr_undo_cwnd,
+ .cwnd_event = bbr_cwnd_event,
+ .ssthresh = bbr_ssthresh,
+ .tso_segs_goal = bbr_tso_segs_goal,
+ .get_info = bbr_get_info,
+ .set_state = bbr_set_state,
+};
+
+static int __init bbr_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_bbr_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 03725b294286..35b280361cb2 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -56,7 +56,7 @@ MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
module_param(use_tolerance, bool, 0644);
MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
-struct minmax {
+struct cdg_minmax {
union {
struct {
s32 min;
@@ -74,10 +74,10 @@ enum cdg_state {
};
struct cdg {
- struct minmax rtt;
- struct minmax rtt_prev;
- struct minmax *gradients;
- struct minmax gsum;
+ struct cdg_minmax rtt;
+ struct cdg_minmax rtt_prev;
+ struct cdg_minmax *gradients;
+ struct cdg_minmax gsum;
bool gfilled;
u8 tail;
u8 state;
@@ -353,7 +353,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
{
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct minmax *gradients;
+ struct cdg_minmax *gradients;
switch (ev) {
case CA_EVENT_CWND_RESTART:
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 882caa4e72bc..1294af4e0127 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
int ret = 0;
/* all algorithms must implement ssthresh and cong_avoid ops */
- if (!ca->ssthresh || !ca->cong_avoid) {
+ if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 54d9f9b0120f..4e777a3243f9 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -150,6 +150,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
tp->segs_in = 0;
tcp_segs_in(tp, skb);
__skb_pull(skb, tcp_hdrlen(skb));
+ sk_forced_mem_schedule(sk, skb->truesize);
skb_set_owner_r(skb, sk);
TCP_SKB_CB(skb)->seq++;
@@ -226,6 +227,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tcp_fastopen_add_skb(child, skb);
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
+ tp->rcv_wup = tp->rcv_nxt;
/* tcp_conn_request() is sending the SYNACK,
* and queues the child into listener accept queue.
*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f3a9f3c2c8d8..8c6ad2d319d6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
static void tcp_sndbuf_expand(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
int sndmem, per_mss;
u32 nr_segs;
@@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
* Cubic needs 1.7 factor, rounded to 2 to include
* extra cushion (application might react slowly to POLLOUT)
*/
- sndmem = 2 * nr_segs * per_mss;
+ sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
+ sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
@@ -899,12 +901,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
+/* Sum the number of packets on the wire we have marked as lost.
+ * There are two cases we care about here:
+ * a) Packet hasn't been marked lost (nor retransmitted),
+ * and this is the first loss.
+ * b) Packet has been marked both lost and retransmitted,
+ * and this means we think it was lost again.
+ */
+static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+ if (!(sacked & TCPCB_LOST) ||
+ ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
+ tp->lost += tcp_skb_pcount(skb);
+}
+
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tcp_verify_retransmit_hint(tp, skb);
tp->lost_out += tcp_skb_pcount(skb);
+ tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
}
}
@@ -913,6 +932,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
+ tcp_sum_lost(tp, skb);
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1094,6 +1114,7 @@ struct tcp_sacktag_state {
*/
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
+ struct rate_sample *rate;
int flag;
};
@@ -1261,6 +1282,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
&skb->skb_mstamp);
+ tcp_rate_skb_delivered(sk, skb, state->rate);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
@@ -1311,6 +1333,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_advance_highest_sack(sk, skb);
tcp_skb_collapse_tstamp(prev, skb);
+ if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
+ TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
+
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
@@ -1540,6 +1565,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
dup_sack,
tcp_skb_pcount(skb),
&skb->skb_mstamp);
+ tcp_rate_skb_delivered(sk, skb, state->rate);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
@@ -1622,8 +1648,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
- if (found_dup_sack)
+ if (found_dup_sack) {
state->flag |= FLAG_DSACKING_ACK;
+ tp->delivered++; /* A spurious retransmission is delivered */
+ }
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1890,6 +1918,7 @@ void tcp_enter_loss(struct sock *sk)
struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
+ bool mark_lost;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1923,8 +1952,12 @@ void tcp_enter_loss(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
+ mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+ is_reneg);
+ if (mark_lost)
+ tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
+ if (mark_lost) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -2503,6 +2536,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ if (inet_csk(sk)->icsk_ca_ops->cong_control)
+ return;
+
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
(tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@ -2879,67 +2915,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
*rexmit = REXMIT_LOST;
}
-/* Kathleen Nichols' algorithm for tracking the minimum value of
- * a data stream over some fixed time interval. (E.g., the minimum
- * RTT over the past five minutes.) It uses constant space and constant
- * time per update yet almost always delivers the same minimum as an
- * implementation that has to keep all the data in the window.
- *
- * The algorithm keeps track of the best, 2nd best & 3rd best min
- * values, maintaining an invariant that the measurement time of the
- * n'th best >= n-1'th best. It also makes sure that the three values
- * are widely separated in the time window since that bounds the worse
- * case error when that data is monotonically increasing over the window.
- *
- * Upon getting a new min, we can forget everything earlier because it
- * has no value - the new min is <= everything else in the window by
- * definition and it's the most recent. So we restart fresh on every new min
- * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
- * best.
- */
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
{
- const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
- struct rtt_meas *m = tcp_sk(sk)->rtt_min;
- struct rtt_meas rttm = {
- .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
- .ts = now,
- };
- u32 elapsed;
-
- /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
- if (unlikely(rttm.rtt <= m[0].rtt))
- m[0] = m[1] = m[2] = rttm;
- else if (rttm.rtt <= m[1].rtt)
- m[1] = m[2] = rttm;
- else if (rttm.rtt <= m[2].rtt)
- m[2] = rttm;
-
- elapsed = now - m[0].ts;
- if (unlikely(elapsed > wlen)) {
- /* Passed entire window without a new min so make 2nd choice
- * the new min & 3rd choice the new 2nd. So forth and so on.
- */
- m[0] = m[1];
- m[1] = m[2];
- m[2] = rttm;
- if (now - m[0].ts > wlen) {
- m[0] = m[1];
- m[1] = rttm;
- if (now - m[0].ts > wlen)
- m[0] = rttm;
- }
- } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
- /* Passed a quarter of the window without a new min so
- * take 2nd choice from the 2nd quarter of the window.
- */
- m[2] = m[1] = rttm;
- } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
- /* Passed half the window without a new min so take the 3rd
- * choice from the last half of the window.
- */
- m[2] = rttm;
- }
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
+
+ minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
+ rtt_us ? : jiffies_to_usecs(1));
}
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
@@ -3102,10 +3084,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una, int *acked,
- struct tcp_sacktag_state *sack)
+ struct tcp_sacktag_state *sack,
+ struct skb_mstamp *now)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct skb_mstamp first_ackt, last_ackt, now;
+ struct skb_mstamp first_ackt, last_ackt;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
@@ -3137,7 +3120,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
acked_pcount = tcp_tso_acked(sk, skb);
if (!acked_pcount)
break;
-
fully_acked = false;
} else {
/* Speedup tcp_unlink_write_queue() and next loop */
@@ -3173,6 +3155,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
+ tcp_rate_skb_delivered(sk, skb, sack->rate);
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
@@ -3205,16 +3188,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
- skb_mstamp_get(&now);
if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
- seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
- ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
+ ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
}
if (sack->first_sackt.v64) {
- sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
- ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
+ sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
+ ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
}
-
+ sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
ca_rtt_us);
@@ -3242,7 +3224,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
- sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+ sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
@@ -3333,8 +3315,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
* information. All transmission or retransmission are delayed afterwards.
*/
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
- int flag)
+ int flag, const struct rate_sample *rs)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->cong_control) {
+ icsk->icsk_ca_ops->cong_control(sk, rs);
+ return;
+ }
+
if (tcp_in_cwnd_reduction(sk)) {
/* Reduce cwnd if state mandates */
tcp_cwnd_reduction(sk, acked_sacked, flag);
@@ -3579,17 +3568,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sacktag_state sack_state;
+ struct rate_sample rs = { .prior_delivered = 0 };
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
u32 prior_fackets;
int prior_packets = tp->packets_out;
- u32 prior_delivered = tp->delivered;
+ u32 delivered = tp->delivered;
+ u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ struct skb_mstamp now;
sack_state.first_sackt.v64 = 0;
+ sack_state.rate = &rs;
/* We very likely will need to access write queue head. */
prefetchw(sk->sk_write_queue.next);
@@ -3612,6 +3605,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
+ skb_mstamp_get(&now);
+
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
@@ -3622,6 +3617,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
}
prior_fackets = tp->fackets_out;
+ rs.prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
* is in window.
@@ -3677,7 +3673,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
- &sack_state);
+ &sack_state, &now);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3694,7 +3690,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
- tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+ delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
+ lost = tp->lost - lost; /* freshly marked lost */
+ tcp_rate_gen(sk, delivered, lost, &now, &rs);
+ tcp_cong_control(sk, ack, delivered, flag, &rs);
tcp_xmit_recovery(sk, rexmit);
return 1;
@@ -4108,7 +4107,7 @@ void tcp_fin(struct sock *sk)
/* It _is_ possible, that we have something out-of-order _after_ FIN.
* Probably, we should reset in this case. For now drop them.
*/
- __skb_queue_purge(&tp->out_of_order_queue);
+ skb_rbtree_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
sk_mem_reclaim(sk);
@@ -4268,7 +4267,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
int this_sack;
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
- if (skb_queue_empty(&tp->out_of_order_queue)) {
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tp->rx_opt.num_sacks = 0;
return;
}
@@ -4344,10 +4343,13 @@ static void tcp_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 dsack_high = tp->rcv_nxt;
+ bool fin, fragstolen, eaten;
struct sk_buff *skb, *tail;
- bool fragstolen, eaten;
+ struct rb_node *p;
- while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+ p = rb_first(&tp->out_of_order_queue);
+ while (p) {
+ skb = rb_entry(p, struct sk_buff, rbnode);
if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
@@ -4357,9 +4359,10 @@ static void tcp_ofo_queue(struct sock *sk)
dsack_high = TCP_SKB_CB(skb)->end_seq;
tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
}
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &tp->out_of_order_queue);
- __skb_unlink(skb, &tp->out_of_order_queue);
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
SOCK_DEBUG(sk, "ofo packet was already received\n");
tcp_drop(sk, skb);
continue;
@@ -4371,12 +4374,19 @@ static void tcp_ofo_queue(struct sock *sk)
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+ fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
if (!eaten)
__skb_queue_tail(&sk->sk_receive_queue, skb);
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
- tcp_fin(sk);
- if (eaten)
+ else
kfree_skb_partial(skb, fragstolen);
+
+ if (unlikely(fin)) {
+ tcp_fin(sk);
+ /* tcp_fin() purges tp->out_of_order_queue,
+ * so we must end this loop right now.
+ */
+ break;
+ }
}
}
@@ -4403,8 +4413,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct rb_node **p, *q, *parent;
struct sk_buff *skb1;
u32 seq, end_seq;
+ bool fragstolen;
tcp_ecn_check_ce(tp, skb);
@@ -4419,88 +4431,92 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
inet_csk_schedule_ack(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+ seq = TCP_SKB_CB(skb)->seq;
+ end_seq = TCP_SKB_CB(skb)->end_seq;
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+ tp->rcv_nxt, seq, end_seq);
- skb1 = skb_peek_tail(&tp->out_of_order_queue);
- if (!skb1) {
+ p = &tp->out_of_order_queue.rb_node;
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
/* Initial out of order segment, build 1 SACK. */
if (tcp_is_sack(tp)) {
tp->rx_opt.num_sacks = 1;
- tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
- tp->selective_acks[0].end_seq =
- TCP_SKB_CB(skb)->end_seq;
+ tp->selective_acks[0].start_seq = seq;
+ tp->selective_acks[0].end_seq = end_seq;
}
- __skb_queue_head(&tp->out_of_order_queue, skb);
+ rb_link_node(&skb->rbnode, NULL, p);
+ rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+ tp->ooo_last_skb = skb;
goto end;
}
- seq = TCP_SKB_CB(skb)->seq;
- end_seq = TCP_SKB_CB(skb)->end_seq;
-
- if (seq == TCP_SKB_CB(skb1)->end_seq) {
- bool fragstolen;
-
- if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
- __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
- } else {
- tcp_grow_window(sk, skb);
- kfree_skb_partial(skb, fragstolen);
- skb = NULL;
- }
-
- if (!tp->rx_opt.num_sacks ||
- tp->selective_acks[0].end_seq != seq)
- goto add_sack;
-
- /* Common case: data arrive in order after hole. */
- tp->selective_acks[0].end_seq = end_seq;
- goto end;
- }
-
- /* Find place to insert this segment. */
- while (1) {
- if (!after(TCP_SKB_CB(skb1)->seq, seq))
- break;
- if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
- skb1 = NULL;
- break;
- }
- skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
- }
-
- /* Do skb overlap to previous one? */
- if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
- /* All the bits are present. Drop. */
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb);
- skb = NULL;
- tcp_dsack_set(sk, seq, end_seq);
- goto add_sack;
+ /* In the typical case, we are adding an skb to the end of the list.
+ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ */
+ if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
+coalesce_done:
+ tcp_grow_window(sk, skb);
+ kfree_skb_partial(skb, fragstolen);
+ skb = NULL;
+ goto add_sack;
+ }
+ /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+ if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
+ parent = &tp->ooo_last_skb->rbnode;
+ p = &parent->rb_right;
+ goto insert;
+ }
+
+ /* Find place to insert this segment. Handle overlaps on the way. */
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ skb1 = rb_entry(parent, struct sk_buff, rbnode);
+ if (before(seq, TCP_SKB_CB(skb1)->seq)) {
+ p = &parent->rb_left;
+ continue;
}
- if (after(seq, TCP_SKB_CB(skb1)->seq)) {
- /* Partial overlap. */
- tcp_dsack_set(sk, seq,
- TCP_SKB_CB(skb1)->end_seq);
- } else {
- if (skb_queue_is_first(&tp->out_of_order_queue,
- skb1))
- skb1 = NULL;
- else
- skb1 = skb_queue_prev(
- &tp->out_of_order_queue,
- skb1);
+ if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb);
+ skb = NULL;
+ tcp_dsack_set(sk, seq, end_seq);
+ goto add_sack;
+ }
+ if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+ /* Partial overlap. */
+ tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
+ } else {
+ /* skb's seq == skb1's seq and skb covers skb1.
+ * Replace skb1 with skb.
+ */
+ rb_replace_node(&skb1->rbnode, &skb->rbnode,
+ &tp->out_of_order_queue);
+ tcp_dsack_extend(sk,
+ TCP_SKB_CB(skb1)->seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb1);
+ goto merge_right;
+ }
+ } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+ goto coalesce_done;
}
+ p = &parent->rb_right;
}
- if (!skb1)
- __skb_queue_head(&tp->out_of_order_queue, skb);
- else
- __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+insert:
+ /* Insert segment into RB tree. */
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
- /* And clean segments covered by new one as whole. */
- while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
- skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+merge_right:
+ /* Remove other segments covered by skb. */
+ while ((q = rb_next(&skb->rbnode)) != NULL) {
+ skb1 = rb_entry(q, struct sk_buff, rbnode);
if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
break;
@@ -4509,12 +4525,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
end_seq);
break;
}
- __skb_unlink(skb1, &tp->out_of_order_queue);
+ rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
tcp_drop(sk, skb1);
}
+ /* If there is no skb after us, we are the last_skb ! */
+ if (!q)
+ tp->ooo_last_skb = skb;
add_sack:
if (tcp_is_sack(tp))
@@ -4651,13 +4670,13 @@ queue_and_out:
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
- if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
/* RFC2581. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
- if (skb_queue_empty(&tp->out_of_order_queue))
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
inet_csk(sk)->icsk_ack.pingpong = 0;
}
@@ -4711,48 +4730,76 @@ drop:
tcp_data_queue_ofo(sk, skb);
}
+static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
+{
+ if (list)
+ return !skb_queue_is_last(list, skb) ? skb->next : NULL;
+
+ return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+}
+
static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
- struct sk_buff_head *list)
+ struct sk_buff_head *list,
+ struct rb_root *root)
{
- struct sk_buff *next = NULL;
+ struct sk_buff *next = tcp_skb_next(skb, list);
- if (!skb_queue_is_last(list, skb))
- next = skb_queue_next(list, skb);
+ if (list)
+ __skb_unlink(skb, list);
+ else
+ rb_erase(&skb->rbnode, root);
- __skb_unlink(skb, list);
__kfree_skb(skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
return next;
}
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct sk_buff *skb1;
+
+ while (*p) {
+ parent = *p;
+ skb1 = rb_entry(parent, struct sk_buff, rbnode);
+ if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, root);
+}
+
/* Collapse contiguous sequence of skbs head..tail with
* sequence numbers start..end.
*
- * If tail is NULL, this means until the end of the list.
+ * If tail is NULL, this means until the end of the queue.
*
* Segments with FIN/SYN are not collapsed (only because this
* simplifies code)
*/
static void
-tcp_collapse(struct sock *sk, struct sk_buff_head *list,
- struct sk_buff *head, struct sk_buff *tail,
- u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+ struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
{
- struct sk_buff *skb, *n;
+ struct sk_buff *skb = head, *n;
+ struct sk_buff_head tmp;
bool end_of_skbs;
/* First, check that queue is collapsible and find
- * the point where collapsing can be useful. */
- skb = head;
+ * the point where collapsing can be useful.
+ */
restart:
- end_of_skbs = true;
- skb_queue_walk_from_safe(list, skb, n) {
- if (skb == tail)
- break;
+ for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
+ n = tcp_skb_next(skb, list);
+
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
- skb = tcp_collapse_one(sk, skb, list);
+ skb = tcp_collapse_one(sk, skb, list, root);
if (!skb)
break;
goto restart;
@@ -4770,13 +4817,10 @@ restart:
break;
}
- if (!skb_queue_is_last(list, skb)) {
- struct sk_buff *next = skb_queue_next(list, skb);
- if (next != tail &&
- TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
- end_of_skbs = false;
- break;
- }
+ if (n && n != tail &&
+ TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
+ end_of_skbs = false;
+ break;
}
/* Decided to skip this, advance start seq. */
@@ -4786,17 +4830,22 @@ restart:
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
return;
+ __skb_queue_head_init(&tmp);
+
while (before(start, end)) {
int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
struct sk_buff *nskb;
nskb = alloc_skb(copy, GFP_ATOMIC);
if (!nskb)
- return;
+ break;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
- __skb_queue_before(list, skb, nskb);
+ if (list)
+ __skb_queue_before(list, skb, nskb);
+ else
+ __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
skb_set_owner_r(nskb, sk);
/* Copy data, releasing collapsed skbs. */
@@ -4814,14 +4863,17 @@ restart:
start += size;
}
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
- skb = tcp_collapse_one(sk, skb, list);
+ skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
skb == tail ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
- return;
+ goto end;
}
}
}
+end:
+ skb_queue_walk_safe(&tmp, skb, n)
+ tcp_rbtree_insert(root, skb);
}
/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -4830,43 +4882,43 @@ restart:
static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
- struct sk_buff *head;
+ struct sk_buff *skb, *head;
+ struct rb_node *p;
u32 start, end;
- if (!skb)
+ p = rb_first(&tp->out_of_order_queue);
+ skb = rb_entry_safe(p, struct sk_buff, rbnode);
+new_range:
+ if (!skb) {
+ p = rb_last(&tp->out_of_order_queue);
+ /* Note: This is possible p is NULL here. We do not
+ * use rb_entry_safe(), as ooo_last_skb is valid only
+ * if rbtree is not empty.
+ */
+ tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
return;
-
+ }
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
- head = skb;
-
- for (;;) {
- struct sk_buff *next = NULL;
- if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
- next = skb_queue_next(&tp->out_of_order_queue, skb);
- skb = next;
+ for (head = skb;;) {
+ skb = tcp_skb_next(skb, NULL);
- /* Segment is terminated when we see gap or when
- * we are at the end of all the queue. */
+ /* Range is terminated when we see a gap or when
+ * we are at the queue end.
+ */
if (!skb ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
- tcp_collapse(sk, &tp->out_of_order_queue,
+ tcp_collapse(sk, NULL, &tp->out_of_order_queue,
head, skb, start, end);
- head = skb;
- if (!skb)
- break;
- /* Start new segment */
+ goto new_range;
+ }
+
+ if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
start = TCP_SKB_CB(skb)->seq;
+ if (after(TCP_SKB_CB(skb)->end_seq, end))
end = TCP_SKB_CB(skb)->end_seq;
- } else {
- if (before(TCP_SKB_CB(skb)->seq, start))
- start = TCP_SKB_CB(skb)->seq;
- if (after(TCP_SKB_CB(skb)->end_seq, end))
- end = TCP_SKB_CB(skb)->end_seq;
- }
}
}
@@ -4883,20 +4935,24 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
static bool tcp_prune_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
+ struct rb_node *node, *prev;
- if (skb_queue_empty(&tp->out_of_order_queue))
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
return false;
NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
-
- while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
- tcp_drop(sk, skb);
+ node = &tp->ooo_last_skb->rbnode;
+ do {
+ prev = rb_prev(node);
+ rb_erase(node, &tp->out_of_order_queue);
+ tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!tcp_under_memory_pressure(sk))
break;
- }
+ node = prev;
+ } while (node);
+ tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
/* Reset SACK state. A conforming SACK implementation will
* do the same at a timeout based retransmit. When a connection
@@ -4930,7 +4986,7 @@ static int tcp_prune_queue(struct sock *sk)
tcp_collapse_ofo_queue(sk);
if (!skb_queue_empty(&sk->sk_receive_queue))
- tcp_collapse(sk, &sk->sk_receive_queue,
+ tcp_collapse(sk, &sk->sk_receive_queue, NULL,
skb_peek(&sk->sk_receive_queue),
NULL,
tp->copied_seq, tp->rcv_nxt);
@@ -5035,7 +5091,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
- (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+ (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
/* Then ack it now */
tcp_send_ack(sk);
} else {
@@ -5894,7 +5950,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
* so release it.
*/
if (req) {
- tp->total_retrans = req->num_retrans;
+ inet_csk(sk)->icsk_retransmits = 0;
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for correct metrics. */
@@ -5936,7 +5992,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
} else
tcp_init_metrics(sk);
- tcp_update_pacing_rate(sk);
+ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+ tcp_update_pacing_rate(sk);
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_time_stamp;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 13b05adf9d3e..7ac37c314312 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1844,7 +1844,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_write_queue_purge(sk);
/* Cleans up our, hopefully empty, out_of_order_queue. */
- __skb_queue_purge(&tp->out_of_order_queue);
+ skb_rbtree_purge(&tp->out_of_order_queue);
#ifdef CONFIG_TCP_MD5SIG
/* Clean up the MD5 key list, if any */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4b95ec4ed2c8..6234ebaa7db1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->srtt_us = 0;
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
- newtp->rtt_min[0].rtt = ~0U;
+ minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
@@ -487,8 +487,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ newtp->app_limited = ~0U;
+
tcp_init_xmit_timers(newsk);
- __skb_queue_head_init(&newtp->out_of_order_queue);
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
newtp->rx_opt.saw_tstamp = 0;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 5c5964962d0c..bc68da38ea86 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -90,12 +90,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
goto out;
}
- /* GSO partial only requires splitting the frame into an MSS
- * multiple and possibly a remainder. So update the mss now.
- */
- if (features & NETIF_F_GSO_PARTIAL)
- mss = skb->len - (skb->len % mss);
-
copy_destructor = gso_skb->destructor == tcp_wfree;
ooo_okay = gso_skb->ooo_okay;
/* All segments but the first should have ooo_okay cleared */
@@ -108,6 +102,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
/* Only first segment might have ooo_okay set */
segs->ooo_okay = ooo_okay;
+ /* GSO partial and frag_list segmentation only requires splitting
+ * the frame into an MSS multiple and possibly a remainder, both
+ * cases return a GSO skb. So update the mss now.
+ */
+ if (skb_is_gso(segs))
+ mss *= skb_shinfo(segs)->gso_segs;
+
delta = htonl(oldlen + (thlen + mss));
skb = segs;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8b45794eb6b2..7c777089a4d6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -734,9 +734,16 @@ static void tcp_tsq_handler(struct sock *sk)
{
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
- TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
- tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->lost_out > tp->retrans_out &&
+ tp->snd_cwnd > tcp_packets_in_flight(tp))
+ tcp_xmit_retransmit_queue(sk);
+
+ tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
0, GFP_ATOMIC);
+ }
}
/*
* One tasklet per cpu tries to send more skbs.
@@ -918,6 +925,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_mstamp_get(&skb->skb_mstamp);
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
+ tcp_rate_skb_sent(sk, skb);
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
@@ -1213,6 +1221,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
tcp_set_skb_tso_segs(skb, mss_now);
tcp_set_skb_tso_segs(buff, mss_now);
+ /* Update delivered info for the new segment */
+ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
+
/* If this packet has been sent out already, we must
* adjust the various packet counters.
*/
@@ -1358,6 +1369,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
}
return mtu;
}
+EXPORT_SYMBOL(tcp_mss_to_mtu);
/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
@@ -1545,7 +1557,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
/* Return how many segs we'd like on a TSO packet,
* to send one TSO packet per ms
*/
-static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ int min_tso_segs)
{
u32 bytes, segs;
@@ -1557,10 +1570,23 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
* This preserves ACK clocking and is consistent
* with tcp_tso_should_defer() heuristic.
*/
- segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+ segs = max_t(u32, bytes / mss_now, min_tso_segs);
return min_t(u32, segs, sk->sk_gso_max_segs);
}
+EXPORT_SYMBOL(tcp_tso_autosize);
+
+/* Return the number of segments we want in the skb we are transmitting.
+ * See if congestion control module wants to decide; otherwise, autosize.
+ */
+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+ u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
+
+ return tso_segs ? :
+ tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
+}
/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
@@ -2020,6 +2046,39 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
+/* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * (These limits are doubled for retransmits)
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
+ unsigned int factor)
+{
+ unsigned int limit;
+
+ limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+ limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
+ limit <<= factor;
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+ set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED, so we must
+ * test again the condition.
+ */
+ smp_mb__after_atomic();
+ if (atomic_read(&sk->sk_wmem_alloc) > limit)
+ return true;
+ }
+ return false;
+}
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -2057,7 +2116,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
}
- max_segs = tcp_tso_autosize(sk, mss_now);
+ max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
@@ -2106,29 +2165,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
- /* TCP Small Queues :
- * Control number of packets in qdisc/devices to two packets / or ~1 ms.
- * This allows for :
- * - better RTT estimation and ACK scheduling
- * - faster recovery
- * - high rates
- * Alas, some drivers / subsystems require a fair amount
- * of queued bytes to ensure line rate.
- * One example is wifi aggregation (802.11 AMPDU)
- */
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
- limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
-
- if (atomic_read(&sk->sk_wmem_alloc) > limit) {
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
- /* It is possible TX completion already happened
- * before we set TSQ_THROTTLED, so we must
- * test again the condition.
- */
- smp_mb__after_atomic();
- if (atomic_read(&sk->sk_wmem_alloc) > limit)
- break;
- }
+ if (tcp_small_queue_check(sk, skb, 0))
+ break;
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
@@ -2605,7 +2643,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
* copying overhead: fragmentation, tunneling, mangling etc.
*/
if (atomic_read(&sk->sk_wmem_alloc) >
- min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
+ min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
+ sk->sk_sndbuf))
return -EAGAIN;
if (skb_still_in_host_queue(sk, skb))
@@ -2774,7 +2813,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
last_lost = tp->snd_una;
}
- max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
+ max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
tcp_for_write_queue_from(skb, sk) {
__u8 sacked;
int segs;
@@ -2828,10 +2867,13 @@ begin_fwd:
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue;
+ if (tcp_small_queue_check(sk, skb, 1))
+ return;
+
if (tcp_retransmit_skb(sk, skb, segs))
return;
- NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
@@ -3568,6 +3610,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
if (!res) {
__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ if (unlikely(tcp_passive_fastopen(sk)))
+ tcp_sk(sk)->total_retrans++;
}
return res;
}
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
new file mode 100644
index 000000000000..9be1581a5a08
--- /dev/null
+++ b/net/ipv4/tcp_rate.c
@@ -0,0 +1,186 @@
+#include <net/tcp.h>
+
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
+ * bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits. The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ *
+ * TCP flows can often be application-limited in request/response workloads.
+ * The estimator marks a bandwidth sample as application-limited if there
+ * was some moment during the sampled window of packets when there was no data
+ * ready to send in the write queue.
+ */
+
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* In general we need to start delivery rate samples from the
+ * time we received the most recent ACK, to ensure we include
+ * the full time the network needs to deliver all in-flight
+ * packets. If there are no packets in flight yet, then we
+ * know that any ACKs after now indicate that the network was
+ * able to deliver those packets completely in the sampling
+ * interval between now and the next ACK.
+ *
+ * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+ * because the latter is a guess based on RTO and loss-marking
+ * heuristics. We don't want spurious RTOs or loss markings to cause
+ * a spuriously small time interval, causing a spuriously high
+ * bandwidth estimate.
+ */
+ if (!tp->packets_out) {
+ tp->first_tx_mstamp = skb->skb_mstamp;
+ tp->delivered_mstamp = skb->skb_mstamp;
+ }
+
+ TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
+ TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the highest prior_delivered count.
+ */
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+ if (!scb->tx.delivered_mstamp.v64)
+ return;
+
+ if (!rs->prior_delivered ||
+ after(scb->tx.delivered, rs->prior_delivered)) {
+ rs->prior_delivered = scb->tx.delivered;
+ rs->prior_mstamp = scb->tx.delivered_mstamp;
+ rs->is_app_limited = scb->tx.is_app_limited;
+ rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+
+ /* Find the duration of the "send phase" of this window: */
+ rs->interval_us = skb_mstamp_us_delta(
+ &skb->skb_mstamp,
+ &scb->tx.first_tx_mstamp);
+
+ /* Record send time of most recently ACKed packet: */
+ tp->first_tx_mstamp = skb->skb_mstamp;
+ }
+ /* Mark off the skb delivered once it's sacked to avoid being
+ * used again when it's cumulatively acked. For acked packets
+ * we don't need to reset since it'll be freed soon.
+ */
+ if (scb->sacked & TCPCB_SACKED_ACKED)
+ scb->tx.delivered_mstamp.v64 = 0;
+}
+
+/* Update the connection delivery information and generate a rate sample. */
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ struct skb_mstamp *now, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 snd_us, ack_us;
+
+ /* Clear app limited if bubble is acked and gone. */
+ if (tp->app_limited && after(tp->delivered, tp->app_limited))
+ tp->app_limited = 0;
+
+ /* TODO: there are multiple places throughout tcp_ack() to get
+ * current time. Refactor the code using a new "tcp_acktag_state"
+ * to carry current time, flags, stats like "tcp_sacktag_state".
+ */
+ if (delivered)
+ tp->delivered_mstamp = *now;
+
+ rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
+ rs->losses = lost; /* freshly marked lost */
+ /* Return an invalid sample if no timing information is available. */
+ if (!rs->prior_mstamp.v64) {
+ rs->delivered = -1;
+ rs->interval_us = -1;
+ return;
+ }
+ rs->delivered = tp->delivered - rs->prior_delivered;
+
+ /* Model sending data and receiving ACKs as separate pipeline phases
+ * for a window. Usually the ACK phase is longer, but with ACK
+ * compression the send phase can be longer. To be safe we use the
+ * longer phase.
+ */
+ snd_us = rs->interval_us; /* send phase */
+ ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */
+ rs->interval_us = max(snd_us, ack_us);
+
+ /* Normally we expect interval_us >= min-rtt.
+ * Note that rate may still be over-estimated when a spuriously
+ * retransmistted skb was first (s)acked because "interval_us"
+ * is under-estimated (up to an RTT). However continuously
+ * measuring the delivery rate during loss recovery is crucial
+ * for connections suffer heavy or prolonged losses.
+ */
+ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+ if (!rs->is_retrans)
+ pr_debug("tcp rate: %ld %d %u %u %u\n",
+ rs->interval_us, rs->delivered,
+ inet_csk(sk)->icsk_ca_state,
+ tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+ rs->interval_us = -1;
+ return;
+ }
+
+ /* Record the last non-app-limited or the highest app-limited bw */
+ if (!rs->is_app_limited ||
+ ((u64)rs->delivered * tp->rate_interval_us >=
+ (u64)tp->rate_delivered * rs->interval_us)) {
+ tp->rate_delivered = rs->delivered;
+ tp->rate_interval_us = rs->interval_us;
+ tp->rate_app_limited = rs->is_app_limited;
+ }
+}
+
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (/* We have less than one packet to send. */
+ tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+ /* Nothing in sending host's qdisc queues or NIC tx queue. */
+ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+ /* We are not limited by CWND. */
+ tcp_packets_in_flight(tp) < tp->snd_cwnd &&
+ /* All lost packets have been retransmitted. */
+ tp->lost_out <= tp->retrans_out)
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d84930b2dd95..f712b411f6ed 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -384,6 +384,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
*/
inet_rtx_syn_ack(sk, req);
req->num_timeout++;
+ icsk->icsk_retransmits++;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
}
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 028eb046ea40..9c5fc973267f 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -76,7 +76,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else if (!yeah->doing_reno_now) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 058c31286ce1..7d96dc2d3d08 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1021,12 +1021,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flow_flags,
faddr, saddr, dport, inet->inet_sport);
- if (!saddr && ipc.oif) {
- err = l3mdev_get_saddr(net, ipc.oif, fl4);
- if (err < 0)
- goto out;
- }
-
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 8a9f6e535caa..9a89c10a55f0 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -20,7 +20,7 @@
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *req,
- struct nlattr *bc)
+ struct nlattr *bc, bool net_admin)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
@@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
return inet_sk_diag_fill(sk, NULL, skb, req,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
}
static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
@@ -76,7 +76,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
err = inet_sk_diag_fill(sk, NULL, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh);
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
kfree_skb(rep);
@@ -97,6 +98,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
int num, s_num, slot, s_slot;
@@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
spin_unlock_bh(&hslot->lock);
goto done;
}
@@ -186,8 +188,8 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
sk = __udp4_lib_lookup(net,
- req->id.idiag_dst[0], req->id.idiag_dport,
- req->id.idiag_src[0], req->id.idiag_sport,
+ req->id.idiag_dst[3], req->id.idiag_dport,
+ req->id.idiag_src[3], req->id.idiag_sport,
req->id.idiag_if, tbl, NULL);
else
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 81f253b6ff36..f9333c963607 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
__be16 new_protocol, bool is_ipv6)
{
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
- bool remcsum, need_csum, offload_csum, ufo;
+ bool remcsum, need_csum, offload_csum, ufo, gso_partial;
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct udphdr *uh = udp_hdr(skb);
u16 mac_offset = skb->mac_header;
@@ -88,6 +88,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
goto out;
}
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
outer_hlen = skb_tnl_header_len(skb);
udp_offset = outer_hlen - tnl_hlen;
skb = segs;
@@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
* will be using a length value equal to only one MSS sized
* segment instead of the entire frame.
*/
- if (skb_is_gso(skb)) {
+ if (gso_partial) {
uh->len = htons(skb_shinfo(skb)->gso_size +
SKB_GSO_CB(skb)->data_offset +
skb->head - (unsigned char *)uh);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b644a23c3db0..6a7ff6957535 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -29,7 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
memset(fl4, 0, sizeof(*fl4));
fl4->daddr = daddr->a4;
fl4->flowi4_tos = tos;
- fl4->flowi4_oif = oif;
+ fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
if (saddr)
fl4->saddr = saddr->a4;
@@ -112,7 +112,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
int oif = 0;
if (skb_dst(skb))
- oif = l3mdev_fib_oif(skb_dst(skb)->dev);
+ oif = skb_dst(skb)->dev->ifindex;
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;