aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c22
-rw-r--r--net/ipv4/devinet.c8
-rw-r--r--net/ipv4/fib_frontend.c23
-rw-r--r--net/ipv4/fib_notifier.c99
-rw-r--r--net/ipv4/fib_rules.c44
-rw-r--r--net/ipv4/fib_semantics.c11
-rw-r--r--net/ipv4/fib_trie.c5
-rw-r--r--net/ipv4/gre_offload.c14
-rw-r--r--net/ipv4/icmp.c4
-rw-r--r--net/ipv4/igmp.c6
-rw-r--r--net/ipv4/inet_hashtables.c27
-rw-r--r--net/ipv4/inetpeer.c428
-rw-r--r--net/ipv4/ip_options.c9
-rw-r--r--net/ipv4/ip_output.c80
-rw-r--r--net/ipv4/ip_sockglue.c19
-rw-r--r--net/ipv4/ip_vti.c31
-rw-r--r--net/ipv4/ipmr.c8
-rw-r--r--net/ipv4/proc.c9
-rw-r--r--net/ipv4/raw.c18
-rw-r--r--net/ipv4/raw_diag.c4
-rw-r--r--net/ipv4/route.c11
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c3
-rw-r--r--net/ipv4/tcp.c223
-rw-r--r--net/ipv4/tcp_bic.c14
-rw-r--r--net/ipv4/tcp_cdg.c12
-rw-r--r--net/ipv4/tcp_cong.c2
-rw-r--r--net/ipv4/tcp_cubic.c13
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_htcp.c3
-rw-r--r--net/ipv4/tcp_illinois.c11
-rw-r--r--net/ipv4/tcp_input.c309
-rw-r--r--net/ipv4/tcp_ipv4.c80
-rw-r--r--net/ipv4/tcp_minisocks.c3
-rw-r--r--net/ipv4/tcp_nv.c13
-rw-r--r--net/ipv4/tcp_output.c19
-rw-r--r--net/ipv4/tcp_probe.c5
-rw-r--r--net/ipv4/tcp_recovery.c2
-rw-r--r--net/ipv4/tcp_scalable.c16
-rw-r--r--net/ipv4/tcp_timer.c12
-rw-r--r--net/ipv4/tcp_veno.c11
-rw-r--r--net/ipv4/tcp_westwood.c31
-rw-r--r--net/ipv4/tcp_yeah.c11
-rw-r--r--net/ipv4/udp.c79
-rw-r--r--net/ipv4/udp_diag.c10
-rw-r--r--net/ipv4/udp_offload.c64
-rw-r--r--net/ipv4/udp_tunnel.c25
-rw-r--r--net/ipv4/xfrm4_policy.c11
48 files changed, 491 insertions, 1384 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2e548eca3489..d678820e4306 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -944,6 +944,8 @@ const struct proto_ops inet_stream_ops = {
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
+ .sendmsg_locked = tcp_sendmsg_locked,
+ .sendpage_locked = tcp_sendpage_locked,
.peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
@@ -1219,10 +1221,9 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
- bool udpfrag = false, fixedid = false, gso_partial, encap;
+ bool fixedid = false, gso_partial, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
- unsigned int offset = 0;
struct iphdr *iph;
int proto, tot_len;
int nhoff;
@@ -1257,7 +1258,6 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
segs = ERR_PTR(-EPROTONOSUPPORT);
if (!skb->encapsulation || encap) {
- udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
/* fixed ID is invalid if DF bit is not set */
@@ -1277,13 +1277,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
skb = segs;
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
- if (udpfrag) {
- iph->frag_off = htons(offset >> 3);
- if (skb->next)
- iph->frag_off |= htons(IP_MF);
- offset += skb->len - nhoff - ihl;
- tot_len = skb->len - nhoff;
- } else if (skb_is_gso(skb)) {
+ if (skb_is_gso(skb)) {
if (!fixedid) {
iph->id = htons(id);
id += skb_shinfo(skb)->gso_segs;
@@ -1778,6 +1772,11 @@ static const struct net_offload ipip_offload = {
},
};
+static int __init ipip_offload_init(void)
+{
+ return inet_add_offload(&ipip_offload, IPPROTO_IPIP);
+}
+
static int __init ipv4_offload_init(void)
{
/*
@@ -1787,9 +1786,10 @@ static int __init ipv4_offload_init(void)
pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
if (tcpv4_offload_init() < 0)
pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+ if (ipip_offload_init() < 0)
+ pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);
dev_add_offload(&ip_packet_offload);
- inet_add_offload(&ipip_offload, IPPROTO_IPIP);
return 0;
}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 38d9af9b917c..d7adc0616599 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2491,9 +2491,9 @@ void __init devinet_init(void)
rtnl_af_register(&inet_af_ops);
- rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
- rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
- rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
+ rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0);
+ rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0);
+ rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, 0);
rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
- inet_netconf_dump_devconf, NULL);
+ inet_netconf_dump_devconf, 0);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 044d2a159a3c..37819ab4cc74 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1247,22 +1247,28 @@ static int __net_init ip_fib_net_init(struct net *net)
int err;
size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
- net->ipv4.fib_seq = 0;
+ err = fib4_notifier_init(net);
+ if (err)
+ return err;
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
- if (!net->ipv4.fib_table_hash)
- return -ENOMEM;
+ if (!net->ipv4.fib_table_hash) {
+ err = -ENOMEM;
+ goto err_table_hash_alloc;
+ }
err = fib4_rules_init(net);
if (err < 0)
- goto fail;
+ goto err_rules_init;
return 0;
-fail:
+err_rules_init:
kfree(net->ipv4.fib_table_hash);
+err_table_hash_alloc:
+ fib4_notifier_exit(net);
return err;
}
@@ -1292,6 +1298,7 @@ static void ip_fib_net_exit(struct net *net)
#endif
rtnl_unlock();
kfree(net->ipv4.fib_table_hash);
+ fib4_notifier_exit(net);
}
static int __net_init fib_net_init(struct net *net)
@@ -1341,7 +1348,7 @@ void __init ip_fib_init(void)
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
- rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
- rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
- rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
+ rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
+ rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
+ rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
}
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index e0714d975947..5d7afb145562 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -1,86 +1,71 @@
#include <linux/rtnetlink.h>
#include <linux/notifier.h>
-#include <linux/rcupdate.h>
+#include <linux/socket.h>
#include <linux/kernel.h>
#include <net/net_namespace.h>
+#include <net/fib_notifier.h>
#include <net/netns/ipv4.h>
#include <net/ip_fib.h>
-static ATOMIC_NOTIFIER_HEAD(fib_chain);
-
-int call_fib_notifier(struct notifier_block *nb, struct net *net,
- enum fib_event_type event_type,
- struct fib_notifier_info *info)
+int call_fib4_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
{
- info->net = net;
- return nb->notifier_call(nb, event_type, info);
+ info->family = AF_INET;
+ return call_fib_notifier(nb, net, event_type, info);
}
-int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
- struct fib_notifier_info *info)
+int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
{
+ ASSERT_RTNL();
+
+ info->family = AF_INET;
net->ipv4.fib_seq++;
- info->net = net;
- return atomic_notifier_call_chain(&fib_chain, event_type, info);
+ return call_fib_notifiers(net, event_type, info);
}
-static unsigned int fib_seq_sum(void)
+static unsigned int fib4_seq_read(struct net *net)
{
- unsigned int fib_seq = 0;
- struct net *net;
-
- rtnl_lock();
- for_each_net(net)
- fib_seq += net->ipv4.fib_seq;
- rtnl_unlock();
+ ASSERT_RTNL();
- return fib_seq;
+ return net->ipv4.fib_seq + fib4_rules_seq_read(net);
}
-static bool fib_dump_is_consistent(struct notifier_block *nb,
- void (*cb)(struct notifier_block *nb),
- unsigned int fib_seq)
+static int fib4_dump(struct net *net, struct notifier_block *nb)
{
- atomic_notifier_chain_register(&fib_chain, nb);
- if (fib_seq == fib_seq_sum())
- return true;
- atomic_notifier_chain_unregister(&fib_chain, nb);
- if (cb)
- cb(nb);
- return false;
+ int err;
+
+ err = fib4_rules_dump(net, nb);
+ if (err)
+ return err;
+
+ fib_notify(net, nb);
+
+ return 0;
}
-#define FIB_DUMP_MAX_RETRIES 5
-int register_fib_notifier(struct notifier_block *nb,
- void (*cb)(struct notifier_block *nb))
-{
- int retries = 0;
+static const struct fib_notifier_ops fib4_notifier_ops_template = {
+ .family = AF_INET,
+ .fib_seq_read = fib4_seq_read,
+ .fib_dump = fib4_dump,
+};
- do {
- unsigned int fib_seq = fib_seq_sum();
- struct net *net;
+int __net_init fib4_notifier_init(struct net *net)
+{
+ struct fib_notifier_ops *ops;
- /* Mutex semantics guarantee that every change done to
- * FIB tries before we read the change sequence counter
- * is now visible to us.
- */
- rcu_read_lock();
- for_each_net_rcu(net) {
- fib_rules_notify(net, nb);
- fib_notify(net, nb);
- }
- rcu_read_unlock();
+ net->ipv4.fib_seq = 0;
- if (fib_dump_is_consistent(nb, cb, fib_seq))
- return 0;
- } while (++retries < FIB_DUMP_MAX_RETRIES);
+ ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+ net->ipv4.notifier_ops = ops;
- return -EBUSY;
+ return 0;
}
-EXPORT_SYMBOL(register_fib_notifier);
-int unregister_fib_notifier(struct notifier_block *nb)
+void __net_exit fib4_notifier_exit(struct net *net)
{
- return atomic_notifier_chain_unregister(&fib_chain, nb);
+ fib_notifier_ops_unregister(net->ipv4.notifier_ops);
}
-EXPORT_SYMBOL(unregister_fib_notifier);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 778ecf977eb2..35d646a62ad4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -68,6 +68,16 @@ bool fib4_rule_default(const struct fib_rule *rule)
}
EXPORT_SYMBOL_GPL(fib4_rule_default);
+int fib4_rules_dump(struct net *net, struct notifier_block *nb)
+{
+ return fib_rules_dump(net, nb, AF_INET);
+}
+
+unsigned int fib4_rules_seq_read(struct net *net)
+{
+ return fib_rules_seq_read(net, AF_INET);
+}
+
int __fib_lookup(struct net *net, struct flowi4 *flp,
struct fib_result *res, unsigned int flags)
{
@@ -185,38 +195,6 @@ static struct fib_table *fib_empty_table(struct net *net)
return NULL;
}
-static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
- enum fib_event_type event_type,
- struct fib_rule *rule)
-{
- struct fib_rule_notifier_info info = {
- .rule = rule,
- };
-
- return call_fib_notifier(nb, net, event_type, &info.info);
-}
-
-static int call_fib_rule_notifiers(struct net *net,
- enum fib_event_type event_type,
- struct fib_rule *rule)
-{
- struct fib_rule_notifier_info info = {
- .rule = rule,
- };
-
- return call_fib_notifiers(net, event_type, &info.info);
-}
-
-/* Called with rcu_read_lock() */
-void fib_rules_notify(struct net *net, struct notifier_block *nb)
-{
- struct fib_rules_ops *ops = net->ipv4.rules_ops;
- struct fib_rule *rule;
-
- list_for_each_entry_rcu(rule, &ops->rules_list, list)
- call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule);
-}
-
static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
FRA_GENERIC_POLICY,
[FRA_FLOW] = { .type = NLA_U32 },
@@ -273,7 +251,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule);
err = 0;
errout:
@@ -295,7 +272,6 @@ static int fib4_rule_delete(struct fib_rule *rule)
net->ipv4.fib_num_tclassid_users--;
#endif
net->ipv4.fib_has_custom_rules = true;
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule);
errout:
return err;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index ec3a9ce281a6..d521caf57385 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -44,6 +44,7 @@
#include <net/netlink.h>
#include <net/nexthop.h>
#include <net/lwtunnel.h>
+#include <net/fib_notifier.h>
#include "fib_lookup.h"
@@ -1344,6 +1345,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
rtm->rtm_flags |= RTNH_F_DEAD;
}
+ if (fi->fib_nh->nh_flags & RTNH_F_OFFLOAD)
+ rtm->rtm_flags |= RTNH_F_OFFLOAD;
#ifdef CONFIG_IP_ROUTE_CLASSID
if (fi->fib_nh[0].nh_tclassid &&
nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
@@ -1451,14 +1454,14 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
fib_nh->nh_flags & RTNH_F_LINKDOWN)
break;
- return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
- &info.info);
+ return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type,
+ &info.info);
case FIB_EVENT_NH_DEL:
if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
(fib_nh->nh_flags & RTNH_F_DEAD))
- return call_fib_notifiers(dev_net(fib_nh->nh_dev),
- event_type, &info.info);
+ return call_fib4_notifiers(dev_net(fib_nh->nh_dev),
+ event_type, &info.info);
default:
break;
}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 64668c69dda6..1a6ffb0dab9c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -81,6 +81,7 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
+#include <net/fib_notifier.h>
#include <trace/events/fib.h>
#include "fib_lookup.h"
@@ -97,7 +98,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
.type = type,
.tb_id = tb_id,
};
- return call_fib_notifier(nb, net, event_type, &info.info);
+ return call_fib4_notifier(nb, net, event_type, &info.info);
}
static int call_fib_entry_notifiers(struct net *net,
@@ -113,7 +114,7 @@ static int call_fib_entry_notifiers(struct net *net,
.type = type,
.tb_id = tb_id,
};
- return call_fib_notifiers(net, event_type, &info.info);
+ return call_fib4_notifiers(net, event_type, &info.info);
}
#define MAX_STAT_DEPTH 32
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index d5cac99170b1..416bb304a281 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
__be16 protocol = skb->protocol;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
- bool need_csum, ufo, gso_partial;
+ bool need_csum, gso_partial;
if (!skb->encapsulation)
goto out;
@@ -47,20 +47,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
skb->encap_hdr_csum = need_csum;
- ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
-
features &= skb->dev->hw_enc_features;
- /* The only checksum offload we care about from here on out is the
- * outer one so strip the existing checksum feature flags based
- * on the fact that we will be computing our checksum in software.
- */
- if (ufo) {
- features &= ~NETIF_F_CSUM_MASK;
- if (!need_csum)
- features |= NETIF_F_HW_CSUM;
- }
-
/* segment inner packet. */
segs = skb_mac_gso_segment(skb, features);
if (IS_ERR_OR_NULL(segs)) {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c2be26b98b5f..681e33998e03 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -412,7 +412,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
int type = icmp_param->data.icmph.type;
int code = icmp_param->data.icmph.code;
- if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
+ if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
return;
/* Needed by both icmp_global_allow and icmp_xmit_lock */
@@ -694,7 +694,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
+ if (ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in))
goto out_unlock;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 498706b072fb..9f86b5133605 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2549,7 +2549,8 @@ done:
/*
* check if a multicast source filter allows delivery for a given <src,dst,intf>
*/
-int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
+int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
+ int dif, int sdif)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *pmc;
@@ -2564,7 +2565,8 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
rcu_read_lock();
for_each_pmc_rcu(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
- pmc->multi.imr_ifindex == dif)
+ (pmc->multi.imr_ifindex == dif ||
+ (sdif && pmc->multi.imr_ifindex == sdif)))
break;
}
ret = inet->mc_all;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 2e3389d614d1..597bb4cfe805 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -170,7 +170,7 @@ EXPORT_SYMBOL_GPL(__inet_inherit_port);
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
- const int dif, bool exact_dif)
+ const int dif, const int sdif, bool exact_dif)
{
int score = -1;
struct inet_sock *inet = inet_sk(sk);
@@ -185,9 +185,13 @@ static inline int compute_score(struct sock *sk, struct net *net,
score += 4;
}
if (sk->sk_bound_dev_if || exact_dif) {
- if (sk->sk_bound_dev_if != dif)
+ bool dev_match = (sk->sk_bound_dev_if == dif ||
+ sk->sk_bound_dev_if == sdif);
+
+ if (exact_dif && !dev_match)
return -1;
- score += 4;
+ if (sk->sk_bound_dev_if && dev_match)
+ score += 4;
}
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
@@ -208,7 +212,7 @@ struct sock *__inet_lookup_listener(struct net *net,
struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
- const int dif)
+ const int dif, const int sdif)
{
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
@@ -218,7 +222,8 @@ struct sock *__inet_lookup_listener(struct net *net,
u32 phash = 0;
sk_for_each_rcu(sk, &ilb->head) {
- score = compute_score(sk, net, hnum, daddr, dif, exact_dif);
+ score = compute_score(sk, net, hnum, daddr,
+ dif, sdif, exact_dif);
if (score > hiscore) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -268,7 +273,7 @@ struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const u16 hnum,
- const int dif)
+ const int dif, const int sdif)
{
INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
@@ -286,11 +291,12 @@ begin:
if (sk->sk_hash != hash)
continue;
if (likely(INET_MATCH(sk, net, acookie,
- saddr, daddr, ports, dif))) {
+ saddr, daddr, ports, dif, sdif))) {
if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
goto out;
if (unlikely(!INET_MATCH(sk, net, acookie,
- saddr, daddr, ports, dif))) {
+ saddr, daddr, ports,
+ dif, sdif))) {
sock_gen_put(sk);
goto begin;
}
@@ -321,9 +327,10 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
__be32 daddr = inet->inet_rcv_saddr;
__be32 saddr = inet->inet_daddr;
int dif = sk->sk_bound_dev_if;
+ struct net *net = sock_net(sk);
+ int sdif = l3mdev_master_ifindex_by_index(net, dif);
INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
- struct net *net = sock_net(sk);
unsigned int hash = inet_ehashfn(net, daddr, lport,
saddr, inet->inet_dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
@@ -339,7 +346,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
continue;
if (likely(INET_MATCH(sk2, net, acookie,
- saddr, daddr, ports, dif))) {
+ saddr, daddr, ports, dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
if (twsk_unique(sk, sk2, twp))
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index c5a117cc6619..337ad41bb80a 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -33,7 +33,7 @@
* also be removed if the pool is overloaded i.e. if the total amount of
* entries is greater-or-equal than the threshold.
*
- * Node pool is organised as an AVL tree.
+ * Node pool is organised as an RB tree.
* Such an implementation has been chosen not just for fun. It's a way to
* prevent easy and efficient DoS attacks by creating hash collisions. A huge
* amount of long living nodes in a single hash slot would significantly delay
@@ -45,7 +45,7 @@
* AND reference count being 0.
* 3. Global variable peer_total is modified under the pool lock.
* 4. struct inet_peer fields modification:
- * avl_left, avl_right, avl_parent, avl_height: pool lock
+ * rb_node: pool lock
* refcnt: atomically against modifications on other CPU;
* usually under some other lock to prevent node disappearing
* daddr: unchangeable
@@ -53,30 +53,15 @@
static struct kmem_cache *peer_cachep __read_mostly;
-static LIST_HEAD(gc_list);
-static const int gc_delay = 60 * HZ;
-static struct delayed_work gc_work;
-static DEFINE_SPINLOCK(gc_lock);
-
-#define node_height(x) x->avl_height
-
-#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
-#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
-static const struct inet_peer peer_fake_node = {
- .avl_left = peer_avl_empty_rcu,
- .avl_right = peer_avl_empty_rcu,
- .avl_height = 0
-};
-
void inet_peer_base_init(struct inet_peer_base *bp)
{
- bp->root = peer_avl_empty_rcu;
+ bp->rb_root = RB_ROOT;
seqlock_init(&bp->lock);
bp->total = 0;
}
EXPORT_SYMBOL_GPL(inet_peer_base_init);
-#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+#define PEER_MAX_GC 32
/* Exported for sysctl_net_ipv4. */
int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
@@ -84,53 +69,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m
int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
-static void inetpeer_gc_worker(struct work_struct *work)
-{
- struct inet_peer *p, *n, *c;
- struct list_head list;
-
- spin_lock_bh(&gc_lock);
- list_replace_init(&gc_list, &list);
- spin_unlock_bh(&gc_lock);
-
- if (list_empty(&list))
- return;
-
- list_for_each_entry_safe(p, n, &list, gc_list) {
-
- if (need_resched())
- cond_resched();
-
- c = rcu_dereference_protected(p->avl_left, 1);
- if (c != peer_avl_empty) {
- list_add_tail(&c->gc_list, &list);
- p->avl_left = peer_avl_empty_rcu;
- }
-
- c = rcu_dereference_protected(p->avl_right, 1);
- if (c != peer_avl_empty) {
- list_add_tail(&c->gc_list, &list);
- p->avl_right = peer_avl_empty_rcu;
- }
-
- n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
-
- if (refcount_read(&p->refcnt) == 1) {
- list_del(&p->gc_list);
- kmem_cache_free(peer_cachep, p);
- }
- }
-
- if (list_empty(&list))
- return;
-
- spin_lock_bh(&gc_lock);
- list_splice(&list, &gc_list);
- spin_unlock_bh(&gc_lock);
-
- schedule_delayed_work(&gc_work, gc_delay);
-}
-
/* Called from ip_output.c:ip_init */
void __init inet_initpeers(void)
{
@@ -153,225 +91,62 @@ void __init inet_initpeers(void)
sizeof(struct inet_peer),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
-
- INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
}
-#define rcu_deref_locked(X, BASE) \
- rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
-
-/*
- * Called with local BH disabled and the pool lock held.
- */
-#define lookup(_daddr, _stack, _base) \
-({ \
- struct inet_peer *u; \
- struct inet_peer __rcu **v; \
- \
- stackptr = _stack; \
- *stackptr++ = &_base->root; \
- for (u = rcu_deref_locked(_base->root, _base); \
- u != peer_avl_empty;) { \
- int cmp = inetpeer_addr_cmp(_daddr, &u->daddr); \
- if (cmp == 0) \
- break; \
- if (cmp == -1) \
- v = &u->avl_left; \
- else \
- v = &u->avl_right; \
- *stackptr++ = v; \
- u = rcu_deref_locked(*v, _base); \
- } \
- u; \
-})
-
-/*
- * Called with rcu_read_lock()
- * Because we hold no lock against a writer, its quite possible we fall
- * in an endless loop.
- * But every pointer we follow is guaranteed to be valid thanks to RCU.
- * We exit from this function if number of links exceeds PEER_MAXDEPTH
- */
-static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
- struct inet_peer_base *base)
+/* Called with rcu_read_lock() or base->lock held */
+static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
+ struct inet_peer_base *base,
+ unsigned int seq,
+ struct inet_peer *gc_stack[],
+ unsigned int *gc_cnt,
+ struct rb_node **parent_p,
+ struct rb_node ***pp_p)
{
- struct inet_peer *u = rcu_dereference(base->root);
- int count = 0;
+ struct rb_node **pp, *parent;
+ struct inet_peer *p;
+
+ pp = &base->rb_root.rb_node;
+ parent = NULL;
+ while (*pp) {
+ int cmp;
- while (u != peer_avl_empty) {
- int cmp = inetpeer_addr_cmp(daddr, &u->daddr);
+ parent = rcu_dereference_raw(*pp);
+ p = rb_entry(parent, struct inet_peer, rb_node);
+ cmp = inetpeer_addr_cmp(daddr, &p->daddr);
if (cmp == 0) {
- /* Before taking a reference, check if this entry was
- * deleted (refcnt=0)
- */
- if (!refcount_inc_not_zero(&u->refcnt)) {
- u = NULL;
- }
- return u;
+ if (!refcount_inc_not_zero(&p->refcnt))
+ break;
+ return p;
+ }
+ if (gc_stack) {
+ if (*gc_cnt < PEER_MAX_GC)
+ gc_stack[(*gc_cnt)++] = p;
+ } else if (unlikely(read_seqretry(&base->lock, seq))) {
+ break;
}
if (cmp == -1)
- u = rcu_dereference(u->avl_left);
+ pp = &(*pp)->rb_left;
else
- u = rcu_dereference(u->avl_right);
- if (unlikely(++count == PEER_MAXDEPTH))
- break;
+ pp = &(*pp)->rb_right;
}
+ *parent_p = parent;
+ *pp_p = pp;
return NULL;
}
-/* Called with local BH disabled and the pool lock held. */
-#define lookup_rightempty(start, base) \
-({ \
- struct inet_peer *u; \
- struct inet_peer __rcu **v; \
- *stackptr++ = &start->avl_left; \
- v = &start->avl_left; \
- for (u = rcu_deref_locked(*v, base); \
- u->avl_right != peer_avl_empty_rcu;) { \
- v = &u->avl_right; \
- *stackptr++ = v; \
- u = rcu_deref_locked(*v, base); \
- } \
- u; \
-})
-
-/* Called with local BH disabled and the pool lock held.
- * Variable names are the proof of operation correctness.
- * Look into mm/map_avl.c for more detail description of the ideas.
- */
-static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
- struct inet_peer __rcu ***stackend,
- struct inet_peer_base *base)
-{
- struct inet_peer __rcu **nodep;
- struct inet_peer *node, *l, *r;
- int lh, rh;
-
- while (stackend > stack) {
- nodep = *--stackend;
- node = rcu_deref_locked(*nodep, base);
- l = rcu_deref_locked(node->avl_left, base);
- r = rcu_deref_locked(node->avl_right, base);
- lh = node_height(l);
- rh = node_height(r);
- if (lh > rh + 1) { /* l: RH+2 */
- struct inet_peer *ll, *lr, *lrl, *lrr;
- int lrh;
- ll = rcu_deref_locked(l->avl_left, base);
- lr = rcu_deref_locked(l->avl_right, base);
- lrh = node_height(lr);
- if (lrh <= node_height(ll)) { /* ll: RH+1 */
- RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
- RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
- node->avl_height = lrh + 1; /* RH+1 or RH+2 */
- RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */
- RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */
- l->avl_height = node->avl_height + 1;
- RCU_INIT_POINTER(*nodep, l);
- } else { /* ll: RH, lr: RH+1 */
- lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
- lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
- RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
- RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
- node->avl_height = rh + 1; /* node: RH+1 */
- RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */
- RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */
- l->avl_height = rh + 1; /* l: RH+1 */
- RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */
- RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */
- lr->avl_height = rh + 2;
- RCU_INIT_POINTER(*nodep, lr);
- }
- } else if (rh > lh + 1) { /* r: LH+2 */
- struct inet_peer *rr, *rl, *rlr, *rll;
- int rlh;
- rr = rcu_deref_locked(r->avl_right, base);
- rl = rcu_deref_locked(r->avl_left, base);
- rlh = node_height(rl);
- if (rlh <= node_height(rr)) { /* rr: LH+1 */
- RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
- RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
- node->avl_height = rlh + 1; /* LH+1 or LH+2 */
- RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */
- RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */
- r->avl_height = node->avl_height + 1;
- RCU_INIT_POINTER(*nodep, r);
- } else { /* rr: RH, rl: RH+1 */
- rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
- rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
- RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
- RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
- node->avl_height = lh + 1; /* node: LH+1 */
- RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */
- RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */
- r->avl_height = lh + 1; /* r: LH+1 */
- RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */
- RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */
- rl->avl_height = lh + 2;
- RCU_INIT_POINTER(*nodep, rl);
- }
- } else {
- node->avl_height = (lh > rh ? lh : rh) + 1;
- }
- }
-}
-
-/* Called with local BH disabled and the pool lock held. */
-#define link_to_pool(n, base) \
-do { \
- n->avl_height = 1; \
- n->avl_left = peer_avl_empty_rcu; \
- n->avl_right = peer_avl_empty_rcu; \
- /* lockless readers can catch us now */ \
- rcu_assign_pointer(**--stackptr, n); \
- peer_avl_rebalance(stack, stackptr, base); \
-} while (0)
-
static void inetpeer_free_rcu(struct rcu_head *head)
{
kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
}
-static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
- struct inet_peer __rcu **stack[PEER_MAXDEPTH])
-{
- struct inet_peer __rcu ***stackptr, ***delp;
-
- if (lookup(&p->daddr, stack, base) != p)
- BUG();
- delp = stackptr - 1; /* *delp[0] == p */
- if (p->avl_left == peer_avl_empty_rcu) {
- *delp[0] = p->avl_right;
- --stackptr;
- } else {
- /* look for a node to insert instead of p */
- struct inet_peer *t;
- t = lookup_rightempty(p, base);
- BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
- **--stackptr = t->avl_left;
- /* t is removed, t->daddr > x->daddr for any
- * x in p->avl_left subtree.
- * Put t in the old place of p. */
- RCU_INIT_POINTER(*delp[0], t);
- t->avl_left = p->avl_left;
- t->avl_right = p->avl_right;
- t->avl_height = p->avl_height;
- BUG_ON(delp[1] != &p->avl_left);
- delp[1] = &t->avl_left; /* was &p->avl_left */
- }
- peer_avl_rebalance(stack, stackptr, base);
- base->total--;
- call_rcu(&p->rcu, inetpeer_free_rcu);
-}
-
/* perform garbage collect on all items stacked during a lookup */
-static int inet_peer_gc(struct inet_peer_base *base,
- struct inet_peer __rcu **stack[PEER_MAXDEPTH],
- struct inet_peer __rcu ***stackptr)
+static void inet_peer_gc(struct inet_peer_base *base,
+ struct inet_peer *gc_stack[],
+ unsigned int gc_cnt)
{
- struct inet_peer *p, *gchead = NULL;
+ struct inet_peer *p;
__u32 delta, ttl;
- int cnt = 0;
+ int i;
if (base->total >= inet_peer_threshold)
ttl = 0; /* be aggressive */
@@ -379,43 +154,38 @@ static int inet_peer_gc(struct inet_peer_base *base,
ttl = inet_peer_maxttl
- (inet_peer_maxttl - inet_peer_minttl) / HZ *
base->total / inet_peer_threshold * HZ;
- stackptr--; /* last stack slot is peer_avl_empty */
- while (stackptr > stack) {
- stackptr--;
- p = rcu_deref_locked(**stackptr, base);
- if (refcount_read(&p->refcnt) == 1) {
- smp_rmb();
- delta = (__u32)jiffies - p->dtime;
- if (delta >= ttl && refcount_dec_if_one(&p->refcnt)) {
- p->gc_next = gchead;
- gchead = p;
- }
- }
+ for (i = 0; i < gc_cnt; i++) {
+ p = gc_stack[i];
+ delta = (__u32)jiffies - p->dtime;
+ if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
+ gc_stack[i] = NULL;
}
- while ((p = gchead) != NULL) {
- gchead = p->gc_next;
- cnt++;
- unlink_from_pool(p, base, stack);
+ for (i = 0; i < gc_cnt; i++) {
+ p = gc_stack[i];
+ if (p) {
+ rb_erase(&p->rb_node, &base->rb_root);
+ base->total--;
+ call_rcu(&p->rcu, inetpeer_free_rcu);
+ }
}
- return cnt;
}
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
const struct inetpeer_addr *daddr,
int create)
{
- struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
- struct inet_peer *p;
- unsigned int sequence;
- int invalidated, gccnt = 0;
+ struct inet_peer *p, *gc_stack[PEER_MAX_GC];
+ struct rb_node **pp, *parent;
+ unsigned int gc_cnt, seq;
+ int invalidated;
/* Attempt a lockless lookup first.
* Because of a concurrent writer, we might not find an existing entry.
*/
rcu_read_lock();
- sequence = read_seqbegin(&base->lock);
- p = lookup_rcu(daddr, base);
- invalidated = read_seqretry(&base->lock, sequence);
+ seq = read_seqbegin(&base->lock);
+ p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
+ invalidated = read_seqretry(&base->lock, seq);
rcu_read_unlock();
if (p)
@@ -428,36 +198,31 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
/* retry an exact lookup, taking the lock before.
* At least, nodes should be hot in our cache.
*/
+ parent = NULL;
write_seqlock_bh(&base->lock);
-relookup:
- p = lookup(daddr, stack, base);
- if (p != peer_avl_empty) {
- refcount_inc(&p->refcnt);
- write_sequnlock_bh(&base->lock);
- return p;
- }
- if (!gccnt) {
- gccnt = inet_peer_gc(base, stack, stackptr);
- if (gccnt && create)
- goto relookup;
- }
- p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
- if (p) {
- p->daddr = *daddr;
- refcount_set(&p->refcnt, 2);
- atomic_set(&p->rid, 0);
- p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
- p->rate_tokens = 0;
- /* 60*HZ is arbitrary, but chosen enough high so that the first
- * calculation of tokens is at its maximum.
- */
- p->rate_last = jiffies - 60*HZ;
- INIT_LIST_HEAD(&p->gc_list);
- /* Link the node. */
- link_to_pool(p, base);
- base->total++;
+ gc_cnt = 0;
+ p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp);
+ if (!p && create) {
+ p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
+ if (p) {
+ p->daddr = *daddr;
+ refcount_set(&p->refcnt, 2);
+ atomic_set(&p->rid, 0);
+ p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+ p->rate_tokens = 0;
+ /* 60*HZ is arbitrary, but chosen enough high so that the first
+ * calculation of tokens is at its maximum.
+ */
+ p->rate_last = jiffies - 60*HZ;
+
+ rb_link_node(&p->rb_node, parent, pp);
+ rb_insert_color(&p->rb_node, &base->rb_root);
+ base->total++;
+ }
}
+ if (gc_cnt)
+ inet_peer_gc(base, gc_stack, gc_cnt);
write_sequnlock_bh(&base->lock);
return p;
@@ -467,8 +232,9 @@ EXPORT_SYMBOL_GPL(inet_getpeer);
void inet_putpeer(struct inet_peer *p)
{
p->dtime = (__u32)jiffies;
- smp_mb__before_atomic();
- refcount_dec(&p->refcnt);
+
+ if (refcount_dec_and_test(&p->refcnt))
+ call_rcu(&p->rcu, inetpeer_free_rcu);
}
EXPORT_SYMBOL_GPL(inet_putpeer);
@@ -513,30 +279,16 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
}
EXPORT_SYMBOL(inet_peer_xrlim_allow);
-static void inetpeer_inval_rcu(struct rcu_head *head)
-{
- struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
-
- spin_lock_bh(&gc_lock);
- list_add_tail(&p->gc_list, &gc_list);
- spin_unlock_bh(&gc_lock);
-
- schedule_delayed_work(&gc_work, gc_delay);
-}
-
void inetpeer_invalidate_tree(struct inet_peer_base *base)
{
- struct inet_peer *root;
-
- write_seqlock_bh(&base->lock);
+ struct inet_peer *p, *n;
- root = rcu_deref_locked(base->root, base);
- if (root != peer_avl_empty) {
- base->root = peer_avl_empty_rcu;
- base->total = 0;
- call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
+ rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) {
+ inet_putpeer(p);
+ cond_resched();
}
- write_sequnlock_bh(&base->lock);
+ base->rb_root = RB_ROOT;
+ base->total = 0;
}
EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 93157f2f4758..525ae88d1e58 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -86,8 +86,8 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
* NOTE: dopt cannot point to skb.
*/
-int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
- const struct ip_options *sopt)
+int __ip_options_echo(struct net *net, struct ip_options *dopt,
+ struct sk_buff *skb, const struct ip_options *sopt)
{
unsigned char *sptr, *dptr;
int soffset, doffset;
@@ -140,7 +140,7 @@ int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
__be32 addr;
memcpy(&addr, dptr+soffset-1, 4);
- if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
+ if (inet_addr_type(net, addr) != RTN_UNICAST) {
dopt->ts_needtime = 1;
soffset += 8;
}
@@ -174,9 +174,6 @@ int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
doffset -= 4;
}
if (doffset > 3) {
- __be32 daddr = fib_compute_spec_dst(skb);
-
- memcpy(&start[doffset-1], &daddr, 4);
dopt->faddr = faddr;
dptr[0] = start[0];
dptr[1] = doffset+3;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e153c40c2436..73b0b15245b6 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -853,61 +853,6 @@ csum_page(struct page *page, int offset, int copy)
return csum;
}
-static inline int ip_ufo_append_data(struct sock *sk,
- struct sk_buff_head *queue,
- int getfrag(void *from, char *to, int offset, int len,
- int odd, struct sk_buff *skb),
- void *from, int length, int hh_len, int fragheaderlen,
- int transhdrlen, int maxfraglen, unsigned int flags)
-{
- struct sk_buff *skb;
- int err;
-
- /* There is support for UDP fragmentation offload by network
- * device, so create one single skb packet containing complete
- * udp datagram
- */
- skb = skb_peek_tail(queue);
- if (!skb) {
- skb = sock_alloc_send_skb(sk,
- hh_len + fragheaderlen + transhdrlen + 20,
- (flags & MSG_DONTWAIT), &err);
-
- if (!skb)
- return err;
-
- /* reserve space for Hardware header */
- skb_reserve(skb, hh_len);
-
- /* create space for UDP/IP header */
- skb_put(skb, fragheaderlen + transhdrlen);
-
- /* initialize network header pointer */
- skb_reset_network_header(skb);
-
- /* initialize protocol header pointer */
- skb->transport_header = skb->network_header + fragheaderlen;
-
- skb->csum = 0;
-
- if (flags & MSG_CONFIRM)
- skb_set_dst_pending_confirm(skb, 1);
-
- __skb_queue_tail(queue, skb);
- } else if (skb_is_gso(skb)) {
- goto append;
- }
-
- skb->ip_summed = CHECKSUM_PARTIAL;
- /* specify the length of each IP datagram fragment */
- skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
-
-append:
- return skb_append_datato_frags(sk, skb, getfrag, from,
- (length - transhdrlen));
-}
-
static int __ip_append_data(struct sock *sk,
struct flowi4 *fl4,
struct sk_buff_head *queue,
@@ -965,19 +910,6 @@ static int __ip_append_data(struct sock *sk,
csummode = CHECKSUM_PARTIAL;
cork->length += length;
- if ((skb && skb_is_gso(skb)) ||
- (((length + (skb ? skb->len : fragheaderlen)) > mtu) &&
- (skb_queue_len(queue) <= 1) &&
- (sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
- (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx)) {
- err = ip_ufo_append_data(sk, queue, getfrag, from, length,
- hh_len, fragheaderlen, transhdrlen,
- maxfraglen, flags);
- if (err)
- goto error;
- return 0;
- }
/* So, what's going on in the loop below?
*
@@ -1288,16 +1220,6 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
if (!skb)
return -EINVAL;
- if ((size + skb->len > mtu) &&
- (skb_queue_len(&sk->sk_write_queue) == 1) &&
- (sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO)) {
- if (skb->ip_summed != CHECKSUM_PARTIAL)
- return -EOPNOTSUPP;
-
- skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
- }
cork->length += size;
while (size > 0) {
@@ -1603,7 +1525,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
int err;
int oif;
- if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
+ if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
return;
ipc.addr = daddr;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ecc4b4a2413e..e558e4f9597b 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -80,7 +80,8 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
}
-static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg,
+ struct sk_buff *skb)
{
unsigned char optbuf[sizeof(struct ip_options) + 40];
struct ip_options *opt = (struct ip_options *)optbuf;
@@ -88,7 +89,7 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
if (IPCB(skb)->opt.optlen == 0)
return;
- if (ip_options_echo(opt, skb)) {
+ if (ip_options_echo(net, opt, skb)) {
msg->msg_flags |= MSG_CTRUNC;
return;
}
@@ -204,7 +205,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
}
if (flags & IP_CMSG_RETOPTS) {
- ip_cmsg_recv_retopts(msg, skb);
+ ip_cmsg_recv_retopts(sock_net(sk), msg, skb);
flags &= ~IP_CMSG_RETOPTS;
if (!flags)
@@ -1206,6 +1207,7 @@ e_inval:
void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
{
struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
+ bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags);
bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) ||
ipv6_sk_rxinfo(sk);
@@ -1219,7 +1221,7 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
* (e.g., process binds socket to eth0 for Tx which is
* redirected to loopback in the rtable/dst).
*/
- if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
+ if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX || l3slave)
pktinfo->ipi_ifindex = inet_iif(skb);
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
@@ -1227,14 +1229,7 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
pktinfo->ipi_ifindex = 0;
pktinfo->ipi_spec_dst.s_addr = 0;
}
- /* We need to keep the dst for __ip_options_echo()
- * We could restrict the test to opt.ts_needtime || opt.srr,
- * but the following is good enough as IP options are not often used.
- */
- if (unlikely(IPCB(skb)->opt.optlen))
- skb_dst_force(skb);
- else
- skb_dst_drop(skb);
+ skb_dst_drop(skb);
}
int ip_setsockopt(struct sock *sk, int level,
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 0192c255e508..5ed63d250950 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -584,33 +584,6 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
.get_link_net = ip_tunnel_get_link_net,
};
-static bool is_vti_tunnel(const struct net_device *dev)
-{
- return dev->netdev_ops == &vti_netdev_ops;
-}
-
-static int vti_device_event(struct notifier_block *unused,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct ip_tunnel *tunnel = netdev_priv(dev);
-
- if (!is_vti_tunnel(dev))
- return NOTIFY_DONE;
-
- switch (event) {
- case NETDEV_DOWN:
- if (!net_eq(tunnel->net, dev_net(dev)))
- xfrm_garbage_collect(tunnel->net);
- break;
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block vti_notifier_block __read_mostly = {
- .notifier_call = vti_device_event,
-};
-
static int __init vti_init(void)
{
const char *msg;
@@ -618,8 +591,6 @@ static int __init vti_init(void)
pr_info("IPv4 over IPsec tunneling driver\n");
- register_netdevice_notifier(&vti_notifier_block);
-
msg = "tunnel device";
err = register_pernet_device(&vti_net_ops);
if (err < 0)
@@ -652,7 +623,6 @@ xfrm_proto_ah_failed:
xfrm_proto_esp_failed:
unregister_pernet_device(&vti_net_ops);
pernet_dev_failed:
- unregister_netdevice_notifier(&vti_notifier_block);
pr_err("vti init: failed to register %s\n", msg);
return err;
}
@@ -664,7 +634,6 @@ static void __exit vti_fini(void)
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
unregister_pernet_device(&vti_net_ops);
- unregister_netdevice_notifier(&vti_notifier_block);
}
module_init(vti_init);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 06863ea3fc5b..c9b3e6e069ae 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -3114,14 +3114,14 @@ int __init ip_mr_init(void)
}
#endif
rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
- ipmr_rtm_getroute, ipmr_rtm_dumproute, NULL);
+ ipmr_rtm_getroute, ipmr_rtm_dumproute, 0);
rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
- ipmr_rtm_route, NULL, NULL);
+ ipmr_rtm_route, NULL, 0);
rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
- ipmr_rtm_route, NULL, NULL);
+ ipmr_rtm_route, NULL, 0);
rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK,
- NULL, ipmr_rtm_dumplink, NULL);
+ NULL, ipmr_rtm_dumplink, 0);
return 0;
#ifdef CONFIG_IP_PIMSM_V2
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 43eb6567b3a0..b6d3fe03feb3 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -206,14 +206,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
- SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
- SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
- SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
- SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
- SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
- SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
- SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
@@ -230,14 +223,12 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
- SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
- SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index b0bb5d0a30bd..33b70bfd1122 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -122,7 +122,8 @@ void raw_unhash_sk(struct sock *sk)
EXPORT_SYMBOL_GPL(raw_unhash_sk);
struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
- unsigned short num, __be32 raddr, __be32 laddr, int dif)
+ unsigned short num, __be32 raddr, __be32 laddr,
+ int dif, int sdif)
{
sk_for_each_from(sk) {
struct inet_sock *inet = inet_sk(sk);
@@ -130,7 +131,8 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
!(inet->inet_daddr && inet->inet_daddr != raddr) &&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+ sk->sk_bound_dev_if != sdif))
goto found; /* gotcha */
}
sk = NULL;
@@ -171,6 +173,7 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
*/
static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
{
+ int sdif = inet_sdif(skb);
struct sock *sk;
struct hlist_head *head;
int delivered = 0;
@@ -184,13 +187,13 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
net = dev_net(skb->dev);
sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
iph->saddr, iph->daddr,
- skb->dev->ifindex);
+ skb->dev->ifindex, sdif);
while (sk) {
delivered = 1;
if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
- skb->dev->ifindex)) {
+ skb->dev->ifindex, sdif)) {
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
/* Not releasing hash table! */
@@ -199,7 +202,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
}
sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
iph->saddr, iph->daddr,
- skb->dev->ifindex);
+ skb->dev->ifindex, sdif);
}
out:
read_unlock(&raw_v4_hashinfo.lock);
@@ -297,12 +300,15 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
read_lock(&raw_v4_hashinfo.lock);
raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
if (raw_sk) {
+ int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
+
iph = (const struct iphdr *)skb->data;
net = dev_net(skb->dev);
while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
iph->daddr, iph->saddr,
- skb->dev->ifindex)) != NULL) {
+ dif, sdif)) != NULL) {
raw_err(raw_sk, skb, info);
raw_sk = sk_next(raw_sk);
iph = (const struct iphdr *)skb->data;
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index e1a51ca68d23..c200065ef9a5 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -46,13 +46,13 @@ static struct sock *raw_lookup(struct net *net, struct sock *from,
sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
r->id.idiag_dst[0],
r->id.idiag_src[0],
- r->id.idiag_if);
+ r->id.idiag_if, 0);
#if IS_ENABLED(CONFIG_IPV6)
else
sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
(const struct in6_addr *)r->id.idiag_src,
(const struct in6_addr *)r->id.idiag_dst,
- r->id.idiag_if);
+ r->id.idiag_if, 0);
#endif
return sk;
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7effa62beed3..d400c0543106 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2236,7 +2236,7 @@ add:
if (!rth)
return ERR_PTR(-ENOBUFS);
- rth->rt_iif = orig_oif ? : 0;
+ rth->rt_iif = orig_oif;
if (res->table)
rth->rt_table_id = res->table->tb_id;
@@ -2439,6 +2439,12 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
/* L3 master device is the loopback for that domain */
dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
net->loopback_dev;
+
+ /* make sure orig_oif points to fib result device even
+ * though packet rx/tx happens over loopback or l3mdev
+ */
+ orig_oif = FIB_RES_OIF(*res);
+
fl4->flowi4_oif = dev_out->ifindex;
flags |= RTCF_LOCAL;
goto make_route;
@@ -3068,7 +3074,8 @@ int __init ip_rt_init(void)
xfrm_init();
xfrm4_init();
#endif
- rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
+ RTNL_FLAG_DOIT_UNLOCKED);
#ifdef CONFIG_SYSCTL
register_pernet_subsys(&sysctl_route_ops);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 03ad8778c395..b1bb1b3a1082 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -355,7 +355,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
*/
- ireq->opt = tcp_v4_save_options(skb);
+ ireq->opt = tcp_v4_save_options(sock_net(sk), skb);
if (security_inet_conn_request(sk, skb, req)) {
reqsk_free(req);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 9bf809726066..0d3c038d7b04 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -45,6 +45,9 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+/* obsolete */
+static int sysctl_tcp_low_latency __read_mostly;
+
/* Update system visible IP port range */
static void set_local_port_range(struct net *net, int range[2])
{
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 71ce33decd97..71b25567e787 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -388,6 +388,19 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
return period;
}
+static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
+{
+ u32 rate = READ_ONCE(tp->rate_delivered);
+ u32 intv = READ_ONCE(tp->rate_interval_us);
+ u64 rate64 = 0;
+
+ if (rate && intv) {
+ rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+ do_div(rate64, intv);
+ }
+ return rate64;
+}
+
/* Address-family independent initialization for a tcp_sock.
*
* NOTE: A lot of things set to zero explicitly by call to
@@ -400,7 +413,6 @@ void tcp_init_sock(struct sock *sk)
tp->out_of_order_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
- tcp_prequeue_init(tp);
INIT_LIST_HEAD(&tp->tsq_node);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
@@ -1034,23 +1046,29 @@ out_err:
}
EXPORT_SYMBOL_GPL(do_tcp_sendpages);
-int tcp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
{
- ssize_t res;
-
if (!(sk->sk_route_caps & NETIF_F_SG) ||
!sk_check_csum_caps(sk))
return sock_no_sendpage(sk->sk_socket, page, offset, size,
flags);
- lock_sock(sk);
-
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
- res = do_tcp_sendpages(sk, page, offset, size, flags);
+ return do_tcp_sendpages(sk, page, offset, size, flags);
+}
+
+int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_sendpage_locked(sk, page, offset, size, flags);
release_sock(sk);
- return res;
+
+ return ret;
}
EXPORT_SYMBOL(tcp_sendpage);
@@ -1144,9 +1162,10 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
return err;
}
-int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
struct sockcm_cookie sockc;
int flags, err, copied = 0;
@@ -1155,9 +1174,27 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
bool sg;
long timeo;
- lock_sock(sk);
-
flags = msg->msg_flags;
+
+ if (flags & MSG_ZEROCOPY && size) {
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
+ uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
+ if (!uarg) {
+ err = -ENOBUFS;
+ goto out_err;
+ }
+
+ /* skb may be freed in main loop, keep extra ref on uarg */
+ sock_zerocopy_get(uarg);
+ if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
+ uarg->zerocopy = 0;
+ }
+
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
@@ -1281,7 +1318,7 @@ new_segment:
err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
if (err)
goto do_fault;
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
@@ -1319,6 +1356,13 @@ new_segment:
page_ref_inc(pfrag->page);
}
pfrag->offset += copy;
+ } else {
+ err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
+ if (err == -EMSGSIZE || err == -EEXIST)
+ goto new_segment;
+ if (err < 0)
+ goto do_error;
+ copy = err;
}
if (!copied)
@@ -1365,7 +1409,7 @@ out:
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
out_nopush:
- release_sock(sk);
+ sock_zerocopy_put(uarg);
return copied + copied_syn;
do_fault:
@@ -1382,6 +1426,7 @@ do_error:
if (copied + copied_syn)
goto out;
out_err:
+ sock_zerocopy_put_abort(uarg);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
@@ -1389,9 +1434,19 @@ out_err:
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
- release_sock(sk);
return err;
}
+
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_sendmsg_locked(sk, msg, size);
+ release_sock(sk);
+
+ return ret;
+}
EXPORT_SYMBOL(tcp_sendmsg);
/*
@@ -1525,20 +1580,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
tcp_send_ack(sk);
}
-static void tcp_prequeue_process(struct sock *sk)
-{
- struct sk_buff *skb;
- struct tcp_sock *tp = tcp_sk(sk);
-
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
-
- while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
- sk_backlog_rcv(sk, skb);
-
- /* Clear memory counter. */
- tp->ucopy.memory = 0;
-}
-
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
@@ -1671,7 +1712,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int err;
int target; /* Read at least this many bytes */
long timeo;
- struct task_struct *user_recv = NULL;
struct sk_buff *skb, *last;
u32 urg_hole = 0;
@@ -1806,51 +1846,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
tcp_cleanup_rbuf(sk, copied);
- if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
- /* Install new reader */
- if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
- user_recv = current;
- tp->ucopy.task = user_recv;
- tp->ucopy.msg = msg;
- }
-
- tp->ucopy.len = len;
-
- WARN_ON(tp->copied_seq != tp->rcv_nxt &&
- !(flags & (MSG_PEEK | MSG_TRUNC)));
-
- /* Ugly... If prequeue is not empty, we have to
- * process it before releasing socket, otherwise
- * order will be broken at second iteration.
- * More elegant solution is required!!!
- *
- * Look: we have the following (pseudo)queues:
- *
- * 1. packets in flight
- * 2. backlog
- * 3. prequeue
- * 4. receive_queue
- *
- * Each queue can be processed only if the next ones
- * are empty. At this point we have empty receive_queue.
- * But prequeue _can_ be not empty after 2nd iteration,
- * when we jumped to start of loop because backlog
- * processing added something to receive_queue.
- * We cannot release_sock(), because backlog contains
- * packets arrived _after_ prequeued ones.
- *
- * Shortly, algorithm is clear --- to process all
- * the queues in order. We could make it more directly,
- * requeueing packets from backlog to prequeue, if
- * is not empty. It is more elegant, but eats cycles,
- * unfortunately.
- */
- if (!skb_queue_empty(&tp->ucopy.prequeue))
- goto do_prequeue;
-
- /* __ Set realtime policy in scheduler __ */
- }
-
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
@@ -1859,31 +1854,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
sk_wait_data(sk, &timeo, last);
}
- if (user_recv) {
- int chunk;
-
- /* __ Restore normal policy in scheduler __ */
-
- chunk = len - tp->ucopy.len;
- if (chunk != 0) {
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
- len -= chunk;
- copied += chunk;
- }
-
- if (tp->rcv_nxt == tp->copied_seq &&
- !skb_queue_empty(&tp->ucopy.prequeue)) {
-do_prequeue:
- tcp_prequeue_process(sk);
-
- chunk = len - tp->ucopy.len;
- if (chunk != 0) {
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
- len -= chunk;
- copied += chunk;
- }
- }
- }
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
@@ -1934,10 +1904,8 @@ do_prequeue:
tcp_rcv_space_adjust(sk);
skip_copy:
- if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+ if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
tp->urg_data = 0;
- tcp_fast_path_check(sk);
- }
if (used + offset < skb->len)
continue;
@@ -1955,25 +1923,6 @@ skip_copy:
break;
} while (len > 0);
- if (user_recv) {
- if (!skb_queue_empty(&tp->ucopy.prequeue)) {
- int chunk;
-
- tp->ucopy.len = copied > 0 ? len : 0;
-
- tcp_prequeue_process(sk);
-
- if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
- len -= chunk;
- copied += chunk;
- }
- }
-
- tp->ucopy.task = NULL;
- tp->ucopy.len = 0;
- }
-
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
@@ -2823,7 +2772,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 now, intv;
+ u32 now;
u64 rate64;
bool slow;
u32 rate;
@@ -2922,13 +2871,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_data_segs_out = tp->data_segs_out;
info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
- rate = READ_ONCE(tp->rate_delivered);
- intv = READ_ONCE(tp->rate_interval_us);
- if (rate && intv) {
- rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
- do_div(rate64, intv);
+ rate64 = tcp_compute_delivery_rate(tp);
+ if (rate64)
info->tcpi_delivery_rate = rate64;
- }
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -2938,8 +2883,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats;
struct tcp_info info;
+ u64 rate64;
+ u32 rate;
- stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+ stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
+ 3 * nla_total_size(sizeof(u32)) +
+ 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
if (!stats)
return NULL;
@@ -2954,6 +2903,20 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
tp->data_segs_out, TCP_NLA_PAD);
nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
tp->total_retrans, TCP_NLA_PAD);
+
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
+
+ rate64 = tcp_compute_delivery_rate(tp);
+ nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
+
+ nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
+ nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
+ nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
+
+ nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
+ nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
return stats;
}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 609965f0e298..fc3614377413 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -49,7 +49,6 @@ MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wma
struct bictcp {
u32 cnt; /* increase cwnd by 1 after ACKs */
u32 last_max_cwnd; /* last maximum snd_cwnd */
- u32 loss_cwnd; /* congestion window at last loss */
u32 last_cwnd; /* the last snd_cwnd */
u32 last_time; /* time when updated last_cwnd */
u32 epoch_start; /* beginning of an epoch */
@@ -72,7 +71,6 @@ static void bictcp_init(struct sock *sk)
struct bictcp *ca = inet_csk_ca(sk);
bictcp_reset(ca);
- ca->loss_cwnd = 0;
if (initial_ssthresh)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
@@ -172,22 +170,12 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
else
ca->last_max_cwnd = tp->snd_cwnd;
- ca->loss_cwnd = tp->snd_cwnd;
-
if (tp->snd_cwnd <= low_window)
return max(tp->snd_cwnd >> 1U, 2U);
else
return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
}
-static u32 bictcp_undo_cwnd(struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- const struct bictcp *ca = inet_csk_ca(sk);
-
- return max(tp->snd_cwnd, ca->loss_cwnd);
-}
-
static void bictcp_state(struct sock *sk, u8 new_state)
{
if (new_state == TCP_CA_Loss)
@@ -214,7 +202,7 @@ static struct tcp_congestion_ops bictcp __read_mostly = {
.ssthresh = bictcp_recalc_ssthresh,
.cong_avoid = bictcp_cong_avoid,
.set_state = bictcp_state,
- .undo_cwnd = bictcp_undo_cwnd,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.pkts_acked = bictcp_acked,
.owner = THIS_MODULE,
.name = "bic",
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 50a0f3e51d5b..66ac69f7bd19 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -85,7 +85,6 @@ struct cdg {
u8 state;
u8 delack;
u32 rtt_seq;
- u32 undo_cwnd;
u32 shadow_wnd;
u16 backoff_cnt;
u16 sample_cnt;
@@ -330,8 +329,6 @@ static u32 tcp_cdg_ssthresh(struct sock *sk)
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- ca->undo_cwnd = tp->snd_cwnd;
-
if (ca->state == CDG_BACKOFF)
return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10);
@@ -344,13 +341,6 @@ static u32 tcp_cdg_ssthresh(struct sock *sk)
return max(2U, tp->snd_cwnd >> 1);
}
-static u32 tcp_cdg_undo_cwnd(struct sock *sk)
-{
- struct cdg *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd);
-}
-
static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
{
struct cdg *ca = inet_csk_ca(sk);
@@ -403,7 +393,7 @@ struct tcp_congestion_ops tcp_cdg __read_mostly = {
.cong_avoid = tcp_cdg_cong_avoid,
.cwnd_event = tcp_cdg_cwnd_event,
.pkts_acked = tcp_cdg_acked,
- .undo_cwnd = tcp_cdg_undo_cwnd,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.ssthresh = tcp_cdg_ssthresh,
.release = tcp_cdg_release,
.init = tcp_cdg_init,
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index fde983f6376b..c2b174469645 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -456,7 +456,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+ return max(tp->snd_cwnd, tp->prior_cwnd);
}
EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 57ae5b5ae643..78bfadfcf342 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -83,7 +83,6 @@ MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (mse
struct bictcp {
u32 cnt; /* increase cwnd by 1 after ACKs */
u32 last_max_cwnd; /* last maximum snd_cwnd */
- u32 loss_cwnd; /* congestion window at last loss */
u32 last_cwnd; /* the last snd_cwnd */
u32 last_time; /* time when updated last_cwnd */
u32 bic_origin_point;/* origin point of bic function */
@@ -142,7 +141,6 @@ static void bictcp_init(struct sock *sk)
struct bictcp *ca = inet_csk_ca(sk);
bictcp_reset(ca);
- ca->loss_cwnd = 0;
if (hystart)
bictcp_hystart_reset(sk);
@@ -366,18 +364,9 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
else
ca->last_max_cwnd = tp->snd_cwnd;
- ca->loss_cwnd = tp->snd_cwnd;
-
return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
}
-static u32 bictcp_undo_cwnd(struct sock *sk)
-{
- struct bictcp *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
-}
-
static void bictcp_state(struct sock *sk, u8 new_state)
{
if (new_state == TCP_CA_Loss) {
@@ -470,7 +459,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
.ssthresh = bictcp_recalc_ssthresh,
.cong_avoid = bictcp_cong_avoid,
.set_state = bictcp_state,
- .undo_cwnd = bictcp_undo_cwnd,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cwnd_event = bictcp_cwnd_event,
.pkts_acked = bictcp_acked,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 6d9879e93648..d1c33c91eadc 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -94,7 +94,6 @@ static const struct hstcp_aimd_val {
struct hstcp {
u32 ai;
- u32 loss_cwnd;
};
static void hstcp_init(struct sock *sk)
@@ -153,22 +152,14 @@ static u32 hstcp_ssthresh(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
- ca->loss_cwnd = tp->snd_cwnd;
/* Do multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
}
-static u32 hstcp_cwnd_undo(struct sock *sk)
-{
- const struct hstcp *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
-}
-
static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
.init = hstcp_init,
.ssthresh = hstcp_ssthresh,
- .undo_cwnd = hstcp_cwnd_undo,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = hstcp_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 3eb78cde6ff0..082d479462fa 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -66,7 +66,6 @@ static inline void htcp_reset(struct htcp *ca)
static u32 htcp_cwnd_undo(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
if (ca->undo_last_cong) {
@@ -76,7 +75,7 @@ static u32 htcp_cwnd_undo(struct sock *sk)
ca->undo_last_cong = 0;
}
- return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta);
+ return tcp_reno_undo_cwnd(sk);
}
static inline void measure_rtt(struct sock *sk, u32 srtt)
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 60352ff4f5a8..7c843578f233 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -48,7 +48,6 @@ struct illinois {
u32 end_seq; /* right edge of current RTT */
u32 alpha; /* Additive increase */
u32 beta; /* Muliplicative decrease */
- u32 loss_cwnd; /* cwnd on loss */
u16 acked; /* # packets acked by current ACK */
u8 rtt_above; /* average rtt has gone above threshold */
u8 rtt_low; /* # of rtts measurements below threshold */
@@ -297,18 +296,10 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
- ca->loss_cwnd = tp->snd_cwnd;
/* Multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
}
-static u32 tcp_illinois_cwnd_undo(struct sock *sk)
-{
- const struct illinois *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
-}
-
/* Extract info for Tcp socket info provided via netlink. */
static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info)
@@ -336,7 +327,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
static struct tcp_congestion_ops tcp_illinois __read_mostly = {
.init = tcp_illinois_init,
.ssthresh = tcp_illinois_ssthresh,
- .undo_cwnd = tcp_illinois_cwnd_undo,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_illinois_cong_avoid,
.set_state = tcp_illinois_state,
.get_info = tcp_illinois_info,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53de1424c13c..d73903fe8c83 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -103,7 +103,6 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
-#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
@@ -1952,6 +1951,7 @@ void tcp_enter_loss(struct sock *sk)
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->prior_cwnd = tp->snd_cwnd;
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
tcp_init_undo(tp);
@@ -3372,12 +3372,6 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
- /* Note, it is the only place, where
- * fast path is recovered for sending TCP.
- */
- tp->pred_flags = 0;
- tcp_fast_path_check(sk);
-
if (tcp_send_head(sk))
tcp_slow_start_after_idle_check(sk);
@@ -3559,6 +3553,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ u32 ack_ev_flags = 0;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
@@ -3599,42 +3594,26 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (flag & FLAG_UPDATE_TS_RECENT)
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
- /* Window is constant, pure forward advance.
- * No more checks are required.
- * Note, we use the fact that SND.UNA>=SND.WL2.
- */
- tcp_update_wl(tp, ack_seq);
- tcp_snd_una_update(tp, ack);
- flag |= FLAG_WIN_UPDATE;
-
- tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
-
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
- } else {
- u32 ack_ev_flags = CA_ACK_SLOWPATH;
-
- if (ack_seq != TCP_SKB_CB(skb)->end_seq)
- flag |= FLAG_DATA;
- else
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+ if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+ flag |= FLAG_DATA;
+ else
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
- flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
+ flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
- if (TCP_SKB_CB(skb)->sacked)
- flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_state);
+ if (TCP_SKB_CB(skb)->sacked)
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_state);
- if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
- flag |= FLAG_ECE;
- ack_ev_flags |= CA_ACK_ECE;
- }
+ if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
+ flag |= FLAG_ECE;
+ ack_ev_flags = CA_ACK_ECE;
+ }
- if (flag & FLAG_WIN_UPDATE)
- ack_ev_flags |= CA_ACK_WIN_UPDATE;
+ if (flag & FLAG_WIN_UPDATE)
+ ack_ev_flags |= CA_ACK_WIN_UPDATE;
- tcp_in_ack_event(sk, ack_ev_flags);
- }
+ tcp_in_ack_event(sk, ack_ev_flags);
/* We passed data and got it acked, remove any soft error
* log. Something worked...
@@ -4402,8 +4381,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
return;
}
- /* Disable header prediction. */
- tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
@@ -4592,8 +4569,8 @@ err:
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- bool fragstolen = false;
- int eaten = -1;
+ bool fragstolen;
+ int eaten;
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
@@ -4615,32 +4592,13 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
goto out_of_window;
/* Ok. In sequence. In window. */
- if (tp->ucopy.task == current &&
- tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
- sock_owned_by_user(sk) && !tp->urg_data) {
- int chunk = min_t(unsigned int, skb->len,
- tp->ucopy.len);
-
- __set_current_state(TASK_RUNNING);
-
- if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
- tp->ucopy.len -= chunk;
- tp->copied_seq += chunk;
- eaten = (chunk == skb->len);
- tcp_rcv_space_adjust(sk);
- }
- }
-
- if (eaten <= 0) {
queue_and_out:
- if (eaten < 0) {
- if (skb_queue_len(&sk->sk_receive_queue) == 0)
- sk_forced_mem_schedule(sk, skb->truesize);
- else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
- goto drop;
- }
- eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
- }
+ if (skb_queue_len(&sk->sk_receive_queue) == 0)
+ sk_forced_mem_schedule(sk, skb->truesize);
+ else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto drop;
+
+ eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (skb->len)
tcp_event_data_recv(sk, skb);
@@ -4660,8 +4618,6 @@ queue_and_out:
if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp);
- tcp_fast_path_check(sk);
-
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
@@ -4987,7 +4943,6 @@ static int tcp_prune_queue(struct sock *sk)
NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */
- tp->pred_flags = 0;
return -1;
}
@@ -5159,9 +5114,6 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = ptr;
-
- /* Disable header prediction. */
- tp->pred_flags = 0;
}
/* This is the 'fast' part of urgent handling. */
@@ -5190,26 +5142,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
}
}
-static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int chunk = skb->len - hlen;
- int err;
-
- if (skb_csum_unnecessary(skb))
- err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
- else
- err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
-
- if (!err) {
- tp->ucopy.len -= chunk;
- tp->copied_seq += chunk;
- tcp_rcv_space_adjust(sk);
- }
-
- return err;
-}
-
/* Accept RST for rcv_nxt - 1 after a FIN.
* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
* FIN is sent followed by a RST packet. The RST is sent with the same
@@ -5340,201 +5272,29 @@ discard:
/*
* TCP receive function for the ESTABLISHED state.
- *
- * It is split into a fast path and a slow path. The fast path is
- * disabled when:
- * - A zero window was announced from us - zero window probing
- * is only handled properly in the slow path.
- * - Out of order segments arrived.
- * - Urgent data is expected.
- * - There is no buffer space left
- * - Unexpected TCP flags/window values/header lengths are received
- * (detected by checking the TCP header against pred_flags)
- * - Data is sent in both directions. Fast path only supports pure senders
- * or pure receivers (this means either the sequence number or the ack
- * value must stay constant)
- * - Unexpected TCP option.
- *
- * When these conditions are not satisfied it drops into a standard
- * receive procedure patterned after RFC793 to handle all cases.
- * The first three cases are guaranteed by proper pred_flags setting,
- * the rest is checked inline. Fast processing is turned on in
- * tcp_data_queue when everything is OK.
*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+ const struct tcphdr *th)
{
+ unsigned int len = skb->len;
struct tcp_sock *tp = tcp_sk(sk);
tcp_mstamp_refresh(tp);
if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
- /*
- * Header prediction.
- * The code loosely follows the one in the famous
- * "30 instruction TCP receive" Van Jacobson mail.
- *
- * Van's trick is to deposit buffers into socket queue
- * on a device interrupt, to call tcp_recv function
- * on the receive process context and checksum and copy
- * the buffer to user space. smart...
- *
- * Our current scheme is not silly either but we take the
- * extra cost of the net_bh soft interrupt processing...
- * We do checksum and copy also but from device to kernel.
- */
tp->rx_opt.saw_tstamp = 0;
- /* pred_flags is 0xS?10 << 16 + snd_wnd
- * if header_prediction is to be made
- * 'S' will always be tp->tcp_header_len >> 2
- * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
- * turn it off (when there are holes in the receive
- * space for instance)
- * PSH flag is ignored.
- */
-
- if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
- TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
- !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
- int tcp_header_len = tp->tcp_header_len;
-
- /* Timestamp header prediction: tcp_header_len
- * is automatically equal to th->doff*4 due to pred_flags
- * match.
- */
-
- /* Check timestamp */
- if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
- /* No? Slow path! */
- if (!tcp_parse_aligned_timestamp(tp, th))
- goto slow_path;
-
- /* If PAWS failed, check it more carefully in slow path */
- if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
- goto slow_path;
-
- /* DO NOT update ts_recent here, if checksum fails
- * and timestamp was corrupted part, it will result
- * in a hung connection since we will drop all
- * future packets due to the PAWS test.
- */
- }
-
- if (len <= tcp_header_len) {
- /* Bulk data transfer: sender */
- if (len == tcp_header_len) {
- /* Predicted packet is in window by definition.
- * seq == rcv_nxt and rcv_wup <= rcv_nxt.
- * Hence, check seq<=rcv_wup reduces to:
- */
- if (tcp_header_len ==
- (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
- tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
-
- /* We know that such packets are checksummed
- * on entry.
- */
- tcp_ack(sk, skb, 0);
- __kfree_skb(skb);
- tcp_data_snd_check(sk);
- return;
- } else { /* Header too small */
- TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
- goto discard;
- }
- } else {
- int eaten = 0;
- bool fragstolen = false;
-
- if (tp->ucopy.task == current &&
- tp->copied_seq == tp->rcv_nxt &&
- len - tcp_header_len <= tp->ucopy.len &&
- sock_owned_by_user(sk)) {
- __set_current_state(TASK_RUNNING);
-
- if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
- /* Predicted packet is in window by definition.
- * seq == rcv_nxt and rcv_wup <= rcv_nxt.
- * Hence, check seq<=rcv_wup reduces to:
- */
- if (tcp_header_len ==
- (sizeof(struct tcphdr) +
- TCPOLEN_TSTAMP_ALIGNED) &&
- tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
-
- tcp_rcv_rtt_measure_ts(sk, skb);
-
- __skb_pull(skb, tcp_header_len);
- tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPHPHITSTOUSER);
- eaten = 1;
- }
- }
- if (!eaten) {
- if (tcp_checksum_complete(skb))
- goto csum_error;
-
- if ((int)skb->truesize > sk->sk_forward_alloc)
- goto step5;
-
- /* Predicted packet is in window by definition.
- * seq == rcv_nxt and rcv_wup <= rcv_nxt.
- * Hence, check seq<=rcv_wup reduces to:
- */
- if (tcp_header_len ==
- (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
- tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
-
- tcp_rcv_rtt_measure_ts(sk, skb);
-
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
-
- /* Bulk data transfer: receiver */
- eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
- &fragstolen);
- }
-
- tcp_event_data_recv(sk, skb);
-
- if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
- /* Well, only one small jumplet in fast path... */
- tcp_ack(sk, skb, FLAG_DATA);
- tcp_data_snd_check(sk);
- if (!inet_csk_ack_scheduled(sk))
- goto no_ack;
- }
-
- __tcp_ack_snd_check(sk, 0);
-no_ack:
- if (eaten)
- kfree_skb_partial(skb, fragstolen);
- sk->sk_data_ready(sk);
- return;
- }
- }
-
-slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
if (!th->ack && !th->rst && !th->syn)
goto discard;
- /*
- * Standard slow path.
- */
-
if (!tcp_validate_incoming(sk, skb, th, 1))
return;
-step5:
- if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
+ if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0)
goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
@@ -5587,12 +5347,6 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
if (sock_flag(sk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
-
- if (!tp->rx_opt.snd_wscale)
- __tcp_fast_path_on(tp, tp->snd_wnd);
- else
- tp->pred_flags = 0;
-
}
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@ -5721,7 +5475,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_ecn_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- tcp_ack(sk, skb, FLAG_SLOWPATH);
+ tcp_ack(sk, skb, 0);
/* Ok.. it's good. Set up sequence numbers and
* move to established.
@@ -5957,8 +5711,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
return 0;
/* step 5: check the ACK field */
- acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
- FLAG_UPDATE_TS_RECENT |
+
+ acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT |
FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) {
@@ -6026,7 +5780,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk);
- tcp_fast_path_on(tp);
break;
case TCP_FIN_WAIT1: {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e9252c7df809..5af8b809dfbc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -85,8 +85,6 @@
#include <crypto/hash.h>
#include <linux/scatterlist.h>
-int sysctl_tcp_low_latency __read_mostly;
-
#ifdef CONFIG_TCP_MD5SIG
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
__be32 daddr, __be32 saddr, const struct tcphdr *th);
@@ -385,7 +383,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
th->dest, iph->saddr, ntohs(th->source),
- inet_iif(icmp_skb));
+ inet_iif(icmp_skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return;
@@ -661,7 +659,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
- ntohs(th->source), inet_iif(skb));
+ ntohs(th->source), inet_iif(skb),
+ tcp_v4_sdif(skb));
/* don't send rst if it can't find key */
if (!sk1)
goto out;
@@ -1269,7 +1268,7 @@ static void tcp_v4_init_req(struct request_sock *req,
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
- ireq->opt = tcp_v4_save_options(skb);
+ ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb);
}
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
@@ -1458,7 +1457,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sk->sk_rx_dst = NULL;
}
}
- tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
+ tcp_rcv_established(sk, skb, tcp_hdr(skb));
return 0;
}
@@ -1525,7 +1524,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
- skb->skb_iif);
+ skb->skb_iif, inet_sdif(skb));
if (sk) {
skb->sk = sk;
skb->destructor = sock_edemux;
@@ -1541,61 +1540,6 @@ void tcp_v4_early_demux(struct sk_buff *skb)
}
}
-/* Packet is added to VJ-style prequeue for processing in process
- * context, if a reader task is waiting. Apparently, this exciting
- * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
- * failed somewhere. Latency? Burstiness? Well, at least now we will
- * see, why it failed. 8)8) --ANK
- *
- */
-bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (sysctl_tcp_low_latency || !tp->ucopy.task)
- return false;
-
- if (skb->len <= tcp_hdrlen(skb) &&
- skb_queue_len(&tp->ucopy.prequeue) == 0)
- return false;
-
- /* Before escaping RCU protected region, we need to take care of skb
- * dst. Prequeue is only enabled for established sockets.
- * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
- * Instead of doing full sk_rx_dst validity here, let's perform
- * an optimistic check.
- */
- if (likely(sk->sk_rx_dst))
- skb_dst_drop(skb);
- else
- skb_dst_force_safe(skb);
-
- __skb_queue_tail(&tp->ucopy.prequeue, skb);
- tp->ucopy.memory += skb->truesize;
- if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
- tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
- struct sk_buff *skb1;
-
- BUG_ON(sock_owned_by_user(sk));
- __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
- skb_queue_len(&tp->ucopy.prequeue));
-
- while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
- sk_backlog_rcv(sk, skb1);
-
- tp->ucopy.memory = 0;
- } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
- wake_up_interruptible_sync_poll(sk_sleep(sk),
- POLLIN | POLLRDNORM | POLLRDBAND);
- if (!inet_csk_ack_scheduled(sk))
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- (3 * tcp_rto_min(sk)) / 4,
- TCP_RTO_MAX);
- }
- return true;
-}
-EXPORT_SYMBOL(tcp_prequeue);
-
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
@@ -1645,6 +1589,7 @@ EXPORT_SYMBOL(tcp_filter);
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
+ int sdif = inet_sdif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
bool refcounted;
@@ -1695,7 +1640,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
- th->dest, &refcounted);
+ th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
@@ -1770,8 +1715,7 @@ process:
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
- if (!tcp_prequeue(sk, skb))
- ret = tcp_v4_do_rcv(sk, skb);
+ ret = tcp_v4_do_rcv(sk, skb);
} else if (tcp_add_backlog(sk, skb)) {
goto discard_and_relse;
}
@@ -1824,7 +1768,8 @@ do_time_wait:
__tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
- inet_iif(skb));
+ inet_iif(skb),
+ sdif);
if (sk2) {
inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
@@ -1936,9 +1881,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
}
#endif
- /* Clean prequeue, it must be empty really */
- __skb_queue_purge(&tp->ucopy.prequeue);
-
/* Clean up a referenced TCP bind bucket. */
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 0ff83c1637d8..1537b87c657f 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -436,8 +436,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
struct tcp_sock *newtp = tcp_sk(newsk);
/* Now setup tcp_sock */
- newtp->pred_flags = 0;
-
newtp->rcv_wup = newtp->copied_seq =
newtp->rcv_nxt = treq->rcv_isn + 1;
newtp->segs_in = 1;
@@ -445,7 +443,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_sml = newtp->snd_una =
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
- tcp_prequeue_init(newtp);
INIT_LIST_HEAD(&newtp->tsq_node);
tcp_init_wl(newtp, treq->rcv_isn);
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index 6d650ed3cb59..1ff73982e28c 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -86,7 +86,6 @@ struct tcpnv {
* < 0 => less than 1 packet/RTT */
u8 available8;
u16 available16;
- u32 loss_cwnd; /* cwnd at last loss */
u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */
nv_reset:1, /* whether to reset values */
nv_catchup:1; /* whether we are growing because
@@ -121,7 +120,6 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
ca->nv_reset = 0;
- ca->loss_cwnd = 0;
ca->nv_no_cong_cnt = 0;
ca->nv_rtt_cnt = 0;
ca->nv_last_rtt = 0;
@@ -177,19 +175,10 @@ static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 tcpnv_recalc_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- struct tcpnv *ca = inet_csk_ca(sk);
- ca->loss_cwnd = tp->snd_cwnd;
return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U);
}
-static u32 tcpnv_undo_cwnd(struct sock *sk)
-{
- struct tcpnv *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
-}
-
static void tcpnv_state(struct sock *sk, u8 new_state)
{
struct tcpnv *ca = inet_csk_ca(sk);
@@ -446,7 +435,7 @@ static struct tcp_congestion_ops tcpnv __read_mostly = {
.ssthresh = tcpnv_recalc_ssthresh,
.cong_avoid = tcpnv_cong_avoid,
.set_state = tcpnv_state,
- .undo_cwnd = tcpnv_undo_cwnd,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.pkts_acked = tcpnv_acked,
.get_info = tcpnv_get_info,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b7661a68d498..3e0d19631534 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -295,9 +295,7 @@ static u16 tcp_select_window(struct sock *sk)
/* RFC1323 scaling applied */
new_win >>= tp->rx_opt.rcv_wscale;
- /* If we advertise zero window, disable fast path. */
if (new_win == 0) {
- tp->pred_flags = 0;
if (old_win)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPTOZEROWINDOWADV);
@@ -2377,7 +2375,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
u32 timeout, rto_delta_us;
/* Don't do any loss probe on a Fast Open connection before 3WHS
@@ -2398,15 +2395,19 @@ bool tcp_schedule_loss_probe(struct sock *sk)
tcp_send_head(sk))
return false;
- /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
+ /* Probe timeout is 2*rtt. Add minimum RTO to account
* for delayed ack when there's one outstanding packet. If no RTT
* sample is available then probe after TCP_TIMEOUT_INIT.
*/
- timeout = rtt << 1 ? : TCP_TIMEOUT_INIT;
- if (tp->packets_out == 1)
- timeout = max_t(u32, timeout,
- (rtt + (rtt >> 1) + TCP_DELACK_MAX));
- timeout = max_t(u32, timeout, msecs_to_jiffies(10));
+ if (tp->srtt_us) {
+ timeout = usecs_to_jiffies(tp->srtt_us >> 2);
+ if (tp->packets_out == 1)
+ timeout += TCP_RTO_MIN;
+ else
+ timeout += TCP_TIMEOUT_MIN;
+ } else {
+ timeout = TCP_TIMEOUT_INIT;
+ }
/* If the RTO formula yields an earlier time, then use that time. */
rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f6c50af24a64..697f4c67b2e3 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -105,8 +105,9 @@ static inline int tcp_probe_avail(void)
* Note: arguments must match tcp_rcv_established()!
*/
static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+ const struct tcphdr *th)
{
+ unsigned int len = skb->len;
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_sock *inet = inet_sk(sk);
@@ -145,7 +146,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
BUG();
}
- p->length = skb->len;
+ p->length = len;
p->snd_nxt = tp->snd_nxt;
p->snd_una = tp->snd_una;
p->snd_cwnd = tp->snd_cwnd;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index fe9a493d0208..449cd914d58e 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -113,7 +113,7 @@ void tcp_rack_mark_lost(struct sock *sk)
tp->rack.advanced = 0;
tcp_rack_detect_loss(sk, &timeout);
if (timeout) {
- timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
+ timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
timeout, inet_csk(sk)->icsk_rto);
}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index f2123075ce6e..addc122f8818 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,10 +15,6 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
-struct scalable {
- u32 loss_cwnd;
-};
-
static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -36,23 +32,13 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 tcp_scalable_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- struct scalable *ca = inet_csk_ca(sk);
-
- ca->loss_cwnd = tp->snd_cwnd;
return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
}
-static u32 tcp_scalable_cwnd_undo(struct sock *sk)
-{
- const struct scalable *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
-}
-
static struct tcp_congestion_ops tcp_scalable __read_mostly = {
.ssthresh = tcp_scalable_ssthresh,
- .undo_cwnd = tcp_scalable_cwnd_undo,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_scalable_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e906014890b6..655dd8d7f064 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -239,7 +239,6 @@ static int tcp_write_timeout(struct sock *sk)
/* Called with BH disabled */
void tcp_delack_timer_handler(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
sk_mem_reclaim_partial(sk);
@@ -254,17 +253,6 @@ void tcp_delack_timer_handler(struct sock *sk)
}
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
- if (!skb_queue_empty(&tp->ucopy.prequeue)) {
- struct sk_buff *skb;
-
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
-
- while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
- sk_backlog_rcv(sk, skb);
-
- tp->ucopy.memory = 0;
- }
-
if (inet_csk_ack_scheduled(sk)) {
if (!icsk->icsk_ack.pingpong) {
/* Delayed ACK missed: inflate ATO. */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 76005d4b8dfc..6fcf482d611b 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -30,7 +30,6 @@ struct veno {
u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */
u32 inc; /* decide whether to increase cwnd */
u32 diff; /* calculate the diff rate */
- u32 loss_cwnd; /* cwnd when loss occured */
};
/* There are several situations when we must "re-start" Veno:
@@ -194,7 +193,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
- veno->loss_cwnd = tp->snd_cwnd;
if (veno->diff < beta)
/* in "non-congestive state", cut cwnd by 1/5 */
return max(tp->snd_cwnd * 4 / 5, 2U);
@@ -203,17 +201,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
return max(tp->snd_cwnd >> 1U, 2U);
}
-static u32 tcp_veno_cwnd_undo(struct sock *sk)
-{
- const struct veno *veno = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd);
-}
-
static struct tcp_congestion_ops tcp_veno __read_mostly = {
.init = tcp_veno_init,
.ssthresh = tcp_veno_ssthresh,
- .undo_cwnd = tcp_veno_cwnd_undo,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_veno_cong_avoid,
.pkts_acked = tcp_veno_pkts_acked,
.set_state = tcp_veno_state,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index bec9cafbe3f9..e5de84310949 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -154,24 +154,6 @@ static inline void update_rtt_min(struct westwood *w)
}
/*
- * @westwood_fast_bw
- * It is called when we are in fast path. In particular it is called when
- * header prediction is successful. In such case in fact update is
- * straight forward and doesn't need any particular care.
- */
-static inline void westwood_fast_bw(struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- struct westwood *w = inet_csk_ca(sk);
-
- westwood_update_window(sk);
-
- w->bk += tp->snd_una - w->snd_una;
- w->snd_una = tp->snd_una;
- update_rtt_min(w);
-}
-
-/*
* @westwood_acked_count
* This function evaluates cumul_ack for evaluating bk in case of
* delayed or partial acks.
@@ -223,17 +205,12 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
{
- if (ack_flags & CA_ACK_SLOWPATH) {
- struct westwood *w = inet_csk_ca(sk);
-
- westwood_update_window(sk);
- w->bk += westwood_acked_count(sk);
+ struct westwood *w = inet_csk_ca(sk);
- update_rtt_min(w);
- return;
- }
+ westwood_update_window(sk);
+ w->bk += westwood_acked_count(sk);
- westwood_fast_bw(sk);
+ update_rtt_min(w);
}
static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index e6ff99c4bd3b..96e829b2e2fc 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -37,7 +37,6 @@ struct yeah {
u32 fast_count;
u32 pkts_acked;
- u32 loss_cwnd;
};
static void tcp_yeah_init(struct sock *sk)
@@ -220,22 +219,14 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
- yeah->loss_cwnd = tp->snd_cwnd;
return max_t(int, tp->snd_cwnd - reduction, 2);
}
-static u32 tcp_yeah_cwnd_undo(struct sock *sk)
-{
- const struct yeah *yeah = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd);
-}
-
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.init = tcp_yeah_init,
.ssthresh = tcp_yeah_ssthresh,
- .undo_cwnd = tcp_yeah_cwnd_undo,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_yeah_cong_avoid,
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a7c804f73990..cb633884e825 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -380,8 +380,8 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
static int compute_score(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
- __be32 daddr, unsigned short hnum, int dif,
- bool exact_dif)
+ __be32 daddr, unsigned short hnum,
+ int dif, int sdif, bool exact_dif)
{
int score;
struct inet_sock *inet;
@@ -413,10 +413,15 @@ static int compute_score(struct sock *sk, struct net *net,
}
if (sk->sk_bound_dev_if || exact_dif) {
- if (sk->sk_bound_dev_if != dif)
+ bool dev_match = (sk->sk_bound_dev_if == dif ||
+ sk->sk_bound_dev_if == sdif);
+
+ if (exact_dif && !dev_match)
return -1;
- score += 4;
+ if (sk->sk_bound_dev_if && dev_match)
+ score += 4;
}
+
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
return score;
@@ -436,10 +441,11 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
- __be32 saddr, __be16 sport,
- __be32 daddr, unsigned int hnum, int dif, bool exact_dif,
- struct udp_hslot *hslot2,
- struct sk_buff *skb)
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif, bool exact_dif,
+ struct udp_hslot *hslot2,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
int score, badness, matches = 0, reuseport = 0;
@@ -449,7 +455,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, exact_dif);
+ daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -477,8 +483,8 @@ static struct sock *udp4_lib_lookup2(struct net *net,
* harder than this. -DaveM
*/
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
- __be16 sport, __be32 daddr, __be16 dport,
- int dif, struct udp_table *udptable, struct sk_buff *skb)
+ __be16 sport, __be32 daddr, __be16 dport, int dif,
+ int sdif, struct udp_table *udptable, struct sk_buff *skb)
{
struct sock *sk, *result;
unsigned short hnum = ntohs(dport);
@@ -496,7 +502,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif,
+ daddr, hnum, dif, sdif,
exact_dif, hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
@@ -511,7 +517,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif,
+ daddr, hnum, dif, sdif,
exact_dif, hslot2, skb);
}
return result;
@@ -521,7 +527,7 @@ begin:
badness = 0;
sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, exact_dif);
+ daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -554,7 +560,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
- udptable, skb);
+ inet_sdif(skb), udptable, skb);
}
struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
@@ -576,7 +582,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
struct sock *sk;
sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
- dif, &udp_table, NULL);
+ dif, 0, &udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
@@ -587,7 +593,7 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup);
static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif, unsigned short hnum)
+ int dif, int sdif, unsigned short hnum)
{
struct inet_sock *inet = inet_sk(sk);
@@ -597,9 +603,10 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
(inet->inet_dport != rmt_port && inet->inet_dport) ||
(inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
ipv6_only_sock(sk) ||
- (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+ sk->sk_bound_dev_if != sdif))
return false;
- if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
+ if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
return false;
return true;
}
@@ -628,8 +635,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
struct net *net = dev_net(skb->dev);
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
- iph->saddr, uh->source, skb->dev->ifindex, udptable,
- NULL);
+ iph->saddr, uh->source, skb->dev->ifindex, 0,
+ udptable, NULL);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
@@ -1176,7 +1183,11 @@ static void udp_set_dev_scratch(struct sk_buff *skb)
scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
scratch->is_linear = !skb_is_nonlinear(skb);
#endif
- if (likely(!skb->_skb_refdst))
+ /* all head states execept sp (dst, sk, nf) are always cleared by
+ * udp_rcv() and we need to preserve secpath, if present, to eventually
+ * process IP_CMSG_PASSSEC at recvmsg() time
+ */
+ if (likely(!skb_sec_path(skb)))
scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
}
@@ -1782,13 +1793,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sk_mark_napi_id_once(sk, skb);
}
- /* At recvmsg() time we may access skb->dst or skb->sp depending on
- * the IP options and the cmsg flags, elsewhere can we clear all
- * pending head states while they are hot in the cache
- */
- if (likely(IPCB(skb)->opt.optlen == 0 && !skb_sec_path(skb)))
- skb_release_head_state(skb);
-
rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
@@ -1956,6 +1960,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
unsigned int offset = offsetof(typeof(*sk), sk_node);
int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
@@ -1970,7 +1975,7 @@ start_lookup:
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
- uh->source, saddr, dif, hnum))
+ uh->source, saddr, dif, sdif, hnum))
continue;
if (!first) {
@@ -2160,7 +2165,7 @@ drop:
static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif)
+ int dif, int sdif)
{
struct sock *sk, *result;
unsigned short hnum = ntohs(loc_port);
@@ -2174,7 +2179,7 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
result = NULL;
sk_for_each_rcu(sk, &hslot->head) {
if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
- rmt_port, rmt_addr, dif, hnum)) {
+ rmt_port, rmt_addr, dif, sdif, hnum)) {
if (result)
return NULL;
result = sk;
@@ -2191,7 +2196,7 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
static struct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif)
+ int dif, int sdif)
{
unsigned short hnum = ntohs(loc_port);
unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
@@ -2203,7 +2208,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (INET_MATCH(sk, net, acookie, rmt_addr,
- loc_addr, ports, dif))
+ loc_addr, ports, dif, sdif))
return sk;
/* Only check first socket in chain */
break;
@@ -2219,6 +2224,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
struct sock *sk = NULL;
struct dst_entry *dst;
int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
int ours;
/* validate the packet */
@@ -2244,10 +2250,11 @@ void udp_v4_early_demux(struct sk_buff *skb)
}
sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
- uh->source, iph->saddr, dif);
+ uh->source, iph->saddr,
+ dif, sdif);
} else if (skb->pkt_type == PACKET_HOST) {
sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
- uh->source, iph->saddr, dif);
+ uh->source, iph->saddr, dif, sdif);
}
if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 4515836d2a3a..d0390d844ac8 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -45,7 +45,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
sk = __udp4_lib_lookup(net,
req->id.idiag_src[0], req->id.idiag_sport,
req->id.idiag_dst[0], req->id.idiag_dport,
- req->id.idiag_if, tbl, NULL);
+ req->id.idiag_if, 0, tbl, NULL);
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6)
sk = __udp6_lib_lookup(net,
@@ -53,7 +53,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
req->id.idiag_sport,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
- req->id.idiag_if, tbl, NULL);
+ req->id.idiag_if, 0, tbl, NULL);
#endif
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
@@ -182,7 +182,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
sk = __udp4_lib_lookup(net,
req->id.idiag_dst[0], req->id.idiag_dport,
req->id.idiag_src[0], req->id.idiag_sport,
- req->id.idiag_if, tbl, NULL);
+ req->id.idiag_if, 0, tbl, NULL);
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6) {
if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
@@ -190,7 +190,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
sk = __udp4_lib_lookup(net,
req->id.idiag_dst[3], req->id.idiag_dport,
req->id.idiag_src[3], req->id.idiag_sport,
- req->id.idiag_if, tbl, NULL);
+ req->id.idiag_if, 0, tbl, NULL);
else
sk = __udp6_lib_lookup(net,
@@ -198,7 +198,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
req->id.idiag_dport,
(struct in6_addr *)req->id.idiag_src,
req->id.idiag_sport,
- req->id.idiag_if, tbl, NULL);
+ req->id.idiag_if, 0, tbl, NULL);
}
#endif
else {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0932c85b42af..97658bfc1b58 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
__be16 new_protocol, bool is_ipv6)
{
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
- bool remcsum, need_csum, offload_csum, ufo, gso_partial;
+ bool remcsum, need_csum, offload_csum, gso_partial;
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct udphdr *uh = udp_hdr(skb);
u16 mac_offset = skb->mac_header;
@@ -61,8 +61,6 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
skb->remcsum_offload = remcsum;
- ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
-
need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
/* Try to offload checksum if possible */
offload_csum = !!(need_csum &&
@@ -77,7 +75,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
* outer one so strip the existing checksum feature flags and
* instead set the flag based on our outer checksum offload value.
*/
- if (remcsum || ufo) {
+ if (remcsum) {
features &= ~NETIF_F_CSUM_MASK;
if (!need_csum || offload_csum)
features |= NETIF_F_HW_CSUM;
@@ -189,66 +187,16 @@ out_unlock:
}
EXPORT_SYMBOL(skb_udp_tunnel_segment);
-static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
- netdev_features_t features)
+static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
- unsigned int mss;
- __wsum csum;
- struct udphdr *uh;
- struct iphdr *iph;
if (skb->encapsulation &&
(skb_shinfo(skb)->gso_type &
- (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
+ (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)))
segs = skb_udp_tunnel_segment(skb, features, false);
- goto out;
- }
- if (!pskb_may_pull(skb, sizeof(struct udphdr)))
- goto out;
-
- mss = skb_shinfo(skb)->gso_size;
- if (unlikely(skb->len <= mss))
- goto out;
-
- if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
- /* Packet is from an untrusted source, reset gso_segs. */
-
- skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
-
- segs = NULL;
- goto out;
- }
-
- /* Do software UFO. Complete and fill in the UDP checksum as
- * HW cannot do checksum of UDP packets sent as multiple
- * IP fragments.
- */
-
- uh = udp_hdr(skb);
- iph = ip_hdr(skb);
-
- uh->check = 0;
- csum = skb_checksum(skb, 0, skb->len, 0);
- uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
- if (uh->check == 0)
- uh->check = CSUM_MANGLED_0;
-
- skb->ip_summed = CHECKSUM_UNNECESSARY;
-
- /* If there is no outer header we can fake a checksum offload
- * due to the fact that we have already done the checksum in
- * software prior to segmenting the frame.
- */
- if (!skb->encap_hdr_csum)
- features |= NETIF_F_HW_CSUM;
-
- /* Fragment the skb. IP headers of the fragments are updated in
- * inet_gso_segment()
- */
- segs = skb_segment(skb, features);
-out:
return segs;
}
@@ -382,7 +330,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
static const struct net_offload udpv4_offload = {
.callbacks = {
- .gso_segment = udp4_ufo_fragment,
+ .gso_segment = udp4_tunnel_segment,
.gro_receive = udp4_gro_receive,
.gro_complete = udp4_gro_complete,
},
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 58bd39fb14b4..6539ff15e9a3 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -82,7 +82,8 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
struct sock *sk = sock->sk;
struct udp_tunnel_info ti;
- if (!dev->netdev_ops->ndo_udp_tunnel_add)
+ if (!dev->netdev_ops->ndo_udp_tunnel_add ||
+ !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
return;
ti.type = type;
@@ -93,6 +94,24 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
}
EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port);
+void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
+ unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct udp_tunnel_info ti;
+
+ if (!dev->netdev_ops->ndo_udp_tunnel_del ||
+ !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+ return;
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port);
+
/* Notify netdevs that UDP port started listening */
void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
{
@@ -109,6 +128,8 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
for_each_netdev_rcu(net, dev) {
if (!dev->netdev_ops->ndo_udp_tunnel_add)
continue;
+ if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+ continue;
dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
}
rcu_read_unlock();
@@ -131,6 +152,8 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
for_each_netdev_rcu(net, dev) {
if (!dev->netdev_ops->ndo_udp_tunnel_del)
continue;
+ if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+ continue;
dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
}
rcu_read_unlock();
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 71b4ecc195c7..4aefb149fe0a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -213,14 +213,6 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
fl4->flowi4_tos = iph->tos;
}
-static inline int xfrm4_garbage_collect(struct dst_ops *ops)
-{
- struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
-
- xfrm_garbage_collect_deferred(net);
- return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
-}
-
static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu)
{
@@ -259,14 +251,13 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
static struct dst_ops xfrm4_dst_ops_template = {
.family = AF_INET,
- .gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
.redirect = xfrm4_redirect,
.cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
- .gc_thresh = INT_MAX,
+ .gc_thresh = 32768,
};
static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {