aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig11
-rw-r--r--net/ipv4/Makefile7
-rw-r--r--net/ipv4/af_inet.c26
-rw-r--r--net/ipv4/ah4.c7
-rw-r--r--net/ipv4/arp.c8
-rw-r--r--net/ipv4/devinet.c13
-rw-r--r--net/ipv4/esp4.c7
-rw-r--r--net/ipv4/fib_frontend.c4
-rw-r--r--net/ipv4/fib_semantics.c3
-rw-r--r--net/ipv4/fib_trie.c2
-rw-r--r--net/ipv4/gre.c253
-rw-r--r--net/ipv4/gre_demux.c414
-rw-r--r--net/ipv4/gre_offload.c130
-rw-r--r--net/ipv4/icmp.c51
-rw-r--r--net/ipv4/igmp.c79
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/inet_hashtables.c2
-rw-r--r--net/ipv4/ip_gre.c258
-rw-r--r--net/ipv4/ip_input.c7
-rw-r--r--net/ipv4/ip_tunnel.c177
-rw-r--r--net/ipv4/ip_tunnel_core.c122
-rw-r--r--net/ipv4/ip_vti.c7
-rw-r--r--net/ipv4/ipcomp.c7
-rw-r--r--net/ipv4/ipip.c20
-rw-r--r--net/ipv4/ipmr.c4
-rw-r--r--net/ipv4/netfilter/Kconfig2
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c7
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c2
-rw-r--r--net/ipv4/ping.c641
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/route.c144
-rw-r--r--net/ipv4/sysctl_net_ipv4.c37
-rw-r--r--net/ipv4/tcp.c344
-rw-r--r--net/ipv4/tcp_input.c517
-rw-r--r--net/ipv4/tcp_ipv4.c78
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv4/tcp_offload.c332
-rw-r--r--net/ipv4/tcp_output.c46
-rw-r--r--net/ipv4/udp.c94
-rw-r--r--net/ipv4/udp_offload.c100
-rw-r--r--net/ipv4/xfrm4_tunnel.c2
42 files changed, 2259 insertions, 1721 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 8603ca827104..37cf1a6ea3ad 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -9,10 +9,7 @@ config IP_MULTICAST
intend to participate in the MBONE, a high bandwidth network on top
of the Internet which carries audio and video broadcasts. More
information about the MBONE is on the WWW at
- <http://www.savetz.com/mbone/>. Information about the multicast
- capabilities of the various network cards is contained in
- <file:Documentation/networking/multicast.txt>. For most people, it's
- safe to say N.
+ <http://www.savetz.com/mbone/>. For most people, it's safe to say N.
config IP_ADVANCED_ROUTER
bool "IP: advanced router"
@@ -223,10 +220,8 @@ config IP_MROUTE
packets that have several destination addresses. It is needed on the
MBONE, a high bandwidth network on top of the Internet which carries
audio and video broadcasts. In order to do that, you would most
- likely run the program mrouted. Information about the multicast
- capabilities of the various network cards is contained in
- <file:Documentation/networking/multicast.txt>. If you haven't heard
- about it, you don't need it.
+ likely run the program mrouted. If you haven't heard about it, you
+ don't need it.
config IP_MROUTE_MULTIPLE_TABLES
bool "IP: multicast policy routing"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 089cb9f36387..4b81e91c80fe 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,10 +8,10 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
- datagram.o raw.o udp.o udplite.o \
- arp.o icmp.o devinet.o af_inet.o igmp.o \
+ tcp_offload.o datagram.o raw.o udp.o udplite.o \
+ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
- inet_fragment.o ping.o
+ inet_fragment.o ping.o ip_tunnel_core.o
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
@@ -19,6 +19,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_NET_IPIP) += ipip.o
+gre-y := gre_demux.o gre_offload.o
obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
obj-$(CONFIG_NET_IPVTI) += ip_vti.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d01be2a3ae53..b4d0be2b7ce9 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1295,6 +1295,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
SKB_GSO_GRE |
SKB_GSO_TCPV6 |
SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_MPLS |
0)))
goto out;
@@ -1384,7 +1385,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
goto out_unlock;
id = ntohl(*(__be32 *)&iph->id);
- flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16;
for (p = *head; p; p = p->next) {
@@ -1406,6 +1407,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
NAPI_GRO_CB(p)->flush |=
(iph->ttl ^ iph2->ttl) |
(iph->tos ^ iph2->tos) |
+ (__force int)((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) |
((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
NAPI_GRO_CB(p)->flush |= flush;
@@ -1557,15 +1559,6 @@ static const struct net_protocol tcp_protocol = {
.netns_ok = 1,
};
-static const struct net_offload tcp_offload = {
- .callbacks = {
- .gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_tso_segment,
- .gro_receive = tcp4_gro_receive,
- .gro_complete = tcp4_gro_complete,
- },
-};
-
static const struct net_protocol udp_protocol = {
.handler = udp_rcv,
.err_handler = udp_err,
@@ -1573,13 +1566,6 @@ static const struct net_protocol udp_protocol = {
.netns_ok = 1,
};
-static const struct net_offload udp_offload = {
- .callbacks = {
- .gso_send_check = udp4_ufo_send_check,
- .gso_segment = udp4_ufo_fragment,
- },
-};
-
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.err_handler = icmp_err,
@@ -1679,10 +1665,10 @@ static int __init ipv4_offload_init(void)
/*
* Add offloads
*/
- if (inet_add_offload(&udp_offload, IPPROTO_UDP) < 0)
+ if (udpv4_offload_init() < 0)
pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
- if (inet_add_offload(&tcp_offload, IPPROTO_TCP) < 0)
- pr_crit("%s: Cannot add TCP protocol offlaod\n", __func__);
+ if (tcpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
dev_add_offload(&ip_packet_offload);
return 0;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 2e7f1948216f..717902669d2f 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -419,12 +419,9 @@ static void ah4_err(struct sk_buff *skb, u32 info)
if (!x)
return;
- if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) {
- atomic_inc(&flow_cache_genid);
- rt_genid_bump(net);
-
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
- } else
+ else
ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
xfrm_state_put(x);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 247ec1951c35..4429b013f269 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1234,13 +1234,19 @@ out:
static int arp_netdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_change_info *change_info;
switch (event) {
case NETDEV_CHANGEADDR:
neigh_changeaddr(&arp_tbl, dev);
rt_cache_flush(dev_net(dev));
break;
+ case NETDEV_CHANGE:
+ change_info = ptr;
+ if (change_info->flags_changed & IFF_NOARP)
+ neigh_changeaddr(&arp_tbl, dev);
+ break;
default:
break;
}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index dfc39d4d48b7..34ca6d5a3a4b 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -215,6 +215,7 @@ void in_dev_finish_destroy(struct in_device *idev)
WARN_ON(idev->ifa_list);
WARN_ON(idev->mc_list);
+ kfree(rcu_dereference_protected(idev->mc_hash, 1));
#ifdef NET_REFCNT_DEBUG
pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
#endif
@@ -771,7 +772,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
ci = nla_data(tb[IFA_CACHEINFO]);
if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
err = -EINVAL;
- goto errout;
+ goto errout_free;
}
*pvalid_lft = ci->ifa_valid;
*pprefered_lft = ci->ifa_prefered;
@@ -779,6 +780,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
return ifa;
+errout_free:
+ inet_free_ifa(ifa);
errout:
return ERR_PTR(err);
}
@@ -1333,7 +1336,7 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev,
static int inetdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct in_device *in_dev = __in_dev_get_rtnl(dev);
ASSERT_RTNL();
@@ -1941,7 +1944,7 @@ static void inet_forward_change(struct net *net)
}
}
-static int devinet_conf_proc(ctl_table *ctl, int write,
+static int devinet_conf_proc(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
@@ -1984,7 +1987,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
return ret;
}
-static int devinet_sysctl_forward(ctl_table *ctl, int write,
+static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
@@ -2027,7 +2030,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
return ret;
}
-static int ipv4_doint_and_flush(ctl_table *ctl, int write,
+static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 4cfe34d4cc96..ab3d814bc80a 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -502,12 +502,9 @@ static void esp4_err(struct sk_buff *skb, u32 info)
if (!x)
return;
- if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) {
- atomic_inc(&flow_cache_genid);
- rt_genid_bump(net);
-
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
- } else
+ else
ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
xfrm_state_put(x);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index c7629a209f9d..b3f627ac4ed8 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -961,7 +961,7 @@ static void nl_fib_input(struct sk_buff *skb)
nlmsg_len(nlh) < sizeof(*frn))
return;
- skb = skb_clone(skb, GFP_KERNEL);
+ skb = netlink_skb_clone(skb, GFP_KERNEL);
if (skb == NULL)
return;
nlh = nlmsg_hdr(skb);
@@ -1038,7 +1038,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct in_device *in_dev;
struct net *net = dev_net(dev);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8f6cb7a87cd6..d5dbca5ecf62 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -169,7 +169,8 @@ static void free_nh_exceptions(struct fib_nh *nh)
next = rcu_dereference_protected(fnhe->fnhe_next, 1);
- rt_fibinfo_free(&fnhe->fnhe_rth);
+ rt_fibinfo_free(&fnhe->fnhe_rth_input);
+ rt_fibinfo_free(&fnhe->fnhe_rth_output);
kfree(fnhe);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 49616fed9340..108a1e9c9eac 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2133,7 +2133,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
max--;
pointers = 0;
- for (i = 1; i <= max; i++)
+ for (i = 1; i < max; i++)
if (stat->nodesizes[i] != 0) {
seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
pointers += (1<<i) * stat->nodesizes[i];
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
deleted file mode 100644
index 7856d1651d05..000000000000
--- a/net/ipv4/gre.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * GRE over IPv4 demultiplexer driver
- *
- * Authors: Dmitry Kozlov (xeb@mail.ru)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/kmod.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/netdevice.h>
-#include <linux/if_tunnel.h>
-#include <linux/spinlock.h>
-#include <net/protocol.h>
-#include <net/gre.h>
-
-
-static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
-static DEFINE_SPINLOCK(gre_proto_lock);
-
-int gre_add_protocol(const struct gre_protocol *proto, u8 version)
-{
- if (version >= GREPROTO_MAX)
- goto err_out;
-
- spin_lock(&gre_proto_lock);
- if (gre_proto[version])
- goto err_out_unlock;
-
- RCU_INIT_POINTER(gre_proto[version], proto);
- spin_unlock(&gre_proto_lock);
- return 0;
-
-err_out_unlock:
- spin_unlock(&gre_proto_lock);
-err_out:
- return -1;
-}
-EXPORT_SYMBOL_GPL(gre_add_protocol);
-
-int gre_del_protocol(const struct gre_protocol *proto, u8 version)
-{
- if (version >= GREPROTO_MAX)
- goto err_out;
-
- spin_lock(&gre_proto_lock);
- if (rcu_dereference_protected(gre_proto[version],
- lockdep_is_held(&gre_proto_lock)) != proto)
- goto err_out_unlock;
- RCU_INIT_POINTER(gre_proto[version], NULL);
- spin_unlock(&gre_proto_lock);
- synchronize_rcu();
- return 0;
-
-err_out_unlock:
- spin_unlock(&gre_proto_lock);
-err_out:
- return -1;
-}
-EXPORT_SYMBOL_GPL(gre_del_protocol);
-
-static int gre_rcv(struct sk_buff *skb)
-{
- const struct gre_protocol *proto;
- u8 ver;
- int ret;
-
- if (!pskb_may_pull(skb, 12))
- goto drop;
-
- ver = skb->data[1]&0x7f;
- if (ver >= GREPROTO_MAX)
- goto drop;
-
- rcu_read_lock();
- proto = rcu_dereference(gre_proto[ver]);
- if (!proto || !proto->handler)
- goto drop_unlock;
- ret = proto->handler(skb);
- rcu_read_unlock();
- return ret;
-
-drop_unlock:
- rcu_read_unlock();
-drop:
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-static void gre_err(struct sk_buff *skb, u32 info)
-{
- const struct gre_protocol *proto;
- const struct iphdr *iph = (const struct iphdr *)skb->data;
- u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
-
- if (ver >= GREPROTO_MAX)
- return;
-
- rcu_read_lock();
- proto = rcu_dereference(gre_proto[ver]);
- if (proto && proto->err_handler)
- proto->err_handler(skb, info);
- rcu_read_unlock();
-}
-
-static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
-{
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- netdev_features_t enc_features;
- int ghl = GRE_HEADER_SECTION;
- struct gre_base_hdr *greh;
- int mac_len = skb->mac_len;
- __be16 protocol = skb->protocol;
- int tnl_hlen;
- bool csum;
-
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_TCPV6 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_GRE)))
- goto out;
-
- if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
- goto out;
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
-
- if (greh->flags & GRE_KEY)
- ghl += GRE_HEADER_SECTION;
- if (greh->flags & GRE_SEQ)
- ghl += GRE_HEADER_SECTION;
- if (greh->flags & GRE_CSUM) {
- ghl += GRE_HEADER_SECTION;
- csum = true;
- } else
- csum = false;
-
- /* setup inner skb. */
- skb->protocol = greh->protocol;
- skb->encapsulation = 0;
-
- if (unlikely(!pskb_may_pull(skb, ghl)))
- goto out;
- __skb_pull(skb, ghl);
- skb_reset_mac_header(skb);
- skb_set_network_header(skb, skb_inner_network_offset(skb));
- skb->mac_len = skb_inner_network_offset(skb);
-
- /* segment inner packet. */
- enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
- segs = skb_mac_gso_segment(skb, enc_features);
- if (!segs || IS_ERR(segs))
- goto out;
-
- skb = segs;
- tnl_hlen = skb_tnl_header_len(skb);
- do {
- __skb_push(skb, ghl);
- if (csum) {
- __be32 *pcsum;
-
- if (skb_has_shared_frag(skb)) {
- int err;
-
- err = __skb_linearize(skb);
- if (err) {
- kfree_skb_list(segs);
- segs = ERR_PTR(err);
- goto out;
- }
- }
-
- greh = (struct gre_base_hdr *)(skb->data);
- pcsum = (__be32 *)(greh + 1);
- *pcsum = 0;
- *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0));
- }
- __skb_push(skb, tnl_hlen - ghl);
-
- skb_reset_mac_header(skb);
- skb_set_network_header(skb, mac_len);
- skb->mac_len = mac_len;
- skb->protocol = protocol;
- } while ((skb = skb->next));
-out:
- return segs;
-}
-
-static int gre_gso_send_check(struct sk_buff *skb)
-{
- if (!skb->encapsulation)
- return -EINVAL;
- return 0;
-}
-
-static const struct net_protocol net_gre_protocol = {
- .handler = gre_rcv,
- .err_handler = gre_err,
- .netns_ok = 1,
-};
-
-static const struct net_offload gre_offload = {
- .callbacks = {
- .gso_send_check = gre_gso_send_check,
- .gso_segment = gre_gso_segment,
- },
-};
-
-static int __init gre_init(void)
-{
- pr_info("GRE over IPv4 demultiplexor driver\n");
-
- if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
- pr_err("can't add protocol\n");
- return -EAGAIN;
- }
-
- if (inet_add_offload(&gre_offload, IPPROTO_GRE)) {
- pr_err("can't add protocol offload\n");
- inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
- return -EAGAIN;
- }
-
- return 0;
-}
-
-static void __exit gre_exit(void)
-{
- inet_del_offload(&gre_offload, IPPROTO_GRE);
- inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
-}
-
-module_init(gre_init);
-module_exit(gre_exit);
-
-MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
-MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
-MODULE_LICENSE("GPL");
-
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
new file mode 100644
index 000000000000..736c9fc3ef93
--- /dev/null
+++ b/net/ipv4/gre_demux.c
@@ -0,0 +1,414 @@
+/*
+ * GRE over IPv4 demultiplexer driver
+ *
+ * Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/if.h>
+#include <linux/icmp.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
+ 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+ int ret;
+
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
+ 0 : -EBUSY;
+
+ if (ret)
+ return ret;
+
+ synchronize_rcu();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ int hdr_len)
+{
+ struct gre_base_hdr *greh;
+
+ skb_push(skb, hdr_len);
+
+ greh = (struct gre_base_hdr *)skb->data;
+ greh->flags = tnl_flags_to_gre_flags(tpi->flags);
+ greh->protocol = tpi->proto;
+
+ if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+ if (tpi->flags&TUNNEL_SEQ) {
+ *ptr = tpi->seq;
+ ptr--;
+ }
+ if (tpi->flags&TUNNEL_KEY) {
+ *ptr = tpi->key;
+ ptr--;
+ }
+ if (tpi->flags&TUNNEL_CSUM &&
+ !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
+ *ptr = 0;
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+ skb->len, 0));
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(gre_build_header);
+
+struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum)
+{
+ int err;
+
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ if (skb_is_gso(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ goto error;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
+ return skb;
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL && gre_csum) {
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return skb;
+error:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(gre_handle_offloads);
+
+static __sum16 check_checksum(struct sk_buff *skb)
+{
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ csum = csum_fold(skb->csum);
+
+ if (!csum)
+ break;
+ /* Fall through. */
+
+ case CHECKSUM_NONE:
+ skb->csum = 0;
+ csum = __skb_checksum_complete(skb);
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ break;
+ }
+
+ return csum;
+}
+
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err)
+{
+ unsigned int ip_hlen = ip_hdrlen(skb);
+ const struct gre_base_hdr *greh;
+ __be32 *options;
+ int hdr_len;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ hdr_len = ip_gre_calc_hlen(tpi->flags);
+
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
+ tpi->proto = greh->protocol;
+
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (check_checksum(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+ options++;
+ }
+
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else
+ tpi->key = 0;
+
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else
+ tpi->seq = 0;
+
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = htons(ETH_P_IP);
+ if ((*(u8 *)options & 0xF0) != 0x40) {
+ hdr_len += 4;
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+ }
+ }
+
+ return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+}
+
+static int gre_cisco_rcv(struct sk_buff *skb)
+{
+ struct tnl_ptk_info tpi;
+ int i;
+ bool csum_err = false;
+
+ if (parse_gre_header(skb, &tpi, &csum_err) < 0)
+ goto drop;
+
+ rcu_read_lock();
+ for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
+ struct gre_cisco_protocol *proto;
+ int ret;
+
+ proto = rcu_dereference(gre_cisco_proto_list[i]);
+ if (!proto)
+ continue;
+ ret = proto->handler(skb, &tpi);
+ if (ret == PACKET_RCVD) {
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static void gre_cisco_err(struct sk_buff *skb, u32 info)
+{
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ *
+ * Moreover, Cisco "wise men" put GRE key to the third word
+ * in GRE header. It makes impossible maintaining even soft
+ * state for keyed
+ * GRE tunnels with enabled checksum. Tell them "thank you".
+ *
+ * Well, I wonder, rfc1812 was written by Cisco employee,
+ * what the hell these idiots break standards established
+ * by themselves???
+ */
+
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+ int i;
+
+ if (parse_gre_header(skb, &tpi, &csum_err)) {
+ if (!csum_err) /* ignore csum errors. */
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+ return;
+ }
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
+ IPPROTO_GRE, 0);
+ return;
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
+ struct gre_cisco_protocol *proto;
+
+ proto = rcu_dereference(gre_cisco_proto_list[i]);
+ if (!proto)
+ continue;
+
+ if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
+ goto out;
+
+ }
+out:
+ rcu_read_unlock();
+}
+
+static int gre_rcv(struct sk_buff *skb)
+{
+ const struct gre_protocol *proto;
+ u8 ver;
+ int ret;
+
+ if (!pskb_may_pull(skb, 12))
+ goto drop;
+
+ ver = skb->data[1]&0x7f;
+ if (ver >= GREPROTO_MAX)
+ goto drop;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (!proto || !proto->handler)
+ goto drop_unlock;
+ ret = proto->handler(skb);
+ rcu_read_unlock();
+ return ret;
+
+drop_unlock:
+ rcu_read_unlock();
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+ const struct gre_protocol *proto;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+
+ if (ver >= GREPROTO_MAX)
+ return;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (proto && proto->err_handler)
+ proto->err_handler(skb, info);
+ rcu_read_unlock();
+}
+
+static const struct net_protocol net_gre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+ .netns_ok = 1,
+};
+
+static const struct gre_protocol ipgre_protocol = {
+ .handler = gre_cisco_rcv,
+ .err_handler = gre_cisco_err,
+};
+
+int gre_cisco_register(struct gre_cisco_protocol *newp)
+{
+ struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
+ &gre_cisco_proto_list[newp->priority];
+
+ return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_cisco_register);
+
+int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
+{
+ struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
+ &gre_cisco_proto_list[del_proto->priority];
+ int ret;
+
+ ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
+
+ if (ret)
+ return ret;
+
+ synchronize_net();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gre_cisco_unregister);
+
+static int __init gre_init(void)
+{
+ pr_info("GRE over IPv4 demultiplexor driver\n");
+
+ if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+ pr_err("can't add protocol\n");
+ goto err;
+ }
+
+ if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
+ pr_info("%s: can't add ipgre handler\n", __func__);
+ goto err_gre;
+ }
+
+ if (gre_offload_init()) {
+ pr_err("can't add protocol offload\n");
+ goto err_gso;
+ }
+
+ return 0;
+err_gso:
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+err_gre:
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+err:
+ return -EAGAIN;
+}
+
+static void __exit gre_exit(void)
+{
+ gre_offload_exit();
+
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
new file mode 100644
index 000000000000..55e6bfb3a289
--- /dev/null
+++ b/net/ipv4/gre_offload.c
@@ -0,0 +1,130 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * GRE GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+static int gre_gso_send_check(struct sk_buff *skb)
+{
+ if (!skb->encapsulation)
+ return -EINVAL;
+ return 0;
+}
+
+static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ netdev_features_t enc_features;
+ int ghl = GRE_HEADER_SECTION;
+ struct gre_base_hdr *greh;
+ int mac_len = skb->mac_len;
+ __be16 protocol = skb->protocol;
+ int tnl_hlen;
+ bool csum;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE)))
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+ goto out;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+
+ if (greh->flags & GRE_KEY)
+ ghl += GRE_HEADER_SECTION;
+ if (greh->flags & GRE_SEQ)
+ ghl += GRE_HEADER_SECTION;
+ if (greh->flags & GRE_CSUM) {
+ ghl += GRE_HEADER_SECTION;
+ csum = true;
+ } else
+ csum = false;
+
+ /* setup inner skb. */
+ skb->protocol = greh->protocol;
+ skb->encapsulation = 0;
+
+ if (unlikely(!pskb_may_pull(skb, ghl)))
+ goto out;
+
+ __skb_pull(skb, ghl);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ segs = skb_mac_gso_segment(skb, enc_features);
+ if (!segs || IS_ERR(segs))
+ goto out;
+
+ skb = segs;
+ tnl_hlen = skb_tnl_header_len(skb);
+ do {
+ __skb_push(skb, ghl);
+ if (csum) {
+ __be32 *pcsum;
+
+ if (skb_has_shared_frag(skb)) {
+ int err;
+
+ err = __skb_linearize(skb);
+ if (err) {
+ kfree_skb_list(segs);
+ segs = ERR_PTR(err);
+ goto out;
+ }
+ }
+
+ greh = (struct gre_base_hdr *)(skb->data);
+ pcsum = (__be32 *)(greh + 1);
+ *pcsum = 0;
+ *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0));
+ }
+ __skb_push(skb, tnl_hlen - ghl);
+
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+static const struct net_offload gre_offload = {
+ .callbacks = {
+ .gso_send_check = gre_gso_send_check,
+ .gso_segment = gre_gso_segment,
+ },
+};
+
+int __init gre_offload_init(void)
+{
+ return inet_add_offload(&gre_offload, IPPROTO_GRE);
+}
+
+void __exit gre_offload_exit(void)
+{
+ inet_del_offload(&gre_offload, IPPROTO_GRE);
+}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 76e10b47e053..5f7d11a45871 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -482,7 +482,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct iphdr *iph;
int room;
- struct icmp_bxm icmp_param;
+ struct icmp_bxm *icmp_param;
struct rtable *rt = skb_rtable(skb_in);
struct ipcm_cookie ipc;
struct flowi4 fl4;
@@ -503,7 +503,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
iph = ip_hdr(skb_in);
if ((u8 *)iph < skb_in->head ||
- (skb_in->network_header + sizeof(*iph)) > skb_in->tail)
+ (skb_network_header(skb_in) + sizeof(*iph)) >
+ skb_tail_pointer(skb_in))
goto out;
/*
@@ -557,9 +558,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
}
+ icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
+ if (!icmp_param)
+ return;
+
sk = icmp_xmit_lock(net);
if (sk == NULL)
- return;
+ goto out_free;
/*
* Construct source address and options.
@@ -585,7 +590,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
IPTOS_PREC_INTERNETCONTROL) :
iph->tos;
- if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
+ if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
goto out_unlock;
@@ -593,19 +598,19 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
* Prepare data for ICMP header.
*/
- icmp_param.data.icmph.type = type;
- icmp_param.data.icmph.code = code;
- icmp_param.data.icmph.un.gateway = info;
- icmp_param.data.icmph.checksum = 0;
- icmp_param.skb = skb_in;
- icmp_param.offset = skb_network_offset(skb_in);
+ icmp_param->data.icmph.type = type;
+ icmp_param->data.icmph.code = code;
+ icmp_param->data.icmph.un.gateway = info;
+ icmp_param->data.icmph.checksum = 0;
+ icmp_param->skb = skb_in;
+ icmp_param->offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
ipc.addr = iph->saddr;
- ipc.opt = &icmp_param.replyopts.opt;
+ ipc.opt = &icmp_param->replyopts.opt;
ipc.tx_flags = 0;
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos,
- type, code, &icmp_param);
+ type, code, icmp_param);
if (IS_ERR(rt))
goto out_unlock;
@@ -617,19 +622,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
room = dst_mtu(&rt->dst);
if (room > 576)
room = 576;
- room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
+ room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr);
- icmp_param.data_len = skb_in->len - icmp_param.offset;
- if (icmp_param.data_len > room)
- icmp_param.data_len = room;
- icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_param->data_len = skb_in->len - icmp_param->offset;
+ if (icmp_param->data_len > room)
+ icmp_param->data_len = room;
+ icmp_param->head_len = sizeof(struct icmphdr);
- icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
+out_free:
+ kfree(icmp_param);
out:;
}
EXPORT_SYMBOL(icmp_send);
@@ -657,7 +664,8 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
}
/*
- * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
+ * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and
+ * ICMP_PARAMETERPROB.
*/
static void icmp_unreach(struct sk_buff *skb)
@@ -939,7 +947,8 @@ error:
void icmp_err(struct sk_buff *skb, u32 info)
{
struct iphdr *iph = (struct iphdr *)skb->data;
- struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
+ int offset = iph->ihl<<2;
+ struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
int type = icmp_hdr(skb)->type;
int code = icmp_hdr(skb)->code;
struct net *net = dev_net(skb->dev);
@@ -949,7 +958,7 @@ void icmp_err(struct sk_buff *skb, u32 info)
* triggered by ICMP_ECHOREPLY which sent from kernel.
*/
if (icmph->type != ICMP_ECHOREPLY) {
- ping_err(skb, info);
+ ping_err(skb, offset, info);
return;
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d8c232794bcb..cd71190d2962 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -363,7 +363,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
static int igmpv3_sendpack(struct sk_buff *skb)
{
struct igmphdr *pig = igmp_hdr(skb);
- const int igmplen = skb->tail - skb->transport_header;
+ const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
@@ -1217,6 +1217,57 @@ static void igmp_group_added(struct ip_mc_list *im)
* Multicast list managers
*/
+static u32 ip_mc_hash(const struct ip_mc_list *im)
+{
+ return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG);
+}
+
+static void ip_mc_hash_add(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash;
+ u32 hash;
+
+ mc_hash = rtnl_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ rcu_assign_pointer(mc_hash[hash], im);
+ return;
+ }
+
+ /* do not use a hash table for small number of items */
+ if (in_dev->mc_count < 4)
+ return;
+
+ mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG,
+ GFP_KERNEL);
+ if (!mc_hash)
+ return;
+
+ for_each_pmc_rtnl(in_dev, im) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ RCU_INIT_POINTER(mc_hash[hash], im);
+ }
+
+ rcu_assign_pointer(in_dev->mc_hash, mc_hash);
+}
+
+static void ip_mc_hash_remove(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash);
+ struct ip_mc_list *aux;
+
+ if (!mc_hash)
+ return;
+ mc_hash += ip_mc_hash(im);
+ while ((aux = rtnl_dereference(*mc_hash)) != im)
+ mc_hash = &aux->next_hash;
+ *mc_hash = im->next_hash;
+}
+
/*
* A socket has joined a multicast group on device dev.
@@ -1258,6 +1309,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
in_dev->mc_count++;
rcu_assign_pointer(in_dev->mc_list, im);
+ ip_mc_hash_add(in_dev, im);
+
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, im->multiaddr);
#endif
@@ -1314,6 +1367,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
ip = &i->next_rcu) {
if (i->multiaddr == addr) {
if (--i->users == 0) {
+ ip_mc_hash_remove(in_dev, i);
*ip = i->next_rcu;
in_dev->mc_count--;
igmp_group_dropped(i);
@@ -1381,13 +1435,9 @@ void ip_mc_init_dev(struct in_device *in_dev)
{
ASSERT_RTNL();
- in_dev->mc_tomb = NULL;
#ifdef CONFIG_IP_MULTICAST
- in_dev->mr_gq_running = 0;
setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
(unsigned long)in_dev);
- in_dev->mr_ifc_count = 0;
- in_dev->mc_count = 0;
setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
(unsigned long)in_dev);
in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
@@ -2321,12 +2371,25 @@ void ip_mc_drop_socket(struct sock *sk)
int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
{
struct ip_mc_list *im;
+ struct ip_mc_list __rcu **mc_hash;
struct ip_sf_list *psf;
int rv = 0;
- for_each_pmc_rcu(in_dev, im) {
- if (im->multiaddr == mc_addr)
- break;
+ mc_hash = rcu_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG);
+
+ for (im = rcu_dereference(mc_hash[hash]);
+ im != NULL;
+ im = rcu_dereference(im->next_hash)) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
+ } else {
+ for_each_pmc_rcu(in_dev, im) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
}
if (im && proto == IPPROTO_IGMP) {
rv = 1;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index cec539458307..c5313a9c019b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -247,8 +247,6 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
{
struct inet_frag_bucket *hb;
struct inet_frag_queue *qp;
-#ifdef CONFIG_SMP
-#endif
unsigned int hash;
read_lock(&f->lock); /* Protects against hash rebuild */
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 6af375afeeef..7bd8983dbfcf 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -467,7 +467,7 @@ void inet_unhash(struct sock *sk)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock_bh(lock);
- done =__sk_nulls_del_node_init_rcu(sk);
+ done = __sk_nulls_del_node_init_rcu(sk);
if (done)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock_bh(lock);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2a83591492dd..1f6eab66f7ce 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -121,103 +121,8 @@ static int ipgre_tunnel_init(struct net_device *dev);
static int ipgre_net_id __read_mostly;
static int gre_tap_net_id __read_mostly;
-static __sum16 check_checksum(struct sk_buff *skb)
-{
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- csum = csum_fold(skb->csum);
-
- if (!csum)
- break;
- /* Fall through. */
-
- case CHECKSUM_NONE:
- skb->csum = 0;
- csum = __skb_checksum_complete(skb);
- skb->ip_summed = CHECKSUM_COMPLETE;
- break;
- }
-
- return csum;
-}
-
-static int ip_gre_calc_hlen(__be16 o_flags)
-{
- int addend = 4;
-
- if (o_flags&TUNNEL_CSUM)
- addend += 4;
- if (o_flags&TUNNEL_KEY)
- addend += 4;
- if (o_flags&TUNNEL_SEQ)
- addend += 4;
- return addend;
-}
-
-static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
- bool *csum_err, int *hdr_len)
-{
- unsigned int ip_hlen = ip_hdrlen(skb);
- const struct gre_base_hdr *greh;
- __be32 *options;
-
- if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
- if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
- return -EINVAL;
-
- tpi->flags = gre_flags_to_tnl_flags(greh->flags);
- *hdr_len = ip_gre_calc_hlen(tpi->flags);
-
- if (!pskb_may_pull(skb, *hdr_len))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
-
- tpi->proto = greh->protocol;
-
- options = (__be32 *)(greh + 1);
- if (greh->flags & GRE_CSUM) {
- if (check_checksum(skb)) {
- *csum_err = true;
- return -EINVAL;
- }
- options++;
- }
-
- if (greh->flags & GRE_KEY) {
- tpi->key = *options;
- options++;
- } else
- tpi->key = 0;
-
- if (unlikely(greh->flags & GRE_SEQ)) {
- tpi->seq = *options;
- options++;
- } else
- tpi->seq = 0;
-
- /* WCCP version 1 and 2 protocol decoding.
- * - Change protocol to IP
- * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
- */
- if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
- tpi->proto = htons(ETH_P_IP);
- if ((*(u8 *)options & 0xF0) != 0x40) {
- *hdr_len += 4;
- if (!pskb_may_pull(skb, *hdr_len))
- return -EINVAL;
- }
- }
-
- return 0;
-}
-
-static void ipgre_err(struct sk_buff *skb, u32 info)
+static int ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
{
/* All the routers (except for Linux) return only
@@ -239,26 +144,18 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct ip_tunnel *t;
- struct tnl_ptk_info tpi;
- int hdr_len;
- bool csum_err = false;
-
- if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len)) {
- if (!csum_err) /* ignore csum errors. */
- return;
- }
switch (type) {
default:
case ICMP_PARAMETERPROB:
- return;
+ return PACKET_RCVD;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH:
/* Impossible event. */
- return;
+ return PACKET_RCVD;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -269,138 +166,61 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
break;
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return;
+ return PACKET_RCVD;
break;
case ICMP_REDIRECT:
break;
}
- if (tpi.proto == htons(ETH_P_TEB))
+ if (tpi->proto == htons(ETH_P_TEB))
itn = net_generic(net, gre_tap_net_id);
else
itn = net_generic(net, ipgre_net_id);
iph = (const struct iphdr *)skb->data;
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
- iph->daddr, iph->saddr, tpi.key);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->daddr, iph->saddr, tpi->key);
if (t == NULL)
- return;
+ return PACKET_REJECT;
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- t->parms.link, 0, IPPROTO_GRE, 0);
- return;
- }
- if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
- IPPROTO_GRE, 0);
- return;
- }
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
- return;
+ return PACKET_RCVD;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- return;
+ return PACKET_RCVD;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
+ return PACKET_RCVD;
}
-static int ipgre_rcv(struct sk_buff *skb)
+static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
{
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn;
const struct iphdr *iph;
struct ip_tunnel *tunnel;
- struct tnl_ptk_info tpi;
- int hdr_len;
- bool csum_err = false;
-
- if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len) < 0)
- goto drop;
- if (tpi.proto == htons(ETH_P_TEB))
+ if (tpi->proto == htons(ETH_P_TEB))
itn = net_generic(net, gre_tap_net_id);
else
itn = net_generic(net, ipgre_net_id);
iph = ip_hdr(skb);
- tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
- iph->saddr, iph->daddr, tpi.key);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
- ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
- return 0;
- }
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-drop:
- kfree_skb(skb);
- return 0;
-}
-
-static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff *skb)
-{
- int err;
-
- if (skb_is_gso(skb)) {
- err = skb_unclone(skb, GFP_ATOMIC);
- if (unlikely(err))
- goto error;
- skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
- return skb;
- } else if (skb->ip_summed == CHECKSUM_PARTIAL &&
- tunnel->parms.o_flags&TUNNEL_CSUM) {
- err = skb_checksum_help(skb);
- if (unlikely(err))
- goto error;
- } else if (skb->ip_summed != CHECKSUM_PARTIAL)
- skb->ip_summed = CHECKSUM_NONE;
-
- return skb;
-
-error:
- kfree_skb(skb);
- return ERR_PTR(err);
-}
-
-static struct sk_buff *gre_build_header(struct sk_buff *skb,
- const struct tnl_ptk_info *tpi,
- int hdr_len)
-{
- struct gre_base_hdr *greh;
-
- skb_push(skb, hdr_len);
-
- greh = (struct gre_base_hdr *)skb->data;
- greh->flags = tnl_flags_to_gre_flags(tpi->flags);
- greh->protocol = tpi->proto;
-
- if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
- __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
-
- if (tpi->flags&TUNNEL_SEQ) {
- *ptr = tpi->seq;
- ptr--;
- }
- if (tpi->flags&TUNNEL_KEY) {
- *ptr = tpi->key;
- ptr--;
- }
- if (tpi->flags&TUNNEL_CSUM &&
- !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
- *(__sum16 *)ptr = 0;
- *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
- skb->len, 0));
- }
+ ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
+ return PACKET_RCVD;
}
-
- return skb;
+ return PACKET_REJECT;
}
static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
@@ -410,11 +230,6 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
struct ip_tunnel *tunnel = netdev_priv(dev);
struct tnl_ptk_info tpi;
- if (likely(!skb->encapsulation)) {
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
- }
-
tpi.flags = tunnel->parms.o_flags;
tpi.proto = proto;
tpi.key = tunnel->parms.o_key;
@@ -423,13 +238,9 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
tpi.seq = htonl(tunnel->o_seqno);
/* Push GRE header. */
- skb = gre_build_header(skb, &tpi, tunnel->hlen);
- if (unlikely(!skb)) {
- dev->stats.tx_dropped++;
- return;
- }
+ gre_build_header(skb, &tpi, tunnel->hlen);
- ip_tunnel_xmit(skb, dev, tnl_params);
+ ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
@@ -438,7 +249,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tnl_params;
- skb = handle_offloads(tunnel, skb);
+ skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
if (IS_ERR(skb))
goto out;
@@ -477,7 +288,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- skb = handle_offloads(tunnel, skb);
+ skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
if (IS_ERR(skb))
goto out;
@@ -503,10 +314,11 @@ static int ipgre_tunnel_ioctl(struct net_device *dev,
if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
return -EFAULT;
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
- ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) {
- return -EINVAL;
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
+ ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
+ return -EINVAL;
}
p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
@@ -708,9 +520,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
return ip_tunnel_init(dev);
}
-static const struct gre_protocol ipgre_protocol = {
- .handler = ipgre_rcv,
- .err_handler = ipgre_err,
+static struct gre_cisco_protocol ipgre_protocol = {
+ .handler = ipgre_rcv,
+ .err_handler = ipgre_err,
+ .priority = 0,
};
static int __net_init ipgre_init_net(struct net *net)
@@ -978,7 +791,7 @@ static int __init ipgre_init(void)
if (err < 0)
goto pnet_tap_faied;
- err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
+ err = gre_cisco_register(&ipgre_protocol);
if (err < 0) {
pr_info("%s: can't add protocol\n", __func__);
goto add_proto_failed;
@@ -997,7 +810,7 @@ static int __init ipgre_init(void)
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
- gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+ gre_cisco_unregister(&ipgre_protocol);
add_proto_failed:
unregister_pernet_device(&ipgre_tap_net_ops);
pnet_tap_faied:
@@ -1009,8 +822,7 @@ static void __exit ipgre_fini(void)
{
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
- if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
- pr_info("%s: can't remove protocol\n", __func__);
+ gre_cisco_unregister(&ipgre_protocol);
unregister_pernet_device(&ipgre_tap_net_ops);
unregister_pernet_device(&ipgre_net_ops);
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3da817b89e9b..15e3e683adec 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -190,10 +190,7 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
- __skb_pull(skb, ip_hdrlen(skb));
-
- /* Point into the IP datagram, just past the header. */
- skb_reset_transport_header(skb);
+ __skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();
{
@@ -437,6 +434,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
goto drop;
}
+ skb->transport_header = skb->network_header + iph->ihl*4;
+
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 7fa8f08fa7ae..ca1cb2d5f6e2 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -304,6 +304,7 @@ static struct net_device *__ip_tunnel_create(struct net *net,
tunnel = netdev_priv(dev);
tunnel->parms = *parms;
+ tunnel->net = net;
err = register_netdevice(dev);
if (err)
@@ -408,13 +409,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
const struct iphdr *iph = ip_hdr(skb);
int err;
- secpath_reset(skb);
-
- skb->protocol = tpi->proto;
-
- skb->mac_header = skb->network_header;
- __pskb_pull(skb, tunnel->hlen);
- skb_postpull_rcsum(skb, skb_transport_header(skb), tunnel->hlen);
#ifdef CONFIG_NET_IPGRE_BROADCAST
if (ipv4_is_multicast(iph->daddr)) {
/* Looped back packet, drop it! */
@@ -442,23 +436,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
tunnel->i_seqno = ntohl(tpi->seq) + 1;
}
- /* Warning: All skb pointers will be invalidated! */
- if (tunnel->dev->type == ARPHRD_ETHER) {
- if (!pskb_may_pull(skb, ETH_HLEN)) {
- tunnel->dev->stats.rx_length_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
-
- iph = ip_hdr(skb);
- skb->protocol = eth_type_trans(skb, tunnel->dev);
- skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
- }
-
- skb->pkt_type = PACKET_HOST;
- __skb_tunnel_rx(skb, tunnel->dev);
-
- skb_reset_network_header(skb);
err = IP_ECN_decapsulate(iph, skb);
if (unlikely(err)) {
if (log_ecn_error)
@@ -477,6 +454,15 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
tstats->rx_bytes += skb->len;
u64_stats_update_end(&tstats->syncp);
+ if (tunnel->net != dev_net(tunnel->dev))
+ skb_scrub_packet(skb);
+
+ if (tunnel->dev->type == ARPHRD_ETHER) {
+ skb->protocol = eth_type_trans(skb, tunnel->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ } else {
+ skb->dev = tunnel->dev;
+ }
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
@@ -486,24 +472,69 @@ drop:
}
EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
+static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+ struct rtable *rt, __be16 df)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
+ int mtu;
+
+ if (df)
+ mtu = dst_mtu(&rt->dst) - dev->hard_header_len
+ - sizeof(struct iphdr) - tunnel->hlen;
+ else
+ mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+
+ if (skb_dst(skb))
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ if (!skb_is_gso(skb) &&
+ (df & htons(IP_DF)) && mtu < pkt_size) {
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ return -E2BIG;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+ if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
+ mtu >= IPV6_MIN_MTU) {
+ if ((tunnel->parms.iph.daddr &&
+ !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+ rt6->rt6i_dst.plen == 128) {
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
+ }
+ }
+
+ if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
+ mtu < pkt_size) {
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ return -E2BIG;
+ }
+ }
+#endif
+ return 0;
+}
+
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
- const struct iphdr *tnl_params)
+ const struct iphdr *tnl_params, const u8 protocol)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *inner_iph;
- struct iphdr *iph;
struct flowi4 fl4;
u8 tos, ttl;
__be16 df;
struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
unsigned int max_headroom; /* The extra header space needed */
__be32 dst;
- int mtu;
+ int err;
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
- memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
dst = tnl_params->daddr;
if (dst == 0) {
/* NBMA tunnel */
@@ -561,8 +592,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
}
- rt = ip_route_output_tunnel(dev_net(dev), &fl4,
- tunnel->parms.iph.protocol,
+ rt = ip_route_output_tunnel(tunnel->net, &fl4,
+ protocol,
dst, tnl_params->saddr,
tunnel->parms.o_key,
RT_TOS(tos),
@@ -571,58 +602,19 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
dev->stats.tx_carrier_errors++;
goto tx_error;
}
- tdev = rt->dst.dev;
-
- if (tdev == dev) {
+ if (rt->dst.dev == dev) {
ip_rt_put(rt);
dev->stats.collisions++;
goto tx_error;
}
- df = tnl_params->frag_off;
-
- if (df)
- mtu = dst_mtu(&rt->dst) - dev->hard_header_len
- - sizeof(struct iphdr);
- else
- mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
-
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-
- if (skb->protocol == htons(ETH_P_IP)) {
- df |= (inner_iph->frag_off&htons(IP_DF));
-
- if (!skb_is_gso(skb) &&
- (inner_iph->frag_off&htons(IP_DF)) &&
- mtu < ntohs(inner_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
- }
+ if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
+ ip_rt_put(rt);
+ goto tx_error;
}
-#if IS_ENABLED(CONFIG_IPV6)
- else if (skb->protocol == htons(ETH_P_IPV6)) {
- struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
-
- if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
- mtu >= IPV6_MIN_MTU) {
- if ((tunnel->parms.iph.daddr &&
- !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
- rt6->rt6i_dst.plen == 128) {
- rt6->rt6i_flags |= RTF_MODIFIED;
- dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
- }
- }
- if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
- mtu < skb->len) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- ip_rt_put(rt);
- goto tx_error;
- }
- }
-#endif
+ if (tunnel->net != dev_net(dev))
+ skb_scrub_packet(skb);
if (tunnel->err_count > 0) {
if (time_before(jiffies,
@@ -646,8 +638,12 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
ttl = ip4_dst_hoplimit(&rt->dst);
}
- max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr)
- + rt->dst.header_len;
+ df = tnl_params->frag_off;
+ if (skb->protocol == htons(ETH_P_IP))
+ df |= (inner_iph->frag_off&htons(IP_DF));
+
+ max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ + rt->dst.header_len;
if (max_headroom > dev->needed_headroom) {
dev->needed_headroom = max_headroom;
if (skb_cow_head(skb, dev->needed_headroom)) {
@@ -657,27 +653,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
}
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
- /* Push down and install the IP header. */
- skb_push(skb, sizeof(struct iphdr));
- skb_reset_network_header(skb);
-
- iph = ip_hdr(skb);
- inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ err = iptunnel_xmit(dev_net(dev), rt, skb,
+ fl4.saddr, fl4.daddr, protocol,
+ ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df);
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr) >> 2;
- iph->frag_off = df;
- iph->protocol = tnl_params->protocol;
- iph->tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
- iph->daddr = fl4.daddr;
- iph->saddr = fl4.saddr;
- iph->ttl = ttl;
- tunnel_ip_select_ident(skb, inner_iph, &rt->dst);
-
- iptunnel_xmit(skb, dev);
return;
#if IS_ENABLED(CONFIG_IPV6)
@@ -926,6 +906,7 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
if (ip_tunnel_find(itn, p, dev->type))
return -EEXIST;
+ nt->net = net;
nt->parms = *p;
err = register_netdevice(dev);
if (err)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
new file mode 100644
index 000000000000..7167b08977df
--- /dev/null
+++ b/net/ipv4/ip_tunnel_core.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+
+int iptunnel_xmit(struct net *net, struct rtable *rt,
+ struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 proto,
+ __u8 tos, __u8 ttl, __be16 df)
+{
+ int pkt_len = skb->len;
+ struct iphdr *iph;
+ int err;
+
+ nf_reset(skb);
+ secpath_reset(skb);
+ skb->rxhash = 0;
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+ /* Push down and install the IP header. */
+ __skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->frag_off = df;
+ iph->protocol = proto;
+ iph->tos = tos;
+ iph->daddr = dst;
+ iph->saddr = src;
+ iph->ttl = ttl;
+ tunnel_ip_select_ident(skb,
+ (const struct iphdr *)skb_inner_network_header(skb),
+ &rt->dst);
+
+ err = ip_local_out(skb);
+ if (unlikely(net_xmit_eval(err)))
+ pkt_len = 0;
+ return pkt_len;
+}
+EXPORT_SYMBOL_GPL(iptunnel_xmit);
+
+int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
+{
+ if (unlikely(!pskb_may_pull(skb, hdr_len)))
+ return -ENOMEM;
+
+ skb_pull_rcsum(skb, hdr_len);
+
+ if (inner_proto == htons(ETH_P_TEB)) {
+ struct ethhdr *eh = (struct ethhdr *)skb->data;
+
+ if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+ return -ENOMEM;
+
+ if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
+ skb->protocol = eh->h_proto;
+ else
+ skb->protocol = htons(ETH_P_802_2);
+
+ } else {
+ skb->protocol = inner_proto;
+ }
+
+ nf_reset(skb);
+ secpath_reset(skb);
+ if (!skb->l4_rxhash)
+ skb->rxhash = 0;
+ skb_dst_drop(skb);
+ skb->vlan_tci = 0;
+ skb_set_queue_mapping(skb, 0);
+ skb->pkt_type = PACKET_HOST;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iptunnel_pull_header);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index c118f6b576bb..17cc0ffa8c0d 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -606,17 +606,10 @@ static int __net_init vti_fb_tunnel_init(struct net_device *dev)
struct iphdr *iph = &tunnel->parms.iph;
struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
-
iph->version = 4;
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
dev_hold(dev);
rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
return 0;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 59cb8c769056..826be4cb482a 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -47,12 +47,9 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
if (!x)
return;
- if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) {
- atomic_inc(&flow_cache_genid);
- rt_genid_bump(net);
-
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
- } else
+ else
ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
xfrm_state_put(x);
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 77bfcce64fe5..51fc2a1dcdd3 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -188,8 +188,12 @@ static int ipip_rcv(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
struct ip_tunnel *tunnel;
- const struct iphdr *iph = ip_hdr(skb);
+ const struct iphdr *iph;
+ if (iptunnel_pull_header(skb, 0, tpi.proto))
+ goto drop;
+
+ iph = ip_hdr(skb);
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
iph->saddr, iph->daddr, 0);
if (tunnel) {
@@ -222,7 +226,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
skb->encapsulation = 1;
}
- ip_tunnel_xmit(skb, dev, tiph);
+ ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
return NETDEV_TX_OK;
tx_error:
@@ -240,11 +244,13 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
return -EFAULT;
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
- return -EINVAL;
- if (p.i_key || p.o_key || p.i_flags || p.o_flags)
- return -EINVAL;
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+ return -EINVAL;
+ }
+
+ p.i_key = p.o_key = p.i_flags = p.o_flags = 0;
if (p.iph.ttl)
p.iph.frag_off |= htons(IP_DF);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9d9610ae7855..132a09664704 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -980,7 +980,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
/* Copy the IP header */
- skb->network_header = skb->tail;
+ skb_set_network_header(skb, skb->len);
skb_put(skb, ihl);
skb_copy_to_linear_data(skb, pkt->data, ihl);
ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
@@ -1609,7 +1609,7 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
struct mr_table *mrt;
struct vif_device *v;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index e7916c193932..4e9028017428 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -111,7 +111,7 @@ config IP_NF_TARGET_REJECT
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_TARGET_ULOG
- tristate "ULOG target support"
+ tristate "ULOG target support (obsolete)"
default m if NETFILTER_ADVANCED=n
---help---
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 5d5d4d1be9c2..30e4de940567 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -108,7 +108,7 @@ static int masq_device_event(struct notifier_block *this,
unsigned long event,
void *ptr)
{
- const struct net_device *dev = ptr;
+ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
if (event == NETDEV_DOWN) {
@@ -129,7 +129,10 @@ static int masq_inet_event(struct notifier_block *this,
void *ptr)
{
struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
- return masq_device_event(this, event, dev);
+ struct netdev_notifier_info info;
+
+ netdev_notifier_info_init(&info, dev);
+ return masq_device_event(this, event, &info);
}
static struct notifier_block masq_dev_notifier = {
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 32b0e978c8e0..cbc22158af49 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -331,6 +331,12 @@ static int ulog_tg_check(const struct xt_tgchk_param *par)
{
const struct ipt_ulog_info *loginfo = par->targinfo;
+ if (!par->net->xt.ulog_warn_deprecated) {
+ pr_info("ULOG is deprecated and it will be removed soon, "
+ "use NFLOG instead\n");
+ par->net->xt.ulog_warn_deprecated = true;
+ }
+
if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
pr_debug("prefix not null-terminated\n");
return -EINVAL;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 567d84168bd2..0a2e0e3e95ba 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -223,7 +223,7 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
static int log_invalid_proto_min = 0;
static int log_invalid_proto_max = 255;
-static ctl_table ip_ct_sysctl_table[] = {
+static struct ctl_table ip_ct_sysctl_table[] = {
{
.procname = "ip_conntrack_max",
.maxlen = sizeof(int),
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 7d93d62cd5fd..746427c9e719 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -33,7 +33,6 @@
#include <linux/netdevice.h>
#include <net/snmp.h>
#include <net/ip.h>
-#include <net/ipv6.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -46,8 +45,18 @@
#include <net/inet_common.h>
#include <net/checksum.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#endif
-static struct ping_table ping_table;
+
+struct ping_table ping_table;
+struct pingv6_ops pingv6_ops;
+EXPORT_SYMBOL_GPL(pingv6_ops);
static u16 ping_port_rover;
@@ -58,6 +67,7 @@ static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int ma
pr_debug("hash(%d) = %d\n", num, res);
return res;
}
+EXPORT_SYMBOL_GPL(ping_hash);
static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
struct net *net, unsigned int num)
@@ -65,7 +75,7 @@ static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
}
-static int ping_v4_get_port(struct sock *sk, unsigned short ident)
+int ping_get_port(struct sock *sk, unsigned short ident)
{
struct hlist_nulls_node *node;
struct hlist_nulls_head *hlist;
@@ -103,6 +113,10 @@ next_port:
ping_portaddr_for_each_entry(sk2, node, hlist) {
isk2 = inet_sk(sk2);
+ /* BUG? Why is this reuse and not reuseaddr? ping.c
+ * doesn't turn off SO_REUSEADDR, and it doesn't expect
+ * that other ping processes can steal its packets.
+ */
if ((isk2->inet_num == ident) &&
(sk2 != sk) &&
(!sk2->sk_reuse || !sk->sk_reuse))
@@ -125,17 +139,18 @@ fail:
write_unlock_bh(&ping_table.lock);
return 1;
}
+EXPORT_SYMBOL_GPL(ping_get_port);
-static void ping_v4_hash(struct sock *sk)
+void ping_hash(struct sock *sk)
{
- pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
+ pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
BUG(); /* "Please do not press this button again." */
}
-static void ping_v4_unhash(struct sock *sk)
+void ping_unhash(struct sock *sk)
{
struct inet_sock *isk = inet_sk(sk);
- pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+ pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
if (sk_hashed(sk)) {
write_lock_bh(&ping_table.lock);
hlist_nulls_del(&sk->sk_nulls_node);
@@ -146,31 +161,61 @@ static void ping_v4_unhash(struct sock *sk)
write_unlock_bh(&ping_table.lock);
}
}
+EXPORT_SYMBOL_GPL(ping_unhash);
-static struct sock *ping_v4_lookup(struct net *net, __be32 saddr, __be32 daddr,
- u16 ident, int dif)
+static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
{
struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
struct sock *sk = NULL;
struct inet_sock *isk;
struct hlist_nulls_node *hnode;
+ int dif = skb->dev->ifindex;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
+ (int)ident, &ip_hdr(skb)->daddr, dif);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n",
+ (int)ident, &ipv6_hdr(skb)->daddr, dif);
+#endif
+ }
- pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
- (int)ident, &daddr, dif);
read_lock_bh(&ping_table.lock);
ping_portaddr_for_each_entry(sk, hnode, hslot) {
isk = inet_sk(sk);
- pr_debug("found: %p: num = %d, daddr = %pI4, dif = %d\n", sk,
- (int)isk->inet_num, &isk->inet_rcv_saddr,
- sk->sk_bound_dev_if);
-
pr_debug("iterate\n");
if (isk->inet_num != ident)
continue;
- if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr)
- continue;
+
+ if (skb->protocol == htons(ETH_P_IP) &&
+ sk->sk_family == AF_INET) {
+ pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk,
+ (int) isk->inet_num, &isk->inet_rcv_saddr,
+ sk->sk_bound_dev_if);
+
+ if (isk->inet_rcv_saddr &&
+ isk->inet_rcv_saddr != ip_hdr(skb)->daddr)
+ continue;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6) &&
+ sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk,
+ (int) isk->inet_num,
+ &inet6_sk(sk)->rcv_saddr,
+ sk->sk_bound_dev_if);
+
+ if (!ipv6_addr_any(&np->rcv_saddr) &&
+ !ipv6_addr_equal(&np->rcv_saddr,
+ &ipv6_hdr(skb)->daddr))
+ continue;
+#endif
+ }
+
if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
continue;
@@ -200,7 +245,7 @@ static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
}
-static int ping_init_sock(struct sock *sk)
+int ping_init_sock(struct sock *sk)
{
struct net *net = sock_net(sk);
kgid_t group = current_egid();
@@ -225,8 +270,9 @@ static int ping_init_sock(struct sock *sk)
return -EACCES;
}
+EXPORT_SYMBOL_GPL(ping_init_sock);
-static void ping_close(struct sock *sk, long timeout)
+void ping_close(struct sock *sk, long timeout)
{
pr_debug("ping_close(sk=%p,sk->num=%u)\n",
inet_sk(sk), inet_sk(sk)->inet_num);
@@ -234,36 +280,122 @@ static void ping_close(struct sock *sk, long timeout)
sk_common_release(sk);
}
+EXPORT_SYMBOL_GPL(ping_close);
+
+/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
+static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
+ struct sockaddr *uaddr, int addr_len) {
+ struct net *net = sock_net(sk);
+ if (sk->sk_family == AF_INET) {
+ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ int chk_addr_ret;
+
+ if (addr_len < sizeof(*addr))
+ return -EINVAL;
+
+ pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
+ sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
+
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+
+ if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
+ chk_addr_ret = RTN_LOCAL;
+
+ if ((sysctl_ip_nonlocal_bind == 0 &&
+ isk->freebind == 0 && isk->transparent == 0 &&
+ chk_addr_ret != RTN_LOCAL) ||
+ chk_addr_ret == RTN_MULTICAST ||
+ chk_addr_ret == RTN_BROADCAST)
+ return -EADDRNOTAVAIL;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr;
+ int addr_type, scoped, has_addr;
+ struct net_device *dev = NULL;
+
+ if (addr_len < sizeof(*addr))
+ return -EINVAL;
+
+ pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n",
+ sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port));
+
+ addr_type = ipv6_addr_type(&addr->sin6_addr);
+ scoped = __ipv6_addr_needs_scope_id(addr_type);
+ if ((addr_type != IPV6_ADDR_ANY &&
+ !(addr_type & IPV6_ADDR_UNICAST)) ||
+ (scoped && !addr->sin6_scope_id))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (addr->sin6_scope_id) {
+ dev = dev_get_by_index_rcu(net, addr->sin6_scope_id);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ }
+ has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev,
+ scoped);
+ rcu_read_unlock();
+
+ if (!(isk->freebind || isk->transparent || has_addr ||
+ addr_type == IPV6_ADDR_ANY))
+ return -EADDRNOTAVAIL;
+
+ if (scoped)
+ sk->sk_bound_dev_if = addr->sin6_scope_id;
+#endif
+ } else {
+ return -EAFNOSUPPORT;
+ }
+ return 0;
+}
+
+static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
+{
+ if (saddr->sa_family == AF_INET) {
+ struct inet_sock *isk = inet_sk(sk);
+ struct sockaddr_in *addr = (struct sockaddr_in *) saddr;
+ isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (saddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ np->rcv_saddr = np->saddr = addr->sin6_addr;
+#endif
+ }
+}
+static void ping_clear_saddr(struct sock *sk, int dif)
+{
+ sk->sk_bound_dev_if = dif;
+ if (sk->sk_family == AF_INET) {
+ struct inet_sock *isk = inet_sk(sk);
+ isk->inet_rcv_saddr = isk->inet_saddr = 0;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ memset(&np->rcv_saddr, 0, sizeof(np->rcv_saddr));
+ memset(&np->saddr, 0, sizeof(np->saddr));
+#endif
+ }
+}
/*
* We need our own bind because there are no privileged id's == local ports.
* Moreover, we don't allow binding to multi- and broadcast addresses.
*/
-static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
- struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct inet_sock *isk = inet_sk(sk);
unsigned short snum;
- int chk_addr_ret;
int err;
+ int dif = sk->sk_bound_dev_if;
- if (addr_len < sizeof(struct sockaddr_in))
- return -EINVAL;
-
- pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n",
- sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
-
- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
- if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
- chk_addr_ret = RTN_LOCAL;
-
- if ((sysctl_ip_nonlocal_bind == 0 &&
- isk->freebind == 0 && isk->transparent == 0 &&
- chk_addr_ret != RTN_LOCAL) ||
- chk_addr_ret == RTN_MULTICAST ||
- chk_addr_ret == RTN_BROADCAST)
- return -EADDRNOTAVAIL;
+ err = ping_check_bind_addr(sk, isk, uaddr, addr_len);
+ if (err)
+ return err;
lock_sock(sk);
@@ -272,42 +404,50 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto out;
err = -EADDRINUSE;
- isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
- snum = ntohs(addr->sin_port);
- if (ping_v4_get_port(sk, snum) != 0) {
- isk->inet_saddr = isk->inet_rcv_saddr = 0;
+ ping_set_saddr(sk, uaddr);
+ snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port);
+ if (ping_get_port(sk, snum) != 0) {
+ ping_clear_saddr(sk, dif);
goto out;
}
- pr_debug("after bind(): num = %d, daddr = %pI4, dif = %d\n",
+ pr_debug("after bind(): num = %d, dif = %d\n",
(int)isk->inet_num,
- &isk->inet_rcv_saddr,
(int)sk->sk_bound_dev_if);
err = 0;
- if (isk->inet_rcv_saddr)
+ if ((sk->sk_family == AF_INET && isk->inet_rcv_saddr) ||
+ (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_any(&inet6_sk(sk)->rcv_saddr)))
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
isk->inet_sport = htons(isk->inet_num);
isk->inet_daddr = 0;
isk->inet_dport = 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ memset(&inet6_sk(sk)->daddr, 0, sizeof(inet6_sk(sk)->daddr));
+#endif
+
sk_dst_reset(sk);
out:
release_sock(sk);
pr_debug("ping_v4_bind -> %d\n", err);
return err;
}
+EXPORT_SYMBOL_GPL(ping_bind);
/*
* Is this a supported type of ICMP message?
*/
-static inline int ping_supported(int type, int code)
+static inline int ping_supported(int family, int type, int code)
{
- if (type == ICMP_ECHO && code == 0)
- return 1;
- return 0;
+ return (family == AF_INET && type == ICMP_ECHO && code == 0) ||
+ (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0);
}
/*
@@ -315,30 +455,42 @@ static inline int ping_supported(int type, int code)
* sort of error condition.
*/
-static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-
-void ping_err(struct sk_buff *skb, u32 info)
+void ping_err(struct sk_buff *skb, int offset, u32 info)
{
- struct iphdr *iph = (struct iphdr *)skb->data;
- struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
+ int family;
+ struct icmphdr *icmph;
struct inet_sock *inet_sock;
- int type = icmp_hdr(skb)->type;
- int code = icmp_hdr(skb)->code;
+ int type;
+ int code;
struct net *net = dev_net(skb->dev);
struct sock *sk;
int harderr;
int err;
+ if (skb->protocol == htons(ETH_P_IP)) {
+ family = AF_INET;
+ type = icmp_hdr(skb)->type;
+ code = icmp_hdr(skb)->code;
+ icmph = (struct icmphdr *)(skb->data + offset);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ family = AF_INET6;
+ type = icmp6_hdr(skb)->icmp6_type;
+ code = icmp6_hdr(skb)->icmp6_code;
+ icmph = (struct icmphdr *) (skb->data + offset);
+ } else {
+ BUG();
+ }
+
/* We assume the packet has already been checked by icmp_unreach */
- if (!ping_supported(icmph->type, icmph->code))
+ if (!ping_supported(family, icmph->type, icmph->code))
return;
- pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type,
- code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+ pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n",
+ skb->protocol, type, code, ntohs(icmph->un.echo.id),
+ ntohs(icmph->un.echo.sequence));
- sk = ping_v4_lookup(net, iph->daddr, iph->saddr,
- ntohs(icmph->un.echo.id), skb->dev->ifindex);
+ sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
if (sk == NULL) {
pr_debug("no socket, dropping\n");
return; /* No socket for error */
@@ -349,72 +501,83 @@ void ping_err(struct sk_buff *skb, u32 info)
harderr = 0;
inet_sock = inet_sk(sk);
- switch (type) {
- default:
- case ICMP_TIME_EXCEEDED:
- err = EHOSTUNREACH;
- break;
- case ICMP_SOURCE_QUENCH:
- /* This is not a real error but ping wants to see it.
- * Report it with some fake errno. */
- err = EREMOTEIO;
- break;
- case ICMP_PARAMETERPROB:
- err = EPROTO;
- harderr = 1;
- break;
- case ICMP_DEST_UNREACH:
- if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
- ipv4_sk_update_pmtu(skb, sk, info);
- if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
- err = EMSGSIZE;
- harderr = 1;
- break;
+ if (skb->protocol == htons(ETH_P_IP)) {
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ /* This is not a real error but ping wants to see it.
+ * Report it with some fake errno.
+ */
+ err = EREMOTEIO;
+ break;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ ipv4_sk_update_pmtu(skb, sk, info);
+ if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+ err = EMSGSIZE;
+ harderr = 1;
+ break;
+ }
+ goto out;
}
- goto out;
- }
- err = EHOSTUNREACH;
- if (code <= NR_ICMP_UNREACH) {
- harderr = icmp_err_convert[code].fatal;
- err = icmp_err_convert[code].errno;
+ err = EHOSTUNREACH;
+ if (code <= NR_ICMP_UNREACH) {
+ harderr = icmp_err_convert[code].fatal;
+ err = icmp_err_convert[code].errno;
+ }
+ break;
+ case ICMP_REDIRECT:
+ /* See ICMP_SOURCE_QUENCH */
+ ipv4_sk_redirect(skb, sk);
+ err = EREMOTEIO;
+ break;
}
- break;
- case ICMP_REDIRECT:
- /* See ICMP_SOURCE_QUENCH */
- ipv4_sk_redirect(skb, sk);
- err = EREMOTEIO;
- break;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ harderr = pingv6_ops.icmpv6_err_convert(type, code, &err);
+#endif
}
/*
* RFC1122: OK. Passes ICMP errors back to application, as per
* 4.1.3.3.
*/
- if (!inet_sock->recverr) {
+ if ((family == AF_INET && !inet_sock->recverr) ||
+ (family == AF_INET6 && !inet6_sk(sk)->recverr)) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
} else {
- ip_icmp_error(sk, skb, err, 0 /* no remote port */,
- info, (u8 *)icmph);
+ if (family == AF_INET) {
+ ip_icmp_error(sk, skb, err, 0 /* no remote port */,
+ info, (u8 *)icmph);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ pingv6_ops.ipv6_icmp_error(sk, skb, err, 0,
+ info, (u8 *)icmph);
+#endif
+ }
}
sk->sk_err = err;
sk->sk_error_report(sk);
out:
sock_put(sk);
}
+EXPORT_SYMBOL_GPL(ping_err);
/*
- * Copy and checksum an ICMP Echo packet from user space into a buffer.
+ * Copy and checksum an ICMP Echo packet from user space into a buffer
+ * starting from the payload.
*/
-struct pingfakehdr {
- struct icmphdr icmph;
- struct iovec *iov;
- __wsum wcheck;
-};
-
-static int ping_getfrag(void *from, char *to,
- int offset, int fraglen, int odd, struct sk_buff *skb)
+int ping_getfrag(void *from, char *to,
+ int offset, int fraglen, int odd, struct sk_buff *skb)
{
struct pingfakehdr *pfh = (struct pingfakehdr *)from;
@@ -425,20 +588,33 @@ static int ping_getfrag(void *from, char *to,
pfh->iov, 0, fraglen - sizeof(struct icmphdr),
&pfh->wcheck))
return -EFAULT;
+ } else if (offset < sizeof(struct icmphdr)) {
+ BUG();
+ } else {
+ if (csum_partial_copy_fromiovecend
+ (to, pfh->iov, offset - sizeof(struct icmphdr),
+ fraglen, &pfh->wcheck))
+ return -EFAULT;
+ }
- return 0;
+#if IS_ENABLED(CONFIG_IPV6)
+ /* For IPv6, checksum each skb as we go along, as expected by
+ * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in
+ * wcheck, it will be finalized in ping_v4_push_pending_frames.
+ */
+ if (pfh->family == AF_INET6) {
+ skb->csum = pfh->wcheck;
+ skb->ip_summed = CHECKSUM_NONE;
+ pfh->wcheck = 0;
}
- if (offset < sizeof(struct icmphdr))
- BUG();
- if (csum_partial_copy_fromiovecend
- (to, pfh->iov, offset - sizeof(struct icmphdr),
- fraglen, &pfh->wcheck))
- return -EFAULT;
+#endif
+
return 0;
}
+EXPORT_SYMBOL_GPL(ping_getfrag);
-static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
- struct flowi4 *fl4)
+static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
+ struct flowi4 *fl4)
{
struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
@@ -450,24 +626,9 @@ static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
return ip_push_pending_frames(sk, fl4);
}
-static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len)
-{
- struct net *net = sock_net(sk);
- struct flowi4 fl4;
- struct inet_sock *inet = inet_sk(sk);
- struct ipcm_cookie ipc;
- struct icmphdr user_icmph;
- struct pingfakehdr pfh;
- struct rtable *rt = NULL;
- struct ip_options_data opt_copy;
- int free = 0;
- __be32 saddr, daddr, faddr;
- u8 tos;
- int err;
-
- pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
-
+int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
+ void *user_icmph, size_t icmph_len) {
+ u8 type, code;
if (len > 0xFFFF)
return -EMSGSIZE;
@@ -482,15 +643,53 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/*
* Fetch the ICMP header provided by the userland.
- * iovec is modified!
+ * iovec is modified! The ICMP header is consumed.
*/
-
- if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov,
- sizeof(struct icmphdr)))
+ if (memcpy_fromiovec(user_icmph, msg->msg_iov, icmph_len))
return -EFAULT;
- if (!ping_supported(user_icmph.type, user_icmph.code))
+
+ if (family == AF_INET) {
+ type = ((struct icmphdr *) user_icmph)->type;
+ code = ((struct icmphdr *) user_icmph)->code;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ type = ((struct icmp6hdr *) user_icmph)->icmp6_type;
+ code = ((struct icmp6hdr *) user_icmph)->icmp6_code;
+#endif
+ } else {
+ BUG();
+ }
+
+ if (!ping_supported(family, type, code))
return -EINVAL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_common_sendmsg);
+
+int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len)
+{
+ struct net *net = sock_net(sk);
+ struct flowi4 fl4;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipcm_cookie ipc;
+ struct icmphdr user_icmph;
+ struct pingfakehdr pfh;
+ struct rtable *rt = NULL;
+ struct ip_options_data opt_copy;
+ int free = 0;
+ __be32 saddr, daddr, faddr;
+ u8 tos;
+ int err;
+
+ pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
+
+ err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph,
+ sizeof(user_icmph));
+ if (err)
+ return err;
+
/*
* Get and verify the address.
*/
@@ -595,13 +794,14 @@ back_from_confirm:
pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
pfh.iov = msg->msg_iov;
pfh.wcheck = 0;
+ pfh.family = AF_INET;
err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
0, &ipc, &rt, msg->msg_flags);
if (err)
ip_flush_pending_frames(sk);
else
- err = ping_push_pending_frames(sk, &pfh, &fl4);
+ err = ping_v4_push_pending_frames(sk, &pfh, &fl4);
release_sock(sk);
out:
@@ -622,11 +822,13 @@ do_confirm:
goto out;
}
-static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len)
+int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len, int noblock, int flags, int *addr_len)
{
struct inet_sock *isk = inet_sk(sk);
- struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+ int family = sk->sk_family;
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
struct sk_buff *skb;
int copied, err;
@@ -636,11 +838,22 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (flags & MSG_OOB)
goto out;
- if (addr_len)
- *addr_len = sizeof(*sin);
+ if (addr_len) {
+ if (family == AF_INET)
+ *addr_len = sizeof(*sin);
+ else if (family == AF_INET6 && addr_len)
+ *addr_len = sizeof(*sin6);
+ }
- if (flags & MSG_ERRQUEUE)
- return ip_recv_error(sk, msg, len);
+ if (flags & MSG_ERRQUEUE) {
+ if (family == AF_INET) {
+ return ip_recv_error(sk, msg, len);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ return pingv6_ops.ipv6_recv_error(sk, msg, len);
+#endif
+ }
+ }
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (!skb)
@@ -659,15 +872,40 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sock_recv_timestamp(msg, sk, skb);
- /* Copy the address. */
- if (sin) {
+ /* Copy the address and add cmsg data. */
+ if (family == AF_INET) {
+ sin = (struct sockaddr_in *) msg->msg_name;
sin->sin_family = AF_INET;
sin->sin_port = 0 /* skb->h.uh->source */;
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+
+ if (isk->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6hdr *ip6 = ipv6_hdr(skb);
+ sin6 = (struct sockaddr_in6 *) msg->msg_name;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = 0;
+ sin6->sin6_addr = ip6->saddr;
+
+ sin6->sin6_flowinfo = 0;
+ if (np->sndflow)
+ sin6->sin6_flowinfo = ip6_flowinfo(ip6);
+
+ sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
+ IP6CB(skb)->iif);
+
+ if (inet6_sk(sk)->rxopt.all)
+ pingv6_ops.ip6_datagram_recv_ctl(sk, msg, skb);
+#endif
+ } else {
+ BUG();
}
- if (isk->cmsg_flags)
- ip_cmsg_recv(msg, skb);
+
err = copied;
done:
@@ -676,8 +914,9 @@ out:
pr_debug("ping_recvmsg -> %d\n", err);
return err;
}
+EXPORT_SYMBOL_GPL(ping_recvmsg);
-static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
inet_sk(sk), inet_sk(sk)->inet_num, skb);
@@ -688,6 +927,7 @@ static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
return 0;
}
+EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
/*
@@ -698,10 +938,7 @@ void ping_rcv(struct sk_buff *skb)
{
struct sock *sk;
struct net *net = dev_net(skb->dev);
- struct iphdr *iph = ip_hdr(skb);
struct icmphdr *icmph = icmp_hdr(skb);
- __be32 saddr = iph->saddr;
- __be32 daddr = iph->daddr;
/* We assume the packet has already been checked by icmp_rcv */
@@ -711,8 +948,7 @@ void ping_rcv(struct sk_buff *skb)
/* Push ICMP header back */
skb_push(skb, skb->data - (u8 *)icmph);
- sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id),
- skb->dev->ifindex);
+ sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
if (sk != NULL) {
pr_debug("rcv on socket %p\n", sk);
ping_queue_rcv_skb(sk, skb_get(skb));
@@ -723,6 +959,7 @@ void ping_rcv(struct sk_buff *skb)
/* We're called from icmp_rcv(). kfree_skb() is done there. */
}
+EXPORT_SYMBOL_GPL(ping_rcv);
struct proto ping_prot = {
.name = "PING",
@@ -733,14 +970,14 @@ struct proto ping_prot = {
.disconnect = udp_disconnect,
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
- .sendmsg = ping_sendmsg,
+ .sendmsg = ping_v4_sendmsg,
.recvmsg = ping_recvmsg,
.bind = ping_bind,
.backlog_rcv = ping_queue_rcv_skb,
.release_cb = ip4_datagram_release_cb,
- .hash = ping_v4_hash,
- .unhash = ping_v4_unhash,
- .get_port = ping_v4_get_port,
+ .hash = ping_hash,
+ .unhash = ping_unhash,
+ .get_port = ping_get_port,
.obj_size = sizeof(struct inet_sock),
};
EXPORT_SYMBOL(ping_prot);
@@ -764,7 +1001,8 @@ static struct sock *ping_get_first(struct seq_file *seq, int start)
continue;
sk_nulls_for_each(sk, node, hslot) {
- if (net_eq(sock_net(sk), net))
+ if (net_eq(sock_net(sk), net) &&
+ sk->sk_family == state->family)
goto found;
}
}
@@ -797,17 +1035,24 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
return pos ? NULL : sk;
}
-static void *ping_seq_start(struct seq_file *seq, loff_t *pos)
+void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
{
struct ping_iter_state *state = seq->private;
state->bucket = 0;
+ state->family = family;
read_lock_bh(&ping_table.lock);
return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
+EXPORT_SYMBOL_GPL(ping_seq_start);
+
+static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return ping_seq_start(seq, pos, AF_INET);
+}
-static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct sock *sk;
@@ -819,13 +1064,15 @@ static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
return sk;
}
+EXPORT_SYMBOL_GPL(ping_seq_next);
-static void ping_seq_stop(struct seq_file *seq, void *v)
+void ping_seq_stop(struct seq_file *seq, void *v)
{
read_unlock_bh(&ping_table.lock);
}
+EXPORT_SYMBOL_GPL(ping_seq_stop);
-static void ping_format_sock(struct sock *sp, struct seq_file *f,
+static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
int bucket, int *len)
{
struct inet_sock *inet = inet_sk(sp);
@@ -846,7 +1093,7 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f,
atomic_read(&sp->sk_drops), len);
}
-static int ping_seq_show(struct seq_file *seq, void *v)
+static int ping_v4_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN)
seq_printf(seq, "%-127s\n",
@@ -857,72 +1104,86 @@ static int ping_seq_show(struct seq_file *seq, void *v)
struct ping_iter_state *state = seq->private;
int len;
- ping_format_sock(v, seq, state->bucket, &len);
+ ping_v4_format_sock(v, seq, state->bucket, &len);
seq_printf(seq, "%*s\n", 127 - len, "");
}
return 0;
}
-static const struct seq_operations ping_seq_ops = {
- .show = ping_seq_show,
- .start = ping_seq_start,
+static const struct seq_operations ping_v4_seq_ops = {
+ .show = ping_v4_seq_show,
+ .start = ping_v4_seq_start,
.next = ping_seq_next,
.stop = ping_seq_stop,
};
static int ping_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_net(inode, file, &ping_seq_ops,
+ struct ping_seq_afinfo *afinfo = PDE_DATA(inode);
+ return seq_open_net(inode, file, &afinfo->seq_ops,
sizeof(struct ping_iter_state));
}
-static const struct file_operations ping_seq_fops = {
+const struct file_operations ping_seq_fops = {
.open = ping_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
};
+EXPORT_SYMBOL_GPL(ping_seq_fops);
+
+static struct ping_seq_afinfo ping_v4_seq_afinfo = {
+ .name = "icmp",
+ .family = AF_INET,
+ .seq_fops = &ping_seq_fops,
+ .seq_ops = {
+ .start = ping_v4_seq_start,
+ .show = ping_v4_seq_show,
+ .next = ping_seq_next,
+ .stop = ping_seq_stop,
+ },
+};
-static int ping_proc_register(struct net *net)
+int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo)
{
struct proc_dir_entry *p;
- int rc = 0;
-
- p = proc_create("icmp", S_IRUGO, net->proc_net, &ping_seq_fops);
+ p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+ afinfo->seq_fops, afinfo);
if (!p)
- rc = -ENOMEM;
- return rc;
+ return -ENOMEM;
+ return 0;
}
+EXPORT_SYMBOL_GPL(ping_proc_register);
-static void ping_proc_unregister(struct net *net)
+void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo)
{
- remove_proc_entry("icmp", net->proc_net);
+ remove_proc_entry(afinfo->name, net->proc_net);
}
+EXPORT_SYMBOL_GPL(ping_proc_unregister);
-
-static int __net_init ping_proc_init_net(struct net *net)
+static int __net_init ping_v4_proc_init_net(struct net *net)
{
- return ping_proc_register(net);
+ return ping_proc_register(net, &ping_v4_seq_afinfo);
}
-static void __net_exit ping_proc_exit_net(struct net *net)
+static void __net_exit ping_v4_proc_exit_net(struct net *net)
{
- ping_proc_unregister(net);
+ ping_proc_unregister(net, &ping_v4_seq_afinfo);
}
-static struct pernet_operations ping_net_ops = {
- .init = ping_proc_init_net,
- .exit = ping_proc_exit_net,
+static struct pernet_operations ping_v4_net_ops = {
+ .init = ping_v4_proc_init_net,
+ .exit = ping_v4_proc_exit_net,
};
int __init ping_proc_init(void)
{
- return register_pernet_subsys(&ping_net_ops);
+ return register_pernet_subsys(&ping_v4_net_ops);
}
void ping_proc_exit(void)
{
- unregister_pernet_subsys(&ping_net_ops);
+ unregister_pernet_subsys(&ping_v4_net_ops);
}
#endif
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 2a5bf86d2415..6577a1149a47 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
+ SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d35bbf0cf404..a9a54a236832 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -565,10 +565,25 @@ static inline void rt_free(struct rtable *rt)
static DEFINE_SPINLOCK(fnhe_lock);
+static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
+{
+ struct rtable *rt;
+
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
+ rt_free(rt);
+ }
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
+ rt_free(rt);
+ }
+}
+
static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
{
struct fib_nh_exception *fnhe, *oldest;
- struct rtable *orig;
oldest = rcu_dereference(hash->chain);
for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
@@ -576,11 +591,7 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
oldest = fnhe;
}
- orig = rcu_dereference(oldest->fnhe_rth);
- if (orig) {
- RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
- rt_free(orig);
- }
+ fnhe_flush_routes(oldest);
return oldest;
}
@@ -594,11 +605,25 @@ static inline u32 fnhe_hashfun(__be32 daddr)
return hval & (FNHE_HASH_SIZE - 1);
}
+static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
+{
+ rt->rt_pmtu = fnhe->fnhe_pmtu;
+ rt->dst.expires = fnhe->fnhe_expires;
+
+ if (fnhe->fnhe_gw) {
+ rt->rt_flags |= RTCF_REDIRECTED;
+ rt->rt_gateway = fnhe->fnhe_gw;
+ rt->rt_uses_gateway = 1;
+ }
+}
+
static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
u32 pmtu, unsigned long expires)
{
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe;
+ struct rtable *rt;
+ unsigned int i;
int depth;
u32 hval = fnhe_hashfun(daddr);
@@ -627,8 +652,15 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_gw = gw;
if (pmtu) {
fnhe->fnhe_pmtu = pmtu;
- fnhe->fnhe_expires = expires;
+ fnhe->fnhe_expires = max(1UL, expires);
}
+ /* Update all cached dsts too */
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt)
+ fill_route_from_fnhe(rt, fnhe);
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (rt)
+ fill_route_from_fnhe(rt, fnhe);
} else {
if (depth > FNHE_RECLAIM_DEPTH)
fnhe = fnhe_oldest(hash);
@@ -640,10 +672,27 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_next = hash->chain;
rcu_assign_pointer(hash->chain, fnhe);
}
+ fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
fnhe->fnhe_daddr = daddr;
fnhe->fnhe_gw = gw;
fnhe->fnhe_pmtu = pmtu;
fnhe->fnhe_expires = expires;
+
+ /* Exception created; mark the cached routes for the nexthop
+ * stale, so anyone caching it rechecks if this exception
+ * applies to them.
+ */
+ rt = rcu_dereference(nh->nh_rth_input);
+ if (rt)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
+
+ for_each_possible_cpu(i) {
+ struct rtable __rcu **prt;
+ prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
+ rt = rcu_dereference(*prt);
+ if (rt)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
+ }
}
fnhe->fnhe_stamp = jiffies;
@@ -922,12 +971,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
if (mtu < ip_rt_min_pmtu)
mtu = ip_rt_min_pmtu;
- if (!rt->rt_pmtu) {
- dst->obsolete = DST_OBSOLETE_KILL;
- } else {
- rt->rt_pmtu = mtu;
- dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
- }
+ if (rt->rt_pmtu == mtu &&
+ time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
+ return;
rcu_read_lock();
if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
@@ -1068,11 +1114,11 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
* DST_OBSOLETE_FORCE_CHK which forces validation calls down
* into this function always.
*
- * When a PMTU/redirect information update invalidates a
- * route, this is indicated by setting obsolete to
- * DST_OBSOLETE_KILL.
+ * When a PMTU/redirect information update invalidates a route,
+ * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
+ * DST_OBSOLETE_DEAD by dst_free().
*/
- if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
+ if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
return NULL;
return dst;
}
@@ -1214,34 +1260,36 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
spin_lock_bh(&fnhe_lock);
if (daddr == fnhe->fnhe_daddr) {
- struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
- if (orig && rt_is_expired(orig)) {
+ struct rtable __rcu **porig;
+ struct rtable *orig;
+ int genid = fnhe_genid(dev_net(rt->dst.dev));
+
+ if (rt_is_input_route(rt))
+ porig = &fnhe->fnhe_rth_input;
+ else
+ porig = &fnhe->fnhe_rth_output;
+ orig = rcu_dereference(*porig);
+
+ if (fnhe->fnhe_genid != genid) {
+ fnhe->fnhe_genid = genid;
fnhe->fnhe_gw = 0;
fnhe->fnhe_pmtu = 0;
fnhe->fnhe_expires = 0;
+ fnhe_flush_routes(fnhe);
+ orig = NULL;
}
- if (fnhe->fnhe_pmtu) {
- unsigned long expires = fnhe->fnhe_expires;
- unsigned long diff = expires - jiffies;
-
- if (time_before(jiffies, expires)) {
- rt->rt_pmtu = fnhe->fnhe_pmtu;
- dst_set_expires(&rt->dst, diff);
- }
- }
- if (fnhe->fnhe_gw) {
- rt->rt_flags |= RTCF_REDIRECTED;
- rt->rt_gateway = fnhe->fnhe_gw;
- rt->rt_uses_gateway = 1;
- } else if (!rt->rt_gateway)
+ fill_route_from_fnhe(rt, fnhe);
+ if (!rt->rt_gateway)
rt->rt_gateway = daddr;
- rcu_assign_pointer(fnhe->fnhe_rth, rt);
- if (orig)
- rt_free(orig);
+ if (!(rt->dst.flags & DST_NOCACHE)) {
+ rcu_assign_pointer(*porig, rt);
+ if (orig)
+ rt_free(orig);
+ ret = true;
+ }
fnhe->fnhe_stamp = jiffies;
- ret = true;
}
spin_unlock_bh(&fnhe_lock);
@@ -1473,6 +1521,7 @@ static int __mkroute_input(struct sk_buff *skb,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
+ struct fib_nh_exception *fnhe;
struct rtable *rth;
int err;
struct in_device *out_dev;
@@ -1519,8 +1568,13 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
+ fnhe = find_exception(&FIB_RES_NH(*res), daddr);
if (do_cache) {
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+ if (fnhe != NULL)
+ rth = rcu_dereference(fnhe->fnhe_rth_input);
+ else
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
@@ -1548,7 +1602,7 @@ static int __mkroute_input(struct sk_buff *skb,
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
- rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
+ rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
skb_dst_set(skb, &rth->dst);
out:
err = 0;
@@ -1863,7 +1917,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
fnhe = find_exception(nh, fl4->daddr);
if (fnhe)
- prth = &fnhe->fnhe_rth;
+ prth = &fnhe->fnhe_rth_output;
else {
if (unlikely(fl4->flowi4_flags &
FLOWI_FLAG_KNOWN_NH &&
@@ -2429,19 +2483,22 @@ static int ip_rt_gc_interval __read_mostly = 60 * HZ;
static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
static int ip_rt_gc_elasticity __read_mostly = 8;
-static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
+static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
+ struct net *net = (struct net *)__ctl->extra1;
+
if (write) {
- rt_cache_flush((struct net *)__ctl->extra1);
+ rt_cache_flush(net);
+ fnhe_genid_bump(net);
return 0;
}
return -EINVAL;
}
-static ctl_table ipv4_route_table[] = {
+static struct ctl_table ipv4_route_table[] = {
{
.procname = "gc_thresh",
.data = &ipv4_dst_ops.gc_thresh,
@@ -2609,6 +2666,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
static __net_init int rt_genid_init(struct net *net)
{
atomic_set(&net->rt_genid, 0);
+ atomic_set(&net->fnhe_genid, 0);
get_random_bytes(&net->ipv4.dev_addr_genid,
sizeof(net->ipv4.dev_addr_genid));
return 0;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index fa2f63fc453b..610e324348d1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -36,6 +36,8 @@ static int tcp_adv_win_scale_min = -31;
static int tcp_adv_win_scale_max = 31;
static int ip_ttl_min = 1;
static int ip_ttl_max = 255;
+static int tcp_syn_retries_min = 1;
+static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
@@ -49,13 +51,13 @@ static void set_local_port_range(int range[2])
}
/* Validate changes from /proc interface. */
-static int ipv4_local_port_range(ctl_table *table, int write,
+static int ipv4_local_port_range(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
int range[2];
- ctl_table tmp = {
+ struct ctl_table tmp = {
.data = &range,
.maxlen = sizeof(range),
.mode = table->mode,
@@ -100,7 +102,7 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig
}
/* Validate changes from /proc interface. */
-static int ipv4_ping_group_range(ctl_table *table, int write,
+static int ipv4_ping_group_range(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
@@ -108,7 +110,7 @@ static int ipv4_ping_group_range(ctl_table *table, int write,
int ret;
gid_t urange[2];
kgid_t low, high;
- ctl_table tmp = {
+ struct ctl_table tmp = {
.data = &urange,
.maxlen = sizeof(urange),
.mode = table->mode,
@@ -135,11 +137,11 @@ static int ipv4_ping_group_range(ctl_table *table, int write,
return ret;
}
-static int proc_tcp_congestion_control(ctl_table *ctl, int write,
+static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
char val[TCP_CA_NAME_MAX];
- ctl_table tbl = {
+ struct ctl_table tbl = {
.data = val,
.maxlen = TCP_CA_NAME_MAX,
};
@@ -153,12 +155,12 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write,
return ret;
}
-static int proc_tcp_available_congestion_control(ctl_table *ctl,
+static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
+ struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
int ret;
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
@@ -170,12 +172,12 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl,
return ret;
}
-static int proc_allowed_congestion_control(ctl_table *ctl,
+static int proc_allowed_congestion_control(struct ctl_table *ctl,
int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
+ struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
int ret;
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
@@ -190,7 +192,7 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
return ret;
}
-static int ipv4_tcp_mem(ctl_table *ctl, int write,
+static int ipv4_tcp_mem(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -201,7 +203,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
struct mem_cgroup *memcg;
#endif
- ctl_table tmp = {
+ struct ctl_table tmp = {
.data = &vec,
.maxlen = sizeof(vec),
.mode = ctl->mode,
@@ -233,10 +235,11 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
return 0;
}
-static int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
{
- ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
+ struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
struct tcp_fastopen_context *ctxt;
int ret;
u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
@@ -331,7 +334,9 @@ static struct ctl_table ipv4_table[] = {
.data = &sysctl_tcp_syn_retries,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_syn_retries_min,
+ .extra2 = &tcp_syn_retries_max
},
{
.procname = "tcp_synack_retries",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ab450c099aa4..5423223e93c2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -279,6 +279,7 @@
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <net/busy_poll.h>
int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
@@ -436,6 +437,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
+ sock_rps_record_flow(sk);
+
sock_poll_wait(file, sk_sleep(sk), wait);
if (sk->sk_state == TCP_LISTEN)
return inet_csk_listen_poll(sk);
@@ -1551,6 +1554,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct sk_buff *skb;
u32 urg_hole = 0;
+ if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
+ (sk->sk_state == TCP_ESTABLISHED))
+ sk_busy_loop(sk, nonblock);
+
lock_sock(sk);
err = -ENOTCONN;
@@ -2875,249 +2882,9 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
- netdev_features_t features)
-{
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- struct tcphdr *th;
- unsigned int thlen;
- unsigned int seq;
- __be32 delta;
- unsigned int oldlen;
- unsigned int mss;
- struct sk_buff *gso_skb = skb;
- __sum16 newcheck;
- bool ooo_okay, copy_destructor;
-
- if (!pskb_may_pull(skb, sizeof(*th)))
- goto out;
-
- th = tcp_hdr(skb);
- thlen = th->doff * 4;
- if (thlen < sizeof(*th))
- goto out;
-
- if (!pskb_may_pull(skb, thlen))
- goto out;
-
- oldlen = (u16)~skb->len;
- __skb_pull(skb, thlen);
-
- mss = skb_shinfo(skb)->gso_size;
- if (unlikely(skb->len <= mss))
- goto out;
-
- if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
- /* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
-
- if (unlikely(type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_TCPV6 |
- SKB_GSO_GRE |
- SKB_GSO_UDP_TUNNEL |
- 0) ||
- !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
- goto out;
-
- skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
-
- segs = NULL;
- goto out;
- }
-
- copy_destructor = gso_skb->destructor == tcp_wfree;
- ooo_okay = gso_skb->ooo_okay;
- /* All segments but the first should have ooo_okay cleared */
- skb->ooo_okay = 0;
-
- segs = skb_segment(skb, features);
- if (IS_ERR(segs))
- goto out;
-
- /* Only first segment might have ooo_okay set */
- segs->ooo_okay = ooo_okay;
-
- delta = htonl(oldlen + (thlen + mss));
-
- skb = segs;
- th = tcp_hdr(skb);
- seq = ntohl(th->seq);
-
- newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
- (__force u32)delta));
-
- do {
- th->fin = th->psh = 0;
- th->check = newcheck;
-
- if (skb->ip_summed != CHECKSUM_PARTIAL)
- th->check =
- csum_fold(csum_partial(skb_transport_header(skb),
- thlen, skb->csum));
-
- seq += mss;
- if (copy_destructor) {
- skb->destructor = gso_skb->destructor;
- skb->sk = gso_skb->sk;
- /* {tcp|sock}_wfree() use exact truesize accounting :
- * sum(skb->truesize) MUST be exactly be gso_skb->truesize
- * So we account mss bytes of 'true size' for each segment.
- * The last segment will contain the remaining.
- */
- skb->truesize = mss;
- gso_skb->truesize -= mss;
- }
- skb = skb->next;
- th = tcp_hdr(skb);
-
- th->seq = htonl(seq);
- th->cwr = 0;
- } while (skb->next);
-
- /* Following permits TCP Small Queues to work well with GSO :
- * The callback to TCP stack will be called at the time last frag
- * is freed at TX completion, and not right now when gso_skb
- * is freed by GSO engine
- */
- if (copy_destructor) {
- swap(gso_skb->sk, skb->sk);
- swap(gso_skb->destructor, skb->destructor);
- swap(gso_skb->truesize, skb->truesize);
- }
-
- delta = htonl(oldlen + (skb->tail - skb->transport_header) +
- skb->data_len);
- th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
- (__force u32)delta));
- if (skb->ip_summed != CHECKSUM_PARTIAL)
- th->check = csum_fold(csum_partial(skb_transport_header(skb),
- thlen, skb->csum));
-
-out:
- return segs;
-}
-EXPORT_SYMBOL(tcp_tso_segment);
-
-struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
-{
- struct sk_buff **pp = NULL;
- struct sk_buff *p;
- struct tcphdr *th;
- struct tcphdr *th2;
- unsigned int len;
- unsigned int thlen;
- __be32 flags;
- unsigned int mss = 1;
- unsigned int hlen;
- unsigned int off;
- int flush = 1;
- int i;
-
- off = skb_gro_offset(skb);
- hlen = off + sizeof(*th);
- th = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- th = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!th))
- goto out;
- }
-
- thlen = th->doff * 4;
- if (thlen < sizeof(*th))
- goto out;
-
- hlen = off + thlen;
- if (skb_gro_header_hard(skb, hlen)) {
- th = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!th))
- goto out;
- }
-
- skb_gro_pull(skb, thlen);
-
- len = skb_gro_len(skb);
- flags = tcp_flag_word(th);
-
- for (; (p = *head); head = &p->next) {
- if (!NAPI_GRO_CB(p)->same_flow)
- continue;
-
- th2 = tcp_hdr(p);
-
- if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
- NAPI_GRO_CB(p)->same_flow = 0;
- continue;
- }
-
- goto found;
- }
-
- goto out_check_final;
-
-found:
- flush = NAPI_GRO_CB(p)->flush;
- flush |= (__force int)(flags & TCP_FLAG_CWR);
- flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
- ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
- flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
- for (i = sizeof(*th); i < thlen; i += 4)
- flush |= *(u32 *)((u8 *)th + i) ^
- *(u32 *)((u8 *)th2 + i);
-
- mss = skb_shinfo(p)->gso_size;
-
- flush |= (len - 1) >= mss;
- flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
-
- if (flush || skb_gro_receive(head, skb)) {
- mss = 1;
- goto out_check_final;
- }
-
- p = *head;
- th2 = tcp_hdr(p);
- tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
-
-out_check_final:
- flush = len < mss;
- flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
- TCP_FLAG_RST | TCP_FLAG_SYN |
- TCP_FLAG_FIN));
-
- if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
- pp = head;
-
-out:
- NAPI_GRO_CB(skb)->flush |= flush;
-
- return pp;
-}
-EXPORT_SYMBOL(tcp_gro_receive);
-
-int tcp_gro_complete(struct sk_buff *skb)
-{
- struct tcphdr *th = tcp_hdr(skb);
-
- skb->csum_start = skb_transport_header(skb) - skb->head;
- skb->csum_offset = offsetof(struct tcphdr, check);
- skb->ip_summed = CHECKSUM_PARTIAL;
-
- skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
-
- if (th->cwr)
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
-
- return 0;
-}
-EXPORT_SYMBOL(tcp_gro_complete);
-
#ifdef CONFIG_TCP_MD5SIG
-static unsigned long tcp_md5sig_users;
-static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool;
-static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
+static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
+static DEFINE_MUTEX(tcp_md5sig_mutex);
static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
{
@@ -3132,30 +2899,14 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
free_percpu(pool);
}
-void tcp_free_md5sig_pool(void)
-{
- struct tcp_md5sig_pool __percpu *pool = NULL;
-
- spin_lock_bh(&tcp_md5sig_pool_lock);
- if (--tcp_md5sig_users == 0) {
- pool = tcp_md5sig_pool;
- tcp_md5sig_pool = NULL;
- }
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- if (pool)
- __tcp_free_md5sig_pool(pool);
-}
-EXPORT_SYMBOL(tcp_free_md5sig_pool);
-
-static struct tcp_md5sig_pool __percpu *
-__tcp_alloc_md5sig_pool(struct sock *sk)
+static void __tcp_alloc_md5sig_pool(void)
{
int cpu;
struct tcp_md5sig_pool __percpu *pool;
pool = alloc_percpu(struct tcp_md5sig_pool);
if (!pool)
- return NULL;
+ return;
for_each_possible_cpu(cpu) {
struct crypto_hash *hash;
@@ -3166,53 +2917,27 @@ __tcp_alloc_md5sig_pool(struct sock *sk)
per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
}
- return pool;
+ /* before setting tcp_md5sig_pool, we must commit all writes
+ * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool()
+ */
+ smp_wmb();
+ tcp_md5sig_pool = pool;
+ return;
out_free:
__tcp_free_md5sig_pool(pool);
- return NULL;
}
-struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
+bool tcp_alloc_md5sig_pool(void)
{
- struct tcp_md5sig_pool __percpu *pool;
- bool alloc = false;
-
-retry:
- spin_lock_bh(&tcp_md5sig_pool_lock);
- pool = tcp_md5sig_pool;
- if (tcp_md5sig_users++ == 0) {
- alloc = true;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- } else if (!pool) {
- tcp_md5sig_users--;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- cpu_relax();
- goto retry;
- } else
- spin_unlock_bh(&tcp_md5sig_pool_lock);
-
- if (alloc) {
- /* we cannot hold spinlock here because this may sleep. */
- struct tcp_md5sig_pool __percpu *p;
-
- p = __tcp_alloc_md5sig_pool(sk);
- spin_lock_bh(&tcp_md5sig_pool_lock);
- if (!p) {
- tcp_md5sig_users--;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- return NULL;
- }
- pool = tcp_md5sig_pool;
- if (pool) {
- /* oops, it has already been assigned. */
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- __tcp_free_md5sig_pool(p);
- } else {
- tcp_md5sig_pool = pool = p;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- }
+ if (unlikely(!tcp_md5sig_pool)) {
+ mutex_lock(&tcp_md5sig_mutex);
+
+ if (!tcp_md5sig_pool)
+ __tcp_alloc_md5sig_pool();
+
+ mutex_unlock(&tcp_md5sig_mutex);
}
- return pool;
+ return tcp_md5sig_pool != NULL;
}
EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
@@ -3229,28 +2954,15 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
struct tcp_md5sig_pool __percpu *p;
local_bh_disable();
-
- spin_lock(&tcp_md5sig_pool_lock);
- p = tcp_md5sig_pool;
+ p = ACCESS_ONCE(tcp_md5sig_pool);
if (p)
- tcp_md5sig_users++;
- spin_unlock(&tcp_md5sig_pool_lock);
-
- if (p)
- return this_cpu_ptr(p);
+ return __this_cpu_ptr(p);
local_bh_enable();
return NULL;
}
EXPORT_SYMBOL(tcp_get_md5sig_pool);
-void tcp_put_md5sig_pool(void)
-{
- local_bh_enable();
- tcp_free_md5sig_pool();
-}
-EXPORT_SYMBOL(tcp_put_md5sig_pool);
-
int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
const struct tcphdr *th)
{
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c6225780bd5..28af45abe062 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -347,24 +347,13 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
}
/* 3. Tuning rcvbuf, when connection enters established state. */
-
static void tcp_fixup_rcvbuf(struct sock *sk)
{
u32 mss = tcp_sk(sk)->advmss;
- u32 icwnd = TCP_DEFAULT_INIT_RCVWND;
int rcvmem;
- /* Limit to 10 segments if mss <= 1460,
- * or 14600/mss segments, with a minimum of two segments.
- */
- if (mss > 1460)
- icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
-
- rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER);
- while (tcp_win_from_space(rcvmem) < mss)
- rcvmem += 128;
-
- rcvmem *= icwnd;
+ rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
+ tcp_default_init_rwnd(mss);
if (sk->sk_rcvbuf < rcvmem)
sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
@@ -1257,8 +1246,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = prev;
- if (skb == tp->scoreboard_skb_hint)
- tp->scoreboard_skb_hint = prev;
if (skb == tp->lost_skb_hint) {
tp->lost_skb_hint = prev;
tp->lost_cnt_hint -= tcp_skb_pcount(prev);
@@ -1966,20 +1953,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
return true;
}
-static inline int tcp_skb_timedout(const struct sock *sk,
- const struct sk_buff *skb)
-{
- return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
-}
-
-static inline int tcp_head_timedout(const struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
-
- return tp->packets_out &&
- tcp_skb_timedout(sk, tcp_write_queue_head(sk));
-}
-
/* Linux NewReno/SACK/FACK/ECN state machine.
* --------------------------------------
*
@@ -2086,12 +2059,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
if (tcp_dupack_heuristics(tp) > tp->reordering)
return true;
- /* Trick#3 : when we use RFC2988 timer restart, fast
- * retransmit can be triggered by timeout of queue head.
- */
- if (tcp_is_fack(tp) && tcp_head_timedout(sk))
- return true;
-
/* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
*/
@@ -2128,44 +2095,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
return false;
}
-/* New heuristics: it is possible only after we switched to restart timer
- * each time when something is ACKed. Hence, we can detect timed out packets
- * during fast retransmit without falling to slow start.
- *
- * Usefulness of this as is very questionable, since we should know which of
- * the segments is the next to timeout which is relatively expensive to find
- * in general case unless we add some data structure just for that. The
- * current approach certainly won't find the right one too often and when it
- * finally does find _something_ it usually marks large part of the window
- * right away (because a retransmission with a larger timestamp blocks the
- * loop from advancing). -ij
- */
-static void tcp_timeout_skbs(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
-
- if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
- return;
-
- skb = tp->scoreboard_skb_hint;
- if (tp->scoreboard_skb_hint == NULL)
- skb = tcp_write_queue_head(sk);
-
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- if (!tcp_skb_timedout(sk, skb))
- break;
-
- tcp_skb_mark_lost(tp, skb);
- }
-
- tp->scoreboard_skb_hint = skb;
-
- tcp_verify_left_out(tp);
-}
-
/* Detect loss in event "A" above by marking head of queue up as lost.
* For FACK or non-SACK(Reno) senders, the first "packets" number of segments
* are considered lost. For RFC3517 SACK, a segment is considered lost if it
@@ -2251,8 +2180,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
else if (fast_rexmit)
tcp_mark_head_lost(sk, 1, 1);
}
-
- tcp_timeout_skbs(sk);
}
/* CWND moderation, preventing bursts due to too big ACKs
@@ -2307,10 +2234,22 @@ static void DBGUNDO(struct sock *sk, const char *msg)
#define DBGUNDO(x...) do { } while (0)
#endif
-static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
+static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
{
struct tcp_sock *tp = tcp_sk(sk);
+ if (unmark_loss) {
+ struct sk_buff *skb;
+
+ tcp_for_write_queue(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+ }
+ tp->lost_out = 0;
+ tcp_clear_all_retrans_hints(tp);
+ }
+
if (tp->prior_ssthresh) {
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2319,7 +2258,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
else
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
- if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) {
+ if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
TCP_ECN_withdraw_cwr(tp);
}
@@ -2327,6 +2266,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
}
tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->undo_marker = 0;
}
static inline bool tcp_may_undo(const struct tcp_sock *tp)
@@ -2346,14 +2286,13 @@ static bool tcp_try_undo_recovery(struct sock *sk)
* or our original transmission succeeded.
*/
DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
- tcp_undo_cwr(sk, true);
+ tcp_undo_cwnd_reduction(sk, false);
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPLOSSUNDO;
else
mib_idx = LINUX_MIB_TCPFULLUNDO;
NET_INC_STATS_BH(sock_net(sk), mib_idx);
- tp->undo_marker = 0;
}
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
@@ -2367,16 +2306,17 @@ static bool tcp_try_undo_recovery(struct sock *sk)
}
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
-static void tcp_try_undo_dsack(struct sock *sk)
+static bool tcp_try_undo_dsack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->undo_marker && !tp->undo_retrans) {
DBGUNDO(sk, "D-SACK");
- tcp_undo_cwr(sk, true);
- tp->undo_marker = 0;
+ tcp_undo_cwnd_reduction(sk, false);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
+ return true;
}
+ return false;
}
/* We can clear retrans_stamp when there are no retransmissions in the
@@ -2408,60 +2348,20 @@ static bool tcp_any_retrans_done(const struct sock *sk)
return false;
}
-/* Undo during fast recovery after partial ACK. */
-
-static int tcp_try_undo_partial(struct sock *sk, int acked)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- /* Partial ACK arrived. Force Hoe's retransmit. */
- int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering);
-
- if (tcp_may_undo(tp)) {
- /* Plain luck! Hole if filled with delayed
- * packet, rather than with a retransmit.
- */
- if (!tcp_any_retrans_done(sk))
- tp->retrans_stamp = 0;
-
- tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
-
- DBGUNDO(sk, "Hoe");
- tcp_undo_cwr(sk, false);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
-
- /* So... Do not make Hoe's retransmit yet.
- * If the first packet was delayed, the rest
- * ones are most probably delayed as well.
- */
- failed = 0;
- }
- return failed;
-}
-
/* Undo during loss recovery after partial ACK or using F-RTO. */
static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
{
struct tcp_sock *tp = tcp_sk(sk);
if (frto_undo || tcp_may_undo(tp)) {
- struct sk_buff *skb;
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
- }
-
- tcp_clear_all_retrans_hints(tp);
+ tcp_undo_cwnd_reduction(sk, true);
DBGUNDO(sk, "partial loss");
- tp->lost_out = 0;
- tcp_undo_cwr(sk, true);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
if (frto_undo)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
inet_csk(sk)->icsk_retransmits = 0;
- tp->undo_marker = 0;
if (frto_undo || tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
return true;
@@ -2494,12 +2394,14 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
TCP_ECN_queue_cwr(tp);
}
-static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
+static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
int fast_rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
+ int newly_acked_sacked = prior_unsacked -
+ (tp->packets_out - tp->sacked_out);
tp->prr_delivered += newly_acked_sacked;
if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
@@ -2556,7 +2458,7 @@ static void tcp_try_keep_open(struct sock *sk)
}
}
-static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
+static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2573,7 +2475,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
tcp_moderate_cwnd(tp);
} else {
- tcp_cwnd_reduction(sk, newly_acked_sacked, 0);
+ tcp_cwnd_reduction(sk, prior_unsacked, 0);
}
}
@@ -2731,6 +2633,40 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
tcp_xmit_retransmit_queue(sk);
}
+/* Undo during fast recovery after partial ACK. */
+static bool tcp_try_undo_partial(struct sock *sk, const int acked,
+ const int prior_unsacked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->undo_marker && tcp_packet_delayed(tp)) {
+ /* Plain luck! Hole if filled with delayed
+ * packet, rather than with a retransmit.
+ */
+ tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
+
+ /* We are getting evidence that the reordering degree is higher
+ * than we realized. If there are no retransmits out then we
+ * can undo. Otherwise we clock out new packets but do not
+ * mark more packets lost or retransmit more.
+ */
+ if (tp->retrans_out) {
+ tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ return true;
+ }
+
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+
+ DBGUNDO(sk, "partial recovery");
+ tcp_undo_cwnd_reduction(sk, true);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+ tcp_try_keep_open(sk);
+ return true;
+ }
+ return false;
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2742,15 +2678,14 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
-static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
- int prior_sacked, int prior_packets,
+static void tcp_fastretrans_alert(struct sock *sk, const int acked,
+ const int prior_unsacked,
bool is_dupack, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
+ bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
- int newly_acked_sacked = 0;
int fast_rexmit = 0;
if (WARN_ON(!tp->packets_out && tp->sacked_out))
@@ -2802,10 +2737,17 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
- } else
- do_lost = tcp_try_undo_partial(sk, pkts_acked);
- newly_acked_sacked = prior_packets - tp->packets_out +
- tp->sacked_out - prior_sacked;
+ } else {
+ if (tcp_try_undo_partial(sk, acked, prior_unsacked))
+ return;
+ /* Partial ACK arrived. Force fast retransmit. */
+ do_lost = tcp_is_reno(tp) ||
+ tcp_fackets_out(tp) > tp->reordering;
+ }
+ if (tcp_try_undo_dsack(sk)) {
+ tcp_try_keep_open(sk);
+ return;
+ }
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack);
@@ -2819,14 +2761,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
if (is_dupack)
tcp_add_reno_sack(sk);
}
- newly_acked_sacked = prior_packets - tp->packets_out +
- tp->sacked_out - prior_sacked;
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
if (!tcp_time_to_recover(sk, flag)) {
- tcp_try_to_open(sk, flag, newly_acked_sacked);
+ tcp_try_to_open(sk, flag, prior_unsacked);
return;
}
@@ -2846,9 +2786,9 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
fast_rexmit = 1;
}
- if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
+ if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
- tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
+ tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
tcp_xmit_retransmit_queue(sk);
}
@@ -3079,7 +3019,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
- tp->scoreboard_skb_hint = NULL;
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = NULL;
if (skb == tp->lost_skb_hint)
@@ -3333,9 +3272,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 prior_in_flight;
u32 prior_fackets;
int prior_packets = tp->packets_out;
- int prior_sacked = tp->sacked_out;
- int pkts_acked = 0;
- int previous_packets_out = 0;
+ const int prior_unsacked = tp->packets_out - tp->sacked_out;
+ int acked = 0; /* Number of packets newly acked */
/* If the ack is older than previous acks
* then we can probably ignore it.
@@ -3410,18 +3348,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
- previous_packets_out = tp->packets_out;
+ acked = tp->packets_out;
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
-
- pkts_acked = previous_packets_out - tp->packets_out;
+ acked -= tp->packets_out;
if (tcp_ack_is_dubious(sk, flag)) {
/* Advance CWND, if state allows this. */
if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, prior_in_flight);
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
- prior_packets, is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
} else {
if (flag & FLAG_DATA_ACKED)
tcp_cong_avoid(sk, ack, prior_in_flight);
@@ -3443,8 +3380,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
- tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
- prior_packets, is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3466,8 +3403,8 @@ old_ack:
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
- tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
- prior_packets, is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
}
SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
@@ -3780,6 +3717,7 @@ void tcp_reset(struct sock *sk)
static void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ const struct dst_entry *dst;
inet_csk_schedule_ack(sk);
@@ -3791,7 +3729,9 @@ static void tcp_fin(struct sock *sk)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- inet_csk(sk)->icsk_ack.pingpong = 1;
+ dst = __sk_dst_get(sk);
+ if (!dst || !dst_metric(dst, RTAX_QUICKACK))
+ inet_csk(sk)->icsk_ack.pingpong = 1;
break;
case TCP_CLOSE_WAIT:
@@ -5601,6 +5541,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock *req;
int queued = 0;
+ bool acceptable;
tp->rx_opt.saw_tstamp = 0;
@@ -5671,157 +5612,147 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
return 0;
/* step 5: check the ACK field */
- if (true) {
- int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
- FLAG_UPDATE_TS_RECENT) > 0;
-
- switch (sk->sk_state) {
- case TCP_SYN_RECV:
- if (acceptable) {
- /* Once we leave TCP_SYN_RECV, we no longer
- * need req so release it.
- */
- if (req) {
- tcp_synack_rtt_meas(sk, req);
- tp->total_retrans = req->num_retrans;
-
- reqsk_fastopen_remove(sk, req, false);
- } else {
- /* Make sure socket is routed, for
- * correct metrics.
- */
- icsk->icsk_af_ops->rebuild_header(sk);
- tcp_init_congestion_control(sk);
+ acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
+ FLAG_UPDATE_TS_RECENT) > 0;
- tcp_mtup_init(sk);
- tcp_init_buffer_space(sk);
- tp->copied_seq = tp->rcv_nxt;
- }
- smp_mb();
- tcp_set_state(sk, TCP_ESTABLISHED);
- sk->sk_state_change(sk);
-
- /* Note, that this wakeup is only for marginal
- * crossed SYN case. Passively open sockets
- * are not waked up, because sk->sk_sleep ==
- * NULL and sk->sk_socket == NULL.
- */
- if (sk->sk_socket)
- sk_wake_async(sk,
- SOCK_WAKE_IO, POLL_OUT);
-
- tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
- tp->snd_wnd = ntohs(th->window) <<
- tp->rx_opt.snd_wscale;
- tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
-
- if (tp->rx_opt.tstamp_ok)
- tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
-
- if (req) {
- /* Re-arm the timer because data may
- * have been sent out. This is similar
- * to the regular data transmission case
- * when new data has just been ack'ed.
- *
- * (TFO) - we could try to be more
- * aggressive and retranmitting any data
- * sooner based on when they were sent
- * out.
- */
- tcp_rearm_rto(sk);
- } else
- tcp_init_metrics(sk);
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ if (!acceptable)
+ return 1;
- /* Prevent spurious tcp_cwnd_restart() on
- * first data packet.
- */
- tp->lsndtime = tcp_time_stamp;
+ /* Once we leave TCP_SYN_RECV, we no longer need req
+ * so release it.
+ */
+ if (req) {
+ tcp_synack_rtt_meas(sk, req);
+ tp->total_retrans = req->num_retrans;
- tcp_initialize_rcv_mss(sk);
- tcp_fast_path_on(tp);
- } else {
- return 1;
- }
- break;
+ reqsk_fastopen_remove(sk, req, false);
+ } else {
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+ tcp_init_congestion_control(sk);
+
+ tcp_mtup_init(sk);
+ tcp_init_buffer_space(sk);
+ tp->copied_seq = tp->rcv_nxt;
+ }
+ smp_mb();
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ sk->sk_state_change(sk);
+
+ /* Note, that this wakeup is only for marginal crossed SYN case.
+ * Passively open sockets are not waked up, because
+ * sk->sk_sleep == NULL and sk->sk_socket == NULL.
+ */
+ if (sk->sk_socket)
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+
+ tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+ tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- case TCP_FIN_WAIT1:
- /* If we enter the TCP_FIN_WAIT1 state and we are a
- * Fast Open socket and this is the first acceptable
- * ACK we have received, this would have acknowledged
- * our SYNACK so stop the SYNACK timer.
+ if (tp->rx_opt.tstamp_ok)
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+
+ if (req) {
+ /* Re-arm the timer because data may have been sent out.
+ * This is similar to the regular data transmission case
+ * when new data has just been ack'ed.
+ *
+ * (TFO) - we could try to be more aggressive and
+ * retransmitting any data sooner based on when they
+ * are sent out.
*/
- if (req != NULL) {
- /* Return RST if ack_seq is invalid.
- * Note that RFC793 only says to generate a
- * DUPACK for it but for TCP Fast Open it seems
- * better to treat this case like TCP_SYN_RECV
- * above.
- */
- if (!acceptable)
- return 1;
- /* We no longer need the request sock. */
- reqsk_fastopen_remove(sk, req, false);
- tcp_rearm_rto(sk);
- }
- if (tp->snd_una == tp->write_seq) {
- struct dst_entry *dst;
-
- tcp_set_state(sk, TCP_FIN_WAIT2);
- sk->sk_shutdown |= SEND_SHUTDOWN;
-
- dst = __sk_dst_get(sk);
- if (dst)
- dst_confirm(dst);
-
- if (!sock_flag(sk, SOCK_DEAD))
- /* Wake up lingering close() */
- sk->sk_state_change(sk);
- else {
- int tmo;
-
- if (tp->linger2 < 0 ||
- (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
- tcp_done(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
- return 1;
- }
+ tcp_rearm_rto(sk);
+ } else
+ tcp_init_metrics(sk);
- tmo = tcp_fin_time(sk);
- if (tmo > TCP_TIMEWAIT_LEN) {
- inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
- } else if (th->fin || sock_owned_by_user(sk)) {
- /* Bad case. We could lose such FIN otherwise.
- * It is not a big problem, but it looks confusing
- * and not so rare event. We still can lose it now,
- * if it spins in bh_lock_sock(), but it is really
- * marginal case.
- */
- inet_csk_reset_keepalive_timer(sk, tmo);
- } else {
- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
- goto discard;
- }
- }
- }
- break;
+ /* Prevent spurious tcp_cwnd_restart() on first data packet */
+ tp->lsndtime = tcp_time_stamp;
- case TCP_CLOSING:
- if (tp->snd_una == tp->write_seq) {
- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
- goto discard;
- }
+ tcp_initialize_rcv_mss(sk);
+ tcp_fast_path_on(tp);
+ break;
+
+ case TCP_FIN_WAIT1: {
+ struct dst_entry *dst;
+ int tmo;
+
+ /* If we enter the TCP_FIN_WAIT1 state and we are a
+ * Fast Open socket and this is the first acceptable
+ * ACK we have received, this would have acknowledged
+ * our SYNACK so stop the SYNACK timer.
+ */
+ if (req != NULL) {
+ /* Return RST if ack_seq is invalid.
+ * Note that RFC793 only says to generate a
+ * DUPACK for it but for TCP Fast Open it seems
+ * better to treat this case like TCP_SYN_RECV
+ * above.
+ */
+ if (!acceptable)
+ return 1;
+ /* We no longer need the request sock. */
+ reqsk_fastopen_remove(sk, req, false);
+ tcp_rearm_rto(sk);
+ }
+ if (tp->snd_una != tp->write_seq)
break;
- case TCP_LAST_ACK:
- if (tp->snd_una == tp->write_seq) {
- tcp_update_metrics(sk);
- tcp_done(sk);
- goto discard;
- }
+ tcp_set_state(sk, TCP_FIN_WAIT2);
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+
+ dst = __sk_dst_get(sk);
+ if (dst)
+ dst_confirm(dst);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ /* Wake up lingering close() */
+ sk->sk_state_change(sk);
break;
}
+
+ if (tp->linger2 < 0 ||
+ (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
+ tcp_done(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ return 1;
+ }
+
+ tmo = tcp_fin_time(sk);
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ } else if (th->fin || sock_owned_by_user(sk)) {
+ /* Bad case. We could lose such FIN otherwise.
+ * It is not a big problem, but it looks confusing
+ * and not so rare event. We still can lose it now,
+ * if it spins in bh_lock_sock(), but it is really
+ * marginal case.
+ */
+ inet_csk_reset_keepalive_timer(sk, tmo);
+ } else {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto discard;
+ }
+ break;
+ }
+
+ case TCP_CLOSING:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+ goto discard;
+ }
+ break;
+
+ case TCP_LAST_ACK:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_update_metrics(sk);
+ tcp_done(sk);
+ goto discard;
+ }
+ break;
}
/* step 6: check the URG bit */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7999fc55c83b..b299da5ff499 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -75,6 +75,7 @@
#include <net/netdma.h>
#include <net/secure_seq.h>
#include <net/tcp_memcontrol.h>
+#include <net/busy_poll.h>
#include <linux/inet.h>
#include <linux/ipv6.h>
@@ -545,8 +546,7 @@ out:
sock_put(sk);
}
-static void __tcp_v4_send_check(struct sk_buff *skb,
- __be32 saddr, __be32 daddr)
+void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
struct tcphdr *th = tcp_hdr(skb);
@@ -571,23 +571,6 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(tcp_v4_send_check);
-int tcp_v4_gso_send_check(struct sk_buff *skb)
-{
- const struct iphdr *iph;
- struct tcphdr *th;
-
- if (!pskb_may_pull(skb, sizeof(*th)))
- return -EINVAL;
-
- iph = ip_hdr(skb);
- th = tcp_hdr(skb);
-
- th->check = 0;
- skb->ip_summed = CHECKSUM_PARTIAL;
- __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
- return 0;
-}
-
/*
* This routine will send an RST to the other tcp.
*
@@ -1026,7 +1009,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
key = sock_kmalloc(sk, sizeof(*key), gfp);
if (!key)
return -ENOMEM;
- if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
+ if (!tcp_alloc_md5sig_pool()) {
sock_kfree_s(sk, key, sizeof(*key));
return -ENOMEM;
}
@@ -1044,9 +1027,7 @@ EXPORT_SYMBOL(tcp_md5_do_add);
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
- struct tcp_md5sig_info *md5sig;
key = tcp_md5_do_lookup(sk, addr, family);
if (!key)
@@ -1054,10 +1035,6 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
hlist_del_rcu(&key->node);
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
kfree_rcu(key, rcu);
- md5sig = rcu_dereference_protected(tp->md5sig_info,
- sock_owned_by_user(sk));
- if (hlist_empty(&md5sig->head))
- tcp_free_md5sig_pool();
return 0;
}
EXPORT_SYMBOL(tcp_md5_do_del);
@@ -1071,8 +1048,6 @@ static void tcp_clear_md5_list(struct sock *sk)
md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
- if (!hlist_empty(&md5sig->head))
- tcp_free_md5sig_pool();
hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
hlist_del_rcu(&key->node);
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
@@ -2019,6 +1994,7 @@ process:
if (sk_filter(sk, skb))
goto discard_and_relse;
+ sk_mark_napi_id(sk, skb);
skb->dev = NULL;
bh_lock_sock_nested(sk);
@@ -2803,52 +2779,6 @@ void tcp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
-struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
-{
- const struct iphdr *iph = skb_gro_network_header(skb);
- __wsum wsum;
- __sum16 sum;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
- skb->csum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
- }
-flush:
- NAPI_GRO_CB(skb)->flush = 1;
- return NULL;
-
- case CHECKSUM_NONE:
- wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb_gro_len(skb), IPPROTO_TCP, 0);
- sum = csum_fold(skb_checksum(skb,
- skb_gro_offset(skb),
- skb_gro_len(skb),
- wsum));
- if (sum)
- goto flush;
-
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
- }
-
- return tcp_gro_receive(head, skb);
-}
-
-int tcp4_gro_complete(struct sk_buff *skb)
-{
- const struct iphdr *iph = ip_hdr(skb);
- struct tcphdr *th = tcp_hdr(skb);
-
- th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
- iph->saddr, iph->daddr, 0);
- skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
-
- return tcp_gro_complete(skb);
-}
-
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 0f0178827259..ab1c08658528 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -317,7 +317,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
key = tp->af_specific->md5_lookup(sk, sk);
if (key != NULL) {
tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
- if (tcptw->tw_md5_key && tcp_alloc_md5sig_pool(sk) == NULL)
+ if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
BUG();
}
} while (0);
@@ -358,10 +358,8 @@ void tcp_twsk_destructor(struct sock *sk)
#ifdef CONFIG_TCP_MD5SIG
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
- if (twsk->tw_md5_key) {
- tcp_free_md5sig_pool();
+ if (twsk->tw_md5_key)
kfree_rcu(twsk->tw_md5_key, rcu);
- }
#endif
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
new file mode 100644
index 000000000000..3a7525e6c086
--- /dev/null
+++ b/net/ipv4/tcp_offload.c
@@ -0,0 +1,332 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * TCPv4 GSO/GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct tcphdr *th;
+ unsigned int thlen;
+ unsigned int seq;
+ __be32 delta;
+ unsigned int oldlen;
+ unsigned int mss;
+ struct sk_buff *gso_skb = skb;
+ __sum16 newcheck;
+ bool ooo_okay, copy_destructor;
+
+ if (!pskb_may_pull(skb, sizeof(*th)))
+ goto out;
+
+ th = tcp_hdr(skb);
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+ goto out;
+
+ if (!pskb_may_pull(skb, thlen))
+ goto out;
+
+ oldlen = (u16)~skb->len;
+ __skb_pull(skb, thlen);
+
+ mss = tcp_skb_mss(skb);
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ int type = skb_shinfo(skb)->gso_type;
+
+ if (unlikely(type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_GRE |
+ SKB_GSO_MPLS |
+ SKB_GSO_UDP_TUNNEL |
+ 0) ||
+ !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
+ goto out;
+
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+ segs = NULL;
+ goto out;
+ }
+
+ copy_destructor = gso_skb->destructor == tcp_wfree;
+ ooo_okay = gso_skb->ooo_okay;
+ /* All segments but the first should have ooo_okay cleared */
+ skb->ooo_okay = 0;
+
+ segs = skb_segment(skb, features);
+ if (IS_ERR(segs))
+ goto out;
+
+ /* Only first segment might have ooo_okay set */
+ segs->ooo_okay = ooo_okay;
+
+ delta = htonl(oldlen + (thlen + mss));
+
+ skb = segs;
+ th = tcp_hdr(skb);
+ seq = ntohl(th->seq);
+
+ newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
+ (__force u32)delta));
+
+ do {
+ th->fin = th->psh = 0;
+ th->check = newcheck;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ th->check =
+ csum_fold(csum_partial(skb_transport_header(skb),
+ thlen, skb->csum));
+
+ seq += mss;
+ if (copy_destructor) {
+ skb->destructor = gso_skb->destructor;
+ skb->sk = gso_skb->sk;
+ /* {tcp|sock}_wfree() use exact truesize accounting :
+ * sum(skb->truesize) MUST be exactly be gso_skb->truesize
+ * So we account mss bytes of 'true size' for each segment.
+ * The last segment will contain the remaining.
+ */
+ skb->truesize = mss;
+ gso_skb->truesize -= mss;
+ }
+ skb = skb->next;
+ th = tcp_hdr(skb);
+
+ th->seq = htonl(seq);
+ th->cwr = 0;
+ } while (skb->next);
+
+ /* Following permits TCP Small Queues to work well with GSO :
+ * The callback to TCP stack will be called at the time last frag
+ * is freed at TX completion, and not right now when gso_skb
+ * is freed by GSO engine
+ */
+ if (copy_destructor) {
+ swap(gso_skb->sk, skb->sk);
+ swap(gso_skb->destructor, skb->destructor);
+ swap(gso_skb->truesize, skb->truesize);
+ }
+
+ delta = htonl(oldlen + (skb_tail_pointer(skb) -
+ skb_transport_header(skb)) +
+ skb->data_len);
+ th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
+ (__force u32)delta));
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ th->check = csum_fold(csum_partial(skb_transport_header(skb),
+ thlen, skb->csum));
+out:
+ return segs;
+}
+EXPORT_SYMBOL(tcp_tso_segment);
+
+struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ struct tcphdr *th;
+ struct tcphdr *th2;
+ unsigned int len;
+ unsigned int thlen;
+ __be32 flags;
+ unsigned int mss = 1;
+ unsigned int hlen;
+ unsigned int off;
+ int flush = 1;
+ int i;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*th);
+ th = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
+
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+ goto out;
+
+ hlen = off + thlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
+
+ skb_gro_pull(skb, thlen);
+
+ len = skb_gro_len(skb);
+ flags = tcp_flag_word(th);
+
+ for (; (p = *head); head = &p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ th2 = tcp_hdr(p);
+
+ if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ goto found;
+ }
+
+ goto out_check_final;
+
+found:
+ flush = NAPI_GRO_CB(p)->flush;
+ flush |= (__force int)(flags & TCP_FLAG_CWR);
+ flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+ ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
+ flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
+ for (i = sizeof(*th); i < thlen; i += 4)
+ flush |= *(u32 *)((u8 *)th + i) ^
+ *(u32 *)((u8 *)th2 + i);
+
+ mss = tcp_skb_mss(p);
+
+ flush |= (len - 1) >= mss;
+ flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
+
+ if (flush || skb_gro_receive(head, skb)) {
+ mss = 1;
+ goto out_check_final;
+ }
+
+ p = *head;
+ th2 = tcp_hdr(p);
+ tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
+
+out_check_final:
+ flush = len < mss;
+ flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
+ TCP_FLAG_RST | TCP_FLAG_SYN |
+ TCP_FLAG_FIN));
+
+ if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
+ pp = head;
+
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+EXPORT_SYMBOL(tcp_gro_receive);
+
+int tcp_gro_complete(struct sk_buff *skb)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ if (th->cwr)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_gro_complete);
+
+static int tcp_v4_gso_send_check(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ struct tcphdr *th;
+
+ if (!pskb_may_pull(skb, sizeof(*th)))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+ th = tcp_hdr(skb);
+
+ th->check = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
+ return 0;
+}
+
+static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ const struct iphdr *iph = skb_gro_network_header(skb);
+ __wsum wsum;
+ __sum16 sum;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
+ skb->csum)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
+ }
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+
+ case CHECKSUM_NONE:
+ wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+ skb_gro_len(skb), IPPROTO_TCP, 0);
+ sum = csum_fold(skb_checksum(skb,
+ skb_gro_offset(skb),
+ skb_gro_len(skb),
+ wsum));
+ if (sum)
+ goto flush;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
+ }
+
+ return tcp_gro_receive(head, skb);
+}
+
+static int tcp4_gro_complete(struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
+ iph->saddr, iph->daddr, 0);
+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+ return tcp_gro_complete(skb);
+}
+
+static const struct net_offload tcpv4_offload = {
+ .callbacks = {
+ .gso_send_check = tcp_v4_gso_send_check,
+ .gso_segment = tcp_tso_segment,
+ .gro_receive = tcp4_gro_receive,
+ .gro_complete = tcp4_gro_complete,
+ },
+};
+
+int __init tcpv4_offload_init(void)
+{
+ return inet_add_offload(&tcpv4_offload, IPPROTO_TCP);
+}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ec335fabd5cc..92fde8d1aa82 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -160,6 +160,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_time_stamp;
+ const struct dst_entry *dst = __sk_dst_get(sk);
if (sysctl_tcp_slow_start_after_idle &&
(!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
@@ -170,8 +171,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
/* If it is a reply for ato after last received
* packet, enter pingpong mode.
*/
- if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
- icsk->icsk_ack.pingpong = 1;
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
+ (!dst || !dst_metric(dst, RTAX_QUICKACK)))
+ icsk->icsk_ack.pingpong = 1;
}
/* Account for an ACK we sent. */
@@ -181,6 +183,21 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
+
+u32 tcp_default_init_rwnd(u32 mss)
+{
+ /* Initial receive window should be twice of TCP_INIT_CWND to
+ * enable proper sending of new unsent data during fast recovery
+ * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
+ * limit when mss is larger than 1460.
+ */
+ u32 init_rwnd = TCP_INIT_CWND * 2;
+
+ if (mss > 1460)
+ init_rwnd = max((1460 * init_rwnd) / mss, 2U);
+ return init_rwnd;
+}
+
/* Determine a window scaling and initial window to offer.
* Based on the assumption that the given amount of space
* will be offered. Store the results in the tp structure.
@@ -230,22 +247,10 @@ void tcp_select_initial_window(int __space, __u32 mss,
}
}
- /* Set initial window to a value enough for senders starting with
- * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
- * a limit on the initial window when mss is larger than 1460.
- */
if (mss > (1 << *rcv_wscale)) {
- int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
- if (mss > 1460)
- init_cwnd =
- max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
- /* when initializing use the value from init_rcv_wnd
- * rather than the default from above
- */
- if (init_rcv_wnd)
- *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
- else
- *rcv_wnd = min(*rcv_wnd, init_cwnd * mss);
+ if (!init_rcv_wnd) /* Use default unless specified otherwise */
+ init_rcv_wnd = tcp_default_init_rwnd(mss);
+ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
}
/* Set the clamp no higher than max representable value */
@@ -2402,6 +2407,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
* see tcp_input.c tcp_sacktag_write_queue().
*/
TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
+ } else {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
return err;
}
@@ -2523,10 +2530,9 @@ begin_fwd:
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue;
- if (tcp_retransmit_skb(sk, skb)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
+ if (tcp_retransmit_skb(sk, skb))
return;
- }
+
NET_INC_STATS_BH(sock_net(sk), mib_idx);
if (tcp_in_cwnd_reduction(sk))
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0bf5d399a03c..766e6bab9113 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -109,6 +109,7 @@
#include <trace/events/udp.h>
#include <linux/static_key.h>
#include <trace/events/skb.h>
+#include <net/busy_poll.h>
#include "udp_impl.h"
struct udp_table udp_table __read_mostly;
@@ -429,7 +430,7 @@ begin:
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = inet_ehashfn(net, daddr, hnum,
- saddr, htons(sport));
+ saddr, sport);
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -510,7 +511,7 @@ begin:
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = inet_ehashfn(net, daddr, hnum,
- saddr, htons(sport));
+ saddr, sport);
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -799,7 +800,7 @@ send:
/*
* Push out all pending data as one UDP datagram. Socket is locked.
*/
-static int udp_push_pending_frames(struct sock *sk)
+int udp_push_pending_frames(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
@@ -818,6 +819,7 @@ out:
up->pending = 0;
return err;
}
+EXPORT_SYMBOL(udp_push_pending_frames);
int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len)
@@ -1709,7 +1711,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
if (sk != NULL) {
- int ret = udp_queue_rcv_skb(sk, skb);
+ int ret;
+
+ sk_mark_napi_id(sk, skb);
+ ret = udp_queue_rcv_skb(sk, skb);
sock_put(sk);
/* a return value > 0 means to resubmit the input, but
@@ -1967,6 +1972,8 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
unsigned int mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk;
+ sock_rps_record_flow(sk);
+
/* Check for false positives due to checksum errors */
if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
!(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
@@ -2284,29 +2291,8 @@ void __init udp_init(void)
sysctl_udp_wmem_min = SK_MEM_QUANTUM;
}
-int udp4_ufo_send_check(struct sk_buff *skb)
-{
- if (!pskb_may_pull(skb, sizeof(struct udphdr)))
- return -EINVAL;
-
- if (likely(!skb->encapsulation)) {
- const struct iphdr *iph;
- struct udphdr *uh;
-
- iph = ip_hdr(skb);
- uh = udp_hdr(skb);
-
- uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
- IPPROTO_UDP, 0);
- skb->csum_start = skb_transport_header(skb) - skb->head;
- skb->csum_offset = offsetof(struct udphdr, check);
- skb->ip_summed = CHECKSUM_PARTIAL;
- }
- return 0;
-}
-
-static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
- netdev_features_t features)
+struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
int mac_len = skb->mac_len;
@@ -2337,6 +2323,9 @@ static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
struct udphdr *uh;
int udp_offset = outer_hlen - tnl_hlen;
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+
skb->mac_len = mac_len;
skb_push(skb, outer_hlen);
@@ -2359,59 +2348,8 @@ static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
uh->check = CSUM_MANGLED_0;
}
- skb->ip_summed = CHECKSUM_NONE;
skb->protocol = protocol;
} while ((skb = skb->next));
out:
return segs;
}
-
-struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
- netdev_features_t features)
-{
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- unsigned int mss;
- mss = skb_shinfo(skb)->gso_size;
- if (unlikely(skb->len <= mss))
- goto out;
-
- if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
- /* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
-
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_GRE) ||
- !(type & (SKB_GSO_UDP))))
- goto out;
-
- skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
-
- segs = NULL;
- goto out;
- }
-
- /* Fragment the skb. IP headers of the fragments are updated in
- * inet_gso_segment()
- */
- if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
- segs = skb_udp_tunnel_segment(skb, features);
- else {
- int offset;
- __wsum csum;
-
- /* Do software UFO. Complete and fill in the UDP checksum as
- * HW cannot do checksum of UDP packets sent as multiple
- * IP fragments.
- */
- offset = skb_checksum_start_offset(skb);
- csum = skb_checksum(skb, offset, skb->len - offset, 0);
- offset += skb->csum_offset;
- *(__sum16 *)(skb->data + offset) = csum_fold(csum);
- skb->ip_summed = CHECKSUM_NONE;
-
- segs = skb_segment(skb, features);
- }
-out:
- return segs;
-}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
new file mode 100644
index 000000000000..f35eccaa855e
--- /dev/null
+++ b/net/ipv4/udp_offload.c
@@ -0,0 +1,100 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * UDPv4 GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <net/udp.h>
+#include <net/protocol.h>
+
+static int udp4_ufo_send_check(struct sk_buff *skb)
+{
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ return -EINVAL;
+
+ if (likely(!skb->encapsulation)) {
+ const struct iphdr *iph;
+ struct udphdr *uh;
+
+ iph = ip_hdr(skb);
+ uh = udp_hdr(skb);
+
+ uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
+ IPPROTO_UDP, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ }
+
+ return 0;
+}
+
+static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int mss;
+
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ int type = skb_shinfo(skb)->gso_type;
+
+ if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_GRE | SKB_GSO_MPLS) ||
+ !(type & (SKB_GSO_UDP))))
+ goto out;
+
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+ segs = NULL;
+ goto out;
+ }
+
+ /* Fragment the skb. IP headers of the fragments are updated in
+ * inet_gso_segment()
+ */
+ if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
+ segs = skb_udp_tunnel_segment(skb, features);
+ else {
+ int offset;
+ __wsum csum;
+
+ /* Do software UFO. Complete and fill in the UDP checksum as
+ * HW cannot do checksum of UDP packets sent as multiple
+ * IP fragments.
+ */
+ offset = skb_checksum_start_offset(skb);
+ csum = skb_checksum(skb, offset, skb->len - offset, 0);
+ offset += skb->csum_offset;
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ segs = skb_segment(skb, features);
+ }
+out:
+ return segs;
+}
+
+static const struct net_offload udpv4_offload = {
+ .callbacks = {
+ .gso_send_check = udp4_ufo_send_check,
+ .gso_segment = udp4_ufo_fragment,
+ },
+};
+
+int __init udpv4_offload_init(void)
+{
+ return inet_add_offload(&udpv4_offload, IPPROTO_UDP);
+}
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 05a5df2febc9..06347dbd32c1 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -63,7 +63,7 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
.handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
- .priority = 2,
+ .priority = 3,
};
#if IS_ENABLED(CONFIG_IPV6)