aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c11
-rw-r--r--net/ipv4/bpfilter/sockopt.c58
-rw-r--r--net/ipv4/cipso_ipv4.c20
-rw-r--r--net/ipv4/devinet.c90
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_frontend.c8
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fib_trie.c15
-rw-r--r--net/ipv4/fou.c12
-rw-r--r--net/ipv4/gre_demux.c17
-rw-r--r--net/ipv4/icmp.c11
-rw-r--r--net/ipv4/igmp.c104
-rw-r--r--net/ipv4/inet_diag.c10
-rw-r--r--net/ipv4/inet_fragment.c301
-rw-r--r--net/ipv4/inetpeer.c1
-rw-r--r--net/ipv4/ip_fragment.c291
-rw-r--r--net/ipv4/ip_gre.c176
-rw-r--r--net/ipv4/ip_input.c12
-rw-r--r--net/ipv4/ip_options.c22
-rw-r--r--net/ipv4/ip_sockglue.c12
-rw-r--r--net/ipv4/ip_tunnel.c108
-rw-r--r--net/ipv4/ip_tunnel_core.c18
-rw-r--r--net/ipv4/ip_vti.c50
-rw-r--r--net/ipv4/ipconfig.c27
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/ipmr.c149
-rw-r--r--net/ipv4/netfilter.c18
-rw-r--r--net/ipv4/netfilter/Kconfig50
-rw-r--r--net/ipv4/netfilter/Makefile7
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c4
-rw-r--r--net/ipv4/netfilter/iptable_nat.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c387
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c196
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic_main.c7
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c9
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c87
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c90
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c82
-rw-r--r--net/ipv4/netlink.c17
-rw-r--r--net/ipv4/route.c88
-rw-r--r--net/ipv4/tcp.c108
-rw-r--r--net/ipv4/tcp_bbr.c180
-rw-r--r--net/ipv4/tcp_input.c43
-rw-r--r--net/ipv4/tcp_ipv4.c9
-rw-r--r--net/ipv4/tcp_minisocks.c55
-rw-r--r--net/ipv4/tcp_output.c83
-rw-r--r--net/ipv4/tcp_timer.c89
-rw-r--r--net/ipv4/udp.c24
-rw-r--r--net/ipv4/udp_impl.h1
-rw-r--r--net/ipv4/udp_tunnel.c15
-rw-r--r--net/ipv4/udplite.c1
51 files changed, 1393 insertions, 1794 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0dfb72c46671..eab3ebde981e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1385,6 +1385,15 @@ out:
}
EXPORT_SYMBOL(inet_gso_segment);
+static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
+ return ERR_PTR(-EINVAL);
+
+ return inet_gso_segment(skb, features);
+}
+
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *,
struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
@@ -1861,7 +1870,7 @@ static struct packet_offload ip_packet_offload __read_mostly = {
static const struct net_offload ipip_offload = {
.callbacks = {
- .gso_segment = inet_gso_segment,
+ .gso_segment = ipip_gso_segment,
.gro_receive = ipip_gro_receive,
.gro_complete = ipip_gro_complete,
},
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 5e04ed25bc0e..1e976bb93d99 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -1,28 +1,54 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/bpfilter.h>
#include <uapi/linux/bpf.h>
#include <linux/wait.h>
#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/file.h>
-int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
- char __user *optval,
- unsigned int optlen, bool is_set);
-EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
+struct bpfilter_umh_ops bpfilter_ops;
+EXPORT_SYMBOL_GPL(bpfilter_ops);
+
+static void bpfilter_umh_cleanup(struct umh_info *info)
+{
+ mutex_lock(&bpfilter_ops.lock);
+ bpfilter_ops.stop = true;
+ fput(info->pipe_to_umh);
+ fput(info->pipe_from_umh);
+ info->pid = 0;
+ mutex_unlock(&bpfilter_ops.lock);
+}
static int bpfilter_mbox_request(struct sock *sk, int optname,
char __user *optval,
unsigned int optlen, bool is_set)
{
- if (!bpfilter_process_sockopt) {
- int err = request_module("bpfilter");
+ int err;
+ mutex_lock(&bpfilter_ops.lock);
+ if (!bpfilter_ops.sockopt) {
+ mutex_unlock(&bpfilter_ops.lock);
+ err = request_module("bpfilter");
+ mutex_lock(&bpfilter_ops.lock);
if (err)
- return err;
- if (!bpfilter_process_sockopt)
- return -ECHILD;
+ goto out;
+ if (!bpfilter_ops.sockopt) {
+ err = -ECHILD;
+ goto out;
+ }
+ }
+ if (bpfilter_ops.stop) {
+ err = bpfilter_ops.start();
+ if (err)
+ goto out;
}
- return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
+ err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set);
+out:
+ mutex_unlock(&bpfilter_ops.lock);
+ return err;
}
int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
@@ -41,3 +67,15 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
return bpfilter_mbox_request(sk, optname, optval, len, false);
}
+
+static int __init bpfilter_sockopt_init(void)
+{
+ mutex_init(&bpfilter_ops.lock);
+ bpfilter_ops.stop = true;
+ bpfilter_ops.info.cmdline = "bpfilter_umh";
+ bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup;
+
+ return 0;
+}
+
+module_init(bpfilter_sockopt_init);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 777fa3b7fb13..f0165c5f376b 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -667,7 +667,8 @@ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
case CIPSO_V4_MAP_PASS:
return 0;
case CIPSO_V4_MAP_TRANS:
- if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)
+ if ((level < doi_def->map.std->lvl.cipso_size) &&
+ (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL))
return 0;
break;
}
@@ -1735,13 +1736,26 @@ validate_return:
*/
void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
{
+ unsigned char optbuf[sizeof(struct ip_options) + 40];
+ struct ip_options *opt = (struct ip_options *)optbuf;
+
if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
return;
+ /*
+ * We might be called above the IP layer,
+ * so we can not use icmp_send and IPCB here.
+ */
+
+ memset(opt, 0, sizeof(struct ip_options));
+ opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
+ if (__ip_options_compile(dev_net(skb->dev), opt, skb, NULL))
+ return;
+
if (gateway)
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt);
else
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt);
}
/**
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 04ba321ae5ce..eb514f312e6f 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1826,7 +1826,7 @@ put_tgt_net:
if (fillargs.netnsid >= 0)
put_net(tgt_net);
- return err < 0 ? err : skb->len;
+ return skb->len ? : err;
}
static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
@@ -2063,13 +2063,49 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
[NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) },
};
+static int inet_netconf_valid_get_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse(nlh, sizeof(struct netconfmsg), tb,
+ NETCONFA_MAX, devconf_ipv4_policy, extack);
+
+ err = nlmsg_parse_strict(nlh, sizeof(struct netconfmsg), tb,
+ NETCONFA_MAX, devconf_ipv4_policy, extack);
+ if (err)
+ return err;
+
+ for (i = 0; i <= NETCONFA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NETCONFA_IFINDEX:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int inet_netconf_get_devconf(struct sk_buff *in_skb,
struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(in_skb->sk);
struct nlattr *tb[NETCONFA_MAX+1];
- struct netconfmsg *ncm;
struct sk_buff *skb;
struct ipv4_devconf *devconf;
struct in_device *in_dev;
@@ -2077,9 +2113,8 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
int ifindex;
int err;
- err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
- devconf_ipv4_policy, extack);
- if (err < 0)
+ err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack);
+ if (err)
goto errout;
err = -EINVAL;
@@ -2556,32 +2591,34 @@ static __net_init int devinet_init_net(struct net *net)
int err;
struct ipv4_devconf *all, *dflt;
#ifdef CONFIG_SYSCTL
- struct ctl_table *tbl = ctl_forward_entry;
+ struct ctl_table *tbl;
struct ctl_table_header *forw_hdr;
#endif
err = -ENOMEM;
- all = &ipv4_devconf;
- dflt = &ipv4_devconf_dflt;
+ all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL);
+ if (!all)
+ goto err_alloc_all;
- if (!net_eq(net, &init_net)) {
- all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
- if (!all)
- goto err_alloc_all;
-
- dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
- if (!dflt)
- goto err_alloc_dflt;
+ dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
+ if (!dflt)
+ goto err_alloc_dflt;
#ifdef CONFIG_SYSCTL
- tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
- if (!tbl)
- goto err_alloc_ctl;
+ tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL);
+ if (!tbl)
+ goto err_alloc_ctl;
- tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
- tbl[0].extra1 = all;
- tbl[0].extra2 = net;
+ tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
+ tbl[0].extra1 = all;
+ tbl[0].extra2 = net;
#endif
+
+ if ((!IS_ENABLED(CONFIG_SYSCTL) ||
+ sysctl_devconf_inherit_init_net != 2) &&
+ !net_eq(net, &init_net)) {
+ memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf));
+ memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt));
}
#ifdef CONFIG_SYSCTL
@@ -2611,15 +2648,12 @@ err_reg_ctl:
err_reg_dflt:
__devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
err_reg_all:
- if (tbl != ctl_forward_entry)
- kfree(tbl);
+ kfree(tbl);
err_alloc_ctl:
#endif
- if (dflt != &ipv4_devconf_dflt)
- kfree(dflt);
+ kfree(dflt);
err_alloc_dflt:
- if (all != &ipv4_devconf)
- kfree(all);
+ kfree(all);
err_alloc_all:
return err;
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 5459f41fc26f..10e809b296ec 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -328,7 +328,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
skb->len += tailen;
skb->data_len += tailen;
skb->truesize += tailen;
- if (sk)
+ if (sk && sk_fullsock(sk))
refcount_add(tailen, &sk->sk_wmem_alloc);
goto out;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6df95be96311..ed14ec245584 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -203,7 +203,7 @@ static void fib_flush(struct net *net)
struct fib_table *tb;
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
- flushed += fib_table_flush(net, tb);
+ flushed += fib_table_flush(net, tb, false);
}
if (flushed)
@@ -710,6 +710,10 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
case RTA_GATEWAY:
cfg->fc_gw = nla_get_be32(attr);
break;
+ case RTA_VIA:
+ NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute");
+ err = -EINVAL;
+ goto errout;
case RTA_PRIORITY:
cfg->fc_priority = nla_get_u32(attr);
break;
@@ -1463,7 +1467,7 @@ static void ip_fib_net_exit(struct net *net)
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
hlist_del(&tb->tb_hlist);
- fib_table_flush(net, tb);
+ fib_table_flush(net, tb, true);
fib_free_table(tb);
}
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 5022bc63863a..8e185b5a2bf6 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1072,7 +1072,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto failure;
}
- fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+ fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
if (!fi)
goto failure;
fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 237c9f72b265..a573e37e0615 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1856,7 +1856,7 @@ void fib_table_flush_external(struct fib_table *tb)
}
/* Caller must hold RTNL. */
-int fib_table_flush(struct net *net, struct fib_table *tb)
+int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *pn = t->kv;
@@ -1904,8 +1904,17 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
- if (!fi || !(fi->fib_flags & RTNH_F_DEAD) ||
- tb->tb_id != fa->tb_id) {
+ if (!fi || tb->tb_id != fa->tb_id ||
+ (!(fi->fib_flags & RTNH_F_DEAD) &&
+ !fib_props[fa->fa_type].error)) {
+ slen = fa->fa_slen;
+ continue;
+ }
+
+ /* Do not flush error routes if network namespace is
+ * not being dismantled
+ */
+ if (!flush_all && fib_props[fa->fa_type].error) {
slen = fa->fa_slen;
continue;
}
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 0c9f171fb085..437070d1ffb1 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -1020,10 +1020,11 @@ static int gue_err(struct sk_buff *skb, u32 info)
{
int transport_offset = skb_transport_offset(skb);
struct guehdr *guehdr;
- size_t optlen;
+ size_t len, optlen;
int ret;
- if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr))
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (!pskb_may_pull(skb, len))
return -EINVAL;
guehdr = (struct guehdr *)&udp_hdr(skb)[1];
@@ -1058,6 +1059,10 @@ static int gue_err(struct sk_buff *skb, u32 info)
optlen = guehdr->hlen << 2;
+ if (!pskb_may_pull(skb, len + optlen))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
if (validate_gue_flags(guehdr, optlen))
return -EINVAL;
@@ -1065,7 +1070,8 @@ static int gue_err(struct sk_buff *skb, u32 info)
* recursion. Besides, this kind of encapsulation can't even be
* configured currently. Discard this.
*/
- if (guehdr->proto_ctype == IPPROTO_UDP)
+ if (guehdr->proto_ctype == IPPROTO_UDP ||
+ guehdr->proto_ctype == IPPROTO_UDPLITE)
return -EOPNOTSUPP;
skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index a4bf22ee3aed..7c4a41dc04bb 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -25,6 +25,7 @@
#include <linux/spinlock.h>
#include <net/protocol.h>
#include <net/gre.h>
+#include <net/erspan.h>
#include <net/icmp.h>
#include <net/route.h>
@@ -119,6 +120,22 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
hdr_len += 4;
}
tpi->hdr_len = hdr_len;
+
+ /* ERSPAN ver 1 and 2 protocol sets GRE key field
+ * to 0 and sets the configured key in the
+ * inner erspan header field
+ */
+ if (greh->protocol == htons(ETH_P_ERSPAN) ||
+ greh->protocol == htons(ETH_P_ERSPAN2)) {
+ struct erspan_base_hdr *ershdr;
+
+ if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr)))
+ return -EINVAL;
+
+ ershdr = (struct erspan_base_hdr *)options;
+ tpi->key = cpu_to_be32(get_session_id(ershdr));
+ }
+
return hdr_len;
}
EXPORT_SYMBOL(gre_parse_header);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 065997f414e6..f3a5893b1e86 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -570,7 +570,8 @@ relookup_failed:
* MUST reply to only the first fragment.
*/
-void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
+ const struct ip_options *opt)
{
struct iphdr *iph;
int room;
@@ -691,7 +692,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in))
+ if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt))
goto out_unlock;
@@ -742,7 +743,7 @@ out_bh_enable:
local_bh_enable();
out:;
}
-EXPORT_SYMBOL(icmp_send);
+EXPORT_SYMBOL(__icmp_send);
static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
@@ -1245,9 +1246,7 @@ static int __net_init icmp_sk_init(struct net *net)
return 0;
fail:
- for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
- free_percpu(net->ipv4.icmp_sk);
+ icmp_sk_exit(net);
return err;
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 765b2b32c4a4..6c2febc39dca 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -159,7 +159,8 @@ static int unsolicited_report_interval(struct in_device *in_dev)
return interval_jiffies;
}
-static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
+ gfp_t gfp);
static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_clear_delrec(struct in_device *in_dev);
static int sf_setstate(struct ip_mc_list *pmc);
@@ -1145,7 +1146,8 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
/*
* deleted ip_mc_list manipulation
*/
-static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
+ gfp_t gfp)
{
struct ip_mc_list *pmc;
struct net *net = dev_net(in_dev->dev);
@@ -1156,7 +1158,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
* for deleted items allows change reports to use common code with
* non-deleted or query-response MCA's.
*/
- pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
+ pmc = kzalloc(sizeof(*pmc), gfp);
if (!pmc)
return;
spin_lock_init(&pmc->lock);
@@ -1261,7 +1263,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
}
#endif
-static void igmp_group_dropped(struct ip_mc_list *im)
+static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp)
{
struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
@@ -1292,13 +1294,18 @@ static void igmp_group_dropped(struct ip_mc_list *im)
return;
}
/* IGMPv3 */
- igmpv3_add_delrec(in_dev, im);
+ igmpv3_add_delrec(in_dev, im, gfp);
igmp_ifc_event(in_dev);
}
#endif
}
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+ __igmp_group_dropped(im, GFP_KERNEL);
+}
+
static void igmp_group_added(struct ip_mc_list *im)
{
struct in_device *in_dev = im->interface;
@@ -1400,8 +1407,8 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
/*
* A socket has joined a multicast group on device dev.
*/
-static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
- unsigned int mode)
+static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
+ unsigned int mode, gfp_t gfp)
{
struct ip_mc_list *im;
@@ -1415,7 +1422,7 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
}
}
- im = kzalloc(sizeof(*im), GFP_KERNEL);
+ im = kzalloc(sizeof(*im), gfp);
if (!im)
goto out;
@@ -1448,6 +1455,12 @@ out:
return;
}
+void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
+{
+ ____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp);
+}
+EXPORT_SYMBOL(__ip_mc_inc_group);
+
void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
{
__ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
@@ -1493,22 +1506,22 @@ static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
len += sizeof(struct igmpv3_report);
- return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+ return ip_mc_may_pull(skb, len) ? 0 : -EINVAL;
}
static int ip_mc_check_igmp_query(struct sk_buff *skb)
{
- unsigned int len = skb_transport_offset(skb);
-
- len += sizeof(struct igmphdr);
- if (skb->len < len)
- return -EINVAL;
+ unsigned int transport_len = ip_transport_len(skb);
+ unsigned int len;
/* IGMPv{1,2}? */
- if (skb->len != len) {
+ if (transport_len != sizeof(struct igmphdr)) {
/* or IGMPv3? */
- len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr);
- if (skb->len < len || !pskb_may_pull(skb, len))
+ if (transport_len < sizeof(struct igmpv3_query))
+ return -EINVAL;
+
+ len = skb_transport_offset(skb) + sizeof(struct igmpv3_query);
+ if (!ip_mc_may_pull(skb, len))
return -EINVAL;
}
@@ -1528,7 +1541,6 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb)
case IGMP_HOST_LEAVE_MESSAGE:
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
- /* fall through */
return 0;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
return ip_mc_check_igmp_reportv3(skb);
@@ -1544,47 +1556,29 @@ static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
return skb_checksum_simple_validate(skb);
}
-static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
-
+static int ip_mc_check_igmp_csum(struct sk_buff *skb)
{
- struct sk_buff *skb_chk;
- unsigned int transport_len;
unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
- int ret = -EINVAL;
+ unsigned int transport_len = ip_transport_len(skb);
+ struct sk_buff *skb_chk;
- transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb);
+ if (!ip_mc_may_pull(skb, len))
+ return -EINVAL;
skb_chk = skb_checksum_trimmed(skb, transport_len,
ip_mc_validate_checksum);
if (!skb_chk)
- goto err;
-
- if (!pskb_may_pull(skb_chk, len))
- goto err;
-
- ret = ip_mc_check_igmp_msg(skb_chk);
- if (ret)
- goto err;
-
- if (skb_trimmed)
- *skb_trimmed = skb_chk;
- /* free now unneeded clone */
- else if (skb_chk != skb)
- kfree_skb(skb_chk);
-
- ret = 0;
+ return -EINVAL;
-err:
- if (ret && skb_chk && skb_chk != skb)
+ if (skb_chk != skb)
kfree_skb(skb_chk);
- return ret;
+ return 0;
}
/**
* ip_mc_check_igmp - checks whether this is a sane IGMP packet
* @skb: the skb to validate
- * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional)
*
* Checks whether an IPv4 packet is a valid IGMP packet. If so sets
* skb transport header accordingly and returns zero.
@@ -1594,18 +1588,10 @@ err:
* -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
* -ENOMEM: A memory allocation failure happened.
*
- * Optionally, an skb pointer might be provided via skb_trimmed (or set it
- * to NULL): After parsing an IGMP packet successfully it will point to
- * an skb which has its tail aligned to the IP packet end. This might
- * either be the originally provided skb or a trimmed, cloned version if
- * the skb frame had data beyond the IP packet. A cloned skb allows us
- * to leave the original skb and its full frame unchanged (which might be
- * desirable for layer 2 frame jugglers).
- *
* Caller needs to set the skb network header and free any returned skb if it
* differs from the provided skb.
*/
-int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+int ip_mc_check_igmp(struct sk_buff *skb)
{
int ret = ip_mc_check_iphdr(skb);
@@ -1615,7 +1601,11 @@ int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
return -ENOMSG;
- return __ip_mc_check_igmp(skb, skb_trimmed);
+ ret = ip_mc_check_igmp_csum(skb);
+ if (ret < 0)
+ return ret;
+
+ return ip_mc_check_igmp_msg(skb);
}
EXPORT_SYMBOL(ip_mc_check_igmp);
@@ -1656,7 +1646,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
* A socket has left a multicast group on device dev
*/
-void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
+void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
{
struct ip_mc_list *i;
struct ip_mc_list __rcu **ip;
@@ -1671,7 +1661,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
ip_mc_hash_remove(in_dev, i);
*ip = i->next_rcu;
in_dev->mc_count--;
- igmp_group_dropped(i);
+ __igmp_group_dropped(i, gfp);
ip_mc_clear_src(i);
if (!in_dev->dead)
@@ -1684,7 +1674,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
}
}
}
-EXPORT_SYMBOL(ip_mc_dec_group);
+EXPORT_SYMBOL(__ip_mc_dec_group);
/* Device changing type */
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 1a4e9ff02762..5731670c560b 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -108,6 +108,7 @@ static size_t inet_sk_attr_size(struct sock *sk,
+ nla_total_size(1) /* INET_DIAG_TOS */
+ nla_total_size(1) /* INET_DIAG_TCLASS */
+ nla_total_size(4) /* INET_DIAG_MARK */
+ + nla_total_size(4) /* INET_DIAG_CLASS_ID */
+ nla_total_size(sizeof(struct inet_diag_meminfo))
+ nla_total_size(sizeof(struct inet_diag_msg))
+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
@@ -287,12 +288,19 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
goto errout;
}
- if (ext & (1 << (INET_DIAG_CLASS_ID - 1))) {
+ if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) ||
+ ext & (1 << (INET_DIAG_TCLASS - 1))) {
u32 classid = 0;
#ifdef CONFIG_SOCK_CGROUP_DATA
classid = sock_cgroup_classid(&sk->sk_cgrp_data);
#endif
+ /* Fallback to socket priority if class id isn't set.
+ * Classful qdiscs use it as direct reference to class.
+ * For cgroup2 classid is always zero.
+ */
+ if (!classid)
+ classid = sk->sk_priority;
if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
goto errout;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 760a9e52e02b..737808e27f8b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -25,6 +25,62 @@
#include <net/sock.h>
#include <net/inet_frag.h>
#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+/* Use skb->cb to track consecutive/adjacent fragments coming at
+ * the end of the queue. Nodes in the rb-tree queue will
+ * contain "runs" of one or more adjacent fragments.
+ *
+ * Invariants:
+ * - next_frag is NULL at the tail of a "run";
+ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
+ */
+struct ipfrag_skb_cb {
+ union {
+ struct inet_skb_parm h4;
+ struct inet6_skb_parm h6;
+ };
+ struct sk_buff *next_frag;
+ int frag_run_len;
+};
+
+#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
+
+static void fragcb_clear(struct sk_buff *skb)
+{
+ RB_CLEAR_NODE(&skb->rbnode);
+ FRAG_CB(skb)->next_frag = NULL;
+ FRAG_CB(skb)->frag_run_len = skb->len;
+}
+
+/* Append skb to the last "run". */
+static void fragrun_append_to_last(struct inet_frag_queue *q,
+ struct sk_buff *skb)
+{
+ fragcb_clear(skb);
+
+ FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
+ FRAG_CB(q->fragments_tail)->next_frag = skb;
+ q->fragments_tail = skb;
+}
+
+/* Create a new "run" with the skb. */
+static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
+ fragcb_clear(skb);
+
+ if (q->last_run_head)
+ rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
+ &q->last_run_head->rbnode.rb_right);
+ else
+ rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
+ rb_insert_color(&skb->rbnode, &q->rb_fragments);
+
+ q->fragments_tail = skb;
+ q->last_run_head = skb;
+}
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
* Value : 0xff if frame should be dropped.
@@ -123,9 +179,30 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
kmem_cache_free(f->frags_cachep, q);
}
+unsigned int inet_frag_rbtree_purge(struct rb_root *root)
+{
+ struct rb_node *p = rb_first(root);
+ unsigned int sum = 0;
+
+ while (p) {
+ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, root);
+ while (skb) {
+ struct sk_buff *next = FRAG_CB(skb)->next_frag;
+
+ sum += skb->truesize;
+ kfree_skb(skb);
+ skb = next;
+ }
+ }
+ return sum;
+}
+EXPORT_SYMBOL(inet_frag_rbtree_purge);
+
void inet_frag_destroy(struct inet_frag_queue *q)
{
- struct sk_buff *fp;
struct netns_frags *nf;
unsigned int sum, sum_truesize = 0;
struct inet_frags *f;
@@ -134,20 +211,9 @@ void inet_frag_destroy(struct inet_frag_queue *q)
WARN_ON(del_timer(&q->timer) != 0);
/* Release all fragment data. */
- fp = q->fragments;
nf = q->net;
f = nf->f;
- if (fp) {
- do {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- kfree_skb(fp);
- fp = xp;
- } while (fp);
- } else {
- sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
- }
+ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
sum = sum_truesize + f->qsize;
call_rcu(&q->rcu, inet_frag_destroy_rcu);
@@ -224,3 +290,212 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
return fq;
}
EXPORT_SYMBOL(inet_frag_find);
+
+int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
+ int offset, int end)
+{
+ struct sk_buff *last = q->fragments_tail;
+
+ /* RFC5722, Section 4, amended by Errata ID : 3089
+ * When reassembling an IPv6 datagram, if
+ * one or more its constituent fragments is determined to be an
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments) MUST be silently discarded.
+ *
+ * Duplicates, however, should be ignored (i.e. skb dropped, but the
+ * queue/fragments kept for later reassembly).
+ */
+ if (!last)
+ fragrun_create(q, skb); /* First fragment. */
+ else if (last->ip_defrag_offset + last->len < end) {
+ /* This is the common case: skb goes to the end. */
+ /* Detect and discard overlaps. */
+ if (offset < last->ip_defrag_offset + last->len)
+ return IPFRAG_OVERLAP;
+ if (offset == last->ip_defrag_offset + last->len)
+ fragrun_append_to_last(q, skb);
+ else
+ fragrun_create(q, skb);
+ } else {
+ /* Binary search. Note that skb can become the first fragment,
+ * but not the last (covered above).
+ */
+ struct rb_node **rbn, *parent;
+
+ rbn = &q->rb_fragments.rb_node;
+ do {
+ struct sk_buff *curr;
+ int curr_run_end;
+
+ parent = *rbn;
+ curr = rb_to_skb(parent);
+ curr_run_end = curr->ip_defrag_offset +
+ FRAG_CB(curr)->frag_run_len;
+ if (end <= curr->ip_defrag_offset)
+ rbn = &parent->rb_left;
+ else if (offset >= curr_run_end)
+ rbn = &parent->rb_right;
+ else if (offset >= curr->ip_defrag_offset &&
+ end <= curr_run_end)
+ return IPFRAG_DUP;
+ else
+ return IPFRAG_OVERLAP;
+ } while (*rbn);
+ /* Here we have parent properly set, and rbn pointing to
+ * one of its NULL left/right children. Insert skb.
+ */
+ fragcb_clear(skb);
+ rb_link_node(&skb->rbnode, parent, rbn);
+ rb_insert_color(&skb->rbnode, &q->rb_fragments);
+ }
+
+ skb->ip_defrag_offset = offset;
+
+ return IPFRAG_OK;
+}
+EXPORT_SYMBOL(inet_frag_queue_insert);
+
+void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
+ struct sk_buff *parent)
+{
+ struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
+ struct sk_buff **nextp;
+ int delta;
+
+ if (head != skb) {
+ fp = skb_clone(skb, GFP_ATOMIC);
+ if (!fp)
+ return NULL;
+ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
+ if (RB_EMPTY_NODE(&skb->rbnode))
+ FRAG_CB(parent)->next_frag = fp;
+ else
+ rb_replace_node(&skb->rbnode, &fp->rbnode,
+ &q->rb_fragments);
+ if (q->fragments_tail == skb)
+ q->fragments_tail = fp;
+ skb_morph(skb, head);
+ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &q->rb_fragments);
+ consume_skb(head);
+ head = skb;
+ }
+ WARN_ON(head->ip_defrag_offset != 0);
+
+ delta = -head->truesize;
+
+ /* Head of list must not be cloned. */
+ if (skb_unclone(head, GFP_ATOMIC))
+ return NULL;
+
+ delta += head->truesize;
+ if (delta)
+ add_frag_mem_limit(q->net, delta);
+
+ /* If the first fragment is fragmented itself, we split
+ * it to two chunks: the first with data and paged part
+ * and the second, holding only fragments.
+ */
+ if (skb_has_frag_list(head)) {
+ struct sk_buff *clone;
+ int i, plen = 0;
+
+ clone = alloc_skb(0, GFP_ATOMIC);
+ if (!clone)
+ return NULL;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_frag_list_init(head);
+ for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+ plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+ clone->data_len = head->data_len - plen;
+ clone->len = clone->data_len;
+ head->truesize += clone->truesize;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+ add_frag_mem_limit(q->net, clone->truesize);
+ skb_shinfo(head)->frag_list = clone;
+ nextp = &clone->next;
+ } else {
+ nextp = &skb_shinfo(head)->frag_list;
+ }
+
+ return nextp;
+}
+EXPORT_SYMBOL(inet_frag_reasm_prepare);
+
+void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
+ void *reasm_data)
+{
+ struct sk_buff **nextp = (struct sk_buff **)reasm_data;
+ struct rb_node *rbn;
+ struct sk_buff *fp;
+
+ skb_push(head, head->data - skb_network_header(head));
+
+ /* Traverse the tree in order, to build frag_list. */
+ fp = FRAG_CB(head)->next_frag;
+ rbn = rb_next(&head->rbnode);
+ rb_erase(&head->rbnode, &q->rb_fragments);
+ while (rbn || fp) {
+ /* fp points to the next sk_buff in the current run;
+ * rbn points to the next run.
+ */
+ /* Go through the current run. */
+ while (fp) {
+ *nextp = fp;
+ nextp = &fp->next;
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+ fp->sk = NULL;
+ head->data_len += fp->len;
+ head->len += fp->len;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_add(head->csum, fp->csum);
+ head->truesize += fp->truesize;
+ fp = FRAG_CB(fp)->next_frag;
+ }
+ /* Move to the next run. */
+ if (rbn) {
+ struct rb_node *rbnext = rb_next(rbn);
+
+ fp = rb_to_skb(rbn);
+ rb_erase(rbn, &q->rb_fragments);
+ rbn = rbnext;
+ }
+ }
+ sub_frag_mem_limit(q->net, head->truesize);
+
+ *nextp = NULL;
+ skb_mark_not_on_list(head);
+ head->prev = NULL;
+ head->tstamp = q->stamp;
+}
+EXPORT_SYMBOL(inet_frag_reasm_finish);
+
+struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
+{
+ struct sk_buff *head, *skb;
+
+ head = skb_rb_first(&q->rb_fragments);
+ if (!head)
+ return NULL;
+ skb = FRAG_CB(head)->next_frag;
+ if (skb)
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &q->rb_fragments);
+ else
+ rb_erase(&head->rbnode, &q->rb_fragments);
+ memset(&head->rbnode, 0, sizeof(head->rbnode));
+ barrier();
+
+ if (head == q->fragments_tail)
+ q->fragments_tail = NULL;
+
+ sub_frag_mem_limit(q->net, head->truesize);
+
+ return head;
+}
+EXPORT_SYMBOL(inet_frag_pull_head);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index d757b9642d0d..be778599bfed 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -216,6 +216,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
atomic_set(&p->rid, 0);
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
+ p->n_redirects = 0;
/* 60*HZ is arbitrary, but chosen enough high so that the first
* calculation of tokens is at its maximum.
*/
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 867be8f7f1fa..cf2b0a6a3337 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -57,57 +57,6 @@
*/
static const char ip_frag_cache_name[] = "ip4-frags";
-/* Use skb->cb to track consecutive/adjacent fragments coming at
- * the end of the queue. Nodes in the rb-tree queue will
- * contain "runs" of one or more adjacent fragments.
- *
- * Invariants:
- * - next_frag is NULL at the tail of a "run";
- * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
- */
-struct ipfrag_skb_cb {
- struct inet_skb_parm h;
- struct sk_buff *next_frag;
- int frag_run_len;
-};
-
-#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
-
-static void ip4_frag_init_run(struct sk_buff *skb)
-{
- BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
-
- FRAG_CB(skb)->next_frag = NULL;
- FRAG_CB(skb)->frag_run_len = skb->len;
-}
-
-/* Append skb to the last "run". */
-static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
- struct sk_buff *skb)
-{
- RB_CLEAR_NODE(&skb->rbnode);
- FRAG_CB(skb)->next_frag = NULL;
-
- FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
- FRAG_CB(q->fragments_tail)->next_frag = skb;
- q->fragments_tail = skb;
-}
-
-/* Create a new "run" with the skb. */
-static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
-{
- if (q->last_run_head)
- rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
- &q->last_run_head->rbnode.rb_right);
- else
- rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
- rb_insert_color(&skb->rbnode, &q->rb_fragments);
-
- ip4_frag_init_run(skb);
- q->fragments_tail = skb;
- q->last_run_head = skb;
-}
-
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
struct inet_frag_queue q;
@@ -212,27 +161,9 @@ static void ip_expire(struct timer_list *t)
* pull the head out of the tree in order to be able to
* deal with head->dev.
*/
- if (qp->q.fragments) {
- head = qp->q.fragments;
- qp->q.fragments = head->next;
- } else {
- head = skb_rb_first(&qp->q.rb_fragments);
- if (!head)
- goto out;
- if (FRAG_CB(head)->next_frag)
- rb_replace_node(&head->rbnode,
- &FRAG_CB(head)->next_frag->rbnode,
- &qp->q.rb_fragments);
- else
- rb_erase(&head->rbnode, &qp->q.rb_fragments);
- memset(&head->rbnode, 0, sizeof(head->rbnode));
- barrier();
- }
- if (head == qp->q.fragments_tail)
- qp->q.fragments_tail = NULL;
-
- sub_frag_mem_limit(qp->q.net, head->truesize);
-
+ head = inet_frag_pull_head(&qp->q);
+ if (!head)
+ goto out;
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
goto out;
@@ -330,7 +261,6 @@ static int ip_frag_reinit(struct ipq *qp)
qp->q.flags = 0;
qp->q.len = 0;
qp->q.meat = 0;
- qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
qp->q.last_run_head = NULL;
@@ -344,12 +274,10 @@ static int ip_frag_reinit(struct ipq *qp)
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
- struct rb_node **rbn, *parent;
- struct sk_buff *skb1, *prev_tail;
- int ihl, end, skb1_run_end;
+ int ihl, end, flags, offset;
+ struct sk_buff *prev_tail;
struct net_device *dev;
unsigned int fragsize;
- int flags, offset;
int err = -ENOENT;
u8 ecn;
@@ -413,62 +341,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
/* Makes sure compiler wont do silly aliasing games */
barrier();
- /* RFC5722, Section 4, amended by Errata ID : 3089
- * When reassembling an IPv6 datagram, if
- * one or more its constituent fragments is determined to be an
- * overlapping fragment, the entire datagram (and any constituent
- * fragments) MUST be silently discarded.
- *
- * We do the same here for IPv4 (and increment an snmp counter) but
- * we do not want to drop the whole queue in response to a duplicate
- * fragment.
- */
-
- err = -EINVAL;
- /* Find out where to put this fragment. */
prev_tail = qp->q.fragments_tail;
- if (!prev_tail)
- ip4_frag_create_run(&qp->q, skb); /* First fragment. */
- else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
- /* This is the common case: skb goes to the end. */
- /* Detect and discard overlaps. */
- if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
- goto overlap;
- if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
- ip4_frag_append_to_last_run(&qp->q, skb);
- else
- ip4_frag_create_run(&qp->q, skb);
- } else {
- /* Binary search. Note that skb can become the first fragment,
- * but not the last (covered above).
- */
- rbn = &qp->q.rb_fragments.rb_node;
- do {
- parent = *rbn;
- skb1 = rb_to_skb(parent);
- skb1_run_end = skb1->ip_defrag_offset +
- FRAG_CB(skb1)->frag_run_len;
- if (end <= skb1->ip_defrag_offset)
- rbn = &parent->rb_left;
- else if (offset >= skb1_run_end)
- rbn = &parent->rb_right;
- else if (offset >= skb1->ip_defrag_offset &&
- end <= skb1_run_end)
- goto err; /* No new data, potential duplicate */
- else
- goto overlap; /* Found an overlap */
- } while (*rbn);
- /* Here we have parent properly set, and rbn pointing to
- * one of its NULL left/right children. Insert skb.
- */
- ip4_frag_init_run(skb);
- rb_link_node(&skb->rbnode, parent, rbn);
- rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
- }
+ err = inet_frag_queue_insert(&qp->q, skb, offset, end);
+ if (err)
+ goto insert_error;
if (dev)
qp->iif = dev->ifindex;
- skb->ip_defrag_offset = offset;
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
@@ -501,10 +380,16 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
skb_dst_drop(skb);
return -EINPROGRESS;
-overlap:
+insert_error:
+ if (err == IPFRAG_DUP) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ err = -EINVAL;
__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
discard_qp:
inet_frag_kill(&qp->q);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
err:
kfree_skb(skb);
return err;
@@ -516,13 +401,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
- struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
- struct sk_buff **nextp; /* To build frag_list. */
- struct rb_node *rbn;
- int len;
- int ihlen;
- int delta;
- int err;
+ void *reasm_data;
+ int len, err;
u8 ecn;
ipq_kill(qp);
@@ -532,117 +412,23 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
err = -EINVAL;
goto out_fail;
}
- /* Make the one we just received the head. */
- if (head != skb) {
- fp = skb_clone(skb, GFP_ATOMIC);
- if (!fp)
- goto out_nomem;
- FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
- if (RB_EMPTY_NODE(&skb->rbnode))
- FRAG_CB(prev_tail)->next_frag = fp;
- else
- rb_replace_node(&skb->rbnode, &fp->rbnode,
- &qp->q.rb_fragments);
- if (qp->q.fragments_tail == skb)
- qp->q.fragments_tail = fp;
- skb_morph(skb, head);
- FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
- rb_replace_node(&head->rbnode, &skb->rbnode,
- &qp->q.rb_fragments);
- consume_skb(head);
- head = skb;
- }
- WARN_ON(head->ip_defrag_offset != 0);
-
- /* Allocate a new buffer for the datagram. */
- ihlen = ip_hdrlen(head);
- len = ihlen + qp->q.len;
+ /* Make the one we just received the head. */
+ reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail);
+ if (!reasm_data)
+ goto out_nomem;
+ len = ip_hdrlen(skb) + qp->q.len;
err = -E2BIG;
if (len > 65535)
goto out_oversize;
- delta = - head->truesize;
-
- /* Head of list must not be cloned. */
- if (skb_unclone(head, GFP_ATOMIC))
- goto out_nomem;
-
- delta += head->truesize;
- if (delta)
- add_frag_mem_limit(qp->q.net, delta);
-
- /* If the first fragment is fragmented itself, we split
- * it to two chunks: the first with data and paged part
- * and the second, holding only fragments. */
- if (skb_has_frag_list(head)) {
- struct sk_buff *clone;
- int i, plen = 0;
-
- clone = alloc_skb(0, GFP_ATOMIC);
- if (!clone)
- goto out_nomem;
- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
- skb_frag_list_init(head);
- for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
- plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
- clone->len = clone->data_len = head->data_len - plen;
- head->truesize += clone->truesize;
- clone->csum = 0;
- clone->ip_summed = head->ip_summed;
- add_frag_mem_limit(qp->q.net, clone->truesize);
- skb_shinfo(head)->frag_list = clone;
- nextp = &clone->next;
- } else {
- nextp = &skb_shinfo(head)->frag_list;
- }
-
- skb_push(head, head->data - skb_network_header(head));
+ inet_frag_reasm_finish(&qp->q, skb, reasm_data);
- /* Traverse the tree in order, to build frag_list. */
- fp = FRAG_CB(head)->next_frag;
- rbn = rb_next(&head->rbnode);
- rb_erase(&head->rbnode, &qp->q.rb_fragments);
- while (rbn || fp) {
- /* fp points to the next sk_buff in the current run;
- * rbn points to the next run.
- */
- /* Go through the current run. */
- while (fp) {
- *nextp = fp;
- nextp = &fp->next;
- fp->prev = NULL;
- memset(&fp->rbnode, 0, sizeof(fp->rbnode));
- fp->sk = NULL;
- head->data_len += fp->len;
- head->len += fp->len;
- if (head->ip_summed != fp->ip_summed)
- head->ip_summed = CHECKSUM_NONE;
- else if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
- fp = FRAG_CB(fp)->next_frag;
- }
- /* Move to the next run. */
- if (rbn) {
- struct rb_node *rbnext = rb_next(rbn);
-
- fp = rb_to_skb(rbn);
- rb_erase(rbn, &qp->q.rb_fragments);
- rbn = rbnext;
- }
- }
- sub_frag_mem_limit(qp->q.net, head->truesize);
-
- *nextp = NULL;
- skb_mark_not_on_list(head);
- head->prev = NULL;
- head->dev = dev;
- head->tstamp = qp->q.stamp;
- IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
+ skb->dev = dev;
+ IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
- iph = ip_hdr(head);
+ iph = ip_hdr(skb);
iph->tot_len = htons(len);
iph->tos |= ecn;
@@ -655,7 +441,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
* from one very small df-fragment and one large non-df frag.
*/
if (qp->max_df_size == qp->q.max_size) {
- IPCB(head)->flags |= IPSKB_FRAG_PMTU;
+ IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
iph->frag_off = htons(IP_DF);
} else {
iph->frag_off = 0;
@@ -664,7 +450,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
ip_send_check(iph);
__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
- qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
qp->q.last_run_head = NULL;
@@ -753,28 +538,6 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
}
EXPORT_SYMBOL(ip_check_defrag);
-unsigned int inet_frag_rbtree_purge(struct rb_root *root)
-{
- struct rb_node *p = rb_first(root);
- unsigned int sum = 0;
-
- while (p) {
- struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
-
- p = rb_next(p);
- rb_erase(&skb->rbnode, root);
- while (skb) {
- struct sk_buff *next = FRAG_CB(skb)->next_frag;
-
- sum += skb->truesize;
- kfree_skb(skb);
- skb = next;
- }
- }
- return sum;
-}
-EXPORT_SYMBOL(inet_frag_rbtree_purge);
-
#ifdef CONFIG_SYSCTL
static int dist_min;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d1d09f3e5f9e..fd219f7bd3ea 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -268,20 +268,11 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
int len;
itn = net_generic(net, erspan_net_id);
- len = gre_hdr_len + sizeof(*ershdr);
-
- /* Check based hdr len */
- if (unlikely(!pskb_may_pull(skb, len)))
- return PACKET_REJECT;
iph = ip_hdr(skb);
ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
ver = ershdr->ver;
- /* The original GRE header does not have key field,
- * Use ERSPAN 10-bit session ID as key.
- */
- tpi->key = cpu_to_be32(get_session_id(ershdr));
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
tpi->flags | TUNNEL_KEY,
iph->saddr, iph->daddr, tpi->key);
@@ -458,81 +449,14 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum)
return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
-static struct rtable *gre_get_rt(struct sk_buff *skb,
- struct net_device *dev,
- struct flowi4 *fl,
- const struct ip_tunnel_key *key)
-{
- struct net *net = dev_net(dev);
-
- memset(fl, 0, sizeof(*fl));
- fl->daddr = key->u.ipv4.dst;
- fl->saddr = key->u.ipv4.src;
- fl->flowi4_tos = RT_TOS(key->tos);
- fl->flowi4_mark = skb->mark;
- fl->flowi4_proto = IPPROTO_GRE;
-
- return ip_route_output_key(net, fl);
-}
-
-static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
- struct net_device *dev,
- struct flowi4 *fl,
- int tunnel_hlen)
-{
- struct ip_tunnel_info *tun_info;
- const struct ip_tunnel_key *key;
- struct rtable *rt = NULL;
- int min_headroom;
- bool use_cache;
- int err;
-
- tun_info = skb_tunnel_info(skb);
- key = &tun_info->key;
- use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
-
- if (use_cache)
- rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
- if (!rt) {
- rt = gre_get_rt(skb, dev, fl, key);
- if (IS_ERR(rt))
- goto err_free_skb;
- if (use_cache)
- dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
- fl->saddr);
- }
-
- min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
- + tunnel_hlen + sizeof(struct iphdr);
- if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
- int head_delta = SKB_DATA_ALIGN(min_headroom -
- skb_headroom(skb) +
- 16);
- err = pskb_expand_head(skb, max_t(int, head_delta, 0),
- 0, GFP_ATOMIC);
- if (unlikely(err))
- goto err_free_rt;
- }
- return rt;
-
-err_free_rt:
- ip_rt_put(rt);
-err_free_skb:
- kfree_skb(skb);
- dev->stats.tx_dropped++;
- return NULL;
-}
-
static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
__be16 proto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
- struct rtable *rt = NULL;
- struct flowi4 fl;
int tunnel_hlen;
- __be16 df, flags;
+ __be16 flags;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -542,13 +466,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
key = &tun_info->key;
tunnel_hlen = gre_calc_hlen(key->tun_flags);
- rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
- if (!rt)
- return;
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto err_free_skb;
/* Push Tunnel header. */
if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
- goto err_free_rt;
+ goto err_free_skb;
flags = tun_info->key.tun_flags &
(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
@@ -556,32 +479,25 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel_id_to_key32(tun_info->key.tun_id),
(flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
- df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
- iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
- key->tos, key->ttl, df, false);
return;
-err_free_rt:
- ip_rt_put(rt);
err_free_skb:
kfree_skb(skb);
dev->stats.tx_dropped++;
}
-static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
- __be16 proto)
+static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
struct erspan_metadata *md;
- struct rtable *rt = NULL;
bool truncate = false;
- struct flowi4 fl;
+ __be16 proto;
int tunnel_hlen;
int version;
- __be16 df;
int nhoff;
int thoff;
@@ -592,21 +508,20 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
key = &tun_info->key;
if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
- goto err_free_rt;
+ goto err_free_skb;
md = ip_tunnel_info_opts(tun_info);
if (!md)
- goto err_free_rt;
+ goto err_free_skb;
/* ERSPAN has fixed 8 byte GRE header */
version = md->version;
tunnel_hlen = 8 + erspan_hdr_len(version);
- rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
- if (!rt)
- return;
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto err_free_skb;
if (gre_handle_offloads(skb, false))
- goto err_free_rt;
+ goto err_free_skb;
if (skb->len > dev->mtu + dev->hard_header_len) {
pskb_trim(skb, dev->mtu + dev->hard_header_len);
@@ -626,27 +541,25 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
if (version == 1) {
erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
ntohl(md->u.index), truncate, true);
+ proto = htons(ETH_P_ERSPAN);
} else if (version == 2) {
erspan_build_header_v2(skb,
ntohl(tunnel_id_to_key32(key->tun_id)),
md->u.md2.dir,
get_hwid(&md->u.md2),
truncate, true);
+ proto = htons(ETH_P_ERSPAN2);
} else {
- goto err_free_rt;
+ goto err_free_skb;
}
gre_build_header(skb, 8, TUNNEL_SEQ,
- htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
+ proto, 0, htonl(tunnel->o_seqno++));
- df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
- iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
- key->tos, key->ttl, df, false);
return;
-err_free_rt:
- ip_rt_put(rt);
err_free_skb:
kfree_skb(skb);
dev->stats.tx_dropped++;
@@ -655,13 +568,18 @@ err_free_skb:
static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ const struct ip_tunnel_key *key;
struct rtable *rt;
struct flowi4 fl4;
if (ip_tunnel_info_af(info) != AF_INET)
return -EINVAL;
- rt = gre_get_rt(skb, dev, &fl4, &info->key);
+ key = &info->key;
+ ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
+ tunnel_id_to_key32(key->tun_id), key->tos, 0,
+ skb->mark, skb_get_hash(skb));
+ rt = ip_route_output_key(dev_net(dev), &fl4);
if (IS_ERR(rt))
return PTR_ERR(rt);
@@ -721,12 +639,13 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
{
struct ip_tunnel *tunnel = netdev_priv(dev);
bool truncate = false;
+ __be16 proto;
if (!pskb_inet_may_pull(skb))
goto free_skb;
if (tunnel->collect_md) {
- erspan_fb_xmit(skb, dev, skb->protocol);
+ erspan_fb_xmit(skb, dev);
return NETDEV_TX_OK;
}
@@ -742,19 +661,22 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
}
/* Push ERSPAN header */
- if (tunnel->erspan_ver == 1)
+ if (tunnel->erspan_ver == 1) {
erspan_build_header(skb, ntohl(tunnel->parms.o_key),
tunnel->index,
truncate, true);
- else if (tunnel->erspan_ver == 2)
+ proto = htons(ETH_P_ERSPAN);
+ } else if (tunnel->erspan_ver == 2) {
erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
tunnel->dir, tunnel->hwid,
truncate, true);
- else
+ proto = htons(ETH_P_ERSPAN2);
+ } else {
goto free_skb;
+ }
tunnel->parms.o_flags &= ~TUNNEL_KEY;
- __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
+ __gre_xmit(skb, dev, &tunnel->parms.iph, proto);
return NETDEV_TX_OK;
free_skb:
@@ -1459,12 +1381,31 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm *p = &t->parms;
+ __be16 o_flags = p->o_flags;
+
+ if (t->erspan_ver == 1 || t->erspan_ver == 2) {
+ if (!t->collect_md)
+ o_flags |= TUNNEL_KEY;
+
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
+ goto nla_put_failure;
+
+ if (t->erspan_ver == 1) {
+ if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
+ goto nla_put_failure;
+ }
+ }
if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
nla_put_be16(skb, IFLA_GRE_IFLAGS,
gre_tnl_flags_to_gre_flags(p->i_flags)) ||
nla_put_be16(skb, IFLA_GRE_OFLAGS,
- gre_tnl_flags_to_gre_flags(p->o_flags)) ||
+ gre_tnl_flags_to_gre_flags(o_flags)) ||
nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
@@ -1494,19 +1435,6 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
goto nla_put_failure;
}
- if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
- goto nla_put_failure;
-
- if (t->erspan_ver == 1) {
- if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
- goto nla_put_failure;
- } else if (t->erspan_ver == 2) {
- if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
- goto nla_put_failure;
- if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
- goto nla_put_failure;
- }
-
return 0;
nla_put_failure:
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 26921f6b3b92..ecce2dc78f17 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -307,11 +307,10 @@ drop:
}
static int ip_rcv_finish_core(struct net *net, struct sock *sk,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net_device *dev)
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
- struct net_device *dev = skb->dev;
struct rtable *rt;
int err;
@@ -400,6 +399,7 @@ drop_error:
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ struct net_device *dev = skb->dev;
int ret;
/* if ingress device is enslaved to an L3 master device pass the
@@ -409,7 +409,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;
- ret = ip_rcv_finish_core(net, sk, skb);
+ ret = ip_rcv_finish_core(net, sk, skb, dev);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
@@ -429,7 +429,6 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
-
__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
skb = skb_share_check(skb, GFP_ATOMIC);
@@ -488,6 +487,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
goto drop;
}
+ iph = ip_hdr(skb);
skb->transport_header = skb->network_header + iph->ihl*4;
/* Remove any debris in the socket control block */
@@ -520,6 +520,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
skb = ip_rcv_core(skb, net);
if (skb == NULL)
return NET_RX_DROP;
+
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
net, NULL, skb, dev, NULL,
ip_rcv_finish);
@@ -544,6 +545,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *dev = skb->dev;
struct dst_entry *dst;
skb_list_del_init(skb);
@@ -553,7 +555,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
skb = l3mdev_ip_rcv(skb);
if (!skb)
continue;
- if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP)
+ if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP)
continue;
dst = skb_dst(skb);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ed194d46c00e..32a35043c9f5 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -251,8 +251,9 @@ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
* If opt == NULL, then skb->data should point to IP header.
*/
-int ip_options_compile(struct net *net,
- struct ip_options *opt, struct sk_buff *skb)
+int __ip_options_compile(struct net *net,
+ struct ip_options *opt, struct sk_buff *skb,
+ __be32 *info)
{
__be32 spec_dst = htonl(INADDR_ANY);
unsigned char *pp_ptr = NULL;
@@ -468,11 +469,22 @@ eol:
return 0;
error:
- if (skb) {
- icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
- }
+ if (info)
+ *info = htonl((pp_ptr-iph)<<24);
return -EINVAL;
}
+
+int ip_options_compile(struct net *net,
+ struct ip_options *opt, struct sk_buff *skb)
+{
+ int ret;
+ __be32 info;
+
+ ret = __ip_options_compile(net, opt, skb, &info);
+ if (ret != 0 && skb)
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, info);
+ return ret;
+}
EXPORT_SYMBOL(ip_options_compile);
/*
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fffcc130900e..82f341e84fae 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -148,19 +148,17 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
{
+ __be16 _ports[2], *ports;
struct sockaddr_in sin;
- __be16 *ports;
- int end;
-
- end = skb_transport_offset(skb) + 4;
- if (end > 0 && !pskb_may_pull(skb, end))
- return;
/* All current transport protocols have the port numbers in the
* first four bytes of the transport header and this function is
* written with this assumption in mind.
*/
- ports = (__be16 *)skb_transport_header(skb);
+ ports = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_ports), &_ports);
+ if (!ports)
+ return;
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = ip_hdr(skb)->daddr;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index c4f5602308ed..2756fb725bf0 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -310,7 +310,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
iph->saddr, tunnel->parms.o_key,
RT_TOS(iph->tos), tunnel->parms.link,
- tunnel->fwmark);
+ tunnel->fwmark, 0);
rt = ip_route_output_key(tunnel->net, &fl4);
if (!IS_ERR(rt)) {
@@ -501,15 +501,19 @@ EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
struct rtable *rt, __be16 df,
- const struct iphdr *inner_iph)
+ const struct iphdr *inner_iph,
+ int tunnel_hlen, __be32 dst, bool md)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
+ int pkt_size;
int mtu;
+ tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
+ pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
+
if (df)
mtu = dst_mtu(&rt->dst) - dev->hard_header_len
- - sizeof(struct iphdr) - tunnel->hlen;
+ - sizeof(struct iphdr) - tunnel_hlen;
else
mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
@@ -527,11 +531,13 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
#if IS_ENABLED(CONFIG_IPV6)
else if (skb->protocol == htons(ETH_P_IPV6)) {
struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+ __be32 daddr;
+
+ daddr = md ? dst : tunnel->parms.iph.daddr;
if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
mtu >= IPV6_MIN_MTU) {
- if ((tunnel->parms.iph.daddr &&
- !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+ if ((daddr && !ipv4_is_multicast(daddr)) ||
rt6->rt6i_dst.plen == 128) {
rt6->rt6i_flags |= RTF_MODIFIED;
dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
@@ -548,17 +554,19 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
return 0;
}
-void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+ u8 proto, int tunnel_hlen)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
u32 headroom = sizeof(struct iphdr);
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
const struct iphdr *inner_iph;
- struct rtable *rt;
+ struct rtable *rt = NULL;
struct flowi4 fl4;
__be16 df = 0;
u8 tos, ttl;
+ bool use_cache;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -574,20 +582,39 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
else if (skb->protocol == htons(ETH_P_IPV6))
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
}
- ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
- RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
+ ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
+ tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
+ 0, skb->mark, skb_get_hash(skb));
if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
goto tx_error;
- rt = ip_route_output_key(tunnel->net, &fl4);
- if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error;
+
+ use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+ if (use_cache)
+ rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
+ if (!rt) {
+ rt = ip_route_output_key(tunnel->net, &fl4);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+ if (use_cache)
+ dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
+ fl4.saddr);
}
if (rt->dst.dev == dev) {
ip_rt_put(rt);
dev->stats.collisions++;
goto tx_error;
}
+
+ if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+ df = htons(IP_DF);
+ if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
+ key->u.ipv4.dst, true)) {
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+
tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
ttl = key->ttl;
if (ttl == 0) {
@@ -598,10 +625,10 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
else
ttl = ip4_dst_hoplimit(&rt->dst);
}
- if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
- df = htons(IP_DF);
- else if (skb->protocol == htons(ETH_P_IP))
+
+ if (!df && skb->protocol == htons(ETH_P_IP))
df = inner_iph->frag_off & htons(IP_DF);
+
headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
if (headroom > dev->needed_headroom)
dev->needed_headroom = headroom;
@@ -627,14 +654,17 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, u8 protocol)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_info *tun_info = NULL;
const struct iphdr *inner_iph;
- struct flowi4 fl4;
- u8 tos, ttl;
- __be16 df;
- struct rtable *rt; /* Route to the other host */
unsigned int max_headroom; /* The extra header space needed */
- __be32 dst;
+ struct rtable *rt = NULL; /* Route to the other host */
+ bool use_cache = false;
+ struct flowi4 fl4;
+ bool md = false;
bool connected;
+ u8 tos, ttl;
+ __be32 dst;
+ __be16 df;
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
connected = (tunnel->parms.iph.daddr != 0);
@@ -650,7 +680,15 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
- if (skb->protocol == htons(ETH_P_IP)) {
+ tun_info = skb_tunnel_info(skb);
+ if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
+ ip_tunnel_info_af(tun_info) == AF_INET &&
+ tun_info->key.u.ipv4.dst) {
+ dst = tun_info->key.u.ipv4.dst;
+ md = true;
+ connected = true;
+ }
+ else if (skb->protocol == htons(ETH_P_IP)) {
rt = skb_rtable(skb);
dst = rt_nexthop(rt, inner_iph->daddr);
}
@@ -688,7 +726,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
else
goto tx_error;
- connected = false;
+ if (!md)
+ connected = false;
}
tos = tnl_params->tos;
@@ -705,13 +744,20 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
- tunnel->fwmark);
+ tunnel->fwmark, skb_get_hash(skb));
if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
goto tx_error;
- rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
- NULL;
+ if (connected && md) {
+ use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+ if (use_cache)
+ rt = dst_cache_get_ip4(&tun_info->dst_cache,
+ &fl4.saddr);
+ } else {
+ rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
+ &fl4.saddr) : NULL;
+ }
if (!rt) {
rt = ip_route_output_key(tunnel->net, &fl4);
@@ -720,7 +766,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
dev->stats.tx_carrier_errors++;
goto tx_error;
}
- if (connected)
+ if (use_cache)
+ dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
+ fl4.saddr);
+ else if (!md && connected)
dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
fl4.saddr);
}
@@ -731,7 +780,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
- if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
+ if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
+ 0, 0, false)) {
ip_rt_put(rt);
goto tx_error;
}
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 9a0e67b52a4e..c3f3d28d1087 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -252,6 +252,14 @@ static int ip_tun_build_state(struct nlattr *attr,
tun_info = lwt_tun_info(new_state);
+#ifdef CONFIG_DST_CACHE
+ err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
+ if (err) {
+ lwtstate_free(new_state);
+ return err;
+ }
+#endif
+
if (tb[LWTUNNEL_IP_ID])
tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
@@ -278,6 +286,15 @@ static int ip_tun_build_state(struct nlattr *attr,
return 0;
}
+static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
+{
+#ifdef CONFIG_DST_CACHE
+ struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+ dst_cache_destroy(&tun_info->dst_cache);
+#endif
+}
+
static int ip_tun_fill_encap_info(struct sk_buff *skb,
struct lwtunnel_state *lwtstate)
{
@@ -313,6 +330,7 @@ static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
.build_state = ip_tun_build_state,
+ .destroy_state = ip_tun_destroy_state,
.fill_encap = ip_tun_fill_encap_info,
.get_encap_size = ip_tun_encap_nlsize,
.cmp_encap = ip_tun_cmp_encap,
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index d7b43e700023..68a21bf75dd0 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -74,6 +74,33 @@ drop:
return 0;
}
+static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->saddr, iph->daddr, 0);
+ if (tunnel) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+
+ skb->dev = tunnel->dev;
+
+ return xfrm_input(skb, nexthdr, spi, encap_type);
+ }
+
+ return -EINVAL;
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
static int vti_rcv(struct sk_buff *skb)
{
XFRM_SPI_SKB_CB(skb)->family = AF_INET;
@@ -82,6 +109,14 @@ static int vti_rcv(struct sk_buff *skb)
return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
}
+static int vti_rcv_ipip(struct sk_buff *skb)
+{
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0);
+}
+
static int vti_rcv_cb(struct sk_buff *skb, int err)
{
unsigned short family;
@@ -435,6 +470,12 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
.priority = 100,
};
+static struct xfrm_tunnel ipip_handler __read_mostly = {
+ .handler = vti_rcv_ipip,
+ .err_handler = vti4_err,
+ .priority = 0,
+};
+
static int __net_init vti_init_net(struct net *net)
{
int err;
@@ -603,6 +644,13 @@ static int __init vti_init(void)
if (err < 0)
goto xfrm_proto_comp_failed;
+ msg = "ipip tunnel";
+ err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+ if (err < 0) {
+ pr_info("%s: cant't register tunnel\n",__func__);
+ goto xfrm_tunnel_failed;
+ }
+
msg = "netlink interface";
err = rtnl_link_register(&vti_link_ops);
if (err < 0)
@@ -612,6 +660,8 @@ static int __init vti_init(void)
rtnl_link_failed:
xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+xfrm_tunnel_failed:
+ xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
xfrm_proto_comp_failed:
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
xfrm_proto_ah_failed:
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b9a9873c25c6..9bcca08efec9 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -85,7 +85,6 @@
/* Define the friendly delay before and after opening net devices */
#define CONF_POST_OPEN 10 /* After opening: 10 msecs */
-#define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */
/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
@@ -101,6 +100,9 @@
#define NONE cpu_to_be32(INADDR_NONE)
#define ANY cpu_to_be32(INADDR_ANY)
+/* Wait for carrier timeout default in seconds */
+static unsigned int carrier_timeout = 120;
+
/*
* Public IP configuration
*/
@@ -268,9 +270,9 @@ static int __init ic_open_devs(void)
/* wait for a carrier on at least one device */
start = jiffies;
- next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
+ next_msg = start + msecs_to_jiffies(20000);
while (time_before(jiffies, start +
- msecs_to_jiffies(CONF_CARRIER_TIMEOUT))) {
+ msecs_to_jiffies(carrier_timeout * 1000))) {
int wait, elapsed;
for_each_netdev(&init_net, dev)
@@ -283,9 +285,9 @@ static int __init ic_open_devs(void)
continue;
elapsed = jiffies_to_msecs(jiffies - start);
- wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000;
+ wait = (carrier_timeout * 1000 - elapsed + 500) / 1000;
pr_info("Waiting up to %d more seconds for network.\n", wait);
- next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
+ next_msg = jiffies + msecs_to_jiffies(20000);
}
have_carrier:
rtnl_unlock();
@@ -1780,3 +1782,18 @@ static int __init vendor_class_identifier_setup(char *addrs)
return 1;
}
__setup("dhcpclass=", vendor_class_identifier_setup);
+
+static int __init set_carrier_timeout(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtouint(str, 0, &carrier_timeout);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+__setup("carrier_timeout=", set_carrier_timeout);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 57c5dd283a2c..fe10b9a2efc8 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -302,7 +302,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
skb_set_inner_ipproto(skb, ipproto);
if (tunnel->collect_md)
- ip_md_tunnel_xmit(skb, dev, ipproto);
+ ip_md_tunnel_xmit(skb, dev, ipproto, 0);
else
ip_tunnel_xmit(skb, dev, tiph, ipproto);
return NETDEV_TX_OK;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ddbf8c9a1abb..2c931120c494 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -67,7 +67,6 @@
#include <net/fib_rules.h>
#include <linux/netconf.h>
#include <net/nexthop.h>
-#include <net/switchdev.h>
#include <linux/nospec.h>
@@ -111,7 +110,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
int cmd);
static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
-static void mroute_clean_tables(struct mr_table *mrt, bool all);
+static void mroute_clean_tables(struct mr_table *mrt, int flags);
static void ipmr_expire_process(struct timer_list *t);
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -416,7 +415,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
static void ipmr_free_table(struct mr_table *mrt)
{
del_timer_sync(&mrt->ipmr_expire_timer);
- mroute_clean_tables(mrt, true);
+ mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC |
+ MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC);
rhltable_destroy(&mrt->mfc_hash);
kfree(mrt);
}
@@ -837,10 +837,8 @@ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
static int vif_add(struct net *net, struct mr_table *mrt,
struct vifctl *vifc, int mrtsock)
{
+ struct netdev_phys_item_id ppid = { };
int vifi = vifc->vifc_vifi;
- struct switchdev_attr attr = {
- .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
- };
struct vif_device *v = &mrt->vif_table[vifi];
struct net_device *dev;
struct in_device *in_dev;
@@ -919,10 +917,10 @@ static int vif_add(struct net *net, struct mr_table *mrt,
vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0),
(VIFF_TUNNEL | VIFF_REGISTER));
- attr.orig_dev = dev;
- if (!switchdev_port_attr_get(dev, &attr)) {
- memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len);
- v->dev_parent_id.id_len = attr.u.ppid.id_len;
+ err = dev_get_port_parent_id(dev, &ppid, true);
+ if (err == 0) {
+ memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len);
+ v->dev_parent_id.id_len = ppid.id_len;
} else {
v->dev_parent_id.id_len = 0;
}
@@ -1299,7 +1297,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
}
/* Close the multicast socket, and clear the vif tables etc */
-static void mroute_clean_tables(struct mr_table *mrt, bool all)
+static void mroute_clean_tables(struct mr_table *mrt, int flags)
{
struct net *net = read_pnet(&mrt->net);
struct mr_mfc *c, *tmp;
@@ -1308,35 +1306,44 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
int i;
/* Shut down all active vif entries */
- for (i = 0; i < mrt->maxvif; i++) {
- if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
- continue;
- vif_delete(mrt, i, 0, &list);
+ if (flags & (MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC)) {
+ for (i = 0; i < mrt->maxvif; i++) {
+ if (((mrt->vif_table[i].flags & VIFF_STATIC) &&
+ !(flags & MRT_FLUSH_VIFS_STATIC)) ||
+ (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS)))
+ continue;
+ vif_delete(mrt, i, 0, &list);
+ }
+ unregister_netdevice_many(&list);
}
- unregister_netdevice_many(&list);
/* Wipe the cache */
- list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
- if (!all && (c->mfc_flags & MFC_STATIC))
- continue;
- rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
- list_del_rcu(&c->list);
- cache = (struct mfc_cache *)c;
- call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
- mrt->id);
- mroute_netlink_event(mrt, cache, RTM_DELROUTE);
- mr_cache_put(c);
- }
-
- if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
- spin_lock_bh(&mfc_unres_lock);
- list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
- list_del(&c->list);
+ if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) {
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+ if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) ||
+ (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC)))
+ continue;
+ rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+ list_del_rcu(&c->list);
cache = (struct mfc_cache *)c;
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
+ mrt->id);
mroute_netlink_event(mrt, cache, RTM_DELROUTE);
- ipmr_destroy_unres(mrt, cache);
+ mr_cache_put(c);
+ }
+ }
+
+ if (flags & MRT_FLUSH_MFC) {
+ if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
+ list_del(&c->list);
+ cache = (struct mfc_cache *)c;
+ mroute_netlink_event(mrt, cache, RTM_DELROUTE);
+ ipmr_destroy_unres(mrt, cache);
+ }
+ spin_unlock_bh(&mfc_unres_lock);
}
- spin_unlock_bh(&mfc_unres_lock);
}
}
@@ -1357,7 +1364,7 @@ static void mrtsock_destruct(struct sock *sk)
NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all);
RCU_INIT_POINTER(mrt->mroute_sk, NULL);
- mroute_clean_tables(mrt, false);
+ mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC);
}
}
rtnl_unlock();
@@ -1482,6 +1489,17 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
sk == rtnl_dereference(mrt->mroute_sk),
parent);
break;
+ case MRT_FLUSH:
+ if (optlen != sizeof(val)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (get_user(val, (int __user *)optval)) {
+ ret = -EFAULT;
+ break;
+ }
+ mroute_clean_tables(mrt, val);
+ break;
/* Control PIM assert. */
case MRT_ASSERT:
if (optlen != sizeof(val)) {
@@ -2467,6 +2485,61 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS);
}
+static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rtmsg *rtm;
+ int i, err;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+
+ rtm = nlmsg_data(nlh);
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
+ rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol ||
+ rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for multicast route get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err)
+ return err;
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
+ return -EINVAL;
+ }
+
+ for (i = 0; i <= RTA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_SRC:
+ case RTA_DST:
+ case RTA_TABLE:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in multicast route get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -2475,18 +2548,14 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct sk_buff *skb = NULL;
struct mfc_cache *cache;
struct mr_table *mrt;
- struct rtmsg *rtm;
__be32 src, grp;
u32 tableid;
int err;
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
- rtm_ipv4_policy, extack);
+ err = ipmr_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
if (err < 0)
goto errout;
- rtm = nlmsg_data(nlh);
-
src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 8d2e5dc9a827..a058213b77a7 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -80,24 +80,6 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
}
EXPORT_SYMBOL(ip_route_me_harder);
-int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
-{
- const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
-
- if (entry->state.hook == NF_INET_LOCAL_OUT) {
- const struct iphdr *iph = ip_hdr(skb);
-
- if (!(iph->tos == rt_info->tos &&
- skb->mark == rt_info->mark &&
- iph->daddr == rt_info->daddr &&
- iph->saddr == rt_info->saddr))
- return ip_route_me_harder(entry->state.net, skb,
- RTN_UNSPEC);
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_ip_reroute);
-
int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
bool strict __always_unused)
{
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 80f72cc5ca8d..c98391d49200 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -94,50 +94,7 @@ config NF_REJECT_IPV4
tristate "IPv4 packet rejection"
default m if NETFILTER_ADVANCED=n
-config NF_NAT_IPV4
- tristate "IPv4 NAT"
- depends on NF_CONNTRACK
- default m if NETFILTER_ADVANCED=n
- select NF_NAT
- help
- The IPv4 NAT option allows masquerading, port forwarding and other
- forms of full Network Address Port Translation. This can be
- controlled by iptables or nft.
-
-if NF_NAT_IPV4
-
-config NF_NAT_MASQUERADE_IPV4
- bool
-
-if NF_TABLES
-config NFT_CHAIN_NAT_IPV4
- depends on NF_TABLES_IPV4
- tristate "IPv4 nf_tables nat chain support"
- help
- This option enables the "nat" chain for IPv4 in nf_tables. This
- chain type is used to perform Network Address Translation (NAT)
- packet transformations such as the source, destination address and
- source and destination ports.
-
-config NFT_MASQ_IPV4
- tristate "IPv4 masquerading support for nf_tables"
- depends on NF_TABLES_IPV4
- depends on NFT_MASQ
- select NF_NAT_MASQUERADE_IPV4
- help
- This is the expression that provides IPv4 masquerading support for
- nf_tables.
-
-config NFT_REDIR_IPV4
- tristate "IPv4 redirect support for nf_tables"
- depends on NF_TABLES_IPV4
- depends on NFT_REDIR
- select NF_NAT_REDIRECT
- help
- This is the expression that provides IPv4 redirect support for
- nf_tables.
-endif # NF_TABLES
-
+if NF_NAT
config NF_NAT_SNMP_BASIC
tristate "Basic SNMP-ALG support"
depends on NF_CONNTRACK_SNMP
@@ -166,7 +123,7 @@ config NF_NAT_H323
depends on NF_CONNTRACK
default NF_CONNTRACK_H323
-endif # NF_NAT_IPV4
+endif # NF_NAT
config IP_NF_IPTABLES
tristate "IP tables support (required for filtering/masq/NAT)"
@@ -263,7 +220,6 @@ config IP_NF_NAT
depends on NF_CONNTRACK
default m if NETFILTER_ADVANCED=n
select NF_NAT
- select NF_NAT_IPV4
select NETFILTER_XT_NAT
help
This enables the `nat' table in iptables. This allows masquerading,
@@ -276,7 +232,7 @@ if IP_NF_NAT
config IP_NF_TARGET_MASQUERADE
tristate "MASQUERADE target support"
- select NF_NAT_MASQUERADE_IPV4
+ select NF_NAT_MASQUERADE
default m if NETFILTER_ADVANCED=n
help
Masquerading is a special case of NAT: all outgoing connections are
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index fd7122e0e2c9..e241f5188ebe 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,10 +3,6 @@
# Makefile for the netfilter modules on top of IPv4.
#
-nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o
-nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
-obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
-
# defrag
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
@@ -29,11 +25,8 @@ $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
-obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
-obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
-obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
# flow table support
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index b61977db9b7f..835d50b279f5 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -846,9 +846,9 @@ static int clusterip_net_init(struct net *net)
static void clusterip_net_exit(struct net *net)
{
+#ifdef CONFIG_PROC_FS
struct clusterip_net *cn = clusterip_pernet(net);
-#ifdef CONFIG_PROC_FS
mutex_lock(&cn->mutex);
proc_remove(cn->procdir);
cn->procdir = NULL;
@@ -864,7 +864,7 @@ static struct pernet_operations clusterip_net_ops = {
.size = sizeof(struct clusterip_net),
};
-struct notifier_block cip_netdev_notifier = {
+static struct notifier_block cip_netdev_notifier = {
.notifier_call = clusterip_netdev_event
};
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a317445448bf..007da0882412 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -15,8 +15,6 @@
#include <net/ip.h>
#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
static int __net_init iptable_nat_table_init(struct net *net);
@@ -70,10 +68,10 @@ static int ipt_nat_register_lookups(struct net *net)
int i, ret;
for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) {
- ret = nf_nat_l3proto_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]);
+ ret = nf_nat_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]);
if (ret) {
while (i)
- nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]);
+ nf_nat_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]);
return ret;
}
@@ -87,7 +85,7 @@ static void ipt_nat_unregister_lookups(struct net *net)
int i;
for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
- nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]);
+ nf_nat_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]);
}
static int __net_init iptable_nat_table_init(struct net *net)
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
deleted file mode 100644
index 2687db015b6f..000000000000
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ /dev/null
@@ -1,387 +0,0 @@
-/*
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/icmp.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/secure_seq.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
-
-#ifdef CONFIG_XFRM
-static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
- const struct nf_conn *ct,
- enum ip_conntrack_dir dir,
- unsigned long statusbit,
- struct flowi *fl)
-{
- const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
- struct flowi4 *fl4 = &fl->u.ip4;
-
- if (ct->status & statusbit) {
- fl4->daddr = t->dst.u3.ip;
- if (t->dst.protonum == IPPROTO_TCP ||
- t->dst.protonum == IPPROTO_UDP ||
- t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
- t->dst.protonum == IPPROTO_SCTP)
- fl4->fl4_dport = t->dst.u.all;
- }
-
- statusbit ^= IPS_NAT_MASK;
-
- if (ct->status & statusbit) {
- fl4->saddr = t->src.u3.ip;
- if (t->dst.protonum == IPPROTO_TCP ||
- t->dst.protonum == IPPROTO_UDP ||
- t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
- t->dst.protonum == IPPROTO_SCTP)
- fl4->fl4_sport = t->src.u.all;
- }
-}
-#endif /* CONFIG_XFRM */
-
-static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff,
- const struct nf_conntrack_tuple *target,
- enum nf_nat_manip_type maniptype)
-{
- struct iphdr *iph;
- unsigned int hdroff;
-
- if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
- return false;
-
- iph = (void *)skb->data + iphdroff;
- hdroff = iphdroff + iph->ihl * 4;
-
- if (!nf_nat_l4proto_manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff,
- hdroff, target, maniptype))
- return false;
- iph = (void *)skb->data + iphdroff;
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
- iph->saddr = target->src.u3.ip;
- } else {
- csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
- iph->daddr = target->dst.u3.ip;
- }
- return true;
-}
-
-static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
- unsigned int iphdroff, __sum16 *check,
- const struct nf_conntrack_tuple *t,
- enum nf_nat_manip_type maniptype)
-{
- struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
- __be32 oldip, newip;
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- oldip = iph->saddr;
- newip = t->src.u3.ip;
- } else {
- oldip = iph->daddr;
- newip = t->dst.u3.ip;
- }
- inet_proto_csum_replace4(check, skb, oldip, newip, true);
-}
-
-static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
- u8 proto, void *data, __sum16 *check,
- int datalen, int oldlen)
-{
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- const struct iphdr *iph = ip_hdr(skb);
-
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
- ip_hdrlen(skb);
- skb->csum_offset = (void *)check - data;
- *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
- proto, 0);
- } else
- inet_proto_csum_replace2(check, skb,
- htons(oldlen), htons(datalen), true);
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
- struct nf_nat_range2 *range)
-{
- if (tb[CTA_NAT_V4_MINIP]) {
- range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
- range->flags |= NF_NAT_RANGE_MAP_IPS;
- }
-
- if (tb[CTA_NAT_V4_MAXIP])
- range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
- else
- range->max_addr.ip = range->min_addr.ip;
-
- return 0;
-}
-#endif
-
-static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
- .l3proto = NFPROTO_IPV4,
- .manip_pkt = nf_nat_ipv4_manip_pkt,
- .csum_update = nf_nat_ipv4_csum_update,
- .csum_recalc = nf_nat_ipv4_csum_recalc,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
-#endif
-#ifdef CONFIG_XFRM
- .decode_session = nf_nat_ipv4_decode_session,
-#endif
-};
-
-int nf_nat_icmp_reply_translation(struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int hooknum)
-{
- struct {
- struct icmphdr icmp;
- struct iphdr ip;
- } *inside;
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
- unsigned int hdrlen = ip_hdrlen(skb);
- struct nf_conntrack_tuple target;
- unsigned long statusbit;
-
- WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
-
- if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
- return 0;
- if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
- return 0;
-
- inside = (void *)skb->data + hdrlen;
- if (inside->icmp.type == ICMP_REDIRECT) {
- if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
- return 0;
- if (ct->status & IPS_NAT_MASK)
- return 0;
- }
-
- if (manip == NF_NAT_MANIP_SRC)
- statusbit = IPS_SRC_NAT;
- else
- statusbit = IPS_DST_NAT;
-
- /* Invert if this is reply direction */
- if (dir == IP_CT_DIR_REPLY)
- statusbit ^= IPS_NAT_MASK;
-
- if (!(ct->status & statusbit))
- return 1;
-
- if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
- &ct->tuplehash[!dir].tuple, !manip))
- return 0;
-
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- /* Reloading "inside" here since manip_pkt may reallocate */
- inside = (void *)skb->data + hdrlen;
- inside->icmp.checksum = 0;
- inside->icmp.checksum =
- csum_fold(skb_checksum(skb, hdrlen,
- skb->len - hdrlen, 0));
- }
-
- /* Change outer to look like the reply to an incoming packet */
- nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
- if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip))
- return 0;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
-
-static unsigned int
-nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct)
- return NF_ACCEPT;
-
- if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
- if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
- if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
- state->hook))
- return NF_DROP;
- else
- return NF_ACCEPT;
- }
- }
-
- return nf_nat_inet_fn(priv, skb, state);
-}
-
-static unsigned int
-nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- unsigned int ret;
- __be32 daddr = ip_hdr(skb)->daddr;
-
- ret = nf_nat_ipv4_fn(priv, skb, state);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- daddr != ip_hdr(skb)->daddr)
- skb_dst_drop(skb);
-
- return ret;
-}
-
-static unsigned int
-nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
-#ifdef CONFIG_XFRM
- const struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- int err;
-#endif
- unsigned int ret;
-
- ret = nf_nat_ipv4_fn(priv, skb, state);
-#ifdef CONFIG_XFRM
- if (ret != NF_DROP && ret != NF_STOLEN &&
- !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if ((ct->tuplehash[dir].tuple.src.u3.ip !=
- ct->tuplehash[!dir].tuple.dst.u3.ip) ||
- (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
- ct->tuplehash[dir].tuple.src.u.all !=
- ct->tuplehash[!dir].tuple.dst.u.all)) {
- err = nf_xfrm_me_harder(state->net, skb, AF_INET);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
- }
-#endif
- return ret;
-}
-
-static unsigned int
-nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- const struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- unsigned int ret;
- int err;
-
- ret = nf_nat_ipv4_fn(priv, skb, state);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if (ct->tuplehash[dir].tuple.dst.u3.ip !=
- ct->tuplehash[!dir].tuple.src.u3.ip) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
-#ifdef CONFIG_XFRM
- else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
- ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
- ct->tuplehash[dir].tuple.dst.u.all !=
- ct->tuplehash[!dir].tuple.src.u.all) {
- err = nf_xfrm_me_harder(state->net, skb, AF_INET);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
-#endif
- }
- return ret;
-}
-
-static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
- /* Before packet filtering, change destination */
- {
- .hook = nf_nat_ipv4_in,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP_PRI_NAT_DST,
- },
- /* After packet filtering, change source */
- {
- .hook = nf_nat_ipv4_out,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_NAT_SRC,
- },
- /* Before packet filtering, change destination */
- {
- .hook = nf_nat_ipv4_local_fn,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP_PRI_NAT_DST,
- },
- /* After packet filtering, change source */
- {
- .hook = nf_nat_ipv4_fn,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_NAT_SRC,
- },
-};
-
-int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
-{
- return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
-}
-EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_register_fn);
-
-void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
-{
- nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
-}
-EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn);
-
-static int __init nf_nat_l3proto_ipv4_init(void)
-{
- return nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
-}
-
-static void __exit nf_nat_l3proto_ipv4_exit(void)
-{
- nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
-}
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
-
-module_init(nf_nat_l3proto_ipv4_init);
-module_exit(nf_nat_l3proto_ipv4_exit);
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
deleted file mode 100644
index 41327bb99093..000000000000
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/atomic.h>
-#include <linux/inetdevice.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/ipv4/nf_nat_masquerade.h>
-
-unsigned int
-nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
- const struct nf_nat_range2 *range,
- const struct net_device *out)
-{
- struct nf_conn *ct;
- struct nf_conn_nat *nat;
- enum ip_conntrack_info ctinfo;
- struct nf_nat_range2 newrange;
- const struct rtable *rt;
- __be32 newsrc, nh;
-
- WARN_ON(hooknum != NF_INET_POST_ROUTING);
-
- ct = nf_ct_get(skb, &ctinfo);
-
- WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
- ctinfo == IP_CT_RELATED_REPLY)));
-
- /* Source address is 0.0.0.0 - locally generated packet that is
- * probably not supposed to be masqueraded.
- */
- if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
- return NF_ACCEPT;
-
- rt = skb_rtable(skb);
- nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
- newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
- if (!newsrc) {
- pr_info("%s ate my IP address\n", out->name);
- return NF_DROP;
- }
-
- nat = nf_ct_nat_ext_add(ct);
- if (nat)
- nat->masq_index = out->ifindex;
-
- /* Transfer from original range. */
- memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
- memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
- newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
- newrange.min_addr.ip = newsrc;
- newrange.max_addr.ip = newsrc;
- newrange.min_proto = range->min_proto;
- newrange.max_proto = range->max_proto;
-
- /* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
-}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
-
-static int device_cmp(struct nf_conn *i, void *ifindex)
-{
- const struct nf_conn_nat *nat = nfct_nat(i);
-
- if (!nat)
- return 0;
- if (nf_ct_l3num(i) != NFPROTO_IPV4)
- return 0;
- return nat->masq_index == (int)(long)ifindex;
-}
-
-static int masq_device_event(struct notifier_block *this,
- unsigned long event,
- void *ptr)
-{
- const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct net *net = dev_net(dev);
-
- if (event == NETDEV_DOWN) {
- /* Device was downed. Search entire table for
- * conntracks which were associated with that device,
- * and forget them.
- */
- WARN_ON(dev->ifindex == 0);
-
- nf_ct_iterate_cleanup_net(net, device_cmp,
- (void *)(long)dev->ifindex, 0, 0);
- }
-
- return NOTIFY_DONE;
-}
-
-static int inet_cmp(struct nf_conn *ct, void *ptr)
-{
- struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
- struct net_device *dev = ifa->ifa_dev->dev;
- struct nf_conntrack_tuple *tuple;
-
- if (!device_cmp(ct, (void *)(long)dev->ifindex))
- return 0;
-
- tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-
- return ifa->ifa_address == tuple->dst.u3.ip;
-}
-
-static int masq_inet_event(struct notifier_block *this,
- unsigned long event,
- void *ptr)
-{
- struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
- struct net *net = dev_net(idev->dev);
-
- /* The masq_dev_notifier will catch the case of the device going
- * down. So if the inetdev is dead and being destroyed we have
- * no work to do. Otherwise this is an individual address removal
- * and we have to perform the flush.
- */
- if (idev->dead)
- return NOTIFY_DONE;
-
- if (event == NETDEV_DOWN)
- nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0);
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block masq_dev_notifier = {
- .notifier_call = masq_device_event,
-};
-
-static struct notifier_block masq_inet_notifier = {
- .notifier_call = masq_inet_event,
-};
-
-static int masq_refcnt;
-static DEFINE_MUTEX(masq_mutex);
-
-int nf_nat_masquerade_ipv4_register_notifier(void)
-{
- int ret = 0;
-
- mutex_lock(&masq_mutex);
- /* check if the notifier was already set */
- if (++masq_refcnt > 1)
- goto out_unlock;
-
- /* Register for device down reports */
- ret = register_netdevice_notifier(&masq_dev_notifier);
- if (ret)
- goto err_dec;
- /* Register IP address change reports */
- ret = register_inetaddr_notifier(&masq_inet_notifier);
- if (ret)
- goto err_unregister;
-
- mutex_unlock(&masq_mutex);
- return ret;
-
-err_unregister:
- unregister_netdevice_notifier(&masq_dev_notifier);
-err_dec:
- masq_refcnt--;
-out_unlock:
- mutex_unlock(&masq_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier);
-
-void nf_nat_masquerade_ipv4_unregister_notifier(void)
-{
- mutex_lock(&masq_mutex);
- /* check if the notifier still has clients */
- if (--masq_refcnt > 0)
- goto out_unlock;
-
- unregister_netdevice_notifier(&masq_dev_notifier);
- unregister_inetaddr_notifier(&masq_inet_notifier);
-out_unlock:
- mutex_unlock(&masq_mutex);
-}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
index a0aa13bcabda..0a8a60c1bf9a 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -105,6 +105,8 @@ static void fast_csum(struct snmp_ctx *ctx, unsigned char offset)
int snmp_version(void *context, size_t hdrlen, unsigned char tag,
const void *data, size_t datalen)
{
+ if (datalen != 1)
+ return -EINVAL;
if (*(unsigned char *)data > 1)
return -ENOTSUPP;
return 1;
@@ -114,8 +116,11 @@ int snmp_helper(void *context, size_t hdrlen, unsigned char tag,
const void *data, size_t datalen)
{
struct snmp_ctx *ctx = (struct snmp_ctx *)context;
- __be32 *pdata = (__be32 *)data;
+ __be32 *pdata;
+ if (datalen != 4)
+ return -EINVAL;
+ pdata = (__be32 *)data;
if (*pdata == ctx->from) {
pr_debug("%s: %pI4 to %pI4\n", __func__,
(void *)&ctx->from, (void *)&ctx->to);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index aa8304c618b8..7dc3c324b911 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -173,21 +173,16 @@ EXPORT_SYMBOL_GPL(nf_send_reset);
void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
{
struct iphdr *iph = ip_hdr(skb_in);
- u8 proto;
+ u8 proto = iph->protocol;
if (iph->frag_off & htons(IP_OFFSET))
return;
- if (skb_csum_unnecessary(skb_in)) {
+ if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) {
icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
return;
}
- if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
- proto = iph->protocol;
- else
- proto = 0;
-
if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0)
icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
}
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
deleted file mode 100644
index a3c4ea303e3e..000000000000
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
- * Copyright (c) 2012 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/ip.h>
-
-static unsigned int nft_nat_do_chain(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nft_pktinfo pkt;
-
- nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv4(&pkt, skb);
-
- return nft_do_chain(&pkt, priv);
-}
-
-static int nft_nat_ipv4_reg(struct net *net, const struct nf_hook_ops *ops)
-{
- return nf_nat_l3proto_ipv4_register_fn(net, ops);
-}
-
-static void nft_nat_ipv4_unreg(struct net *net, const struct nf_hook_ops *ops)
-{
- nf_nat_l3proto_ipv4_unregister_fn(net, ops);
-}
-
-static const struct nft_chain_type nft_chain_nat_ipv4 = {
- .name = "nat",
- .type = NFT_CHAIN_T_NAT,
- .family = NFPROTO_IPV4,
- .owner = THIS_MODULE,
- .hook_mask = (1 << NF_INET_PRE_ROUTING) |
- (1 << NF_INET_POST_ROUTING) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_LOCAL_IN),
- .hooks = {
- [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
- [NF_INET_POST_ROUTING] = nft_nat_do_chain,
- [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
- [NF_INET_LOCAL_IN] = nft_nat_do_chain,
- },
- .ops_register = nft_nat_ipv4_reg,
- .ops_unregister = nft_nat_ipv4_unreg,
-};
-
-static int __init nft_chain_nat_init(void)
-{
- nft_register_chain_type(&nft_chain_nat_ipv4);
-
- return 0;
-}
-
-static void __exit nft_chain_nat_exit(void)
-{
- nft_unregister_chain_type(&nft_chain_nat_ipv4);
-}
-
-module_init(nft_chain_nat_init);
-module_exit(nft_chain_nat_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
deleted file mode 100644
index 6847de1d1db8..000000000000
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nft_masq.h>
-#include <net/netfilter/ipv4/nf_nat_masquerade.h>
-
-static void nft_masq_ipv4_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_masq *priv = nft_expr_priv(expr);
- struct nf_nat_range2 range;
-
- memset(&range, 0, sizeof(range));
- range.flags = priv->flags;
- if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- }
- regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt),
- &range, nft_out(pkt));
-}
-
-static void
-nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
-{
- nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
-}
-
-static struct nft_expr_type nft_masq_ipv4_type;
-static const struct nft_expr_ops nft_masq_ipv4_ops = {
- .type = &nft_masq_ipv4_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
- .eval = nft_masq_ipv4_eval,
- .init = nft_masq_init,
- .destroy = nft_masq_ipv4_destroy,
- .dump = nft_masq_dump,
- .validate = nft_masq_validate,
-};
-
-static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
- .family = NFPROTO_IPV4,
- .name = "masq",
- .ops = &nft_masq_ipv4_ops,
- .policy = nft_masq_policy,
- .maxattr = NFTA_MASQ_MAX,
- .owner = THIS_MODULE,
-};
-
-static int __init nft_masq_ipv4_module_init(void)
-{
- int ret;
-
- ret = nft_register_expr(&nft_masq_ipv4_type);
- if (ret < 0)
- return ret;
-
- ret = nf_nat_masquerade_ipv4_register_notifier();
- if (ret)
- nft_unregister_expr(&nft_masq_ipv4_type);
-
- return ret;
-}
-
-static void __exit nft_masq_ipv4_module_exit(void)
-{
- nft_unregister_expr(&nft_masq_ipv4_type);
- nf_nat_masquerade_ipv4_unregister_notifier();
-}
-
-module_init(nft_masq_ipv4_module_init);
-module_exit(nft_masq_ipv4_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org");
-MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
deleted file mode 100644
index 5120be1d3118..000000000000
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_redirect.h>
-#include <net/netfilter/nft_redir.h>
-
-static void nft_redir_ipv4_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_redir *priv = nft_expr_priv(expr);
- struct nf_nat_ipv4_multi_range_compat mr;
-
- memset(&mr, 0, sizeof(mr));
- if (priv->sreg_proto_min) {
- mr.range[0].min.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- mr.range[0].max.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
- }
-
- mr.range[0].flags |= priv->flags;
-
- regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
-}
-
-static void
-nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
-{
- nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
-}
-
-static struct nft_expr_type nft_redir_ipv4_type;
-static const struct nft_expr_ops nft_redir_ipv4_ops = {
- .type = &nft_redir_ipv4_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
- .eval = nft_redir_ipv4_eval,
- .init = nft_redir_init,
- .destroy = nft_redir_ipv4_destroy,
- .dump = nft_redir_dump,
- .validate = nft_redir_validate,
-};
-
-static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
- .family = NFPROTO_IPV4,
- .name = "redir",
- .ops = &nft_redir_ipv4_ops,
- .policy = nft_redir_policy,
- .maxattr = NFTA_REDIR_MAX,
- .owner = THIS_MODULE,
-};
-
-static int __init nft_redir_ipv4_module_init(void)
-{
- return nft_register_expr(&nft_redir_ipv4_type);
-}
-
-static void __exit nft_redir_ipv4_module_exit(void)
-{
- nft_unregister_expr(&nft_redir_ipv4_type);
-}
-
-module_init(nft_redir_ipv4_module_init);
-module_exit(nft_redir_ipv4_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
-MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
index f86bb4f06609..d8e3a1fb8e82 100644
--- a/net/ipv4/netlink.c
+++ b/net/ipv4/netlink.c
@@ -3,9 +3,10 @@
#include <linux/types.h>
#include <net/net_namespace.h>
#include <net/netlink.h>
+#include <linux/in6.h>
#include <net/ip.h>
-int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family,
struct netlink_ext_ack *extack)
{
*ip_proto = nla_get_u8(attr);
@@ -13,11 +14,19 @@ int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
switch (*ip_proto) {
case IPPROTO_TCP:
case IPPROTO_UDP:
+ return 0;
case IPPROTO_ICMP:
+ if (family != AF_INET)
+ break;
+ return 0;
+#if IS_ENABLED(CONFIG_IPV6)
+ case IPPROTO_ICMPV6:
+ if (family != AF_INET6)
+ break;
return 0;
- default:
- NL_SET_ERR_MSG(extack, "Unsupported ip proto");
- return -EOPNOTSUPP;
+#endif
}
+ NL_SET_ERR_MSG(extack, "Unsupported ip proto");
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ce92f73cf104..738ff0a1a048 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -887,13 +887,15 @@ void ip_rt_send_redirect(struct sk_buff *skb)
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
*/
- if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
+ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
peer->rate_tokens = 0;
+ peer->n_redirects = 0;
+ }
/* Too many ignored redirects; do not send anything
* set dst.rate_last to the last seen redirected packet.
*/
- if (peer->rate_tokens >= ip_rt_redirect_number) {
+ if (peer->n_redirects >= ip_rt_redirect_number) {
peer->rate_last = jiffies;
goto out_put_peer;
}
@@ -910,6 +912,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
peer->rate_last = jiffies;
++peer->rate_tokens;
+ ++peer->n_redirects;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (log_martians &&
peer->rate_tokens == ip_rt_redirect_number)
@@ -1608,7 +1611,8 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
return -EINVAL;
if (ipv4_is_zeronet(saddr)) {
- if (!ipv4_is_local_multicast(daddr))
+ if (!ipv4_is_local_multicast(daddr) &&
+ ip_hdr(skb)->protocol != IPPROTO_IGMP)
return -EINVAL;
} else {
err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
@@ -1816,6 +1820,7 @@ out:
int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
const struct sk_buff *skb, struct flow_keys *flkeys)
{
+ u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
struct flow_keys hash_keys;
u32 mhash;
@@ -1866,6 +1871,9 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
}
mhash = flow_hash_from_keys(&hash_keys);
+ if (multipath_hash)
+ mhash = jhash_2words(mhash, multipath_hash, 0);
+
return mhash >> 1;
}
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -2763,6 +2771,75 @@ static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
return skb;
}
+static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rtmsg *rtm;
+ int i, err;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ NL_SET_ERR_MSG(extack,
+ "ipv4: Invalid header for route get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+
+ rtm = nlmsg_data(nlh);
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
+ rtm->rtm_table || rtm->rtm_protocol ||
+ rtm->rtm_scope || rtm->rtm_type) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
+ return -EINVAL;
+ }
+
+ if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
+ RTM_F_LOOKUP_TABLE |
+ RTM_F_FIB_MATCH)) {
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err)
+ return err;
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
+ return -EINVAL;
+ }
+
+ for (i = 0; i <= RTA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_IIF:
+ case RTA_OIF:
+ case RTA_SRC:
+ case RTA_DST:
+ case RTA_IP_PROTO:
+ case RTA_SPORT:
+ case RTA_DPORT:
+ case RTA_MARK:
+ case RTA_UID:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -2783,8 +2860,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
int err;
int mark;
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
- extack);
+ err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
if (err < 0)
return err;
@@ -2800,7 +2876,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (tb[RTA_IP_PROTO]) {
err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
- &ip_proto, extack);
+ &ip_proto, AF_INET, extack);
if (err)
return err;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 27e2f6837062..ad07dd71063d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1127,7 +1127,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp)
}
static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
- int *copied, size_t size)
+ int *copied, size_t size,
+ struct ubuf_info *uarg)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
@@ -1147,6 +1148,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
return -ENOBUFS;
tp->fastopen_req->data = msg;
tp->fastopen_req->size = size;
+ tp->fastopen_req->uarg = uarg;
if (inet->defer_connect) {
err = tcp_connect(sk);
@@ -1186,11 +1188,6 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
flags = msg->msg_flags;
if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
- if (sk->sk_state != TCP_ESTABLISHED) {
- err = -EINVAL;
- goto out_err;
- }
-
skb = tcp_write_queue_tail(sk);
uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
if (!uarg) {
@@ -1205,7 +1202,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
!tp->repair) {
- err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
else if (err)
@@ -1415,7 +1412,8 @@ do_fault:
/* It is the one place in all of TCP, except connection
* reset, where we can be unlinking the send_head.
*/
- tcp_check_send_head(sk, skb);
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
sk_wmem_free_skb(sk, skb);
}
@@ -1554,7 +1552,7 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
(copied > 0 &&
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
- !icsk->icsk_ack.pingpong)) &&
+ !inet_csk_in_pingpong_mode(sk))) &&
!atomic_read(&sk->sk_rmem_alloc)))
time_to_ack = true;
}
@@ -1847,57 +1845,78 @@ out:
#endif
static void tcp_update_recv_tstamps(struct sk_buff *skb,
- struct scm_timestamping *tss)
+ struct scm_timestamping_internal *tss)
{
if (skb->tstamp)
- tss->ts[0] = ktime_to_timespec(skb->tstamp);
+ tss->ts[0] = ktime_to_timespec64(skb->tstamp);
else
- tss->ts[0] = (struct timespec) {0};
+ tss->ts[0] = (struct timespec64) {0};
if (skb_hwtstamps(skb)->hwtstamp)
- tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
+ tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
else
- tss->ts[2] = (struct timespec) {0};
+ tss->ts[2] = (struct timespec64) {0};
}
/* Similar to __sock_recv_timestamp, but does not require an skb */
static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
- struct scm_timestamping *tss)
+ struct scm_timestamping_internal *tss)
{
- struct timeval tv;
+ int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
bool has_timestamping = false;
if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
if (sock_flag(sk, SOCK_RCVTSTAMP)) {
if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
- sizeof(tss->ts[0]), &tss->ts[0]);
- } else {
- tv.tv_sec = tss->ts[0].tv_sec;
- tv.tv_usec = tss->ts[0].tv_nsec / 1000;
+ if (new_tstamp) {
+ struct __kernel_timespec kts = {tss->ts[0].tv_sec, tss->ts[0].tv_nsec};
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
- sizeof(tv), &tv);
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
+ sizeof(kts), &kts);
+ } else {
+ struct timespec ts_old = timespec64_to_timespec(tss->ts[0]);
+
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
+ sizeof(ts_old), &ts_old);
+ }
+ } else {
+ if (new_tstamp) {
+ struct __kernel_sock_timeval stv;
+
+ stv.tv_sec = tss->ts[0].tv_sec;
+ stv.tv_usec = tss->ts[0].tv_nsec / 1000;
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
+ sizeof(stv), &stv);
+ } else {
+ struct __kernel_old_timeval tv;
+
+ tv.tv_sec = tss->ts[0].tv_sec;
+ tv.tv_usec = tss->ts[0].tv_nsec / 1000;
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
+ sizeof(tv), &tv);
+ }
}
}
if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
has_timestamping = true;
else
- tss->ts[0] = (struct timespec) {0};
+ tss->ts[0] = (struct timespec64) {0};
}
if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
has_timestamping = true;
else
- tss->ts[2] = (struct timespec) {0};
+ tss->ts[2] = (struct timespec64) {0};
}
if (has_timestamping) {
- tss->ts[1] = (struct timespec) {0};
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
- sizeof(*tss), tss);
+ tss->ts[1] = (struct timespec64) {0};
+ if (sock_flag(sk, SOCK_TSTAMP_NEW))
+ put_cmsg_scm_timestamping64(msg, tss);
+ else
+ put_cmsg_scm_timestamping(msg, tss);
}
}
@@ -1938,7 +1957,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
long timeo;
struct sk_buff *skb, *last;
u32 urg_hole = 0;
- struct scm_timestamping tss;
+ struct scm_timestamping_internal tss;
bool has_tss = false;
bool has_cmsg;
@@ -2528,6 +2547,7 @@ void tcp_write_queue_purge(struct sock *sk)
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
tcp_sk(sk)->packets_out = 0;
+ inet_csk(sk)->icsk_backoff = 0;
}
int tcp_disconnect(struct sock *sk, int flags)
@@ -2572,6 +2592,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
tp->rcv_rtt_last_tsecr = 0;
tp->write_seq += tp->max_window + 2;
if (tp->write_seq == 0)
@@ -2579,7 +2600,9 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_backoff = 0;
tp->snd_cwnd = 2;
icsk->icsk_probes_out = 0;
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tp->snd_cwnd = TCP_INIT_CWND;
tp->snd_cwnd_cnt = 0;
tp->window_clamp = 0;
tp->delivered_ce = 0;
@@ -2603,6 +2626,23 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->duplicate_sack[0].end_seq = 0;
tp->dsack_dups = 0;
tp->reord_seen = 0;
+ tp->retrans_out = 0;
+ tp->sacked_out = 0;
+ tp->tlp_high_seq = 0;
+ tp->last_oow_ack_time = 0;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ tp->app_limited = ~0U;
+ tp->rack.mstamp = 0;
+ tp->rack.advanced = 0;
+ tp->rack.reo_wnd_steps = 1;
+ tp->rack.last_delivered = 0;
+ tp->rack.reo_wnd_persist = 0;
+ tp->rack.dsack_seen = 0;
+ tp->syn_data_acked = 0;
+ tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.dsack = 0;
+ tp->rx_opt.num_sacks = 0;
+
/* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
@@ -2968,16 +3008,16 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_QUICKACK:
if (!val) {
- icsk->icsk_ack.pingpong = 1;
+ inet_csk_enter_pingpong_mode(sk);
} else {
- icsk->icsk_ack.pingpong = 0;
+ inet_csk_exit_pingpong_mode(sk);
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
inet_csk_ack_scheduled(sk)) {
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
tcp_cleanup_rbuf(sk, 1);
if (!(val & 1))
- icsk->icsk_ack.pingpong = 1;
+ inet_csk_enter_pingpong_mode(sk);
}
}
break;
@@ -3391,7 +3431,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return 0;
}
case TCP_QUICKACK:
- val = !icsk->icsk_ack.pingpong;
+ val = !inet_csk_in_pingpong_mode(sk);
break;
case TCP_CONGESTION:
@@ -3659,7 +3699,7 @@ bool tcp_alloc_md5sig_pool(void)
if (!tcp_md5sig_pool_populated) {
__tcp_alloc_md5sig_pool();
if (tcp_md5sig_pool_populated)
- static_key_slow_inc(&tcp_md5_needed);
+ static_branch_inc(&tcp_md5_needed);
}
mutex_unlock(&tcp_md5sig_mutex);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 0f497fc49c3f..56be7d27f208 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -115,6 +115,14 @@ struct bbr {
unused_b:5;
u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
u32 full_bw; /* recent bw, to estimate if pipe is full */
+
+ /* For tracking ACK aggregation: */
+ u64 ack_epoch_mstamp; /* start of ACK sampling epoch */
+ u16 extra_acked[2]; /* max excess data ACKed in epoch */
+ u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */
+ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */
+ extra_acked_win_idx:1, /* current index in extra_acked array */
+ unused_c:6;
};
#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
@@ -182,6 +190,15 @@ static const u32 bbr_lt_bw_diff = 4000 / 8;
/* If we estimate we're policed, use lt_bw for this many round trips: */
static const u32 bbr_lt_bw_max_rtts = 48;
+/* Gain factor for adding extra_acked to target cwnd: */
+static const int bbr_extra_acked_gain = BBR_UNIT;
+/* Window length of extra_acked window. */
+static const u32 bbr_extra_acked_win_rtts = 5;
+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
+static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+/* Time period for clamping cwnd increment due to ack aggregation */
+static const u32 bbr_extra_acked_max_us = 100 * 1000;
+
static void bbr_check_probe_rtt_done(struct sock *sk);
/* Do we estimate that STARTUP filled the pipe? */
@@ -208,6 +225,16 @@ static u32 bbr_bw(const struct sock *sk)
return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
}
+/* Return maximum extra acked in past k-2k round trips,
+ * where k = bbr_extra_acked_win_rtts.
+ */
+static u16 bbr_extra_acked(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return max(bbr->extra_acked[0], bbr->extra_acked[1]);
+}
+
/* Return rate in bytes per second, optionally with a gain.
* The order here is chosen carefully to avoid overflow of u64. This should
* work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
@@ -305,6 +332,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
if (event == CA_EVENT_TX_START && tp->app_limited) {
bbr->idle_restart = 1;
+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ bbr->ack_epoch_acked = 0;
/* Avoid pointless buffer overflows: pace at est. bw if we don't
* need more speed (we're restarting from idle and app-limited).
*/
@@ -315,30 +344,19 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
}
}
-/* Find target cwnd. Right-size the cwnd based on min RTT and the
- * estimated bottleneck bandwidth:
+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
*
- * cwnd = bw * min_rtt * gain = BDP * gain
+ * bdp = bw * min_rtt * gain
*
* The key factor, gain, controls the amount of queue. While a small gain
* builds a smaller queue, it becomes more vulnerable to noise in RTT
* measurements (e.g., delayed ACKs or other ACK compression effects). This
* noise may cause BBR to under-estimate the rate.
- *
- * To achieve full performance in high-speed paths, we budget enough cwnd to
- * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
- * - one skb in sending host Qdisc,
- * - one skb in sending host TSO/GSO engine
- * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
- * which allows 2 outstanding 2-packet sequences, to try to keep pipe
- * full even with ACK-every-other-packet delayed ACKs.
*/
-static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
{
struct bbr *bbr = inet_csk_ca(sk);
- u32 cwnd;
+ u32 bdp;
u64 w;
/* If we've never had a valid RTT sample, cap cwnd at the initial
@@ -353,7 +371,24 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
w = (u64)bw * bbr->min_rtt_us;
/* Apply a gain to the given value, then remove the BW_SCALE shift. */
- cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+ return bdp;
+}
+
+/* To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ * - one skb in sending host Qdisc,
+ * - one skb in sending host TSO/GSO engine
+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
/* Allow enough full-sized skbs in flight to utilize end systems. */
cwnd += 3 * bbr_tso_segs_goal(sk);
@@ -368,6 +403,17 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
return cwnd;
}
+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
+{
+ u32 inflight;
+
+ inflight = bbr_bdp(sk, bw, gain);
+ inflight = bbr_quantization_budget(sk, inflight, gain);
+
+ return inflight;
+}
+
/* With pacing at lower layers, there's often less data "in the network" than
* "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
* we often have several skbs queued in the pacing layer with a pre-scheduled
@@ -401,6 +447,22 @@ static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
return inflight_at_edt - interval_delivered;
}
+/* Find the cwnd increment based on estimate of ack aggregation */
+static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+{
+ u32 max_aggr_cwnd, aggr_cwnd = 0;
+
+ if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
+ max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ / BW_UNIT;
+ aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
+ >> BBR_SCALE;
+ aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ }
+
+ return aggr_cwnd;
+}
+
/* An optimization in BBR to reduce losses: On the first round of recovery, we
* follow the packet conservation principle: send P packets per P packets acked.
* After that, we slow-start and send at most 2*P packets per P packets acked.
@@ -461,8 +523,15 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
goto done;
+ target_cwnd = bbr_bdp(sk, bw, gain);
+
+ /* Increment the cwnd to account for excess ACKed data that seems
+ * due to aggregation (of data and/or ACKs) visible in the ACK stream.
+ */
+ target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain);
+
/* If we're below target cwnd, slow start cwnd toward target cwnd. */
- target_cwnd = bbr_target_cwnd(sk, bw, gain);
if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
cwnd = min(cwnd + acked, target_cwnd);
else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
@@ -503,14 +572,14 @@ static bool bbr_is_next_cycle_phase(struct sock *sk,
if (bbr->pacing_gain > BBR_UNIT)
return is_full_length &&
(rs->losses || /* perhaps pacing_gain*BDP won't fit */
- inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
+ inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
* probing didn't find more bw. If inflight falls to match BDP then we
* estimate queue is drained; persisting would underutilize the pipe.
*/
return is_full_length ||
- inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
+ inflight <= bbr_inflight(sk, bw, BBR_UNIT);
}
static void bbr_advance_cycle_phase(struct sock *sk)
@@ -727,6 +796,67 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
}
}
+/* Estimates the windowed max degree of ack aggregation.
+ * This is used to provision extra in-flight data to keep sending during
+ * inter-ACK silences.
+ *
+ * Degree of ack aggregation is estimated as extra data acked beyond expected.
+ *
+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
+ * cwnd += max_extra_acked
+ *
+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+ * Max filter is an approximate sliding window of 5-10 (packet timed) round
+ * trips.
+ */
+static void bbr_update_ack_aggregation(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ u32 epoch_us, expected_acked, extra_acked;
+ struct bbr *bbr = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
+ rs->delivered < 0 || rs->interval_us <= 0)
+ return;
+
+ if (bbr->round_start) {
+ bbr->extra_acked_win_rtts = min(0x1F,
+ bbr->extra_acked_win_rtts + 1);
+ if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
+ bbr->extra_acked_win_rtts = 0;
+ bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 0 : 1;
+ bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
+ }
+ }
+
+ /* Compute how many packets we expected to be delivered over epoch. */
+ epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
+ bbr->ack_epoch_mstamp);
+ expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
+
+ /* Reset the aggregation epoch if ACK rate is below expected rate or
+ * significantly large no. of ack received since epoch (potentially
+ * quite old epoch).
+ */
+ if (bbr->ack_epoch_acked <= expected_acked ||
+ (bbr->ack_epoch_acked + rs->acked_sacked >=
+ bbr_ack_epoch_acked_reset_thresh)) {
+ bbr->ack_epoch_acked = 0;
+ bbr->ack_epoch_mstamp = tp->delivered_mstamp;
+ expected_acked = 0;
+ }
+
+ /* Compute excess data delivered, beyond what was expected. */
+ bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
+ bbr->ack_epoch_acked + rs->acked_sacked);
+ extra_acked = bbr->ack_epoch_acked - expected_acked;
+ extra_acked = min(extra_acked, tp->snd_cwnd);
+ if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
+ bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+}
+
/* Estimate when the pipe is full, using the change in delivery rate: BBR
* estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
* at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
@@ -762,11 +892,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
bbr->mode = BBR_DRAIN; /* drain queue we created */
tcp_sk(sk)->snd_ssthresh =
- bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT);
+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
} /* fall through to check if in-flight is already small: */
if (bbr->mode == BBR_DRAIN &&
bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
- bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
}
@@ -881,6 +1011,7 @@ static void bbr_update_gains(struct sock *sk)
static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
{
bbr_update_bw(sk, rs);
+ bbr_update_ack_aggregation(sk, rs);
bbr_update_cycle_phase(sk, rs);
bbr_check_full_bw_reached(sk, rs);
bbr_check_drain(sk, rs);
@@ -932,6 +1063,13 @@ static void bbr_init(struct sock *sk)
bbr_reset_lt_bw_sampling(sk);
bbr_reset_startup_mode(sk);
+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ bbr->ack_epoch_acked = 0;
+ bbr->extra_acked_win_rtts = 0;
+ bbr->extra_acked_win_idx = 0;
+ bbr->extra_acked[0] = 0;
+ bbr->extra_acked[1] = 0;
+
cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 76858b14ebe9..4eb0c8ca3c60 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -221,7 +221,7 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_incr_quickack(sk, max_quickacks);
- icsk->icsk_ack.pingpong = 0;
+ inet_csk_exit_pingpong_mode(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
EXPORT_SYMBOL(tcp_enter_quickack_mode);
@@ -236,7 +236,7 @@ static bool tcp_in_quickack_mode(struct sock *sk)
const struct dst_entry *dst = __sk_dst_get(sk);
return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
- (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
+ (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
}
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
@@ -1574,9 +1574,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
return skb;
}
-static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
- struct tcp_sacktag_state *state,
- u32 seq)
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq)
{
struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
struct sk_buff *skb;
@@ -1598,13 +1596,12 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
}
static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
- struct tcp_sacktag_state *state,
u32 skip_to_seq)
{
if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
return skb;
- return tcp_sacktag_bsearch(sk, state, skip_to_seq);
+ return tcp_sacktag_bsearch(sk, skip_to_seq);
}
static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
@@ -1617,7 +1614,7 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
return skb;
if (before(next_dup->start_seq, skip_to_seq)) {
- skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
+ skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
skb = tcp_sacktag_walk(skb, sk, NULL, state,
next_dup->start_seq, next_dup->end_seq,
1);
@@ -1758,8 +1755,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
/* Head todo? */
if (before(start_seq, cache->start_seq)) {
- skb = tcp_sacktag_skip(skb, sk, state,
- start_seq);
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
skb = tcp_sacktag_walk(skb, sk, next_dup,
state,
start_seq,
@@ -1785,7 +1781,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
goto walk;
}
- skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
+ skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
/* Check overlap against next cached too (past this one already) */
cache++;
continue;
@@ -1796,7 +1792,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (!skb)
break;
}
- skb = tcp_sacktag_skip(skb, sk, state, start_seq);
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
walk:
skb = tcp_sacktag_walk(skb, sk, next_dup, state,
@@ -3595,7 +3591,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* this segment (RFC793 Section 3.9).
*/
if (after(ack, tp->snd_nxt))
- goto invalid_ack;
+ return -1;
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
@@ -3714,10 +3710,6 @@ no_queue:
tcp_process_tlp_ack(sk, ack, flag);
return 1;
-invalid_ack:
- SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
- return -1;
-
old_ack:
/* If data was SACKed, tag it and see if we should send more data.
* If data was DSACKed, see if we can undo a cwnd reduction.
@@ -3731,7 +3723,6 @@ old_ack:
tcp_xmit_recovery(sk, rexmit);
}
- SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return 0;
}
@@ -4094,7 +4085,7 @@ void tcp_fin(struct sock *sk)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- inet_csk(sk)->icsk_ack.pingpong = 1;
+ inet_csk_enter_pingpong_mode(sk);
break;
case TCP_CLOSE_WAIT:
@@ -4432,13 +4423,9 @@ static void tcp_ofo_queue(struct sock *sk)
rb_erase(&skb->rbnode, &tp->out_of_order_queue);
if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
- SOCK_DEBUG(sk, "ofo packet was already received\n");
tcp_drop(sk, skb);
continue;
}
- SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
@@ -4502,8 +4489,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
- SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, seq, end_seq);
p = &tp->out_of_order_queue.rb_node;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
@@ -4779,10 +4764,6 @@ drop:
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
- SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
-
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
/* If window is closed, drop tail of packet. But after
@@ -5061,8 +5042,6 @@ static int tcp_prune_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
-
NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
@@ -5889,7 +5868,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
return -1;
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
- icsk->icsk_ack.pingpong) {
+ inet_csk_in_pingpong_mode(sk)) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index efc6fef692ff..831d844a27ca 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -536,12 +536,15 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (sock_owned_by_user(sk))
break;
+ skb = tcp_rtx_queue_head(sk);
+ if (WARN_ON_ONCE(!skb))
+ break;
+
icsk->icsk_backoff--;
icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
TCP_TIMEOUT_INIT;
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
- skb = tcp_rtx_queue_head(sk);
tcp_mstamp_refresh(tp);
delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
@@ -970,7 +973,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
* We need to maintain these in the sk structure.
*/
-struct static_key tcp_md5_needed __read_mostly;
+DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
EXPORT_SYMBOL(tcp_md5_needed);
/* Find the Key structure for an address. */
@@ -2437,7 +2440,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
refcount_read(&sk->sk_refcnt), sk,
jiffies_to_clock_t(icsk->icsk_rto),
jiffies_to_clock_t(icsk->icsk_ack.ato),
- (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
+ (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
tp->snd_cwnd,
state == TCP_LISTEN ?
fastopenq->max_qlen :
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 12affb7864d9..79900f783e0d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -294,12 +294,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
* so the timewait ack generating code has the key.
*/
do {
- struct tcp_md5sig_key *key;
tcptw->tw_md5_key = NULL;
- key = tp->af_specific->md5_lookup(sk, sk);
- if (key) {
- tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
- BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool());
+ if (static_branch_unlikely(&tcp_md5_needed)) {
+ struct tcp_md5sig_key *key;
+
+ key = tp->af_specific->md5_lookup(sk, sk);
+ if (key) {
+ tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
+ BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool());
+ }
}
} while (0);
#endif
@@ -338,10 +341,12 @@ EXPORT_SYMBOL(tcp_time_wait);
void tcp_twsk_destructor(struct sock *sk)
{
#ifdef CONFIG_TCP_MD5SIG
- struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+ if (static_branch_unlikely(&tcp_md5_needed)) {
+ struct tcp_timewait_sock *twsk = tcp_twsk(sk);
- if (twsk->tw_md5_key)
- kfree_rcu(twsk->tw_md5_key, rcu);
+ if (twsk->tw_md5_key)
+ kfree_rcu(twsk->tw_md5_key, rcu);
+ }
#endif
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
@@ -479,43 +484,16 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
tcp_init_wl(newtp, treq->rcv_isn);
- newtp->srtt_us = 0;
- newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
- newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
- newtp->packets_out = 0;
- newtp->retrans_out = 0;
- newtp->sacked_out = 0;
- newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- newtp->tlp_high_seq = 0;
newtp->lsndtime = tcp_jiffies32;
newsk->sk_txhash = treq->txhash;
- newtp->last_oow_ack_time = 0;
newtp->total_retrans = req->num_retrans;
- /* So many TCP implementations out there (incorrectly) count the
- * initial SYN frame in their delayed-ACK and congestion control
- * algorithms that we must have the following bandaid to talk
- * efficiently to them. -DaveM
- */
- newtp->snd_cwnd = TCP_INIT_CWND;
- newtp->snd_cwnd_cnt = 0;
-
- /* There's a bubble in the pipe until at least the first ACK. */
- newtp->app_limited = ~0U;
-
tcp_init_xmit_timers(newsk);
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
- newtp->rx_opt.saw_tstamp = 0;
-
- newtp->rx_opt.dsack = 0;
- newtp->rx_opt.num_sacks = 0;
-
- newtp->urg_data = 0;
-
if (sock_flag(newsk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(newsk,
keepalive_time_when(newtp));
@@ -556,13 +534,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
tcp_ecn_openreq_child(newtp, req);
newtp->fastopen_req = NULL;
newtp->fastopen_rsk = NULL;
- newtp->syn_data_acked = 0;
- newtp->rack.mstamp = 0;
- newtp->rack.advanced = 0;
- newtp->rack.reo_wnd_steps = 1;
- newtp->rack.last_delivered = 0;
- newtp->rack.reo_wnd_persist = 0;
- newtp->rack.dsack_seen = 0;
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 730bc44dbad9..4522579aaca2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -165,13 +165,16 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
- tp->lsndtime = now;
-
- /* If it is a reply for ato after last received
- * packet, enter pingpong mode.
+ /* If this is the first data packet sent in response to the
+ * previous received data,
+ * and it is a reply for ato after last received packet,
+ * increase pingpong count.
*/
- if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
- icsk->icsk_ack.pingpong = 1;
+ if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
+ (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+ inet_csk_inc_pingpong_cnt(sk);
+
+ tp->lsndtime = now;
}
/* Account for an ACK we sent. */
@@ -594,7 +597,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- if (static_key_false(&tcp_md5_needed) &&
+ if (static_branch_unlikely(&tcp_md5_needed) &&
rcu_access_pointer(tp->md5sig_info)) {
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
@@ -731,7 +734,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- if (static_key_false(&tcp_md5_needed) &&
+ if (static_branch_unlikely(&tcp_md5_needed) &&
rcu_access_pointer(tp->md5sig_info)) {
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
@@ -980,7 +983,6 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
- skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (sk->sk_pacing_status != SK_PACING_NONE) {
unsigned long rate = sk->sk_pacing_rate;
@@ -1028,7 +1030,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
-
+ prior_wstamp = tp->tcp_wstamp_ns;
+ tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (clone_it) {
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
@@ -1045,11 +1049,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
return -ENOBUFS;
}
- prior_wstamp = tp->tcp_wstamp_ns;
- tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
-
- skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
-
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
@@ -1847,17 +1846,17 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
-static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
- struct sk_buff *skb, unsigned int len,
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
unsigned int mss_now, gfp_t gfp)
{
- struct sk_buff *buff;
int nlen = skb->len - len;
+ struct sk_buff *buff;
u8 flags;
/* All of a TSO frame must be composed of paged data. */
if (skb->len != skb->data_len)
- return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
+ return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+ skb, len, mss_now, gfp);
buff = sk_stream_alloc_skb(sk, 0, gfp, true);
if (unlikely(!buff))
@@ -1893,7 +1892,7 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
/* Link BUFF into the send queue. */
__skb_header_release(buff);
- tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
+ tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
return 0;
}
@@ -2347,6 +2346,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ tcp_init_tso_segs(skb, mss_now);
goto repair; /* Skip network transmission */
}
@@ -2391,8 +2391,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
nonagle);
if (skb->len > limit &&
- unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
- skb, limit, mss_now, gfp)))
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
if (tcp_small_queue_check(sk, skb, 0))
@@ -2937,12 +2936,16 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
+ /* To avoid taking spuriously low RTT samples based on a timestamp
+ * for a transmit that never happened, always mark EVER_RETRANS
+ */
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
TCP_SKB_CB(skb)->seq, segs, err);
if (likely(!err)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
trace_tcp_retransmit_skb(sk, skb);
} else if (err != -EBUSY) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
@@ -2963,13 +2966,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
#endif
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
-
- /* Save stamp of the first retransmit. */
- if (!tp->retrans_stamp)
- tp->retrans_stamp = tcp_skb_timestamp(skb);
-
}
+ /* Save stamp of the first (attempted) retransmit. */
+ if (!tp->retrans_stamp)
+ tp->retrans_stamp = tcp_skb_timestamp(skb);
+
if (tp->undo_retrans < 0)
tp->undo_retrans = 0;
tp->undo_retrans += tcp_skb_pcount(skb);
@@ -3456,6 +3458,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
skb_trim(syn_data, copied);
space = copied;
}
+ skb_zcopy_set(syn_data, fo->uarg, NULL);
}
/* No more data pending in inet_wait_for_connect() */
if (space == fo->size)
@@ -3569,7 +3572,7 @@ void tcp_send_delayed_ack(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2;
- if (icsk->icsk_ack.pingpong ||
+ if (inet_csk_in_pingpong_mode(sk) ||
(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
max_ato = TCP_DELACK_MAX;
@@ -3750,7 +3753,7 @@ void tcp_send_probe0(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- unsigned long probe_max;
+ unsigned long timeout;
int err;
err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
@@ -3762,26 +3765,18 @@ void tcp_send_probe0(struct sock *sk)
return;
}
+ icsk->icsk_probes_out++;
if (err <= 0) {
if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
icsk->icsk_backoff++;
- icsk->icsk_probes_out++;
- probe_max = TCP_RTO_MAX;
+ timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
} else {
/* If packet was not sent due to local congestion,
- * do not backoff and do not remember icsk_probes_out.
- * Let local senders to fight for local resources.
- *
- * Use accumulated backoff yet.
+ * Let senders fight for local resources conservatively.
*/
- if (!icsk->icsk_probes_out)
- icsk->icsk_probes_out = 1;
- probe_max = TCP_RESOURCE_PROBE_INTERVAL;
- }
- tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- tcp_probe0_when(sk, probe_max),
- TCP_RTO_MAX,
- NULL);
+ timeout = TCP_RESOURCE_PROBE_INTERVAL;
+ }
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL);
}
int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f87dbc78b6bc..f0c86398e6a7 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,28 +22,14 @@
#include <linux/gfp.h>
#include <net/tcp.h>
-static u32 tcp_retransmit_stamp(const struct sock *sk)
-{
- u32 start_ts = tcp_sk(sk)->retrans_stamp;
-
- if (unlikely(!start_ts)) {
- struct sk_buff *head = tcp_rtx_queue_head(sk);
-
- if (!head)
- return 0;
- start_ts = tcp_skb_timestamp(head);
- }
- return start_ts;
-}
-
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 elapsed, start_ts;
s32 remaining;
- start_ts = tcp_retransmit_stamp(sk);
- if (!icsk->icsk_user_timeout || !start_ts)
+ start_ts = tcp_sk(sk)->retrans_stamp;
+ if (!icsk->icsk_user_timeout)
return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
remaining = icsk->icsk_user_timeout - elapsed;
@@ -173,7 +159,20 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
-
+static unsigned int tcp_model_timeout(struct sock *sk,
+ unsigned int boundary,
+ unsigned int rto_base)
+{
+ unsigned int linear_backoff_thresh, timeout;
+
+ linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base);
+ if (boundary <= linear_backoff_thresh)
+ timeout = ((2 << boundary) - 1) * rto_base;
+ else
+ timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+ (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+ return jiffies_to_msecs(timeout);
+}
/**
* retransmits_timed_out() - returns true if this connection has timed out
* @sk: The current socket
@@ -191,26 +190,15 @@ static bool retransmits_timed_out(struct sock *sk,
unsigned int boundary,
unsigned int timeout)
{
- const unsigned int rto_base = TCP_RTO_MIN;
- unsigned int linear_backoff_thresh, start_ts;
+ unsigned int start_ts;
if (!inet_csk(sk)->icsk_retransmits)
return false;
- start_ts = tcp_retransmit_stamp(sk);
- if (!start_ts)
- return false;
-
- if (likely(timeout == 0)) {
- linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
+ start_ts = tcp_sk(sk)->retrans_stamp;
+ if (likely(timeout == 0))
+ timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN);
- if (boundary <= linear_backoff_thresh)
- timeout = ((2 << boundary) - 1) * rto_base;
- else
- timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
- (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
- timeout = jiffies_to_msecs(timeout);
- }
return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
}
@@ -226,7 +214,7 @@ static int tcp_write_timeout(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits) {
dst_negative_advice(sk);
- } else if (!tp->syn_data && !tp->syn_fastopen) {
+ } else {
sk_rethink_txhash(sk);
}
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
@@ -289,14 +277,14 @@ void tcp_delack_timer_handler(struct sock *sk)
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
if (inet_csk_ack_scheduled(sk)) {
- if (!icsk->icsk_ack.pingpong) {
+ if (!inet_csk_in_pingpong_mode(sk)) {
/* Delayed ACK missed: inflate ATO. */
icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
} else {
/* Delayed ACK missed: leave pingpong mode and
* deflate ATO.
*/
- icsk->icsk_ack.pingpong = 0;
+ inet_csk_exit_pingpong_mode(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
tcp_mstamp_refresh(tcp_sk(sk));
@@ -345,7 +333,6 @@ static void tcp_probe_timer(struct sock *sk)
struct sk_buff *skb = tcp_send_head(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
- u32 start_ts;
if (tp->packets_out || !skb) {
icsk->icsk_probes_out = 0;
@@ -360,12 +347,13 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer().
*/
- start_ts = tcp_skb_timestamp(skb);
- if (!start_ts)
- skb->skb_mstamp_ns = tp->tcp_clock_cache;
- else if (icsk->icsk_user_timeout &&
- (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
- goto abort;
+ if (icsk->icsk_user_timeout) {
+ u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out,
+ tcp_probe0_base(sk));
+
+ if (elapsed >= icsk->icsk_user_timeout)
+ goto abort;
+ }
max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
@@ -395,6 +383,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
int max_retries = icsk->icsk_syn_retries ? :
sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+ struct tcp_sock *tp = tcp_sk(sk);
struct request_sock *req;
req = tcp_sk(sk)->fastopen_rsk;
@@ -412,6 +401,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
inet_rtx_syn_ack(sk, req);
req->num_timeout++;
icsk->icsk_retransmits++;
+ if (!tp->retrans_stamp)
+ tp->retrans_stamp = tcp_time_stamp(tp);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
}
@@ -443,10 +434,8 @@ void tcp_retransmit_timer(struct sock *sk)
*/
return;
}
- if (!tp->packets_out)
- goto out;
-
- WARN_ON(tcp_rtx_queue_empty(sk));
+ if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk)))
+ return;
tp->tlp_high_seq = 0;
@@ -511,14 +500,13 @@ void tcp_retransmit_timer(struct sock *sk)
tcp_enter_loss(sk);
+ icsk->icsk_retransmits++;
if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion,
- * do not backoff.
+ * Let senders fight for local resources conservatively.
*/
- if (!icsk->icsk_retransmits)
- icsk->icsk_retransmits = 1;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+ TCP_RESOURCE_PROBE_INTERVAL,
TCP_RTO_MAX);
goto out;
}
@@ -539,7 +527,6 @@ void tcp_retransmit_timer(struct sock *sk)
* the 120 second clamps though!
*/
icsk->icsk_backoff++;
- icsk->icsk_retransmits++;
out_reset_timer:
/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3fb0ed5e4789..372fdc5381a9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -562,10 +562,12 @@ static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
int (*handler)(struct sk_buff *skb, u32 info);
+ const struct ip_tunnel_encap_ops *encap;
- if (!iptun_encaps[i])
+ encap = rcu_dereference(iptun_encaps[i]);
+ if (!encap)
continue;
- handler = rcu_dereference(iptun_encaps[i]->err_handler);
+ handler = encap->err_handler;
if (handler && !handler(skb, info))
return 0;
}
@@ -847,15 +849,23 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
const int hlen = skb_network_header_len(skb) +
sizeof(struct udphdr);
- if (hlen + cork->gso_size > cork->fragsize)
+ if (hlen + cork->gso_size > cork->fragsize) {
+ kfree_skb(skb);
return -EINVAL;
- if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+ }
+ if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) {
+ kfree_skb(skb);
return -EINVAL;
- if (sk->sk_no_check_tx)
+ }
+ if (sk->sk_no_check_tx) {
+ kfree_skb(skb);
return -EINVAL;
+ }
if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
- dst_xfrm(skb_dst(skb)))
+ dst_xfrm(skb_dst(skb))) {
+ kfree_skb(skb);
return -EIO;
+ }
skb_shinfo(skb)->gso_size = cork->gso_size;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
@@ -1918,7 +1928,7 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
}
EXPORT_SYMBOL(udp_lib_rehash);
-static void udp_v4_rehash(struct sock *sk)
+void udp_v4_rehash(struct sock *sk)
{
u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
inet_sk(sk)->inet_rcv_saddr,
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 322672655419..6b2fa77eeb1c 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -10,6 +10,7 @@ int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
int udp_v4_get_port(struct sock *sk, unsigned short snum);
+void udp_v4_rehash(struct sock *sk);
int udp_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, unsigned int optlen);
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index be8b5b2157d8..e93cc0379201 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -21,18 +21,9 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
goto error;
if (cfg->bind_ifindex) {
- struct net_device *dev;
-
- dev = dev_get_by_index(net, cfg->bind_ifindex);
- if (!dev) {
- err = -ENODEV;
- goto error;
- }
-
- err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE,
- dev->name, strlen(dev->name) + 1);
- dev_put(dev);
-
+ err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX,
+ (void *)&cfg->bind_ifindex,
+ sizeof(cfg->bind_ifindex));
if (err < 0)
goto error;
}
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 39c7f17d916f..3c94b8f0ff27 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -53,6 +53,7 @@ struct proto udplite_prot = {
.sendpage = udp_sendpage,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
+ .rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,