aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c5
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/fib_frontend.c3
-rw-r--r--net/ipv4/fib_semantics.c15
-rw-r--r--net/ipv4/fib_trie.c2
-rw-r--r--net/ipv4/icmp.c10
-rw-r--r--net/ipv4/igmp.c4
-rw-r--r--net/ipv4/inet_fragment.c39
-rw-r--r--net/ipv4/ip_fragment.c8
-rw-r--r--net/ipv4/ip_output.c3
-rw-r--r--net/ipv4/ipmr.c4
-rw-r--r--net/ipv4/netfilter/Kconfig8
-rw-r--r--net/ipv4/netfilter/Makefile2
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c4
-rw-r--r--net/ipv4/nexthop.c2
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/raw.c4
-rw-r--r--net/ipv4/route.c17
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp.c65
-rw-r--r--net/ipv4/tcp_bbr.c6
-rw-r--r--net/ipv4/tcp_bpf.c6
-rw-r--r--net/ipv4/tcp_diag.c52
-rw-r--r--net/ipv4/tcp_input.c84
-rw-r--r--net/ipv4/tcp_ipv4.c16
-rw-r--r--net/ipv4/tcp_output.c23
-rw-r--r--net/ipv4/tcp_timer.c2
-rw-r--r--net/ipv4/udp.c7
28 files changed, 312 insertions, 92 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ed2301ef872e..70f92aaca411 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1845,13 +1845,8 @@ static __net_init int inet_init_net(struct net *net)
return 0;
}
-static __net_exit void inet_exit_net(struct net *net)
-{
-}
-
static __net_initdata struct pernet_operations af_inet_ops = {
.init = inet_init_net,
- .exit = inet_exit_net,
};
static int __init init_inet_pernet_ops(void)
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 7bd29e694603..9a0fe0c2fa02 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -15,6 +15,7 @@
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp_states.h>
+#include <net/sock_reuseport.h>
int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -69,6 +70,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
}
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
+ reuseport_has_conns(sk, true);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
inet->inet_id = jiffies;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e8bc939b56dd..dde77f72e03e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -124,7 +124,8 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
h = id & (FIB_TABLE_HASHSZ - 1);
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist,
+ lockdep_rtnl_is_held()) {
if (tb->tb_id == id)
return tb;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2db089e10ba0..0913a090b2bf 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1582,7 +1582,7 @@ failure:
}
int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
- unsigned char *flags, bool skip_oif)
+ u8 rt_family, unsigned char *flags, bool skip_oif)
{
if (nhc->nhc_flags & RTNH_F_DEAD)
*flags |= RTNH_F_DEAD;
@@ -1613,7 +1613,7 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
/* if gateway family does not match nexthop family
* gateway is encoded as RTA_VIA
*/
- if (nhc->nhc_gw_family != nhc->nhc_family) {
+ if (rt_family != nhc->nhc_gw_family) {
int alen = sizeof(struct in6_addr);
struct nlattr *nla;
struct rtvia *via;
@@ -1654,7 +1654,7 @@ EXPORT_SYMBOL_GPL(fib_nexthop_info);
#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
- int nh_weight)
+ int nh_weight, u8 rt_family)
{
const struct net_device *dev = nhc->nhc_dev;
struct rtnexthop *rtnh;
@@ -1667,7 +1667,7 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
rtnh->rtnh_hops = nh_weight - 1;
rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
- if (fib_nexthop_info(skb, nhc, &flags, true) < 0)
+ if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0)
goto nla_put_failure;
rtnh->rtnh_flags = flags;
@@ -1693,13 +1693,14 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
goto nla_put_failure;
if (unlikely(fi->nh)) {
- if (nexthop_mpath_fill_node(skb, fi->nh) < 0)
+ if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0)
goto nla_put_failure;
goto mp_end;
}
for_nexthops(fi) {
- if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0)
+ if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
+ AF_INET) < 0)
goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
if (nh->nh_tclassid &&
@@ -1775,7 +1776,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
unsigned char flags = 0;
- if (fib_nexthop_info(skb, nhc, &flags, false) < 0)
+ if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0)
goto nla_put_failure;
rtm->rtm_flags = flags;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2b2b3d291ab0..1ab2fb6bb37d 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2145,7 +2145,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
if (filter->dump_exceptions) {
err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
- &i_fa, s_fa);
+ &i_fa, s_fa, flags);
if (err < 0)
goto stop;
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1510e951f451..4298aae74e0e 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -582,7 +582,13 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (!rt)
goto out;
- net = dev_net(rt->dst.dev);
+
+ if (rt->dst.dev)
+ net = dev_net(rt->dst.dev);
+ else if (skb_in->dev)
+ net = dev_net(skb_in->dev);
+ else
+ goto out;
/*
* Find the original header. It is expected to be valid, of course.
@@ -902,7 +908,7 @@ static bool icmp_redirect(struct sk_buff *skb)
return false;
}
- icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
+ icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway));
return true;
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 180f6896b98b..480d0b22db1a 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1475,7 +1475,7 @@ EXPORT_SYMBOL(__ip_mc_inc_group);
void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
{
- __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
+ __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
}
EXPORT_SYMBOL(ip_mc_inc_group);
@@ -2197,7 +2197,7 @@ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
iml->sflist = NULL;
iml->sfmode = mode;
rcu_assign_pointer(inet->mc_list, iml);
- __ip_mc_inc_group(in_dev, addr, mode);
+ ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL);
err = 0;
done:
return err;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index a999451345f9..10d31733297d 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -475,11 +475,12 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
EXPORT_SYMBOL(inet_frag_reasm_prepare);
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
- void *reasm_data)
+ void *reasm_data, bool try_coalesce)
{
struct sk_buff **nextp = (struct sk_buff **)reasm_data;
struct rb_node *rbn;
struct sk_buff *fp;
+ int sum_truesize;
skb_push(head, head->data - skb_network_header(head));
@@ -487,25 +488,41 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
fp = FRAG_CB(head)->next_frag;
rbn = rb_next(&head->rbnode);
rb_erase(&head->rbnode, &q->rb_fragments);
+
+ sum_truesize = head->truesize;
while (rbn || fp) {
/* fp points to the next sk_buff in the current run;
* rbn points to the next run.
*/
/* Go through the current run. */
while (fp) {
- *nextp = fp;
- nextp = &fp->next;
- fp->prev = NULL;
- memset(&fp->rbnode, 0, sizeof(fp->rbnode));
- fp->sk = NULL;
- head->data_len += fp->len;
- head->len += fp->len;
+ struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
+ bool stolen;
+ int delta;
+
+ sum_truesize += fp->truesize;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
- fp = FRAG_CB(fp)->next_frag;
+
+ if (try_coalesce && skb_try_coalesce(head, fp, &stolen,
+ &delta)) {
+ kfree_skb_partial(fp, stolen);
+ } else {
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+ fp->sk = NULL;
+
+ head->data_len += fp->len;
+ head->len += fp->len;
+ head->truesize += fp->truesize;
+
+ *nextp = fp;
+ nextp = &fp->next;
+ }
+
+ fp = next_frag;
}
/* Move to the next run. */
if (rbn) {
@@ -516,7 +533,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
rbn = rbnext;
}
}
- sub_frag_mem_limit(q->fqdir, head->truesize);
+ sub_frag_mem_limit(q->fqdir, sum_truesize);
*nextp = NULL;
skb_mark_not_on_list(head);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 4385eb9e781f..cfeb8890f94e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -393,6 +393,11 @@ err:
return err;
}
+static bool ip_frag_coalesce_ok(const struct ipq *qp)
+{
+ return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER;
+}
+
/* Build a new IP datagram from all its fragments. */
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev)
@@ -421,7 +426,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
if (len > 65535)
goto out_oversize;
- inet_frag_reasm_finish(&qp->q, skb, reasm_data);
+ inet_frag_reasm_finish(&qp->q, skb, reasm_data,
+ ip_frag_coalesce_ok(qp));
skb->dev = dev;
IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cc7ef0d05bbd..5eb73775c3f7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1266,6 +1266,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
cork->length = 0;
cork->ttl = ipc->ttl;
cork->tos = ipc->tos;
+ cork->mark = ipc->sockc.mark;
cork->priority = ipc->priority;
cork->transmit_time = ipc->sockc.transmit_time;
cork->tx_flags = 0;
@@ -1529,7 +1530,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
}
skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->mark = cork->mark;
skb->tstamp = cork->transmit_time;
/*
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c07bc82cbbe9..313470f6bb14 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1134,8 +1134,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
if (!found) {
/* Create a new entry if allowable */
- if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
- (c = ipmr_cache_alloc_unres()) == NULL) {
+ c = ipmr_cache_alloc_unres();
+ if (!c) {
spin_unlock_bh(&mfc_unres_lock);
kfree_skb(skb);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 69e76d677f9e..f17b402111ce 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -272,7 +272,7 @@ config IP_NF_TARGET_CLUSTERIP
The CLUSTERIP target allows you to build load-balancing clusters of
network servers without having a dedicated load-balancing
router/server/switch.
-
+
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_TARGET_ECN
@@ -281,7 +281,7 @@ config IP_NF_TARGET_ECN
depends on NETFILTER_ADVANCED
---help---
This option adds a `ECN' target, which can be used in the iptables mangle
- table.
+ table.
You can use this target to remove the ECN bits from the IPv4 header of
an IP packet. This is particularly useful, if you need to work around
@@ -306,7 +306,7 @@ config IP_NF_RAW
This option adds a `raw' table to iptables. This table is the very
first in the netfilter framework and hooks in at the PREROUTING
and OUTPUT chains.
-
+
If you want to compile it as a module, say M here and read
<file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
@@ -318,7 +318,7 @@ config IP_NF_SECURITY
help
This option adds a `security' table to iptables, for use
with Mandatory Access Control (MAC) policy.
-
+
If unsure, say N.
endif # IP_NF_IPTABLES
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index c50e0ec095d2..7c497c78105f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
# flow table support
obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o
-# generic IP tables
+# generic IP tables
obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
# the three instances of ip_tables
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 0e70f3f65f6f..748dc3ce58d3 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -36,8 +36,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
opts.options |= XT_SYNPROXY_OPT_ECN;
opts.options &= info->options;
- opts.mss_encode = opts.mss;
- opts.mss = info->mss;
+ opts.mss_encode = opts.mss_option;
+ opts.mss_option = info->mss;
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy_init_timestamp_cookie(info, &opts);
else
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 5fe5a3981d43..fc34fd1668d6 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1151,7 +1151,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
.fc_encap_type = cfg->nh_encap_type,
};
u32 tb_id = l3mdev_fib_table(cfg->dev);
- int err = -EINVAL;
+ int err;
err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
if (err) {
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 9d24ef5c5d8f..535427292194 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -781,7 +781,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
} else if (!ipc.oif)
ipc.oif = inet->uc_index;
- flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
sk->sk_uid);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 40a6abbc9cf6..80da5a66d5d7 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -375,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb_reserve(skb, hlen);
skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->mark = sockc->mark;
skb->tstamp = sockc->transmit_time;
skb_dst_set(skb, &rt->dst);
*rtp = NULL;
@@ -623,7 +623,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
}
- flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos,
RT_SCOPE_UNIVERSE,
hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 517300d587a7..b6a6f18c3dd1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2728,7 +2728,8 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
struct rtable *rt, u32 table_id, struct flowi4 *fl4,
- struct sk_buff *skb, u32 portid, u32 seq)
+ struct sk_buff *skb, u32 portid, u32 seq,
+ unsigned int flags)
{
struct rtmsg *r;
struct nlmsghdr *nlh;
@@ -2736,7 +2737,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
u32 error;
u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
+ nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
if (!nlh)
return -EMSGSIZE;
@@ -2860,7 +2861,7 @@ nla_put_failure:
static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, u32 table_id,
struct fnhe_hash_bucket *bucket, int genid,
- int *fa_index, int fa_start)
+ int *fa_index, int fa_start, unsigned int flags)
{
int i;
@@ -2891,7 +2892,7 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
table_id, NULL, skb,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq);
+ cb->nlh->nlmsg_seq, flags);
if (err)
return err;
next:
@@ -2904,7 +2905,7 @@ next:
int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
u32 table_id, struct fib_info *fi,
- int *fa_index, int fa_start)
+ int *fa_index, int fa_start, unsigned int flags)
{
struct net *net = sock_net(cb->skb->sk);
int nhsel, genid = fnhe_genid(net);
@@ -2922,7 +2923,8 @@ int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
err = 0;
if (bucket)
err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
- genid, fa_index, fa_start);
+ genid, fa_index, fa_start,
+ flags);
rcu_read_unlock();
if (err)
return err;
@@ -3183,7 +3185,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.flowi4_tos, res.fi, 0);
} else {
err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
- NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
}
if (err < 0)
goto errout_rcu;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0b980e841927..59ded25acd04 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -820,6 +820,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &tcp_min_snd_mss_max,
},
{
+ .procname = "tcp_mtu_probe_floor",
+ .data = &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_min_snd_mss_min,
+ .extra2 = &tcp_min_snd_mss_max,
+ },
+ {
.procname = "tcp_probe_threshold",
.data = &init_net.ipv4.sysctl_tcp_probe_threshold,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 776905899ac0..79c325a07ba5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -935,6 +935,22 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
return mss_now;
}
+/* In some cases, both sendpage() and sendmsg() could have added
+ * an skb to the write queue, but failed adding payload on it.
+ * We need to remove it to consume less memory, but more
+ * importantly be able to generate EPOLLOUT for Edge Trigger epoll()
+ * users.
+ */
+static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (skb && !skb->len) {
+ tcp_unlink_write_queue(skb, sk);
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+ sk_wmem_free_skb(sk, skb);
+ }
+}
+
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
@@ -984,6 +1000,9 @@ new_segment:
if (!skb)
goto wait_for_memory;
+#ifdef CONFIG_TLS_DEVICE
+ skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
skb_entail(sk, skb);
copy = size_goal;
}
@@ -1061,6 +1080,7 @@ out:
return copied;
do_error:
+ tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
if (copied)
goto out;
out_err:
@@ -1162,7 +1182,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
struct sockcm_cookie sockc;
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
- bool process_backlog = false;
+ int process_backlog = 0;
bool zc = false;
long timeo;
@@ -1254,9 +1274,10 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
- if (process_backlog && sk_flush_backlog(sk)) {
- process_backlog = false;
- goto restart;
+ if (unlikely(process_backlog >= 16)) {
+ process_backlog = 0;
+ if (sk_flush_backlog(sk))
+ goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
@@ -1264,7 +1285,7 @@ new_segment:
if (!skb)
goto wait_for_memory;
- process_backlog = true;
+ process_backlog++;
skb->ip_summed = CHECKSUM_PARTIAL;
skb_entail(sk, skb);
@@ -1385,18 +1406,11 @@ out_nopush:
sock_zerocopy_put(uarg);
return copied + copied_syn;
+do_error:
+ skb = tcp_write_queue_tail(sk);
do_fault:
- if (!skb->len) {
- tcp_unlink_write_queue(skb, sk);
- /* It is the one place in all of TCP, except connection
- * reset, where we can be unlinking the send_head.
- */
- if (tcp_write_queue_empty(sk))
- tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
- sk_wmem_free_skb(sk, skb);
- }
+ tcp_remove_empty_skb(sk, skb);
-do_error:
if (copied + copied_syn)
goto out;
out_err:
@@ -1776,19 +1790,21 @@ static int tcp_zerocopy_receive(struct sock *sk,
break;
frags = skb_shinfo(skb)->frags;
while (offset) {
- if (frags->size > offset)
+ if (skb_frag_size(frags) > offset)
goto out;
- offset -= frags->size;
+ offset -= skb_frag_size(frags);
frags++;
}
}
- if (frags->size != PAGE_SIZE || frags->page_offset) {
+ if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
int remaining = zc->recv_skip_hint;
+ int size = skb_frag_size(frags);
- while (remaining && (frags->size != PAGE_SIZE ||
- frags->page_offset)) {
- remaining -= frags->size;
+ while (remaining && (size != PAGE_SIZE ||
+ skb_frag_off(frags))) {
+ remaining -= size;
frags++;
+ size = skb_frag_size(frags);
}
zc->recv_skip_hint -= remaining;
break;
@@ -2637,6 +2653,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->rx_opt.saw_tstamp = 0;
tp->rx_opt.dsack = 0;
tp->rx_opt.num_sacks = 0;
+ tp->rcv_ooopack = 0;
/* Clean up fastopen related fields */
@@ -3279,6 +3296,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_bytes_retrans = tp->bytes_retrans;
info->tcpi_dsack_dups = tp->dsack_dups;
info->tcpi_reord_seen = tp->reord_seen;
+ info->tcpi_rcv_ooopack = tp->rcv_ooopack;
+ info->tcpi_snd_wnd = tp->snd_wnd;
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3781,8 +3800,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
return 1;
for (i = 0; i < shi->nr_frags; ++i) {
- const struct skb_frag_struct *f = &shi->frags[i];
- unsigned int offset = f->page_offset;
+ const skb_frag_t *f = &shi->frags[i];
+ unsigned int offset = skb_frag_off(f);
struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
sg_set_page(&sg, page, skb_frag_size(f),
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 56be7d27f208..95b59540eee1 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -346,7 +346,7 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
*
- * bdp = bw * min_rtt * gain
+ * bdp = ceil(bw * min_rtt * gain)
*
* The key factor, gain, controls the amount of queue. While a small gain
* builds a smaller queue, it becomes more vulnerable to noise in RTT
@@ -370,7 +370,9 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
w = (u64)bw * bbr->min_rtt_us;
- /* Apply a gain to the given value, then remove the BW_SCALE shift. */
+ /* Apply a gain to the given value, remove the BW_SCALE shift, and
+ * round the value up to avoid a negative feedback loop.
+ */
bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
return bdp;
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 3d1e15401384..8a56e09cfb0e 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -398,10 +398,14 @@ more_data:
static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
struct sk_msg tmp, *msg_tx = NULL;
- int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
int copied = 0, err = 0;
struct sk_psock *psock;
long timeo;
+ int flags;
+
+ /* Don't let internal do_tcp_sendpages() flags through */
+ flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED);
+ flags |= MSG_NO_SHARED_FRAGS;
psock = sk_psock_get(sk);
if (unlikely(!psock))
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index a3a386236d93..81a8221d650a 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -81,13 +81,42 @@ static int tcp_diag_put_md5sig(struct sk_buff *skb,
}
#endif
+static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk,
+ const struct tcp_ulp_ops *ulp_ops)
+{
+ struct nlattr *nest;
+ int err;
+
+ nest = nla_nest_start_noflag(skb, INET_DIAG_ULP_INFO);
+ if (!nest)
+ return -EMSGSIZE;
+
+ err = nla_put_string(skb, INET_ULP_INFO_NAME, ulp_ops->name);
+ if (err)
+ goto nla_failure;
+
+ if (ulp_ops->get_info)
+ err = ulp_ops->get_info(sk, skb);
+ if (err)
+ goto nla_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_failure:
+ nla_nest_cancel(skb, nest);
+ return err;
+}
+
static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
struct sk_buff *skb)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int err = 0;
+
#ifdef CONFIG_TCP_MD5SIG
if (net_admin) {
struct tcp_md5sig_info *md5sig;
- int err = 0;
rcu_read_lock();
md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
@@ -99,11 +128,21 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
}
#endif
+ if (net_admin) {
+ const struct tcp_ulp_ops *ulp_ops;
+
+ ulp_ops = icsk->icsk_ulp_ops;
+ if (ulp_ops)
+ err = tcp_diag_put_ulp(skb, sk, ulp_ops);
+ if (err)
+ return err;
+ }
return 0;
}
static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
size_t size = 0;
#ifdef CONFIG_TCP_MD5SIG
@@ -124,6 +163,17 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
}
#endif
+ if (net_admin && sk_fullsock(sk)) {
+ const struct tcp_ulp_ops *ulp_ops;
+
+ ulp_ops = icsk->icsk_ulp_ops;
+ if (ulp_ops) {
+ size += nla_total_size(0) +
+ nla_total_size(TCP_ULP_NAME_MAX);
+ if (ulp_ops->get_info_size)
+ size += ulp_ops->get_info_size(sk);
+ }
+ }
return size;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c21e8a22fb3b..3578357abe30 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -266,7 +266,7 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
{
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
}
static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
@@ -3782,6 +3782,49 @@ static void smc_parse_options(const struct tcphdr *th,
#endif
}
+/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
+ * value on success.
+ */
+static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
+{
+ const unsigned char *ptr = (const unsigned char *)(th + 1);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
+ u16 mss = 0;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return mss;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ if (length < 2)
+ return mss;
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return mss;
+ if (opsize > length)
+ return mss; /* fail on partial options */
+ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
+ u16 in_mss = get_unaligned_be16(ptr);
+
+ if (in_mss) {
+ if (user_mss && user_mss < in_mss)
+ in_mss = user_mss;
+ mss = in_mss;
+ }
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+ return mss;
+}
+
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
@@ -4512,6 +4555,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
+ tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -6422,9 +6466,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc);
/*
* Return true if a syncookie should be sent
*/
-static bool tcp_syn_flood_action(const struct sock *sk,
- const struct sk_buff *skb,
- const char *proto)
+static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
@@ -6444,7 +6486,7 @@ static bool tcp_syn_flood_action(const struct sock *sk,
net->ipv4.sysctl_tcp_syncookies != 2 &&
xchg(&queue->synflood_warned, 1) == 0)
net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
- proto, ntohs(tcp_hdr(skb)->dest), msg);
+ proto, sk->sk_num, msg);
return want_cookie;
}
@@ -6466,6 +6508,36 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
}
}
+/* If a SYN cookie is required and supported, returns a clamped MSS value to be
+ * used for SYN cookie generation.
+ */
+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 &&
+ !inet_csk_reqsk_queue_is_full(sk))
+ return 0;
+
+ if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
+ return 0;
+
+ if (sk_acceptq_is_full(sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ return 0;
+ }
+
+ mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
+ if (!mss)
+ mss = af_ops->mss_clamp;
+
+ return mss;
+}
+EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
@@ -6487,7 +6559,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
*/
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
- want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
+ want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
if (!want_cookie)
goto drop;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d57641cb3477..fd394ad179a0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1515,6 +1515,21 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
return sk;
}
+u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
+ struct tcphdr *th, u32 *cookie)
+{
+ u16 mss = 0;
+#ifdef CONFIG_SYN_COOKIES
+ mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, th);
+ if (mss) {
+ *cookie = __cookie_v4_init_sequence(iph, th, &mss);
+ tcp_synq_overflow(sk);
+ }
+#endif
+ return mss;
+}
+
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
@@ -2637,6 +2652,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+ net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6e4afc48d7bb..fec6d67bfd14 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1050,11 +1050,22 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
- if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
- else
+ } else {
tcp_options_size = tcp_established_options(sk, skb, &opts,
&md5);
+ /* Force a PSH flag on all (GSO) packets to expedite GRO flush
+ * at receiver : This slightly improve GRO performance.
+ * Note that we do not force the PSH flag for non GSO packets,
+ * because they might be sent under high congestion events,
+ * and in this case it is better to delay the delivery of 1-MSS
+ * packets and thus the corresponding ACK packet that would
+ * release the following packet.
+ */
+ if (tcp_skb_pcount(skb) > 1)
+ tcb->tcp_flags |= TCPHDR_PSH;
+ }
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
/* if no packet is in qdisc/device queue, then allow XPS to select
@@ -1320,6 +1331,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
if (!buff)
return -ENOMEM; /* We'll just try again later. */
+ skb_copy_decrypted(buff, skb);
sk->sk_wmem_queued += buff->truesize;
sk_mem_charge(sk, buff->truesize);
@@ -1402,7 +1414,7 @@ static int __pskb_trim_head(struct sk_buff *skb, int len)
} else {
shinfo->frags[k] = shinfo->frags[i];
if (eat) {
- shinfo->frags[k].page_offset += eat;
+ skb_frag_off_add(&shinfo->frags[k], eat);
skb_frag_size_sub(&shinfo->frags[k], eat);
eat = 0;
}
@@ -1874,6 +1886,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
buff = sk_stream_alloc_skb(sk, 0, gfp, true);
if (unlikely(!buff))
return -ENOMEM;
+ skb_copy_decrypted(buff, skb);
sk->sk_wmem_queued += buff->truesize;
sk_mem_charge(sk, buff->truesize);
@@ -2051,7 +2064,7 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
if (len <= skb->len)
break;
- if (unlikely(TCP_SKB_CB(skb)->eor))
+ if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
return false;
len -= skb->len;
@@ -2143,6 +2156,7 @@ static int tcp_mtu_probe(struct sock *sk)
sk_mem_charge(sk, nskb->truesize);
skb = tcp_send_head(sk);
+ skb_copy_decrypted(nskb, skb);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
@@ -2167,6 +2181,7 @@ static int tcp_mtu_probe(struct sock *sk)
* we need to propagate it to the new skb.
*/
TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
+ tcp_skb_collapse_tstamp(nskb, skb);
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
} else {
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c801cd37cc2a..dbd9d2d0ee63 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -154,7 +154,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
} else {
mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
- mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
+ mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor);
mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d88821c794fb..cf755156a684 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -423,12 +423,13 @@ static struct sock *udp4_lib_lookup2(struct net *net,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif);
if (score > badness) {
- if (sk->sk_reuseport) {
+ if (sk->sk_reuseport &&
+ sk->sk_state != TCP_ESTABLISHED) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (result)
+ if (result && !reuseport_has_conns(sk, false))
return result;
}
badness = score;
@@ -1130,7 +1131,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl4 = &fl4_stack;
- flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
+ flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
faddr, saddr, dport, inet->inet_sport,