aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorWei Wang <weiwan@google.com>2020-09-09 17:50:48 -0700
committerDavid S. Miller <davem@davemloft.net>2020-09-10 13:15:40 -0700
commitac8f1710c12bb4c3626280ce03f05459ba8feef6 (patch)
treef9db7718210545aa9708155babb2b1df39fb0fce /net
parentip: pass tos into ip_build_and_send_pkt() (diff)
downloadlinux-dev-ac8f1710c12bb4c3626280ce03f05459ba8feef6.tar.xz
linux-dev-ac8f1710c12bb4c3626280ce03f05459ba8feef6.zip
tcp: reflect tos value received in SYN to the socket
This commit adds a new TCP feature to reflect the tos value received in SYN, and send it out on the SYN-ACK, and eventually set the tos value of the established socket with this reflected tos value. This provides a way to set the traffic class/QoS level for all traffic in the same connection to be the same as the incoming SYN request. It could be useful in data centers to provide equivalent QoS according to the incoming request. This feature is guarded by /proc/sys/net/ipv4/tcp_reflect_tos, and is by default turned off. Signed-off-by: Wei Wang <weiwan@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp_ipv4.c10
-rw-r--r--net/ipv6/tcp_ipv6.c10
3 files changed, 27 insertions, 2 deletions
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 54023a46db04..3e5f4f2e705e 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1330,6 +1330,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &comp_sack_nr_max,
},
{
+ .procname = "tcp_reflect_tos",
+ .data = &init_net.ipv4.sysctl_tcp_reflect_tos,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
.maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c4c7ad4c8b5a..ace48b2790ff 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -972,6 +972,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi4 fl4;
int err = -1;
struct sk_buff *skb;
+ u8 tos;
/* First, grab a route. */
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
@@ -979,6 +980,9 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
+ tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+ tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
+
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
@@ -986,7 +990,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
rcu_dereference(ireq->ireq_opt),
- inet_sk(sk)->tos);
+ tos & ~INET_ECN_MASK);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -1531,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = prandom_u32();
+ /* Set ToS of the new socket based upon the value of incoming SYN. */
+ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+ newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req);
if (!dst)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 04efa3ee80ef..862058dce6d0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -510,6 +510,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi6 *fl6 = &fl->u.ip6;
struct sk_buff *skb;
int err = -ENOMEM;
+ u8 tclass;
/* First, grab a route. */
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
@@ -528,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_lock();
opt = ireq->ipv6_opt;
+ tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+ tcp_rsk(req)->syn_tos : np->tclass;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
+ err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
+ tclass & ~INET_ECN_MASK,
sk->sk_priority);
rcu_read_unlock();
err = net_xmit_eval(err);
@@ -1310,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
if (np->repflow)
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
+ /* Set ToS of the new socket based upon the value of incoming SYN. */
+ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+ newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
/* Clone native IPv6 options from listening socket (if any)
Yes, keeping reference count would be much more clever,