From 3d7682af228fd78dc46bc6bf40e0268ad04521ec Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 29 Nov 2017 14:25:50 +0000
Subject: rxrpc: Clean up whitespace

Clean up some whitespace from rxrpc.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c  | 2 +-
 net/rxrpc/conn_object.c | 2 +-
 net/rxrpc/input.c       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index bda952ffe6a6..555274ddc514 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -426,7 +426,7 @@ recheck_state:
 	next = call->expect_rx_by;
 
 #define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; }
-	
+
 	set(call->expect_req_by);
 	set(call->expect_term_by);
 	set(call->ack_at);
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1aad04a32d5e..c628351eb900 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -424,7 +424,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 	if (earliest != now + MAX_JIFFY_OFFSET) {
 		_debug("reschedule reaper %ld", (long)earliest - (long)now);
 		ASSERT(time_after(earliest, now));
-		rxrpc_set_service_reap_timer(rxnet, earliest);		
+		rxrpc_set_service_reap_timer(rxnet, earliest);
 	}
 
 	while (!list_empty(&graveyard)) {
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 23a5e61d8f79..6fc61400337f 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -976,7 +976,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
 		rxrpc_reduce_call_timer(call, expect_rx_by, now,
 					rxrpc_timer_set_for_normal);
 	}
-	
+
 	switch (sp->hdr.type) {
 	case RXRPC_PACKET_TYPE_DATA:
 		rxrpc_input_data(call, skb, skew);
@@ -1213,7 +1213,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
 				goto reupgrade;
 			conn->service_id = sp->hdr.serviceId;
 		}
-		
+
 		if (sp->hdr.callNumber == 0) {
 			/* Connection-level packet */
 			_debug("CONN %p {%d}", conn, conn->debug_id);
-- 
cgit v1.2.3-59-g8ed1b


From 5fc62f6a139a7b06b027bf442cd4205619506f59 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 29 Nov 2017 14:40:41 +0000
Subject: rxrpc: Fix ACK generation from the connection event processor

Repeat terminal ACKs and now terminal ACKs are now generated from the
connection event processor rather from call handling as this allows us to
discard client call structures as soon as possible and free up the channel
for a follow on call.

However, in ACKs so generated, the additional information trailer is
malformed because the padding that's meant to be in the middle isn't
included in what's transmitted.

Fix it so that the 3 bytes of padding are included in the transmission.

Further, the trailer is misaligned because of the padding, so assigment to
the u16 and u32 fields inside it might cause problems on some arches, so
fix this by breaking the padding and the trailer out of the packed struct.

(This also deals with potential compiler weirdies where some of the nested
structs are packed and some aren't).

The symptoms can be seen in wireshark as terminal DUPLICATE or IDLE ACK
packets in which the Max MTU, Interface MTU and rwind fields have weird
values and the Max Packets field is apparently missing.

Reported-by: Jeffrey Altman <jaltman@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/conn_event.c | 50 +++++++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 9e9a8db1bc9c..4ca11be6be3c 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -30,22 +30,18 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 	struct rxrpc_skb_priv *sp = skb ? rxrpc_skb(skb) : NULL;
 	struct rxrpc_channel *chan;
 	struct msghdr msg;
-	struct kvec iov;
+	struct kvec iov[3];
 	struct {
 		struct rxrpc_wire_header whdr;
 		union {
-			struct {
-				__be32 code;
-			} abort;
-			struct {
-				struct rxrpc_ackpacket ack;
-				u8 padding[3];
-				struct rxrpc_ackinfo info;
-			};
+			__be32 abort_code;
+			struct rxrpc_ackpacket ack;
 		};
 	} __attribute__((packed)) pkt;
+	struct rxrpc_ackinfo ack_info;
 	size_t len;
-	u32 serial, mtu, call_id;
+	int ioc;
+	u32 serial, mtu, call_id, padding;
 
 	_enter("%d", conn->debug_id);
 
@@ -66,6 +62,13 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
+	iov[0].iov_base	= &pkt;
+	iov[0].iov_len	= sizeof(pkt.whdr);
+	iov[1].iov_base	= &padding;
+	iov[1].iov_len	= 3;
+	iov[2].iov_base	= &ack_info;
+	iov[2].iov_len	= sizeof(ack_info);
+
 	pkt.whdr.epoch		= htonl(conn->proto.epoch);
 	pkt.whdr.cid		= htonl(conn->proto.cid);
 	pkt.whdr.callNumber	= htonl(call_id);
@@ -80,8 +83,10 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 	len = sizeof(pkt.whdr);
 	switch (chan->last_type) {
 	case RXRPC_PACKET_TYPE_ABORT:
-		pkt.abort.code	= htonl(chan->last_abort);
-		len += sizeof(pkt.abort);
+		pkt.abort_code	= htonl(chan->last_abort);
+		iov[0].iov_len += sizeof(pkt.abort_code);
+		len += sizeof(pkt.abort_code);
+		ioc = 1;
 		break;
 
 	case RXRPC_PACKET_TYPE_ACK:
@@ -94,13 +99,19 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		pkt.ack.serial		= htonl(skb ? sp->hdr.serial : 0);
 		pkt.ack.reason		= skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE;
 		pkt.ack.nAcks		= 0;
-		pkt.info.rxMTU		= htonl(rxrpc_rx_mtu);
-		pkt.info.maxMTU		= htonl(mtu);
-		pkt.info.rwind		= htonl(rxrpc_rx_window_size);
-		pkt.info.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
+		ack_info.rxMTU		= htonl(rxrpc_rx_mtu);
+		ack_info.maxMTU		= htonl(mtu);
+		ack_info.rwind		= htonl(rxrpc_rx_window_size);
+		ack_info.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
 		pkt.whdr.flags		|= RXRPC_SLOW_START_OK;
-		len += sizeof(pkt.ack) + sizeof(pkt.info);
+		padding			= 0;
+		iov[0].iov_len += sizeof(pkt.ack);
+		len += sizeof(pkt.ack) + 3 + sizeof(ack_info);
+		ioc = 3;
 		break;
+
+	default:
+		return;
 	}
 
 	/* Resync with __rxrpc_disconnect_call() and check that the last call
@@ -110,9 +121,6 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 	if (READ_ONCE(chan->last_call) != call_id)
 		return;
 
-	iov.iov_base	= &pkt;
-	iov.iov_len	= len;
-
 	serial = atomic_inc_return(&conn->serial);
 	pkt.whdr.serial = htonl(serial);
 
@@ -127,7 +135,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		break;
 	}
 
-	kernel_sendmsg(conn->params.local->socket, &msg, &iov, 1, len);
+	kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len);
 	_leave("");
 	return;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 282ef4729195c8503f7101d574acfb5e7c8a8209 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Tue, 28 Nov 2017 11:28:52 -0600
Subject: rxrpc: Fix variable overwrite

Values assigned to both variable resend_at and ack_at are overwritten
before they can be used.

The correct fix here is to add 'now' to the previously computed value in
resend_at and ack_at.

Addresses-Coverity-ID: 1462262
Addresses-Coverity-ID: 1462263
Addresses-Coverity-ID: 1462264
Fixes: beb8e5e4f38c ("rxrpc: Express protocol timeouts in terms of RTT")
Link: https://marc.info/?i=17004.1511808959%40warthog.procyon.org.uk
Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c | 2 +-
 net/rxrpc/sendmsg.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 555274ddc514..ad2ab1103189 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -123,7 +123,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 		else
 			ack_at = expiry;
 
-		ack_at = jiffies + expiry;
+		ack_at += now;
 		if (time_before(ack_at, call->ack_at)) {
 			WRITE_ONCE(call->ack_at, ack_at);
 			rxrpc_reduce_call_timer(call, ack_at, now,
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index a1c53ac066a1..09f2a3e05221 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -233,7 +233,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 		if (resend_at < 1)
 			resend_at = 1;
 
-		resend_at = now + rxrpc_resend_timeout;
+		resend_at += now;
 		WRITE_ONCE(call->resend_at, resend_at);
 		rxrpc_reduce_call_timer(call, resend_at, now,
 					rxrpc_timer_set_for_send);
-- 
cgit v1.2.3-59-g8ed1b


From 4ba161a793d5f43757c35feff258d9f20a082940 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 24 Nov 2017 12:00:24 -0500
Subject: SUNRPC: Allow connect to return EHOSTUNREACH

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtsock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 684e356b40e4..c2b2d489b57b 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2440,6 +2440,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 	case -ECONNREFUSED:
 	case -ECONNRESET:
 	case -ENETUNREACH:
+	case -EHOSTUNREACH:
 	case -EADDRINUSE:
 	case -ENOBUFS:
 		/*
-- 
cgit v1.2.3-59-g8ed1b


From 90a6ec85351b31449c2c6b5406b5396ac96f191d Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 29 Nov 2017 16:07:51 -0800
Subject: act_sample: get rid of tcf_sample_cleanup_rcu()

Similar to commit d7fb60b9cafb ("net_sched: get rid of tcfa_rcu"),
TC actions don't need to respect RCU grace period, because it
is either just detached from tc filter (standalone case) or
it is removed together with tc filter (bound case) in which case
RCU grace period is already respected at filter layer.

Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_sample.h |  1 -
 net/sched/act_sample.c         | 14 +++-----------
 2 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h
index 524cee4f4c81..01dbfea32672 100644
--- a/include/net/tc_act/tc_sample.h
+++ b/include/net/tc_act/tc_sample.h
@@ -14,7 +14,6 @@ struct tcf_sample {
 	struct psample_group __rcu *psample_group;
 	u32 psample_group_num;
 	struct list_head tcfm_list;
-	struct rcu_head rcu;
 };
 #define to_sample(a) ((struct tcf_sample *)a)
 
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 8b5abcd2f32f..9438969290a6 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -96,23 +96,16 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 	return ret;
 }
 
-static void tcf_sample_cleanup_rcu(struct rcu_head *rcu)
+static void tcf_sample_cleanup(struct tc_action *a, int bind)
 {
-	struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu);
+	struct tcf_sample *s = to_sample(a);
 	struct psample_group *psample_group;
 
-	psample_group = rcu_dereference_protected(s->psample_group, 1);
+	psample_group = rtnl_dereference(s->psample_group);
 	RCU_INIT_POINTER(s->psample_group, NULL);
 	psample_group_put(psample_group);
 }
 
-static void tcf_sample_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_sample *s = to_sample(a);
-
-	call_rcu(&s->rcu, tcf_sample_cleanup_rcu);
-}
-
 static bool tcf_sample_dev_ok_push(struct net_device *dev)
 {
 	switch (dev->type) {
@@ -264,7 +257,6 @@ static int __init sample_init_module(void)
 
 static void __exit sample_cleanup_module(void)
 {
-	rcu_barrier();
 	tcf_unregister_action(&act_sample_ops, &sample_net_ops);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 3016dad75b48279e579117ee3ed566ba90a3b023 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 29 Nov 2017 17:43:57 -0800
Subject: tcp: remove buggy call to tcp_v6_restore_cb()

tcp_v6_send_reset() expects to receive an skb with skb->cb[] layout as
used in TCP stack.
MD5 lookup uses tcp_v6_iif() and tcp_v6_sdif() and thus
TCP_SKB_CB(skb)->header.h6

This patch probably fixes RST packets sent on behalf of a timewait md5
ipv6 socket.

Before Florian patch, tcp_v6_restore_cb() was needed before jumping to
no_tcp_socket label.

Fixes: 271c3b9b7bda ("tcp: honour SO_BINDTODEVICE for TW_RST case too")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6bb98c93edfe..be11dc13aa70 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1590,7 +1590,6 @@ do_time_wait:
 		tcp_v6_timewait_ack(sk, skb);
 		break;
 	case TCP_TW_RST:
-		tcp_v6_restore_cb(skb);
 		tcp_v6_send_reset(sk, skb);
 		inet_twsk_deschedule_put(inet_twsk(sk));
 		goto discard_it;
-- 
cgit v1.2.3-59-g8ed1b


From f859b4af1c52493ec21173ccc73d0b60029b5b88 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 30 Nov 2017 10:41:14 +0800
Subject: sit: update frag_off info

After parsing the sit netlink change info, we forget to update frag_off in
ipip6_tunnel_update(). Fix it by assigning frag_off with new value.

Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sit.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index d60ddcb0bfe2..d7dc23c1b2ca 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1098,6 +1098,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p,
 	ipip6_tunnel_link(sitn, t);
 	t->parms.iph.ttl = p->iph.ttl;
 	t->parms.iph.tos = p->iph.tos;
+	t->parms.iph.frag_off = p->iph.frag_off;
 	if (t->parms.link != p->link || t->fwmark != fwmark) {
 		t->parms.link = p->link;
 		t->fwmark = fwmark;
-- 
cgit v1.2.3-59-g8ed1b


From eb5b46faa693470681ec7c28cc2436edd1571198 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 30 Nov 2017 07:21:33 -0500
Subject: SUNRPC: Handle ENETDOWN errors

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/clnt.c     | 5 +++++
 net/sunrpc/xprtsock.c | 1 +
 2 files changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index a801da812f86..e2a4184f3c5d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1841,6 +1841,7 @@ call_bind_status(struct rpc_task *task)
 	case -ECONNABORTED:
 	case -ENOTCONN:
 	case -EHOSTDOWN:
+	case -ENETDOWN:
 	case -EHOSTUNREACH:
 	case -ENETUNREACH:
 	case -ENOBUFS:
@@ -1917,6 +1918,7 @@ call_connect_status(struct rpc_task *task)
 		/* fall through */
 	case -ECONNRESET:
 	case -ECONNABORTED:
+	case -ENETDOWN:
 	case -ENETUNREACH:
 	case -EHOSTUNREACH:
 	case -EADDRINUSE:
@@ -2022,6 +2024,7 @@ call_transmit_status(struct rpc_task *task)
 		 */
 	case -ECONNREFUSED:
 	case -EHOSTDOWN:
+	case -ENETDOWN:
 	case -EHOSTUNREACH:
 	case -ENETUNREACH:
 	case -EPERM:
@@ -2071,6 +2074,7 @@ call_bc_transmit(struct rpc_task *task)
 	switch (task->tk_status) {
 	case 0:
 		/* Success */
+	case -ENETDOWN:
 	case -EHOSTDOWN:
 	case -EHOSTUNREACH:
 	case -ENETUNREACH:
@@ -2139,6 +2143,7 @@ call_status(struct rpc_task *task)
 	task->tk_status = 0;
 	switch(status) {
 	case -EHOSTDOWN:
+	case -ENETDOWN:
 	case -EHOSTUNREACH:
 	case -ENETUNREACH:
 	case -EPERM:
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c2b2d489b57b..7fa94abfd6b2 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2439,6 +2439,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 		 */
 	case -ECONNREFUSED:
 	case -ECONNRESET:
+	case -ENETDOWN:
 	case -ENETUNREACH:
 	case -EHOSTUNREACH:
 	case -EADDRINUSE:
-- 
cgit v1.2.3-59-g8ed1b


From d30fc5126efb0c33b7adf5966d3051db2c3d7721 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 25 Nov 2017 21:18:34 +0800
Subject: sctp: only update outstanding_bytes for transmitted queue when doing
 prsctp_prune

Now outstanding_bytes is only increased when appending chunks into one
packet and sending it at 1st time, while decreased when it is about to
move into retransmit queue. It means outstanding_bytes value is already
decreased for all chunks in retransmit queue.

However sctp_prsctp_prune_sent is a common function to check the chunks
in both transmitted and retransmit queue, it decrease outstanding_bytes
when moving a chunk into abandoned queue from either of them.

It could cause outstanding_bytes underflow, as it also decreases it's
value for the chunks in retransmit queue.

This patch fixes it by only updating outstanding_bytes for transmitted
queue when pruning queues for prsctp prio policy, the same fix is also
needed in sctp_check_transmitted.

Fixes: 8dbdf1f5b09c ("sctp: implement prsctp PRIO policy")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 4db012aa25f7..7029f8b99063 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -377,7 +377,8 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
 		asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
 		streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
 
-		if (!chk->tsn_gap_acked) {
+		if (queue != &asoc->outqueue.retransmit &&
+		    !chk->tsn_gap_acked) {
 			if (chk->transport)
 				chk->transport->flight_size -=
 						sctp_data_size(chk);
@@ -1434,7 +1435,8 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 			/* If this chunk has not been acked, stop
 			 * considering it as 'outstanding'.
 			 */
-			if (!tchunk->tsn_gap_acked) {
+			if (transmitted_queue != &q->retransmit &&
+			    !tchunk->tsn_gap_acked) {
 				if (tchunk->transport)
 					tchunk->transport->flight_size -=
 							sctp_data_size(tchunk);
-- 
cgit v1.2.3-59-g8ed1b


From e5f612969c6f965e3bd1158598e0a3b1c4f389b9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 25 Nov 2017 21:18:35 +0800
Subject: sctp: abandon the whole msg if one part of a fragmented message is
 abandoned

As rfc3758#section-3.1 demands:

   A3) When a TSN is "abandoned", if it is part of a fragmented message,
       all other TSN's within that fragmented message MUST be abandoned
       at the same time.

Besides, if it couldn't handle this, the rest frags would never get
assembled in peer side.

This patch supports it by adding abandoned flag in sctp_datamsg, when
one chunk is being abandoned, set chunk->msg->abandoned as well. Next
time when checking for abandoned, go checking chunk->msg->abandoned
first.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  3 ++-
 net/sctp/chunk.c           |  7 +++++++
 net/sctp/outqueue.c        | 12 ++++++++----
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 16f949eef52f..2f8f93da5dc2 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -503,7 +503,8 @@ struct sctp_datamsg {
 	/* Did the messenge fail to send? */
 	int send_error;
 	u8 send_failed:1,
-	   can_delay;	    /* should this message be Nagle delayed */
+	   can_delay:1,	/* should this message be Nagle delayed */
+	   abandoned:1;	/* should this message be abandoned */
 };
 
 struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *,
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 7b261afc47b9..9213805b558d 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -53,6 +53,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
 	msg->send_failed = 0;
 	msg->send_error = 0;
 	msg->can_delay = 1;
+	msg->abandoned = 0;
 	msg->expires_at = 0;
 	INIT_LIST_HEAD(&msg->chunks);
 }
@@ -304,6 +305,9 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 	if (!chunk->asoc->peer.prsctp_capable)
 		return 0;
 
+	if (chunk->msg->abandoned)
+		return 1;
+
 	if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
 	    time_after(jiffies, chunk->msg->expires_at)) {
 		struct sctp_stream_out *streamout =
@@ -316,6 +320,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 			chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
 			streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
 		}
+		chunk->msg->abandoned = 1;
 		return 1;
 	} else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) &&
 		   chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
@@ -324,10 +329,12 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 
 		chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
 		streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
+		chunk->msg->abandoned = 1;
 		return 1;
 	} else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) &&
 		   chunk->msg->expires_at &&
 		   time_after(jiffies, chunk->msg->expires_at)) {
+		chunk->msg->abandoned = 1;
 		return 1;
 	}
 	/* PRIO policy is processed by sendmsg, not here */
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 7029f8b99063..4ab164b5aad0 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -364,10 +364,12 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
 	list_for_each_entry_safe(chk, temp, queue, transmitted_list) {
 		struct sctp_stream_out *streamout;
 
-		if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
-		    chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
+		if (!chk->msg->abandoned &&
+		    (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
+		     chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive))
 			continue;
 
+		chk->msg->abandoned = 1;
 		list_del_init(&chk->transmitted_list);
 		sctp_insert_list(&asoc->outqueue.abandoned,
 				 &chk->transmitted_list);
@@ -404,10 +406,12 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 	q->sched->unsched_all(&asoc->stream);
 
 	list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) {
-		if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
-		    chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
+		if (!chk->msg->abandoned &&
+		    (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
+		     chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive))
 			continue;
 
+		chk->msg->abandoned = 1;
 		sctp_sched_dequeue_common(q, chk);
 		asoc->sent_cnt_removable--;
 		asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
-- 
cgit v1.2.3-59-g8ed1b


From 779edd7348878a7376c0e3d0f96485c30b5f1b7d Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 25 Nov 2017 21:18:36 +0800
Subject: sctp: do not abandon the other frags in unsent outq if one msg has
 outstanding frags

Now for the abandoned chunks in unsent outq, it would just free the chunks.
Because no tsn is assigned to them yet, there's no need to send fwd tsn to
peer, unlike for the abandoned chunks in sent outq.

The problem is when parts of the msg have been sent and the other frags
are still in unsent outq, if they are abandoned/dropped, the peer would
never get this msg reassembled.

So these frags in unsent outq can't be dropped if this msg already has
outstanding frags.

This patch does the check in sctp_chunk_abandoned and
sctp_prsctp_prune_unsent.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/chunk.c    | 4 ++++
 net/sctp/outqueue.c | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 9213805b558d..7f8baa48e7c2 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -308,6 +308,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 	if (chunk->msg->abandoned)
 		return 1;
 
+	if (!chunk->has_tsn &&
+	    !(chunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG))
+		return 0;
+
 	if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
 	    time_after(jiffies, chunk->msg->expires_at)) {
 		struct sctp_stream_out *streamout =
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 4ab164b5aad0..7d67feeeffc1 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -407,7 +407,8 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 
 	list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) {
 		if (!chk->msg->abandoned &&
-		    (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
+		    (!(chk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) ||
+		     !SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
 		     chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive))
 			continue;
 
-- 
cgit v1.2.3-59-g8ed1b


From cfac7f836a715b91f08c851df915d401a4d52783 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Dec 2017 10:06:56 -0800
Subject: tcp/dccp: block bh before arming time_wait timer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Maciej Żenczykowski reported some panics in tcp_twsk_destructor()
that might be caused by the following bug.

timewait timer is pinned to the cpu, because we want to transition
timwewait refcount from 0 to 4 in one go, once everything has been
initialized.

At the time commit ed2e92394589 ("tcp/dccp: fix timewait races in timer
handling") was merged, TCP was always running from BH habdler.

After commit 5413d1babe8f ("net: do not block BH while processing
socket backlog") we definitely can run tcp_time_wait() from process
context.

We need to block BH in the critical section so that the pinned timer
has still its purpose.

This bug is more likely to happen under stress and when very small RTO
are used in datacenter flows.

Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Maciej Żenczykowski <maze@google.com>
Acked-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/minisocks.c     | 6 ++++++
 net/ipv4/tcp_minisocks.c | 6 ++++++
 2 files changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index abd07a443219..178bb9833311 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -57,10 +57,16 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
 		if (state == DCCP_TIME_WAIT)
 			timeo = DCCP_TIMEWAIT_LEN;
 
+		/* tw_timer is pinned, so we need to make sure BH are disabled
+		 * in following section, otherwise timer handler could run before
+		 * we complete the initialization.
+		 */
+		local_bh_disable();
 		inet_twsk_schedule(tw, timeo);
 		/* Linkage updates. */
 		__inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
 		inet_twsk_put(tw);
+		local_bh_enable();
 	} else {
 		/* Sorry, if we're out of memory, just CLOSE this
 		 * socket up.  We've got bigger problems than
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e36eff0403f4..b079b619b60c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -310,10 +310,16 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		if (state == TCP_TIME_WAIT)
 			timeo = TCP_TIMEWAIT_LEN;
 
+		/* tw_timer is pinned, so we need to make sure BH are disabled
+		 * in following section, otherwise timer handler could run before
+		 * we complete the initialization.
+		 */
+		local_bh_disable();
 		inet_twsk_schedule(tw, timeo);
 		/* Linkage updates. */
 		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
 		inet_twsk_put(tw);
+		local_bh_enable();
 	} else {
 		/* Sorry, if we're out of memory, just CLOSE this
 		 * socket up.  We've got bigger problems than
-- 
cgit v1.2.3-59-g8ed1b


From c7799c067c2ae33e348508c8afec354f3257ff25 Mon Sep 17 00:00:00 2001
From: Tommi Rantala <tommi.t.rantala@nokia.com>
Date: Wed, 29 Nov 2017 12:48:42 +0200
Subject: tipc: call tipc_rcv() only if bearer is up in tipc_udp_recv()

Remove the second tipc_rcv() call in tipc_udp_recv(). We have just
checked that the bearer is not up, and calling tipc_rcv() with a bearer
that is not up leads to a TIPC div-by-zero crash in
tipc_node_calculate_timer(). The crash is rare in practice, but can
happen like this:

  We're enabling a bearer, but it's not yet up and fully initialized.
  At the same time we receive a discovery packet, and in tipc_udp_recv()
  we end up calling tipc_rcv() with the not-yet-initialized bearer,
  causing later the div-by-zero crash in tipc_node_calculate_timer().

Jon Maloy explains the impact of removing the second tipc_rcv() call:
  "link setup in the worst case will be delayed until the next arriving
   discovery messages, 1 sec later, and this is an acceptable delay."

As the tipc_rcv() call is removed, just leave the function via the
rcu_out label, so that we will kfree_skb().

[   12.590450] Own node address <1.1.1>, network identity 1
[   12.668088] divide error: 0000 [#1] SMP
[   12.676952] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.14.2-dirty #1
[   12.679225] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014
[   12.682095] task: ffff8c2a761edb80 task.stack: ffffa41cc0cac000
[   12.684087] RIP: 0010:tipc_node_calculate_timer.isra.12+0x45/0x60 [tipc]
[   12.686486] RSP: 0018:ffff8c2a7fc838a0 EFLAGS: 00010246
[   12.688451] RAX: 0000000000000000 RBX: ffff8c2a5b382600 RCX: 0000000000000000
[   12.691197] RDX: 0000000000000000 RSI: ffff8c2a5b382600 RDI: ffff8c2a5b382600
[   12.693945] RBP: ffff8c2a7fc838b0 R08: 0000000000000001 R09: 0000000000000001
[   12.696632] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8c2a5d8949d8
[   12.699491] R13: ffffffff95ede400 R14: 0000000000000000 R15: ffff8c2a5d894800
[   12.702338] FS:  0000000000000000(0000) GS:ffff8c2a7fc80000(0000) knlGS:0000000000000000
[   12.705099] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   12.706776] CR2: 0000000001bb9440 CR3: 00000000bd009001 CR4: 00000000003606e0
[   12.708847] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   12.711016] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   12.712627] Call Trace:
[   12.713390]  <IRQ>
[   12.714011]  tipc_node_check_dest+0x2e8/0x350 [tipc]
[   12.715286]  tipc_disc_rcv+0x14d/0x1d0 [tipc]
[   12.716370]  tipc_rcv+0x8b0/0xd40 [tipc]
[   12.717396]  ? minmax_running_min+0x2f/0x60
[   12.718248]  ? dst_alloc+0x4c/0xa0
[   12.718964]  ? tcp_ack+0xaf1/0x10b0
[   12.719658]  ? tipc_udp_is_known_peer+0xa0/0xa0 [tipc]
[   12.720634]  tipc_udp_recv+0x71/0x1d0 [tipc]
[   12.721459]  ? dst_alloc+0x4c/0xa0
[   12.722130]  udp_queue_rcv_skb+0x264/0x490
[   12.722924]  __udp4_lib_rcv+0x21e/0x990
[   12.723670]  ? ip_route_input_rcu+0x2dd/0xbf0
[   12.724442]  ? tcp_v4_rcv+0x958/0xa40
[   12.725039]  udp_rcv+0x1a/0x20
[   12.725587]  ip_local_deliver_finish+0x97/0x1d0
[   12.726323]  ip_local_deliver+0xaf/0xc0
[   12.726959]  ? ip_route_input_noref+0x19/0x20
[   12.727689]  ip_rcv_finish+0xdd/0x3b0
[   12.728307]  ip_rcv+0x2ac/0x360
[   12.728839]  __netif_receive_skb_core+0x6fb/0xa90
[   12.729580]  ? udp4_gro_receive+0x1a7/0x2c0
[   12.730274]  __netif_receive_skb+0x1d/0x60
[   12.730953]  ? __netif_receive_skb+0x1d/0x60
[   12.731637]  netif_receive_skb_internal+0x37/0xd0
[   12.732371]  napi_gro_receive+0xc7/0xf0
[   12.732920]  receive_buf+0x3c3/0xd40
[   12.733441]  virtnet_poll+0xb1/0x250
[   12.733944]  net_rx_action+0x23e/0x370
[   12.734476]  __do_softirq+0xc5/0x2f8
[   12.734922]  irq_exit+0xfa/0x100
[   12.735315]  do_IRQ+0x4f/0xd0
[   12.735680]  common_interrupt+0xa2/0xa2
[   12.736126]  </IRQ>
[   12.736416] RIP: 0010:native_safe_halt+0x6/0x10
[   12.736925] RSP: 0018:ffffa41cc0cafe90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff4d
[   12.737756] RAX: 0000000000000000 RBX: ffff8c2a761edb80 RCX: 0000000000000000
[   12.738504] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[   12.739258] RBP: ffffa41cc0cafe90 R08: 0000014b5b9795e5 R09: ffffa41cc12c7e88
[   12.740118] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000002
[   12.740964] R13: ffff8c2a761edb80 R14: 0000000000000000 R15: 0000000000000000
[   12.741831]  default_idle+0x2a/0x100
[   12.742323]  arch_cpu_idle+0xf/0x20
[   12.742796]  default_idle_call+0x28/0x40
[   12.743312]  do_idle+0x179/0x1f0
[   12.743761]  cpu_startup_entry+0x1d/0x20
[   12.744291]  start_secondary+0x112/0x120
[   12.744816]  secondary_startup_64+0xa5/0xa5
[   12.745367] Code: b9 f4 01 00 00 48 89 c2 48 c1 ea 02 48 3d d3 07 00
00 48 0f 47 d1 49 8b 0c 24 48 39 d1 76 07 49 89 14 24 48 89 d1 31 d2 48
89 df <48> f7 f1 89 c6 e8 81 6e ff ff 5b 41 5c 5d c3 66 90 66 2e 0f 1f
[   12.747527] RIP: tipc_node_calculate_timer.isra.12+0x45/0x60 [tipc] RSP: ffff8c2a7fc838a0
[   12.748555] ---[ end trace 1399ab83390650fd ]---
[   12.749296] Kernel panic - not syncing: Fatal exception in interrupt
[   12.750123] Kernel Offset: 0x13200000 from 0xffffffff82000000
(relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[   12.751215] Rebooting in 60 seconds..

Fixes: c9b64d492b1f ("tipc: add replicast peer discovery")
Signed-off-by: Tommi Rantala <tommi.t.rantala@nokia.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/udp_media.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index ecca64fc6a6f..3deabcab4882 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -371,10 +371,6 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb)
 			goto rcu_out;
 	}
 
-	tipc_rcv(sock_net(sk), skb, b);
-	rcu_read_unlock();
-	return 0;
-
 rcu_out:
 	rcu_read_unlock();
 out:
-- 
cgit v1.2.3-59-g8ed1b


From c501256406fb19c306504ee1fe41a4ea208d4245 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 1 Dec 2017 11:09:53 +0000
Subject: rxrpc: Use correct netns source in rxrpc_release_sock()

In rxrpc_release_sock() there may be no rx->local value to access, so we
can't unconditionally follow it to the rxrpc network namespace information
to poke the connection reapers.

Instead, use the socket's namespace pointer to find the namespace.

This unfixed code causes the following static checker warning:

	net/rxrpc/af_rxrpc.c:898 rxrpc_release_sock()
	error: we previously assumed 'rx->local' could be null (see line 887)

Fixes: 3d18cbb7fd0c ("rxrpc: Fix conn expiry timers")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/af_rxrpc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 8f7cf4c042be..dcd818fa837e 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -860,6 +860,7 @@ static void rxrpc_sock_destructor(struct sock *sk)
 static int rxrpc_release_sock(struct sock *sk)
 {
 	struct rxrpc_sock *rx = rxrpc_sk(sk);
+	struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk));
 
 	_enter("%p{%d,%d}", sk, sk->sk_state, refcount_read(&sk->sk_refcnt));
 
@@ -895,8 +896,8 @@ static int rxrpc_release_sock(struct sock *sk)
 	rxrpc_release_calls_on_socket(rx);
 	flush_workqueue(rxrpc_workqueue);
 	rxrpc_purge_queue(&sk->sk_receive_queue);
-	rxrpc_queue_work(&rx->local->rxnet->service_conn_reaper);
-	rxrpc_queue_work(&rx->local->rxnet->client_conn_reaper);
+	rxrpc_queue_work(&rxnet->service_conn_reaper);
+	rxrpc_queue_work(&rxnet->client_conn_reaper);
 
 	rxrpc_put_local(rx->local);
 	rx->local = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From eeea10b83a139451130df1594f26710c8fa390c8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 3 Dec 2017 09:32:59 -0800
Subject: tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb()

James Morris reported kernel stack corruption bug [1] while
running the SELinux testsuite, and bisected to a recent
commit bffa72cf7f9d ("net: sk_buff rbnode reorg")

We believe this commit is fine, but exposes an older bug.

SELinux code runs from tcp_filter() and might send an ICMP,
expecting IP options to be found in skb->cb[] using regular IPCB placement.

We need to defer TCP mangling of skb->cb[] after tcp_filter() calls.

This patch adds tcp_v4_fill_cb()/tcp_v4_restore_cb() in a very
similar way we added them for IPv6.

[1]
[  339.806024] SELinux: failure in selinux_parse_skb(), unable to parse packet
[  339.822505] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffffff81745af5
[  339.822505]
[  339.852250] CPU: 4 PID: 3642 Comm: client Not tainted 4.15.0-rc1-test #15
[  339.868498] Hardware name: LENOVO 10FGS0VA1L/30BC, BIOS FWKT68A   01/19/2017
[  339.885060] Call Trace:
[  339.896875]  <IRQ>
[  339.908103]  dump_stack+0x63/0x87
[  339.920645]  panic+0xe8/0x248
[  339.932668]  ? ip_push_pending_frames+0x33/0x40
[  339.946328]  ? icmp_send+0x525/0x530
[  339.958861]  ? kfree_skbmem+0x60/0x70
[  339.971431]  __stack_chk_fail+0x1b/0x20
[  339.984049]  icmp_send+0x525/0x530
[  339.996205]  ? netlbl_skbuff_err+0x36/0x40
[  340.008997]  ? selinux_netlbl_err+0x11/0x20
[  340.021816]  ? selinux_socket_sock_rcv_skb+0x211/0x230
[  340.035529]  ? security_sock_rcv_skb+0x3b/0x50
[  340.048471]  ? sk_filter_trim_cap+0x44/0x1c0
[  340.061246]  ? tcp_v4_inbound_md5_hash+0x69/0x1b0
[  340.074562]  ? tcp_filter+0x2c/0x40
[  340.086400]  ? tcp_v4_rcv+0x820/0xa20
[  340.098329]  ? ip_local_deliver_finish+0x71/0x1a0
[  340.111279]  ? ip_local_deliver+0x6f/0xe0
[  340.123535]  ? ip_rcv_finish+0x3a0/0x3a0
[  340.135523]  ? ip_rcv_finish+0xdb/0x3a0
[  340.147442]  ? ip_rcv+0x27c/0x3c0
[  340.158668]  ? inet_del_offload+0x40/0x40
[  340.170580]  ? __netif_receive_skb_core+0x4ac/0x900
[  340.183285]  ? rcu_accelerate_cbs+0x5b/0x80
[  340.195282]  ? __netif_receive_skb+0x18/0x60
[  340.207288]  ? process_backlog+0x95/0x140
[  340.218948]  ? net_rx_action+0x26c/0x3b0
[  340.230416]  ? __do_softirq+0xc9/0x26a
[  340.241625]  ? do_softirq_own_stack+0x2a/0x40
[  340.253368]  </IRQ>
[  340.262673]  ? do_softirq+0x50/0x60
[  340.273450]  ? __local_bh_enable_ip+0x57/0x60
[  340.285045]  ? ip_finish_output2+0x175/0x350
[  340.296403]  ? ip_finish_output+0x127/0x1d0
[  340.307665]  ? nf_hook_slow+0x3c/0xb0
[  340.318230]  ? ip_output+0x72/0xe0
[  340.328524]  ? ip_fragment.constprop.54+0x80/0x80
[  340.340070]  ? ip_local_out+0x35/0x40
[  340.350497]  ? ip_queue_xmit+0x15c/0x3f0
[  340.361060]  ? __kmalloc_reserve.isra.40+0x31/0x90
[  340.372484]  ? __skb_clone+0x2e/0x130
[  340.382633]  ? tcp_transmit_skb+0x558/0xa10
[  340.393262]  ? tcp_connect+0x938/0xad0
[  340.403370]  ? ktime_get_with_offset+0x4c/0xb0
[  340.414206]  ? tcp_v4_connect+0x457/0x4e0
[  340.424471]  ? __inet_stream_connect+0xb3/0x300
[  340.435195]  ? inet_stream_connect+0x3b/0x60
[  340.445607]  ? SYSC_connect+0xd9/0x110
[  340.455455]  ? __audit_syscall_entry+0xaf/0x100
[  340.466112]  ? syscall_trace_enter+0x1d0/0x2b0
[  340.476636]  ? __audit_syscall_exit+0x209/0x290
[  340.487151]  ? SyS_connect+0xe/0x10
[  340.496453]  ? do_syscall_64+0x67/0x1b0
[  340.506078]  ? entry_SYSCALL64_slow_path+0x25/0x25

Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: James Morris <james.l.morris@oracle.com>
Tested-by: James Morris <james.l.morris@oracle.com>
Tested-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 59 ++++++++++++++++++++++++++++++++++++-----------------
 net/ipv6/tcp_ipv6.c | 10 +++++----
 2 files changed, 46 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c6bc0c4d19c6..77ea45da0fe9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1591,6 +1591,34 @@ int tcp_filter(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(tcp_filter);
 
+static void tcp_v4_restore_cb(struct sk_buff *skb)
+{
+	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
+		sizeof(struct inet_skb_parm));
+}
+
+static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
+			   const struct tcphdr *th)
+{
+	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
+	 * barrier() makes sure compiler wont play fool^Waliasing games.
+	 */
+	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
+		sizeof(struct inet_skb_parm));
+	barrier();
+
+	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+				    skb->len - th->doff * 4);
+	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
+	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
+	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
+	TCP_SKB_CB(skb)->sacked	 = 0;
+	TCP_SKB_CB(skb)->has_rxtstamp =
+			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
+}
+
 /*
  *	From tcp_input.c
  */
@@ -1631,24 +1659,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
 
 	th = (const struct tcphdr *)skb->data;
 	iph = ip_hdr(skb);
-	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
-	 * barrier() makes sure compiler wont play fool^Waliasing games.
-	 */
-	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
-		sizeof(struct inet_skb_parm));
-	barrier();
-
-	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
-	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
-				    skb->len - th->doff * 4);
-	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
-	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
-	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
-	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
-	TCP_SKB_CB(skb)->sacked	 = 0;
-	TCP_SKB_CB(skb)->has_rxtstamp =
-			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
-
 lookup:
 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
 			       th->dest, sdif, &refcounted);
@@ -1679,14 +1689,19 @@ process:
 		sock_hold(sk);
 		refcounted = true;
 		nsk = NULL;
-		if (!tcp_filter(sk, skb))
+		if (!tcp_filter(sk, skb)) {
+			th = (const struct tcphdr *)skb->data;
+			iph = ip_hdr(skb);
+			tcp_v4_fill_cb(skb, iph, th);
 			nsk = tcp_check_req(sk, skb, req, false);
+		}
 		if (!nsk) {
 			reqsk_put(req);
 			goto discard_and_relse;
 		}
 		if (nsk == sk) {
 			reqsk_put(req);
+			tcp_v4_restore_cb(skb);
 		} else if (tcp_child_process(sk, nsk, skb)) {
 			tcp_v4_send_reset(nsk, skb);
 			goto discard_and_relse;
@@ -1712,6 +1727,7 @@ process:
 		goto discard_and_relse;
 	th = (const struct tcphdr *)skb->data;
 	iph = ip_hdr(skb);
+	tcp_v4_fill_cb(skb, iph, th);
 
 	skb->dev = NULL;
 
@@ -1742,6 +1758,8 @@ no_tcp_socket:
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 		goto discard_it;
 
+	tcp_v4_fill_cb(skb, iph, th);
+
 	if (tcp_checksum_complete(skb)) {
 csum_error:
 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
@@ -1768,6 +1786,8 @@ do_time_wait:
 		goto discard_it;
 	}
 
+	tcp_v4_fill_cb(skb, iph, th);
+
 	if (tcp_checksum_complete(skb)) {
 		inet_twsk_put(inet_twsk(sk));
 		goto csum_error;
@@ -1784,6 +1804,7 @@ do_time_wait:
 		if (sk2) {
 			inet_twsk_deschedule_put(inet_twsk(sk));
 			sk = sk2;
+			tcp_v4_restore_cb(skb);
 			refcounted = false;
 			goto process;
 		}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index be11dc13aa70..1f04ec0e4a7a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1454,7 +1454,6 @@ process:
 		struct sock *nsk;
 
 		sk = req->rsk_listener;
-		tcp_v6_fill_cb(skb, hdr, th);
 		if (tcp_v6_inbound_md5_hash(sk, skb)) {
 			sk_drops_add(sk, skb);
 			reqsk_put(req);
@@ -1467,8 +1466,12 @@ process:
 		sock_hold(sk);
 		refcounted = true;
 		nsk = NULL;
-		if (!tcp_filter(sk, skb))
+		if (!tcp_filter(sk, skb)) {
+			th = (const struct tcphdr *)skb->data;
+			hdr = ipv6_hdr(skb);
+			tcp_v6_fill_cb(skb, hdr, th);
 			nsk = tcp_check_req(sk, skb, req, false);
+		}
 		if (!nsk) {
 			reqsk_put(req);
 			goto discard_and_relse;
@@ -1492,8 +1495,6 @@ process:
 	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
 
-	tcp_v6_fill_cb(skb, hdr, th);
-
 	if (tcp_v6_inbound_md5_hash(sk, skb))
 		goto discard_and_relse;
 
@@ -1501,6 +1502,7 @@ process:
 		goto discard_and_relse;
 	th = (const struct tcphdr *)skb->data;
 	hdr = ipv6_hdr(skb);
+	tcp_v6_fill_cb(skb, hdr, th);
 
 	skb->dev = NULL;
 
-- 
cgit v1.2.3-59-g8ed1b