tipc: fix changeover issues due to large packet

In conjunction with changing the interfaces' MTU (e.g. especially in the case of a bonding) where the TIPC links are brought up and down in a short time, a couple of issues were detected with the current link changeover mechanism: 1) When one link is up but immediately forced down again, the failover procedure will be carried out in order to failover all the messages in the link's transmq queue onto the other working link. The link and node state is also set to FAILINGOVER as part of the process. The message will be transmited in form of a FAILOVER_MSG, so its size is plus of 40 bytes (= the message header size). There is no problem if the original message size is not larger than the link's MTU - 40, and indeed this is the max size of a normal payload messages. However, in the situation above, because the link has just been up, the messages in the link's transmq are almost SYNCH_MSGs which had been generated by the link synching procedure, then their size might reach the max value already! When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size will exceed the link's MTU. As a result, the messages are dropped silently and the failover procedure will never end up, the link will not be able to exit the FAILINGOVER state, so cannot be re-established. 2) The same scenario above can happen more easily in case the MTU of the links is set differently or when changing. In that case, as long as a large message in the failure link's transmq queue was built and fragmented with its link's MTU > the other link's one, the issue will happen (there is no need of a link synching in advance). 3) The link synching procedure also faces with the same issue but since the link synching is only started upon receipt of a SYNCH_MSG, dropping the message will not result in a state deadlock, but it is not expected as design. The 1) & 3) issues are resolved by the last commit that only a dummy SYNCH_MSG (i.e. without data) is generated at the link synching, so the size of a FAILOVER_MSG if any then will never exceed the link's MTU. For the 2) issue, the only solution is trying to fragment the messages in the failure link's transmq queue according to the working link's MTU so they can be failovered then. A new function is made to accomplish this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the original message size is too large, it will be fragmented & reassembled at the receiving side. Acked-by: Ying Xue <ying.xue@windriver.com> Acked-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Tuong Lien <tuong.t.lien@dektech.com.au> 2019-07-24 08:56:12 +0700
committer: David S. Miller <davem@davemloft.net> 2019-07-25 15:55:47 -0700
commit: 2320bcdae62887555701ea78a46b640ff6b63868 (patch)
tree: 9823f2ab5b41438842517ae8647904c9aa7b943b /net/tipc/link.c
parent: tipc: optimize link synching mechanism (diff)
download: linux-dev-2320bcdae62887555701ea78a46b640ff6b63868.tar.xz
linux-dev-2320bcdae62887555701ea78a46b640ff6b63868.zip
1 files changed, 79 insertions, 14 deletions
diff --git a/net/tipc/link.c b/net/tipc/link.c
index e215b4ba6a4b..2c274777b2dd 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -180,6 +180,7 @@ struct tipc_link {
 
 	/* Fragmentation/reassembly */
 	struct sk_buff *reasm_buf;
+	struct sk_buff *reasm_tnlmsg;
 
 	/* Broadcast */
 	u16 ackers;
@@ -897,8 +898,10 @@ void tipc_link_reset(struct tipc_link *l)
 	l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0;
 	l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0;
 	kfree_skb(l->reasm_buf);
+	kfree_skb(l->reasm_tnlmsg);
 	kfree_skb(l->failover_reasm_skb);
 	l->reasm_buf = NULL;
+	l->reasm_tnlmsg = NULL;
 	l->failover_reasm_skb = NULL;
 	l->rcv_unacked = 0;
 	l->snd_nxt = 1;
@@ -940,6 +943,9 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 	int rc = 0;
 
 	if (unlikely(msg_size(hdr) > mtu)) {
+		pr_warn("Too large msg, purging xmit list %d %d %d %d %d!\n",
+			skb_queue_len(list), msg_user(hdr),
+			msg_type(hdr), msg_size(hdr), mtu);
 		skb_queue_purge(list);
 		return -EMSGSIZE;
 	}
@@ -1233,6 +1239,7 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb,
 			     struct sk_buff_head *inputq)
 {
 	struct sk_buff **reasm_skb = &l->failover_reasm_skb;
+	struct sk_buff **reasm_tnlmsg = &l->reasm_tnlmsg;
 	struct sk_buff_head *fdefq = &l->failover_deferdq;
 	struct tipc_msg *hdr = buf_msg(skb);
 	struct sk_buff *iskb;
@@ -1240,40 +1247,56 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb,
 	int rc = 0;
 	u16 seqno;
 
-	/* SYNCH_MSG */
-	if (msg_type(hdr) == SYNCH_MSG)
-		goto drop;
+	if (msg_type(hdr) == SYNCH_MSG) {
+		kfree_skb(skb);
+		return 0;
+	}
 
-	/* FAILOVER_MSG */
-	if (!tipc_msg_extract(skb, &iskb, &ipos)) {
-		pr_warn_ratelimited("Cannot extract FAILOVER_MSG, defq: %d\n",
-				    skb_queue_len(fdefq));
-		return rc;
+	/* Not a fragment? */
+	if (likely(!msg_nof_fragms(hdr))) {
+		if (unlikely(!tipc_msg_extract(skb, &iskb, &ipos))) {
+			pr_warn_ratelimited("Unable to extract msg, defq: %d\n",
+					    skb_queue_len(fdefq));
+			return 0;
+		}
+		kfree_skb(skb);
+	} else {
+		/* Set fragment type for buf_append */
+		if (msg_fragm_no(hdr) == 1)
+			msg_set_type(hdr, FIRST_FRAGMENT);
+		else if (msg_fragm_no(hdr) < msg_nof_fragms(hdr))
+			msg_set_type(hdr, FRAGMENT);
+		else
+			msg_set_type(hdr, LAST_FRAGMENT);
+
+		if (!tipc_buf_append(reasm_tnlmsg, &skb)) {
+			/* Successful but non-complete reassembly? */
+			if (*reasm_tnlmsg || link_is_bc_rcvlink(l))
+				return 0;
+			pr_warn_ratelimited("Unable to reassemble tunnel msg\n");
+			return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+		}
+		iskb = skb;
 	}
 
 	do {
 		seqno = buf_seqno(iskb);
-
 		if (unlikely(less(seqno, l->drop_point))) {
 			kfree_skb(iskb);
 			continue;
 		}
-
 		if (unlikely(seqno != l->drop_point)) {
 			__tipc_skb_queue_sorted(fdefq, seqno, iskb);
 			continue;
 		}
 
 		l->drop_point++;
-
 		if (!tipc_data_input(l, iskb, inputq))
 			rc |= tipc_link_input(l, iskb, inputq, reasm_skb);
 		if (unlikely(rc))
 			break;
 	} while ((iskb = __tipc_skb_dequeue(fdefq, l->drop_point)));
 
-drop:
-	kfree_skb(skb);
 	return rc;
 }
 
@@ -1663,15 +1686,18 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
 	struct sk_buff *skb, *tnlskb;
 	struct tipc_msg *hdr, tnlhdr;
 	struct sk_buff_head *queue = &l->transmq;
-	struct sk_buff_head tmpxq, tnlq;
+	struct sk_buff_head tmpxq, tnlq, frags;
 	u16 pktlen, pktcnt, seqno = l->snd_nxt;
+	bool pktcnt_need_update = false;
 	u16 syncpt;
+	int rc;
 
 	if (!tnl)
 		return;
 
 	skb_queue_head_init(&tnlq);
 	skb_queue_head_init(&tmpxq);
+	skb_queue_head_init(&frags);
 
 	/* At least one packet required for safe algorithm => add dummy */
 	skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
@@ -1727,6 +1753,39 @@ tnl:
 		if (queue == &l->backlogq)
 			msg_set_seqno(hdr, seqno++);
 		pktlen = msg_size(hdr);
+
+		/* Tunnel link MTU is not large enough? This could be
+		 * due to:
+		 * 1) Link MTU has just changed or set differently;
+		 * 2) Or FAILOVER on the top of a SYNCH message
+		 *
+		 * The 2nd case should not happen if peer supports
+		 * TIPC_TUNNEL_ENHANCED
+		 */
+		if (pktlen > tnl->mtu - INT_H_SIZE) {
+			if (mtyp == FAILOVER_MSG &&
+			    (tnl->peer_caps & TIPC_TUNNEL_ENHANCED)) {
+				rc = tipc_msg_fragment(skb, &tnlhdr, tnl->mtu,
+						       &frags);
+				if (rc) {
+					pr_warn("%sunable to frag msg: rc %d\n",
+						link_co_err, rc);
+					return;
+				}
+				pktcnt += skb_queue_len(&frags) - 1;
+				pktcnt_need_update = true;
+				skb_queue_splice_tail_init(&frags, &tnlq);
+				continue;
+			}
+			/* Unluckily, peer doesn't have TIPC_TUNNEL_ENHANCED
+			 * => Just warn it and return!
+			 */
+			pr_warn_ratelimited("%stoo large msg <%d, %d>: %d!\n",
+					    link_co_err, msg_user(hdr),
+					    msg_type(hdr), msg_size(hdr));
+			return;
+		}
+
 		msg_set_size(&tnlhdr, pktlen + INT_H_SIZE);
 		tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE, GFP_ATOMIC);
 		if (!tnlskb) {
@@ -1742,6 +1801,12 @@ tnl:
 		goto tnl;
 	}
 
+	if (pktcnt_need_update)
+		skb_queue_walk(&tnlq, skb) {
+			hdr = buf_msg(skb);
+			msg_set_msgcnt(hdr, pktcnt);
+		}
+
 	tipc_link_xmit(tnl, &tnlq, xmitq);
 
 	if (mtyp == FAILOVER_MSG) {
author	Tuong Lien <tuong.t.lien@dektech.com.au>	2019-07-24 08:56:12 +0700
committer	David S. Miller <davem@davemloft.net>	2019-07-25 15:55:47 -0700
commit	2320bcdae62887555701ea78a46b640ff6b63868 (patch)
tree	9823f2ab5b41438842517ae8647904c9aa7b943b /net/tipc/link.c
parent	tipc: optimize link synching mechanism (diff)
download	linux-dev-2320bcdae62887555701ea78a46b640ff6b63868.tar.xz linux-dev-2320bcdae62887555701ea78a46b640ff6b63868.zip