From f21e897eccb5a236f4191ecc1b4391eda895d6ed Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 14 May 2015 10:46:17 -0400 Subject: tipc: improve link congestion algorithm The link congestion algorithm used until now implies two problems. - It is too generous towards lower-level messages in situations of high load by giving "absolute" bandwidth guarantees to the different priority levels. LOW traffic is guaranteed 10%, MEDIUM is guaranted 20%, HIGH is guaranteed 30%, and CRITICAL is guaranteed 40% of the available bandwidth. But, in the absence of higher level traffic, the ratio between two distinct levels becomes unreasonable. E.g. if there is only LOW and MEDIUM traffic on a system, the former is guaranteed 1/3 of the bandwidth, and the latter 2/3. This again means that if there is e.g. one LOW user and 10 MEDIUM users, the former will have 33.3% of the bandwidth, and the others will have to compete for the remainder, i.e. each will end up with 6.7% of the capacity. - Packets of type MSG_BUNDLER are created at SYSTEM importance level, but only after the packets bundled into it have passed the congestion test for their own respective levels. Since bundled packets don't result in incrementing the level counter for their own importance, only occasionally for the SYSTEM level counter, they do in practice obtain SYSTEM level importance. Hence, the current implementation provides a gap in the congestion algorithm that in the worst case may lead to a link reset. We now refine the congestion algorithm as follows: - A message is accepted to the link backlog only if its own level counter, and all superior level counters, permit it. - The importance of a created bundle packet is set according to its contents. A bundle packet created from messges at levels LOW to CRITICAL is given importance level CRITICAL, while a bundle created from a SYSTEM level message is given importance SYSTEM. In the latter case only subsequent SYSTEM level messages are allowed to be bundled into it. This solves the first problem described above, by making the bandwidth guarantee relative to the total number of users at all levels; only the upper limit for each level remains absolute. In the example described above, the single LOW user would use 1/11th of the bandwidth, the same as each of the ten MEDIUM users, but he still has the same guarantee against starvation as the latter ones. The fix also solves the second problem. If the CRITICAL level is filled up by bundle packets of that level, no lower level packets will be accepted any more. Suggested-by: Gergely Kiss Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/tipc/msg.c') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index c3e96e815418..ff7362d40cb3 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -365,6 +365,9 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) return false; if (unlikely(max < (start + msz))) return false; + if ((msg_importance(msg) < TIPC_SYSTEM_IMPORTANCE) && + (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE)) + return false; skb_put(bskb, pad + msz); skb_copy_to_linear_data_offset(bskb, start, skb->data, msz); @@ -448,6 +451,10 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) bmsg = buf_msg(bskb); tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, INT_H_SIZE, dnode); + if (msg_isdata(msg)) + msg_set_importance(bmsg, TIPC_CRITICAL_IMPORTANCE); + else + msg_set_importance(bmsg, TIPC_SYSTEM_IMPORTANCE); msg_set_seqno(bmsg, msg_seqno(msg)); msg_set_ack(bmsg, msg_ack(msg)); msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); -- cgit v1.2.3-59-g8ed1b From dd3f9e70f59f43a5712eba9cf3ee4f1e6999540c Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 14 May 2015 10:46:18 -0400 Subject: tipc: add packet sequence number at instant of transmission Currently, the packet sequence number is updated and added to each packet at the moment a packet is added to the link backlog queue. This is wasteful, since it forces the code to traverse the send packet list packet by packet when adding them to the backlog queue. It would be better to just splice the whole packet list into the backlog queue when that is the right action to do. In this commit, we do this change. Also, since the sequence numbers cannot now be assigned to the packets at the moment they are added the backlog queue, we do instead calculate and add them at the moment of transmission, when the backlog queue has to be traversed anyway. We do this in the function tipc_link_push_packet(). Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 6 +----- net/tipc/link.c | 37 ++++++++++++++++++++++++++++--------- net/tipc/msg.c | 44 +++++++++++++++++++++----------------------- net/tipc/msg.h | 6 +++--- net/tipc/node.c | 2 +- 5 files changed, 54 insertions(+), 41 deletions(-) (limited to 'net/tipc/msg.c') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 842e19f6abf6..4906ca3c0f3a 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -115,12 +115,8 @@ static void bclink_set_last_sent(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *bcl = tn->bcl; - struct sk_buff *skb = skb_peek(&bcl->backlogq); - if (skb) - bcl->silent_intv_cnt = mod(buf_seqno(skb) - 1); - else - bcl->silent_intv_cnt = mod(bcl->snd_nxt - 1); + bcl->silent_intv_cnt = mod(bcl->snd_nxt - 1); } u32 tipc_bclink_get_last_sent(struct net *net) diff --git a/net/tipc/link.c b/net/tipc/link.c index c1aba697776f..fb2a003c8e6d 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -653,7 +653,7 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct tipc_media_addr *addr = &link->media_addr; struct sk_buff_head *transmq = &link->transmq; struct sk_buff_head *backlogq = &link->backlogq; - struct sk_buff *skb, *tmp; + struct sk_buff *skb, *bskb; /* Match msg importance against this and all higher backlog limits: */ for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { @@ -665,32 +665,36 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, return -EMSGSIZE; } /* Prepare each packet for sending, and add to relevant queue: */ - skb_queue_walk_safe(list, skb, tmp) { - __skb_unlink(skb, list); + while (skb_queue_len(list)) { + skb = skb_peek(list); msg = buf_msg(skb); msg_set_seqno(msg, seqno); msg_set_ack(msg, ack); msg_set_bcast_ack(msg, bc_last_in); if (likely(skb_queue_len(transmq) < maxwin)) { + __skb_dequeue(list); __skb_queue_tail(transmq, skb); tipc_bearer_send(net, link->bearer_id, skb, addr); link->rcv_unacked = 0; seqno++; continue; } - if (tipc_msg_bundle(skb_peek_tail(backlogq), skb, mtu)) { + if (tipc_msg_bundle(skb_peek_tail(backlogq), msg, mtu)) { + kfree_skb(__skb_dequeue(list)); link->stats.sent_bundled++; continue; } - if (tipc_msg_make_bundle(&skb, mtu, link->addr)) { + if (tipc_msg_make_bundle(&bskb, msg, mtu, link->addr)) { + kfree_skb(__skb_dequeue(list)); + __skb_queue_tail(backlogq, bskb); + link->backlog[msg_importance(buf_msg(bskb))].len++; link->stats.sent_bundled++; link->stats.sent_bundles++; - imp = msg_importance(buf_msg(skb)); + continue; } - __skb_queue_tail(backlogq, skb); - link->backlog[imp].len++; - seqno++; + link->backlog[imp].len += skb_queue_len(list); + skb_queue_splice_tail_init(list, backlogq); } link->snd_nxt = seqno; return 0; @@ -822,6 +826,7 @@ void tipc_link_push_packets(struct tipc_link *link) { struct sk_buff *skb; struct tipc_msg *msg; + u16 seqno = link->snd_nxt; u16 ack = mod(link->rcv_nxt - 1); while (skb_queue_len(&link->transmq) < link->window) { @@ -831,12 +836,15 @@ void tipc_link_push_packets(struct tipc_link *link) msg = buf_msg(skb); link->backlog[msg_importance(msg)].len--; msg_set_ack(msg, ack); + msg_set_seqno(msg, seqno); + seqno = mod(seqno + 1); msg_set_bcast_ack(msg, link->owner->bclink.last_in); link->rcv_unacked = 0; __skb_queue_tail(&link->transmq, skb); tipc_bearer_send(link->owner->net, link->bearer_id, skb, &link->media_addr); } + link->snd_nxt = seqno; } void tipc_link_reset_all(struct tipc_node *node) @@ -1526,6 +1534,11 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, TUNNEL_PROTOCOL, FAILOVER_MSG, INT_H_SIZE, l_ptr->addr); + + skb_queue_walk(&l_ptr->backlogq, skb) { + msg_set_seqno(buf_msg(skb), l_ptr->snd_nxt); + l_ptr->snd_nxt = mod(l_ptr->snd_nxt + 1); + } skb_queue_splice_tail_init(&l_ptr->backlogq, &l_ptr->transmq); tipc_link_purge_backlog(l_ptr); msgcount = skb_queue_len(&l_ptr->transmq); @@ -1586,6 +1599,7 @@ void tipc_link_dup_queue_xmit(struct tipc_link *link, struct tipc_msg tnl_hdr; struct sk_buff_head *queue = &link->transmq; int mcnt; + u16 seqno; tipc_msg_init(link_own_addr(link), &tnl_hdr, TUNNEL_PROTOCOL, SYNCH_MSG, INT_H_SIZE, link->addr); @@ -1617,6 +1631,11 @@ tunnel_queue: } if (queue == &link->backlogq) return; + seqno = link->snd_nxt; + skb_queue_walk(&link->backlogq, skb) { + msg_set_seqno(buf_msg(skb), seqno); + seqno = mod(seqno + 1); + } queue = &link->backlogq; goto tunnel_queue; } diff --git a/net/tipc/msg.c b/net/tipc/msg.c index ff7362d40cb3..08b4cc7d496d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -331,16 +331,15 @@ error: /** * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one - * @bskb: the buffer to append to ("bundle") - * @skb: buffer to be appended + * @skb: the buffer to append to ("bundle") + * @msg: message to be appended * @mtu: max allowable size for the bundle buffer * Consumes buffer if successful * Returns true if bundling could be performed, otherwise false */ -bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) +bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu) { struct tipc_msg *bmsg; - struct tipc_msg *msg = buf_msg(skb); unsigned int bsz; unsigned int msz = msg_size(msg); u32 start, pad; @@ -348,9 +347,9 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) if (likely(msg_user(msg) == MSG_FRAGMENTER)) return false; - if (!bskb) + if (!skb) return false; - bmsg = buf_msg(bskb); + bmsg = buf_msg(skb); bsz = msg_size(bmsg); start = align(bsz); pad = start - bsz; @@ -359,9 +358,9 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) return false; if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) return false; - if (likely(msg_user(bmsg) != MSG_BUNDLER)) + if (unlikely(msg_user(bmsg) != MSG_BUNDLER)) return false; - if (unlikely(skb_tailroom(bskb) < (pad + msz))) + if (unlikely(skb_tailroom(skb) < (pad + msz))) return false; if (unlikely(max < (start + msz))) return false; @@ -369,11 +368,10 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE)) return false; - skb_put(bskb, pad + msz); - skb_copy_to_linear_data_offset(bskb, start, skb->data, msz); + skb_put(skb, pad + msz); + skb_copy_to_linear_data_offset(skb, start, msg, msz); msg_set_size(bmsg, start + msz); msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); - kfree_skb(skb); return true; } @@ -419,18 +417,18 @@ none: /** * tipc_msg_make_bundle(): Create bundle buf and append message to its tail - * @list: the buffer chain - * @skb: buffer to be appended and replaced + * @list: the buffer chain, where head is the buffer to replace/append + * @skb: buffer to be created, appended to and returned in case of success + * @msg: message to be appended * @mtu: max allowable size for the bundle buffer, inclusive header * @dnode: destination node for message. (Not always present in header) - * Replaces buffer if successful * Returns true if success, otherwise false */ -bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) +bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, + u32 mtu, u32 dnode) { - struct sk_buff *bskb; + struct sk_buff *_skb; struct tipc_msg *bmsg; - struct tipc_msg *msg = buf_msg(*skb); u32 msz = msg_size(msg); u32 max = mtu - INT_H_SIZE; @@ -443,12 +441,12 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) if (msz > (max / 2)) return false; - bskb = tipc_buf_acquire(max); - if (!bskb) + _skb = tipc_buf_acquire(max); + if (!_skb) return false; - skb_trim(bskb, INT_H_SIZE); - bmsg = buf_msg(bskb); + skb_trim(_skb, INT_H_SIZE); + bmsg = buf_msg(_skb); tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, INT_H_SIZE, dnode); if (msg_isdata(msg)) @@ -458,8 +456,8 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) msg_set_seqno(bmsg, msg_seqno(msg)); msg_set_ack(bmsg, msg_ack(msg)); msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); - tipc_msg_bundle(bskb, *skb, mtu); - *skb = bskb; + tipc_msg_bundle(_skb, msg, mtu); + *skb = _skb; return true; } diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 6caf16c475e0..19c45fb66238 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -776,9 +776,9 @@ struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); -bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu); - -bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode); +bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu); +bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, + u32 mtu, u32 dnode); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); diff --git a/net/tipc/node.c b/net/tipc/node.c index eb3856bb8c5a..0b1d61a5f853 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1,7 +1,7 @@ /* * net/tipc/node.c: TIPC node management routines * - * Copyright (c) 2000-2006, 2012-2014, Ericsson AB + * Copyright (c) 2000-2006, 2012-2015, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * -- cgit v1.2.3-59-g8ed1b