From 61e84623ace35ce48975e8f90bbbac7557c43d61 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 7 Oct 2016 22:04:33 -0400 Subject: net: centralize net_device min/max MTU checking While looking into an MTU issue with sfc, I started noticing that almost every NIC driver with an ndo_change_mtu function implemented almost exactly the same range checks, and in many cases, that was the only practical thing their ndo_change_mtu function was doing. Quite a few drivers have either 68, 64, 60 or 46 as their minimum MTU value checked, and then various sizes from 1500 to 65535 for their maximum MTU value. We can remove a whole lot of redundant code here if we simple store min_mtu and max_mtu in net_device, and check against those in net/core/dev.c's dev_set_mtu(). In theory, there should be zero functional change with this patch, it just puts the infrastructure in place. Subsequent patches will attempt to start using said infrastructure, with theoretically zero change in functionality. CC: netdev@vger.kernel.org Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 136ae6bbe81e..fbdf923af4d3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1506,6 +1506,8 @@ enum netdev_priv_flags { * @if_port: Selectable AUI, TP, ... * @dma: DMA channel * @mtu: Interface MTU value + * @min_mtu: Interface Minimum MTU value + * @max_mtu: Interface Maximum MTU value * @type: Interface hardware type * @hard_header_len: Maximum hardware header length. * @@ -1726,6 +1728,8 @@ struct net_device { unsigned char dma; unsigned int mtu; + unsigned int min_mtu; + unsigned int max_mtu; unsigned short type; unsigned short hard_header_len; -- cgit v1.3-7-g2ca7 From cf53b1da73bdf940f1523ec5a7d375d7056c759c Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 11 Oct 2016 13:04:09 -0700 Subject: Revert "net: Add driver helper functions to determine checksum offloadability" This reverts commit 6ae23ad36253a8033c5714c52b691b84456487c5. The code has been in kernel since 4.4 but there are no in tree code that uses. Unused code is broken code, remove it. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 78 -------------------------- net/core/dev.c | 136 ---------------------------------------------- 2 files changed, 214 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fbdf923af4d3..bf341b65ca5e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2653,71 +2653,6 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, remcsum_unadjust((__sum16 *)ptr, grc->delta); } -struct skb_csum_offl_spec { - __u16 ipv4_okay:1, - ipv6_okay:1, - encap_okay:1, - ip_options_okay:1, - ext_hdrs_okay:1, - tcp_okay:1, - udp_okay:1, - sctp_okay:1, - vlan_okay:1, - no_encapped_ipv6:1, - no_not_encapped:1; -}; - -bool __skb_csum_offload_chk(struct sk_buff *skb, - const struct skb_csum_offl_spec *spec, - bool *csum_encapped, - bool csum_help); - -static inline bool skb_csum_offload_chk(struct sk_buff *skb, - const struct skb_csum_offl_spec *spec, - bool *csum_encapped, - bool csum_help) -{ - if (skb->ip_summed != CHECKSUM_PARTIAL) - return false; - - return __skb_csum_offload_chk(skb, spec, csum_encapped, csum_help); -} - -static inline bool skb_csum_offload_chk_help(struct sk_buff *skb, - const struct skb_csum_offl_spec *spec) -{ - bool csum_encapped; - - return skb_csum_offload_chk(skb, spec, &csum_encapped, true); -} - -static inline bool skb_csum_off_chk_help_cmn(struct sk_buff *skb) -{ - static const struct skb_csum_offl_spec csum_offl_spec = { - .ipv4_okay = 1, - .ip_options_okay = 1, - .ipv6_okay = 1, - .vlan_okay = 1, - .tcp_okay = 1, - .udp_okay = 1, - }; - - return skb_csum_offload_chk_help(skb, &csum_offl_spec); -} - -static inline bool skb_csum_off_chk_help_cmn_v4_only(struct sk_buff *skb) -{ - static const struct skb_csum_offl_spec csum_offl_spec = { - .ipv4_okay = 1, - .ip_options_okay = 1, - .tcp_okay = 1, - .udp_okay = 1, - .vlan_okay = 1, - }; - - return skb_csum_offload_chk_help(skb, &csum_offl_spec); -} - static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, @@ -3961,19 +3896,6 @@ static inline bool can_checksum_protocol(netdev_features_t features, } } -/* Map an ethertype into IP protocol if possible */ -static inline int eproto_to_ipproto(int eproto) -{ - switch (eproto) { - case htons(ETH_P_IP): - return IPPROTO_IP; - case htons(ETH_P_IPV6): - return IPPROTO_IPV6; - default: - return -1; - } -} - #ifdef CONFIG_BUG void netdev_rx_csum_fault(struct net_device *dev); #else diff --git a/net/core/dev.c b/net/core/dev.c index f376639e8774..6498cc2ba8f6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -139,7 +139,6 @@ #include #include #include -#include #include #include "net-sysfs.h" @@ -2492,141 +2491,6 @@ out: } EXPORT_SYMBOL(skb_checksum_help); -/* skb_csum_offload_check - Driver helper function to determine if a device - * with limited checksum offload capabilities is able to offload the checksum - * for a given packet. - * - * Arguments: - * skb - sk_buff for the packet in question - * spec - contains the description of what device can offload - * csum_encapped - returns true if the checksum being offloaded is - * encpasulated. That is it is checksum for the transport header - * in the inner headers. - * checksum_help - when set indicates that helper function should - * call skb_checksum_help if offload checks fail - * - * Returns: - * true: Packet has passed the checksum checks and should be offloadable to - * the device (a driver may still need to check for additional - * restrictions of its device) - * false: Checksum is not offloadable. If checksum_help was set then - * skb_checksum_help was called to resolve checksum for non-GSO - * packets and when IP protocol is not SCTP - */ -bool __skb_csum_offload_chk(struct sk_buff *skb, - const struct skb_csum_offl_spec *spec, - bool *csum_encapped, - bool csum_help) -{ - struct iphdr *iph; - struct ipv6hdr *ipv6; - void *nhdr; - int protocol; - u8 ip_proto; - - if (skb->protocol == htons(ETH_P_8021Q) || - skb->protocol == htons(ETH_P_8021AD)) { - if (!spec->vlan_okay) - goto need_help; - } - - /* We check whether the checksum refers to a transport layer checksum in - * the outermost header or an encapsulated transport layer checksum that - * corresponds to the inner headers of the skb. If the checksum is for - * something else in the packet we need help. - */ - if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) { - /* Non-encapsulated checksum */ - protocol = eproto_to_ipproto(vlan_get_protocol(skb)); - nhdr = skb_network_header(skb); - *csum_encapped = false; - if (spec->no_not_encapped) - goto need_help; - } else if (skb->encapsulation && spec->encap_okay && - skb_checksum_start_offset(skb) == - skb_inner_transport_offset(skb)) { - /* Encapsulated checksum */ - *csum_encapped = true; - switch (skb->inner_protocol_type) { - case ENCAP_TYPE_ETHER: - protocol = eproto_to_ipproto(skb->inner_protocol); - break; - case ENCAP_TYPE_IPPROTO: - protocol = skb->inner_protocol; - break; - } - nhdr = skb_inner_network_header(skb); - } else { - goto need_help; - } - - switch (protocol) { - case IPPROTO_IP: - if (!spec->ipv4_okay) - goto need_help; - iph = nhdr; - ip_proto = iph->protocol; - if (iph->ihl != 5 && !spec->ip_options_okay) - goto need_help; - break; - case IPPROTO_IPV6: - if (!spec->ipv6_okay) - goto need_help; - if (spec->no_encapped_ipv6 && *csum_encapped) - goto need_help; - ipv6 = nhdr; - nhdr += sizeof(*ipv6); - ip_proto = ipv6->nexthdr; - break; - default: - goto need_help; - } - -ip_proto_again: - switch (ip_proto) { - case IPPROTO_TCP: - if (!spec->tcp_okay || - skb->csum_offset != offsetof(struct tcphdr, check)) - goto need_help; - break; - case IPPROTO_UDP: - if (!spec->udp_okay || - skb->csum_offset != offsetof(struct udphdr, check)) - goto need_help; - break; - case IPPROTO_SCTP: - if (!spec->sctp_okay || - skb->csum_offset != offsetof(struct sctphdr, checksum)) - goto cant_help; - break; - case NEXTHDR_HOP: - case NEXTHDR_ROUTING: - case NEXTHDR_DEST: { - u8 *opthdr = nhdr; - - if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay) - goto need_help; - - ip_proto = opthdr[0]; - nhdr += (opthdr[1] + 1) << 3; - - goto ip_proto_again; - } - default: - goto need_help; - } - - /* Passed the tests for offloading checksum */ - return true; - -need_help: - if (csum_help && !skb_shinfo(skb)->gso_size) - skb_checksum_help(skb); -cant_help: - return false; -} -EXPORT_SYMBOL(__skb_csum_offload_chk); - __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; -- cgit v1.3-7-g2ca7 From 1a3f060c1a47dba4e12ac21ce62b57666b9c4e95 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 17 Oct 2016 19:15:44 -0700 Subject: net: Introduce new api for walking upper and lower devices This patch introduces netdev_walk_all_upper_dev_rcu, netdev_walk_all_lower_dev and netdev_walk_all_lower_dev_rcu. These functions recursively walk the adj_list of devices to determine all upper and lower devices. The functions take a callback function that is invoked for each device in the list. If the callback returns non-0, the walk is terminated and the functions return that code back to callers. v3 - simplified netdev_has_upper_dev_all_rcu and __netdev_has_upper_dev and removed typecast as suggested by Stephen v2 - fixed definition of netdev_next_lower_dev_rcu to mirror the upper_dev version. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 17 +++++ net/core/dev.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bf341b65ca5e..a5902d995907 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3778,6 +3778,14 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, updev; \ updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter))) +int netdev_walk_all_upper_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *upper_dev, + void *data), + void *data); + +bool netdev_has_upper_dev_all_rcu(struct net_device *dev, + struct net_device *upper_dev); + void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter); void *netdev_lower_get_next_private_rcu(struct net_device *dev, @@ -3821,6 +3829,15 @@ struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, ldev; \ ldev = netdev_all_lower_get_next_rcu(dev, &(iter))) +int netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *lower_dev, + void *data), + void *data); +int netdev_walk_all_lower_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *lower_dev, + void *data), + void *data); + void *netdev_adjacent_get_private(struct list_head *adj_list); void *netdev_lower_get_first_private_rcu(struct net_device *dev); struct net_device *netdev_master_upper_dev_get(struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index f67fd16615bb..fc48337cfab8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5155,6 +5155,31 @@ bool netdev_has_upper_dev(struct net_device *dev, } EXPORT_SYMBOL(netdev_has_upper_dev); +/** + * netdev_has_upper_dev_all - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check + * + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks the entire upper device chain. + * The caller must hold rcu lock. + */ + +static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data) +{ + struct net_device *dev = data; + + return upper_dev == dev; +} + +bool netdev_has_upper_dev_all_rcu(struct net_device *dev, + struct net_device *upper_dev) +{ + return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, + upper_dev); +} +EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); + /** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device @@ -5255,6 +5280,51 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, } EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); +static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} + +int netdev_walk_all_upper_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *udev; + struct list_head *iter; + int ret; + + for (iter = &dev->adj_list.upper, + udev = netdev_next_upper_dev_rcu(dev, &iter); + udev; + udev = netdev_next_upper_dev_rcu(dev, &iter)) { + /* first is the upper device itself */ + ret = fn(udev, data); + if (ret) + return ret; + + /* then look at all of its upper devices */ + ret = netdev_walk_all_upper_dev_rcu(udev, fn, data); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); + /** * netdev_lower_get_next_private - Get the next ->private from the * lower neighbour list @@ -5361,6 +5431,49 @@ struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list } EXPORT_SYMBOL(netdev_all_lower_get_next); +static struct net_device *netdev_next_lower_dev(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = lower->list.next; + + return lower->dev; +} + +int netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *ldev; + struct list_head *iter; + int ret; + + for (iter = &dev->adj_list.lower, + ldev = netdev_next_lower_dev(dev, &iter); + ldev; + ldev = netdev_next_lower_dev(dev, &iter)) { + /* first is the lower device itself */ + ret = fn(ldev, data); + if (ret) + return ret; + + /* then look at all of its lower devices */ + ret = netdev_walk_all_lower_dev(ldev, fn, data); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); + /** * netdev_all_lower_get_next_rcu - Get the next device from all * lower neighbour list, RCU variant @@ -5382,6 +5495,48 @@ struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, } EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); +static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} + +int netdev_walk_all_lower_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *ldev; + struct list_head *iter; + int ret; + + for (iter = &dev->adj_list.lower, + ldev = netdev_next_lower_dev_rcu(dev, &iter); + ldev; + ldev = netdev_next_lower_dev_rcu(dev, &iter)) { + /* first is the lower device itself */ + ret = fn(ldev, data); + if (ret) + return ret; + + /* then look at all of its lower devices */ + ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); + /** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU -- cgit v1.3-7-g2ca7 From f1170fd462c67c4ae2f20734566d94e0f8f62f69 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 17 Oct 2016 19:15:51 -0700 Subject: net: Remove all_adj_list and its references Only direct adjacencies are maintained. All upper or lower devices can be learned via the new walk API which recursively walks the adj_list for upper devices or lower devices. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 25 ------ net/core/dev.c | 223 ++++------------------------------------------ 2 files changed, 18 insertions(+), 230 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a5902d995907..458c87631e7f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1456,7 +1456,6 @@ enum netdev_priv_flags { * @ptype_specific: Device-specific, protocol-specific packet handlers * * @adj_list: Directly linked devices, like slaves for bonding - * @all_adj_list: All linked devices, *including* neighbours * @features: Currently active device features * @hw_features: User-changeable features * @@ -1675,11 +1674,6 @@ struct net_device { struct list_head lower; } adj_list; - struct { - struct list_head upper; - struct list_head lower; - } all_adj_list; - netdev_features_t features; netdev_features_t hw_features; netdev_features_t wanted_features; @@ -3771,13 +3765,6 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, updev; \ updev = netdev_upper_get_next_dev_rcu(dev, &(iter))) -/* iterate through upper list, must be called under RCU read lock */ -#define netdev_for_each_all_upper_dev_rcu(dev, updev, iter) \ - for (iter = &(dev)->all_adj_list.upper, \ - updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)); \ - updev; \ - updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter))) - int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *upper_dev, void *data), @@ -3817,18 +3804,6 @@ struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, struct list_head **iter); -#define netdev_for_each_all_lower_dev(dev, ldev, iter) \ - for (iter = (dev)->all_adj_list.lower.next, \ - ldev = netdev_all_lower_get_next(dev, &(iter)); \ - ldev; \ - ldev = netdev_all_lower_get_next(dev, &(iter))) - -#define netdev_for_each_all_lower_dev_rcu(dev, ldev, iter) \ - for (iter = (dev)->all_adj_list.lower.next, \ - ldev = netdev_all_lower_get_next_rcu(dev, &(iter)); \ - ldev; \ - ldev = netdev_all_lower_get_next_rcu(dev, &(iter))) - int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *lower_dev, void *data), diff --git a/net/core/dev.c b/net/core/dev.c index fc48337cfab8..a9fe14908b44 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5137,6 +5137,13 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, return NULL; } +static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data) +{ + struct net_device *dev = data; + + return upper_dev == dev; +} + /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device @@ -5151,7 +5158,8 @@ bool netdev_has_upper_dev(struct net_device *dev, { ASSERT_RTNL(); - return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); + return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, + upper_dev); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -5165,13 +5173,6 @@ EXPORT_SYMBOL(netdev_has_upper_dev); * The caller must hold rcu lock. */ -static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data) -{ - struct net_device *dev = data; - - return upper_dev == dev; -} - bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev) { @@ -5191,7 +5192,7 @@ static bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); - return !list_empty(&dev->all_adj_list.upper); + return !list_empty(&dev->adj_list.upper); } /** @@ -5254,32 +5255,6 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); -/** - * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list - * @dev: device - * @iter: list_head ** of the current position - * - * Gets the next device from the dev's upper list, starting from iter - * position. The caller must hold RCU read lock. - */ -struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, - struct list_head **iter) -{ - struct netdev_adjacent *upper; - - WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); - - upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); - - if (&upper->list == &dev->all_adj_list.upper) - return NULL; - - *iter = &upper->list; - - return upper->dev; -} -EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); - static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, struct list_head **iter) { @@ -5406,31 +5381,6 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) } EXPORT_SYMBOL(netdev_lower_get_next); -/** - * netdev_all_lower_get_next - Get the next device from all lower neighbour list - * @dev: device - * @iter: list_head ** of the current position - * - * Gets the next netdev_adjacent from the dev's all lower neighbour - * list, starting from iter position. The caller must hold RTNL lock or - * its own locking that guarantees that the neighbour all lower - * list will remain unchanged. - */ -struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter) -{ - struct netdev_adjacent *lower; - - lower = list_entry(*iter, struct netdev_adjacent, list); - - if (&lower->list == &dev->all_adj_list.lower) - return NULL; - - *iter = lower->list.next; - - return lower->dev; -} -EXPORT_SYMBOL(netdev_all_lower_get_next); - static struct net_device *netdev_next_lower_dev(struct net_device *dev, struct list_head **iter) { @@ -5474,27 +5424,6 @@ int netdev_walk_all_lower_dev(struct net_device *dev, } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); -/** - * netdev_all_lower_get_next_rcu - Get the next device from all - * lower neighbour list, RCU variant - * @dev: device - * @iter: list_head ** of the current position - * - * Gets the next netdev_adjacent from the dev's all lower neighbour - * list, starting from iter position. The caller must hold RCU read lock. - */ -struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, - struct list_head **iter) -{ - struct netdev_adjacent *lower; - - lower = list_first_or_null_rcu(&dev->all_adj_list.lower, - struct netdev_adjacent, list); - - return lower ? lower->dev : NULL; -} -EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); - static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter) { @@ -5722,15 +5651,6 @@ static int __netdev_adjacent_dev_link_lists(struct net_device *dev, return 0; } -static int __netdev_adjacent_dev_link(struct net_device *dev, - struct net_device *upper_dev) -{ - return __netdev_adjacent_dev_link_lists(dev, upper_dev, - &dev->all_adj_list.upper, - &upper_dev->all_adj_list.lower, - NULL, false); -} - static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, struct net_device *upper_dev, u16 ref_nr, @@ -5741,40 +5661,19 @@ static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); } -static void __netdev_adjacent_dev_unlink(struct net_device *dev, - struct net_device *upper_dev, - u16 ref_nr) -{ - __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr, - &dev->all_adj_list.upper, - &upper_dev->all_adj_list.lower); -} - static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, void *private, bool master) { - int ret = __netdev_adjacent_dev_link(dev, upper_dev); - - if (ret) - return ret; - - ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, - &dev->adj_list.upper, - &upper_dev->adj_list.lower, - private, master); - if (ret) { - __netdev_adjacent_dev_unlink(dev, upper_dev, 1); - return ret; - } - - return 0; + return __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + private, master); } static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, struct net_device *upper_dev) { - __netdev_adjacent_dev_unlink(dev, upper_dev, 1); __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower); @@ -5785,7 +5684,6 @@ static int __netdev_upper_dev_link(struct net_device *dev, void *upper_priv, void *upper_info) { struct netdev_notifier_changeupper_info changeupper_info; - struct netdev_adjacent *i, *j, *to_i, *to_j; int ret = 0; ASSERT_RTNL(); @@ -5794,10 +5692,10 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) + if (netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; - if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) + if (netdev_has_upper_dev(dev, upper_dev)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) @@ -5819,80 +5717,15 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) return ret; - /* Now that we linked these devs, make all the upper_dev's - * all_adj_list.upper visible to every dev's all_adj_list.lower an - * versa, and don't forget the devices itself. All of these - * links are non-neighbours. - */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { - pr_debug("Interlinking %s with %s, non-neighbour\n", - i->dev->name, j->dev->name); - ret = __netdev_adjacent_dev_link(i->dev, j->dev); - if (ret) - goto rollback_mesh; - } - } - - /* add dev to every upper_dev's upper device */ - list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { - pr_debug("linking %s's upper device %s with %s\n", - upper_dev->name, i->dev->name, dev->name); - ret = __netdev_adjacent_dev_link(dev, i->dev); - if (ret) - goto rollback_upper_mesh; - } - - /* add upper_dev to every dev's lower device */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - pr_debug("linking %s's lower device %s with %s\n", dev->name, - i->dev->name, upper_dev->name); - ret = __netdev_adjacent_dev_link(i->dev, upper_dev); - if (ret) - goto rollback_lower_mesh; - } - ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) - goto rollback_lower_mesh; + goto rollback; return 0; -rollback_lower_mesh: - to_i = i; - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - if (i == to_i) - break; - __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); - } - - i = NULL; - -rollback_upper_mesh: - to_i = i; - list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { - if (i == to_i) - break; - __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); - } - - i = j = NULL; - -rollback_mesh: - to_i = i; - to_j = j; - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { - if (i == to_i && j == to_j) - break; - __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); - } - if (i == to_i) - break; - } - +rollback: __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; @@ -5949,7 +5782,6 @@ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { struct netdev_notifier_changeupper_info changeupper_info; - struct netdev_adjacent *i, *j; ASSERT_RTNL(); changeupper_info.upper_dev = upper_dev; @@ -5961,23 +5793,6 @@ void netdev_upper_dev_unlink(struct net_device *dev, __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); - /* Here is the tricky part. We must remove all dev's lower - * devices from all upper_dev's upper devices and vice - * versa, to maintain the graph relationship. - */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) - list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) - __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); - - /* remove also the devices itself from lower/upper device - * list - */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) - __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); - - list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) - __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); - call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, &changeupper_info.info); } @@ -7679,8 +7494,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->adj_list.upper); INIT_LIST_HEAD(&dev->adj_list.lower); - INIT_LIST_HEAD(&dev->all_adj_list.upper); - INIT_LIST_HEAD(&dev->all_adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); #ifdef CONFIG_NET_SCHED -- cgit v1.3-7-g2ca7 From 9cf1f6a8c4cbb7836b838b51b3b02ddf32c6c6a0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 28 Oct 2016 11:43:20 -0400 Subject: net: Move functions for configuring traffic classes out of inline headers The functions for configuring the traffic class to queue mappings have other effects that need to be addressed. Instead of trying to export a bunch of new functions just relocate the functions so that we can instrument them directly with the functionality they will need. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 31 +++---------------------------- net/core/dev.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 28 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 20ce8df115ac..e05ab3bd48d2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1920,34 +1920,9 @@ int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc) return 0; } -static inline -void netdev_reset_tc(struct net_device *dev) -{ - dev->num_tc = 0; - memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); - memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); -} - -static inline -int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) -{ - if (tc >= dev->num_tc) - return -EINVAL; - - dev->tc_to_txq[tc].count = count; - dev->tc_to_txq[tc].offset = offset; - return 0; -} - -static inline -int netdev_set_num_tc(struct net_device *dev, u8 num_tc) -{ - if (num_tc > TC_MAX_QUEUE) - return -EINVAL; - - dev->num_tc = num_tc; - return 0; -} +void netdev_reset_tc(struct net_device *dev); +int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset); +int netdev_set_num_tc(struct net_device *dev, u8 num_tc); static inline int netdev_get_num_tc(struct net_device *dev) diff --git a/net/core/dev.c b/net/core/dev.c index 8341dadf5e94..2d54be912136 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2173,6 +2173,35 @@ error: EXPORT_SYMBOL(netif_set_xps_queue); #endif +void netdev_reset_tc(struct net_device *dev) +{ + dev->num_tc = 0; + memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); + memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); +} +EXPORT_SYMBOL(netdev_reset_tc); + +int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) +{ + if (tc >= dev->num_tc) + return -EINVAL; + + dev->tc_to_txq[tc].count = count; + dev->tc_to_txq[tc].offset = offset; + return 0; +} +EXPORT_SYMBOL(netdev_set_tc_queue); + +int netdev_set_num_tc(struct net_device *dev, u8 num_tc) +{ + if (num_tc > TC_MAX_QUEUE) + return -EINVAL; + + dev->num_tc = num_tc; + return 0; +} +EXPORT_SYMBOL(netdev_set_num_tc); + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. -- cgit v1.3-7-g2ca7 From 8d059b0f6f5b1d3acf829454e1087818ad660058 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 28 Oct 2016 11:43:49 -0400 Subject: net: Add sysfs value to determine queue traffic class Add a sysfs attribute for a Tx queue that allows us to determine the traffic class for a given queue. This will allow us to more easily determine this in the future. It is needed as XPS will take the traffic class for a group of queues into account in order to avoid pulling traffic from one traffic class into another. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 17 +++++++++++++++++ net/core/net-sysfs.c | 20 +++++++++++++++++++- 3 files changed, 37 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e05ab3bd48d2..d91a41860614 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1920,6 +1920,7 @@ int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc) return 0; } +int netdev_txq_to_tc(struct net_device *dev, unsigned int txq); void netdev_reset_tc(struct net_device *dev); int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset); int netdev_set_num_tc(struct net_device *dev, u8 num_tc); diff --git a/net/core/dev.c b/net/core/dev.c index 2d54be912136..db0fdbbcd9b8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1948,6 +1948,23 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) } } +int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) +{ + if (dev->num_tc) { + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + int i; + + for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { + if ((txq - tc->offset) < tc->count) + return i; + } + + return -1; + } + + return 0; +} + #ifdef CONFIG_XPS static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index d4fe28606ff5..38bd9b933195 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1024,7 +1024,6 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue, return sprintf(buf, "%lu", trans_timeout); } -#ifdef CONFIG_XPS static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; @@ -1036,6 +1035,21 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue) return i; } +static ssize_t show_traffic_class(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + char *buf) +{ + struct net_device *dev = queue->dev; + int index = get_netdev_queue_index(queue); + int tc = netdev_txq_to_tc(dev, index); + + if (tc < 0) + return -EINVAL; + + return sprintf(buf, "%u\n", tc); +} + +#ifdef CONFIG_XPS static ssize_t show_tx_maxrate(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, char *buf) @@ -1078,6 +1092,9 @@ static struct netdev_queue_attribute queue_tx_maxrate = static struct netdev_queue_attribute queue_trans_timeout = __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); +static struct netdev_queue_attribute queue_traffic_class = + __ATTR(traffic_class, S_IRUGO, show_traffic_class, NULL); + #ifdef CONFIG_BQL /* * Byte queue limits sysfs structures and functions. @@ -1263,6 +1280,7 @@ static struct netdev_queue_attribute xps_cpus_attribute = static struct attribute *netdev_queue_default_attrs[] = { &queue_trans_timeout.attr, + &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, &queue_tx_maxrate.attr, -- cgit v1.3-7-g2ca7 From 184c449f91fef521042970cca46bd5cdfc0e3a37 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 28 Oct 2016 11:50:13 -0400 Subject: net: Add support for XPS with QoS via traffic classes This patch adds support for setting and using XPS when QoS via traffic classes is enabled. With this change we will factor in the priority and traffic class mapping of the packet and use that information to correctly select the queue. This allows us to define a set of queues for a given traffic class via mqprio and then configure the XPS mapping for those queues so that the traffic flows can avoid head-of-line blocking between the individual CPUs if so desired. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +- net/core/dev.c | 117 ++++++++++++++++++++++++++++++++-------------- net/core/net-sysfs.c | 31 +++++++----- 3 files changed, 105 insertions(+), 47 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d91a41860614..66fd61c681d9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -732,8 +732,8 @@ struct xps_dev_maps { struct rcu_head rcu; struct xps_map __rcu *cpu_map[0]; }; -#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) + \ - (nr_cpu_ids * sizeof(struct xps_map *))) +#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ + (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *))) #endif /* CONFIG_XPS */ #define TC_MAX_QUEUE 16 diff --git a/net/core/dev.c b/net/core/dev.c index 108a6adce185..f23e28668f32 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2002,14 +2002,22 @@ static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { - int i, j; + int num_tc = dev->num_tc ? : 1; + bool active = false; + int tci; - for (i = count, j = offset; i--; j++) { - if (!remove_xps_queue(dev_maps, cpu, j)) - break; + for (tci = cpu * num_tc; num_tc--; tci++) { + int i, j; + + for (i = count, j = offset; i--; j++) { + if (!remove_xps_queue(dev_maps, cpu, j)) + break; + } + + active |= i < 0; } - return i < 0; + return active; } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, @@ -2086,20 +2094,28 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + int i, cpu, tci, numa_node_id = -2; + int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; - int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); - int cpu, numa_node_id = -2; bool active = false; + if (dev->num_tc) { + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + + maps_sz = XPS_DEV_MAPS_SIZE(num_tc); + if (maps_sz < L1_CACHE_BYTES) + maps_sz = L1_CACHE_BYTES; + mutex_lock(&xps_map_mutex); dev_maps = xmap_dereference(dev->xps_maps); /* allocate memory for queue storage */ - for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mask)) - continue; - + for_each_cpu_and(cpu, cpu_online_mask, mask) { if (!new_dev_maps) new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { @@ -2107,25 +2123,38 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, return -ENOMEM; } - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : + tci = cpu * num_tc + tc; + map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) : NULL; map = expand_xps_map(map, cpu, index); if (!map) goto error; - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; for_each_possible_cpu(cpu) { + /* copy maps belonging to foreign traffic classes */ + for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) { + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->cpu_map[tci]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + } + + /* We need to explicitly update tci as prevous loop + * could break out early if dev_maps is NULL. + */ + tci = cpu * num_tc + tc; + if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { /* add queue to CPU maps */ int pos = 0; - map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + map = xmap_dereference(new_dev_maps->cpu_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; @@ -2139,26 +2168,36 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, #endif } else if (dev_maps) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[cpu]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + map = xmap_dereference(dev_maps->cpu_map[tci]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); } + /* copy maps belonging to foreign traffic classes */ + for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->cpu_map[tci]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + } } rcu_assign_pointer(dev->xps_maps, new_dev_maps); /* Cleanup old maps */ - if (dev_maps) { - for_each_possible_cpu(cpu) { - new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); - map = xmap_dereference(dev_maps->cpu_map[cpu]); + if (!dev_maps) + goto out_no_old_maps; + + for_each_possible_cpu(cpu) { + for (i = num_tc, tci = cpu * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); + map = xmap_dereference(dev_maps->cpu_map[tci]); if (map && map != new_map) kfree_rcu(map, rcu); } - - kfree_rcu(dev_maps, rcu); } + kfree_rcu(dev_maps, rcu); + +out_no_old_maps: dev_maps = new_dev_maps; active = true; @@ -2173,11 +2212,12 @@ out_no_new_maps: /* removes queue from unused CPUs */ for_each_possible_cpu(cpu) { - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) - continue; - - if (remove_xps_queue(dev_maps, cpu, index)) - active = true; + for (i = tc, tci = cpu * num_tc; i--; tci++) + active |= remove_xps_queue(dev_maps, tci, index); + if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu)) + active |= remove_xps_queue(dev_maps, tci, index); + for (i = num_tc - tc, tci++; --i; tci++) + active |= remove_xps_queue(dev_maps, tci, index); } /* free map if not active */ @@ -2193,11 +2233,14 @@ out_no_maps: error: /* remove any maps that we added */ for_each_possible_cpu(cpu) { - new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : - NULL; - if (new_map && new_map != map) - kfree(new_map); + for (i = num_tc, tci = cpu * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[tci]) : + NULL; + if (new_map && new_map != map) + kfree(new_map); + } } mutex_unlock(&xps_map_mutex); @@ -3158,8 +3201,14 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps); if (dev_maps) { - map = rcu_dereference( - dev_maps->cpu_map[skb->sender_cpu - 1]); + unsigned int tci = skb->sender_cpu - 1; + + if (dev->num_tc) { + tci *= dev->num_tc; + tci += netdev_get_prio_tc_map(dev, skb->priority); + } + + map = rcu_dereference(dev_maps->cpu_map[tci]); if (map) { if (map->len == 1) queue_index = map->queues[0]; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 38bd9b933195..b0c04cf4851d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1210,29 +1210,38 @@ static ssize_t show_xps_map(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, char *buf) { struct net_device *dev = queue->dev; + int cpu, len, num_tc = 1, tc = 0; struct xps_dev_maps *dev_maps; cpumask_var_t mask; unsigned long index; - int i, len; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; index = get_netdev_queue_index(queue); + if (dev->num_tc) { + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps); if (dev_maps) { - for_each_possible_cpu(i) { - struct xps_map *map = - rcu_dereference(dev_maps->cpu_map[i]); - if (map) { - int j; - for (j = 0; j < map->len; j++) { - if (map->queues[j] == index) { - cpumask_set_cpu(i, mask); - break; - } + for_each_possible_cpu(cpu) { + int i, tci = cpu * num_tc + tc; + struct xps_map *map; + + map = rcu_dereference(dev_maps->cpu_map[tci]); + if (!map) + continue; + + for (i = map->len; i--;) { + if (map->queues[i] == index) { + cpumask_set_cpu(cpu, mask); + break; } } } -- cgit v1.3-7-g2ca7 From 149d6ad83663b4820ca09c9d40b1eea7f5c22c2b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Nov 2016 11:07:28 -0800 Subject: net: napi_hash_add() is no longer exported There are no more users except from net/core/dev.c napi_hash_add() can now be static. Signed-off-by: Eric Dumazet Cc: Michael Chan Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 ----------- net/core/dev.c | 3 +-- 2 files changed, 1 insertion(+), 13 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 66fd61c681d9..d64135a0ab71 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -467,17 +467,6 @@ static inline void napi_complete(struct napi_struct *n) return napi_complete_done(n, 0); } -/** - * napi_hash_add - add a NAPI to global hashtable - * @napi: NAPI context - * - * Generate a new napi_id and store a @napi under it in napi_hash. - * Used for busy polling (CONFIG_NET_RX_BUSY_POLL). - * Note: This is normally automatically done from netif_napi_add(), - * so might disappear in a future Linux version. - */ -void napi_hash_add(struct napi_struct *napi); - /** * napi_hash_del - remove a NAPI from global table * @napi: NAPI context diff --git a/net/core/dev.c b/net/core/dev.c index c9837fa08dfc..7385c1a152fd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5017,7 +5017,7 @@ EXPORT_SYMBOL(sk_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ -void napi_hash_add(struct napi_struct *napi) +static void napi_hash_add(struct napi_struct *napi) { if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) @@ -5037,7 +5037,6 @@ void napi_hash_add(struct napi_struct *napi) spin_unlock(&napi_hash_lock); } -EXPORT_SYMBOL_GPL(napi_hash_add); /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi -- cgit v1.3-7-g2ca7 From 217f6974368188fd8bd7804bf5a036aa5762c5e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2016 10:15:11 -0800 Subject: net: busy-poll: allow preemption in sk_busy_loop() After commit 4cd13c21b207 ("softirq: Let ksoftirqd do its job"), sk_busy_loop() needs a bit of care : softirqs might be delayed since we do not allow preemption yet. This patch adds preemptiom points in sk_busy_loop(), and makes sure no unnecessary cache line dirtying or atomic operations are done while looping. A new flag is added into napi->state : NAPI_STATE_IN_BUSY_POLL This prevents napi_complete_done() from clearing NAPIF_STATE_SCHED, so that sk_busy_loop() does not have to grab it again. Similarly, netpoll_poll_lock() is done one time. This gives about 10 to 20 % improvement in various busy polling tests, especially when many threads are busy polling in configurations with large number of NIC queues. This should allow experimenting with bigger delays without hurting overall latencies. Tested: On a 40Gb mlx4 NIC, 32 RX/TX queues. echo 70 >/proc/sys/net/core/busy_read for i in `seq 1 40`; do echo -n $i: ; ./super_netperf $i -H lpaa24 -t UDP_RR -- -N -n; done Before: After: 1: 90072 92819 2: 157289 184007 3: 235772 213504 4: 344074 357513 5: 394755 458267 6: 461151 487819 7: 549116 625963 8: 544423 716219 9: 720460 738446 10: 794686 837612 11: 915998 923960 12: 937507 925107 13: 1019677 971506 14: 1046831 1113650 15: 1114154 1148902 16: 1105221 1179263 17: 1266552 1299585 18: 1258454 1383817 19: 1341453 1312194 20: 1363557 1488487 21: 1387979 1501004 22: 1417552 1601683 23: 1550049 1642002 24: 1568876 1601915 25: 1560239 1683607 26: 1640207 1745211 27: 1706540 1723574 28: 1638518 1722036 29: 1734309 1757447 30: 1782007 1855436 31: 1724806 1888539 32: 1717716 1944297 33: 1778716 1869118 34: 1805738 1983466 35: 1815694 2020758 36: 1893059 2035632 37: 1843406 2034653 38: 1888830 2086580 39: 1972827 2143567 40: 1877729 2181851 Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Adam Belay Cc: Tariq Toukan Cc: Yuval Mintz Cc: Ariel Elior Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 +++++ net/core/dev.c | 102 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 92 insertions(+), 20 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 86bacf6a64f0..e71de66e3792 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -334,6 +334,16 @@ enum { NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ + NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ +}; + +enum { + NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED), + NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE), + NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC), + NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED), + NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL), }; enum gro_result { diff --git a/net/core/dev.c b/net/core/dev.c index 6deba68ad9e4..369dcc8efc01 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4902,6 +4902,12 @@ void __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + /* Some drivers call us directly, instead of calling + * napi_complete_done(). + */ + if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state))) + return; + list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); @@ -4913,10 +4919,13 @@ void napi_complete_done(struct napi_struct *n, int work_done) unsigned long flags; /* - * don't let napi dequeue from the cpu poll list - * just in case its running on a different cpu + * 1) Don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu. + * 2) If we are busy polling, do nothing here, we have + * the guarantee we will be called later. */ - if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) + if (unlikely(n->state & (NAPIF_STATE_NPSVC | + NAPIF_STATE_IN_BUSY_POLL))) return; if (n->gro_list) { @@ -4956,13 +4965,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) } #if defined(CONFIG_NET_RX_BUSY_POLL) + #define BUSY_POLL_BUDGET 8 + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) +{ + int rc; + + clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); + + local_bh_disable(); + + /* All we really want here is to re-enable device interrupts. + * Ideally, a new ndo_busy_poll_stop() could avoid another round. + */ + rc = napi->poll(napi, BUSY_POLL_BUDGET); + netpoll_poll_unlock(have_poll_lock); + if (rc == BUSY_POLL_BUDGET) + __napi_schedule(napi); + local_bh_enable(); + if (local_softirq_pending()) + do_softirq(); +} + bool sk_busy_loop(struct sock *sk, int nonblock) { unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + int (*napi_poll)(struct napi_struct *napi, int budget); int (*busy_poll)(struct napi_struct *dev); + void *have_poll_lock = NULL; struct napi_struct *napi; - int rc = false; + int rc; + +restart: + rc = false; + napi_poll = NULL; rcu_read_lock(); @@ -4973,24 +5010,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock) /* Note: ndo_busy_poll method is optional in linux-4.5 */ busy_poll = napi->dev->netdev_ops->ndo_busy_poll; - do { + preempt_disable(); + for (;;) { rc = 0; local_bh_disable(); if (busy_poll) { rc = busy_poll(napi); - } else if (napi_schedule_prep(napi)) { - void *have = netpoll_poll_lock(napi); - - if (test_bit(NAPI_STATE_SCHED, &napi->state)) { - rc = napi->poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); - if (rc == BUSY_POLL_BUDGET) { - napi_complete_done(napi, rc); - napi_schedule(napi); - } - } - netpoll_poll_unlock(have); + goto count; } + if (!napi_poll) { + unsigned long val = READ_ONCE(napi->state); + + /* If multiple threads are competing for this napi, + * we avoid dirtying napi->state as much as we can. + */ + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | + NAPIF_STATE_IN_BUSY_POLL)) + goto count; + if (cmpxchg(&napi->state, val, + val | NAPIF_STATE_IN_BUSY_POLL | + NAPIF_STATE_SCHED) != val) + goto count; + have_poll_lock = netpoll_poll_lock(napi); + napi_poll = napi->poll; + } + rc = napi_poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); +count: if (rc > 0) __NET_ADD_STATS(sock_net(sk), LINUX_MIB_BUSYPOLLRXPACKETS, rc); @@ -4999,10 +5045,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (rc == LL_FLUSH_FAILED) break; /* permanent failure */ - cpu_relax(); - } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - !need_resched() && !busy_loop_timeout(end_time)); + if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || + busy_loop_timeout(end_time)) + break; + if (unlikely(need_resched())) { + if (napi_poll) + busy_poll_stop(napi, have_poll_lock); + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + rc = !skb_queue_empty(&sk->sk_receive_queue); + if (rc || busy_loop_timeout(end_time)) + return rc; + goto restart; + } + cpu_relax_lowlatency(); + } + if (napi_poll) + busy_poll_stop(napi, have_poll_lock); + preempt_enable(); rc = !skb_queue_empty(&sk->sk_receive_queue); out: rcu_read_unlock(); -- cgit v1.3-7-g2ca7 From 364b6055738b4c752c30ccaaf25c624e69d76195 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2016 10:15:13 -0800 Subject: net: busy-poll: return busypolling status to drivers NAPI drivers use napi_complete_done() or napi_complete() when they drained RX ring and right before re-enabling device interrupts. In busy polling, we can avoid interrupts being delivered since we are polling RX ring in a controlled loop. Drivers can chose to use napi_complete_done() return value to reduce interrupts overhead while busy polling is active. This is optional, legacy drivers should work fine even if not updated. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Adam Belay Cc: Tariq Toukan Cc: Yuval Mintz Cc: Ariel Elior Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++--- net/core/dev.c | 10 ++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e71de66e3792..bcddf951ccee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -463,16 +463,17 @@ static inline bool napi_reschedule(struct napi_struct *napi) return false; } -void __napi_complete(struct napi_struct *n); -void napi_complete_done(struct napi_struct *n, int work_done); +bool __napi_complete(struct napi_struct *n); +bool napi_complete_done(struct napi_struct *n, int work_done); /** * napi_complete - NAPI processing complete * @n: NAPI context * * Mark NAPI processing as complete. * Consider using napi_complete_done() instead. + * Return false if device should avoid rearming interrupts. */ -static inline void napi_complete(struct napi_struct *n) +static inline bool napi_complete(struct napi_struct *n) { return napi_complete_done(n, 0); } diff --git a/net/core/dev.c b/net/core/dev.c index 369dcc8efc01..edba9efeb2e9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4898,7 +4898,7 @@ void __napi_schedule_irqoff(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule_irqoff); -void __napi_complete(struct napi_struct *n) +bool __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); @@ -4906,15 +4906,16 @@ void __napi_complete(struct napi_struct *n) * napi_complete_done(). */ if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state))) - return; + return false; list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); + return true; } EXPORT_SYMBOL(__napi_complete); -void napi_complete_done(struct napi_struct *n, int work_done) +bool napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags; @@ -4926,7 +4927,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) */ if (unlikely(n->state & (NAPIF_STATE_NPSVC | NAPIF_STATE_IN_BUSY_POLL))) - return; + return false; if (n->gro_list) { unsigned long timeout = 0; @@ -4948,6 +4949,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) __napi_complete(n); local_irq_restore(flags); } + return true; } EXPORT_SYMBOL(napi_complete_done); -- cgit v1.3-7-g2ca7 From 89c4b442b78bdba388337cc746fe63caba85f46c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Nov 2016 14:54:50 -0800 Subject: netpoll: more efficient locking Callers of netpoll_poll_lock() own NAPI_STATE_SCHED Callers of netpoll_poll_unlock() have BH blocked between the NAPI_STATE_SCHED being cleared and poll_lock is released. We can avoid the spinlock which has no contention, and use cmpxchg() on poll_owner which we need to set anyway. This removes a possible lockdep violation after the cited commit, since sk_busy_loop() re-enables BH before calling busy_poll_stop() Fixes: 217f69743681 ("net: busy-poll: allow preemption in sk_busy_loop()") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - include/linux/netpoll.h | 13 +++++++------ net/core/dev.c | 1 - net/core/netpoll.c | 6 +++--- 4 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bcddf951ccee..e84800edd249 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -316,7 +316,6 @@ struct napi_struct { unsigned int gro_count; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL - spinlock_t poll_lock; int poll_owner; #endif struct net_device *dev; diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index b25ee9ffdbe6..1828900c9411 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -78,8 +78,11 @@ static inline void *netpoll_poll_lock(struct napi_struct *napi) struct net_device *dev = napi->dev; if (dev && dev->npinfo) { - spin_lock(&napi->poll_lock); - napi->poll_owner = smp_processor_id(); + int owner = smp_processor_id(); + + while (cmpxchg(&napi->poll_owner, -1, owner) != -1) + cpu_relax(); + return napi; } return NULL; @@ -89,10 +92,8 @@ static inline void netpoll_poll_unlock(void *have) { struct napi_struct *napi = have; - if (napi) { - napi->poll_owner = -1; - spin_unlock(&napi->poll_lock); - } + if (napi) + smp_store_release(&napi->poll_owner, -1); } static inline bool netpoll_tx_running(struct net_device *dev) diff --git a/net/core/dev.c b/net/core/dev.c index edba9efeb2e9..f71b34ab57a5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5143,7 +5143,6 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; #ifdef CONFIG_NETPOLL - spin_lock_init(&napi->poll_lock); napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 53599bd0c82d..9424673009c1 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -171,12 +171,12 @@ static void poll_one_napi(struct napi_struct *napi) static void poll_napi(struct net_device *dev) { struct napi_struct *napi; + int cpu = smp_processor_id(); list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner != smp_processor_id() && - spin_trylock(&napi->poll_lock)) { + if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { poll_one_napi(napi); - spin_unlock(&napi->poll_lock); + smp_store_release(&napi->poll_owner, -1); } } } -- cgit v1.3-7-g2ca7 From 3df5b3c67546fb05266766b6abaf71563f82efe4 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 22 Nov 2016 23:09:54 +0200 Subject: net: Add net-device param to the get offloaded stats ndo Some drivers would need to check few internal matters for that. To be used in downstream mlx5 commit. Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 +- include/linux/netdevice.h | 4 ++-- net/core/rtnetlink.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 4a1f9d5f7c03..e0d7d5adbaee 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -857,7 +857,7 @@ mlxsw_sp_port_get_sw_stats64(const struct net_device *dev, return 0; } -static bool mlxsw_sp_port_has_offload_stats(int attr_id) +static bool mlxsw_sp_port_has_offload_stats(const struct net_device *dev, int attr_id) { switch (attr_id) { case IFLA_OFFLOAD_XSTATS_CPU_HIT: diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e84800edd249..ae32a27523f9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -925,7 +925,7 @@ struct netdev_xdp { * 3. Update dev->stats asynchronously and atomically, and define * neither operation. * - * bool (*ndo_has_offload_stats)(int attr_id) + * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id) * Return true if this device supports offload stats of this attr_id. * * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev, @@ -1165,7 +1165,7 @@ struct net_device_ops { struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev, struct rtnl_link_stats64 *storage); - bool (*ndo_has_offload_stats)(int attr_id); + bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id); int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev, void *attr_data); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a99917b5de33..ef8a96010816 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3671,7 +3671,7 @@ static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, if (!size) continue; - if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) continue; attr = nla_reserve_64bit(skb, attr_id, size, @@ -3712,7 +3712,7 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev) for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { - if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) continue; size = rtnl_get_offload_stats_attr_size(attr_id); nla_size += nla_total_size_64bit(size); -- cgit v1.3-7-g2ca7 From 5a717f4f8f2830f297b5511022481bdc27b9d576 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 24 Nov 2016 07:04:08 +0200 Subject: netdevice: fix sparse warning for HARD_TX_LOCK sparse warns about context imbalance in any code that uses HARD_TX_LOCK/UNLOCK - this is because it's unable to determine that flags don't change so lock and unlock are paired. Seems easy enough to fix by adding __acquire/__release calls. With this patch af_packet.c is now sparse-clean, Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/netdevice.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ff57cd2eba3b..4ffcd874cc20 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3462,6 +3462,17 @@ static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) txq->xmit_lock_owner = cpu; } +static inline bool __netif_tx_acquire(struct netdev_queue *txq) +{ + __acquire(&txq->_xmit_lock); + return true; +} + +static inline void __netif_tx_release(struct netdev_queue *txq) +{ + __release(&txq->_xmit_lock); +} + static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); @@ -3563,17 +3574,21 @@ static inline void netif_tx_unlock_bh(struct net_device *dev) #define HARD_TX_LOCK(dev, txq, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ __netif_tx_lock(txq, cpu); \ + } else { \ + __netif_tx_acquire(txq); \ } \ } #define HARD_TX_TRYLOCK(dev, txq) \ (((dev->features & NETIF_F_LLTX) == 0) ? \ __netif_tx_trylock(txq) : \ - true ) + __netif_tx_acquire(txq)) #define HARD_TX_UNLOCK(dev, txq) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ __netif_tx_unlock(txq); \ + } else { \ + __netif_tx_release(txq); \ } \ } -- cgit v1.3-7-g2ca7 From 85de8576a0b14aecc99136cfbf90e367fa2142cb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 28 Nov 2016 23:16:54 +0100 Subject: bpf, xdp: allow to pass flags to dev_change_xdp_fd Add an IFLA_XDP_FLAGS attribute that can be passed for setting up XDP along with IFLA_XDP_FD, which eventually allows user space to implement typical add/replace/delete logic for programs. Right now, calling into dev_change_xdp_fd() will always replace previous programs. When passed XDP_FLAGS_UPDATE_IF_NOEXIST, we can handle this more graceful when requested by returning -EBUSY in case we try to attach a new program, but we find that another one is already attached. This will be used by upcoming front-end for iproute2 as well. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- include/uapi/linux/if_link.h | 4 ++++ net/core/dev.c | 20 ++++++++++++++++++-- net/core/rtnetlink.c | 14 +++++++++++++- 4 files changed, 36 insertions(+), 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4ffcd874cc20..3755317cc6a9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3253,7 +3253,7 @@ int dev_get_phys_port_id(struct net_device *dev, int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); int dev_change_proto_down(struct net_device *dev, bool proto_down); -int dev_change_xdp_fd(struct net_device *dev, int fd); +int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 92b2d4928bf1..6b13e591abc9 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -876,10 +876,14 @@ enum { /* XDP section */ +#define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) +#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST) + enum { IFLA_XDP_UNSPEC, IFLA_XDP_FD, IFLA_XDP_ATTACHED, + IFLA_XDP_FLAGS, __IFLA_XDP_MAX, }; diff --git a/net/core/dev.c b/net/core/dev.c index 048b46b7c92a..bffb5253e778 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6692,26 +6692,42 @@ EXPORT_SYMBOL(dev_change_proto_down); * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @fd: new program fd or negative value to clear + * @flags: xdp-related flags * * Set or clear a bpf program for a device */ -int dev_change_xdp_fd(struct net_device *dev, int fd) +int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; - struct netdev_xdp xdp = {}; + struct netdev_xdp xdp; int err; + ASSERT_RTNL(); + if (!ops->ndo_xdp) return -EOPNOTSUPP; if (fd >= 0) { + if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + + err = ops->ndo_xdp(dev, &xdp); + if (err < 0) + return err; + if (xdp.prog_attached) + return -EBUSY; + } + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); if (IS_ERR(prog)) return PTR_ERR(prog); } + memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_SETUP_PROG; xdp.prog = prog; + err = ops->ndo_xdp(dev, &xdp); if (err < 0 && prog) bpf_prog_put(prog); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4e60525ea586..bd85570e6e4b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1505,6 +1505,7 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, + [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, }; static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) @@ -2164,6 +2165,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_XDP]) { struct nlattr *xdp[IFLA_XDP_MAX + 1]; + u32 xdp_flags = 0; err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], ifla_xdp_policy); @@ -2174,9 +2176,19 @@ static int do_setlink(const struct sk_buff *skb, err = -EINVAL; goto errout; } + + if (xdp[IFLA_XDP_FLAGS]) { + xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); + if (xdp_flags & ~XDP_FLAGS_MASK) { + err = -EINVAL; + goto errout; + } + } + if (xdp[IFLA_XDP_FD]) { err = dev_change_xdp_fd(dev, - nla_get_s32(xdp[IFLA_XDP_FD])); + nla_get_s32(xdp[IFLA_XDP_FD]), + xdp_flags); if (err) goto errout; status |= DO_SETLINK_NOTIFY; -- cgit v1.3-7-g2ca7 From 7091d8c7055d7310339435ae3af2fb490a92524d Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Thu, 1 Dec 2016 14:06:37 +0200 Subject: net/sched: cls_flower: Add offload support using egress Hardware device In order to support hardware offloading when the device given by the tc rule is different from the Hardware underline device, extract the mirred (egress) device from the tc action when a filter is added, using the new tc_action_ops, get_dev(). Flower caches the information about the mirred device and use it for calling ndo_setup_tc in filter change, update stats and delete. Calling ndo_setup_tc of the mirred (egress) device instead of the ingress device will allow a resolution between the software ingress device and the underline hardware device. The resolution will take place inside the offloading driver using 'egress_device' flag added to tc_to_netdev struct which is provided to the offloading driver. Signed-off-by: Hadar Hen Zion Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/pkt_cls.h | 2 ++ net/sched/cls_api.c | 24 ++++++++++++++++++++++++ net/sched/cls_flower.c | 41 ++++++++++++++++++++++++----------------- 4 files changed, 51 insertions(+), 17 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3755317cc6a9..1ff5ea6e1221 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -802,6 +802,7 @@ struct tc_to_netdev { struct tc_cls_matchall_offload *cls_mall; struct tc_cls_bpf_offload *cls_bpf; }; + bool egress_dev; }; /* These structures hold the attributes of xdp state that are being passed diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 45ad9aab9bba..f0a051480c6c 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -171,6 +171,8 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); +int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, + struct net_device **hw_dev); /** * struct tcf_pkt_info - packet information diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b05d4a2155b0..3fbba79a4ef0 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -682,6 +682,30 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_dump_stats); +int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, + struct net_device **hw_dev) +{ +#ifdef CONFIG_NET_CLS_ACT + const struct tc_action *a; + LIST_HEAD(actions); + + if (tc_no_actions(exts)) + return -EINVAL; + + tcf_exts_to_list(exts, &actions); + list_for_each_entry(a, &actions, list) { + if (a->ops->get_dev) { + a->ops->get_dev(a, dev_net(dev), hw_dev); + break; + } + } + if (*hw_dev) + return 0; +#endif + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(tcf_exts_get_dev); + static int __init tc_filter_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 13b349f426a7..1cacfa5c95f3 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -78,6 +78,8 @@ struct cls_fl_filter { u32 handle; u32 flags; struct rcu_head rcu; + struct tc_to_netdev tc; + struct net_device *hw_dev; }; static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) @@ -203,9 +205,9 @@ static void fl_destroy_filter(struct rcu_head *head) static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) { - struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload offload = {0}; - struct tc_to_netdev tc; + struct net_device *dev = f->hw_dev; + struct tc_to_netdev *tc = &f->tc; if (!tc_can_offload(dev, tp)) return; @@ -213,10 +215,10 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) offload.command = TC_CLSFLOWER_DESTROY; offload.cookie = (unsigned long)f; - tc.type = TC_SETUP_CLSFLOWER; - tc.cls_flower = &offload; + tc->type = TC_SETUP_CLSFLOWER; + tc->cls_flower = &offload; - dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc); } static int fl_hw_replace_filter(struct tcf_proto *tp, @@ -226,11 +228,17 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload offload = {0}; - struct tc_to_netdev tc; + struct tc_to_netdev *tc = &f->tc; int err; - if (!tc_can_offload(dev, tp)) - return tc_skip_sw(f->flags) ? -EINVAL : 0; + if (!tc_can_offload(dev, tp)) { + if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev)) + return tc_skip_sw(f->flags) ? -EINVAL : 0; + dev = f->hw_dev; + tc->egress_dev = true; + } else { + f->hw_dev = dev; + } offload.command = TC_CLSFLOWER_REPLACE; offload.cookie = (unsigned long)f; @@ -239,23 +247,22 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, offload.key = &f->key; offload.exts = &f->exts; - tc.type = TC_SETUP_CLSFLOWER; - tc.cls_flower = &offload; + tc->type = TC_SETUP_CLSFLOWER; + tc->cls_flower = &offload; err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, - &tc); + tc); if (tc_skip_sw(f->flags)) return err; - return 0; } static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) { - struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload offload = {0}; - struct tc_to_netdev tc; + struct net_device *dev = f->hw_dev; + struct tc_to_netdev *tc = &f->tc; if (!tc_can_offload(dev, tp)) return; @@ -264,10 +271,10 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) offload.cookie = (unsigned long)f; offload.exts = &f->exts; - tc.type = TC_SETUP_CLSFLOWER; - tc.cls_flower = &offload; + tc->type = TC_SETUP_CLSFLOWER; + tc->cls_flower = &offload; - dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc); } static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f) -- cgit v1.3-7-g2ca7 From 13bfff25c081f4e060af761c4082b5a96f756810 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Dec 2016 08:29:10 -0800 Subject: net: rfs: add a jump label RFS is not commonly used, so add a jump label to avoid some conditionals in fast path. Signed-off-by: Eric Dumazet Cc: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/sock.h | 25 ++++++++++++++----------- net/core/dev.c | 2 ++ net/core/sysctl_net_core.c | 5 ++++- 4 files changed, 21 insertions(+), 12 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1ff5ea6e1221..994f7423a74b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -192,6 +192,7 @@ struct net_device_stats { #ifdef CONFIG_RPS #include extern struct static_key rps_needed; +extern struct static_key rfs_needed; #endif struct neighbour; diff --git a/include/net/sock.h b/include/net/sock.h index 1749e38d0301..2729e77950b7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -913,17 +913,20 @@ static inline void sock_rps_record_flow_hash(__u32 hash) static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS - /* Reading sk->sk_rxhash might incur an expensive cache line miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. - */ - if (sk->sk_state == TCP_ESTABLISHED) - sock_rps_record_flow_hash(sk->sk_rxhash); + if (static_key_false(&rfs_needed)) { + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) + sock_rps_record_flow_hash(sk->sk_rxhash); + } #endif } diff --git a/net/core/dev.c b/net/core/dev.c index bffb5253e778..1d33ce03365f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3447,6 +3447,8 @@ EXPORT_SYMBOL(rps_cpu_mask); struct static_key rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); +struct static_key rfs_needed __read_mostly; +EXPORT_SYMBOL(rfs_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0df2aa652530..2a46e4009f62 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -79,10 +79,13 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); - if (sock_table) + if (sock_table) { static_key_slow_inc(&rps_needed); + static_key_slow_inc(&rfs_needed); + } if (orig_sock_table) { static_key_slow_dec(&rps_needed); + static_key_slow_dec(&rfs_needed); synchronize_rcu(); vfree(orig_sock_table); } -- cgit v1.3-7-g2ca7