From d0522f1cd25edb796548f91e04766fa3cbc3b6df Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 6 Nov 2018 12:51:14 -0800 Subject: net: Add extack argument to rtnl_create_link Add extack arg to rtnl_create_link and add messages for invalid number of Tx or Rx queues. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 33d9227a8b80..f787b7640d49 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2885,9 +2885,11 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } EXPORT_SYMBOL(rtnl_configure_link); -struct net_device *rtnl_create_link(struct net *net, - const char *ifname, unsigned char name_assign_type, - const struct rtnl_link_ops *ops, struct nlattr *tb[]) +struct net_device *rtnl_create_link(struct net *net, const char *ifname, + unsigned char name_assign_type, + const struct rtnl_link_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; @@ -2903,11 +2905,15 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); - if (num_tx_queues < 1 || num_tx_queues > 4096) + if (num_tx_queues < 1 || num_tx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); + } - if (num_rx_queues < 1 || num_rx_queues > 4096) + if (num_rx_queues < 1 || num_rx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); + } dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); @@ -3163,7 +3169,7 @@ replay: } dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb); + name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; -- cgit v1.2.3-59-g8ed1b From 68d57f3b1d1a87b4022a0a7463126d7ea172bda1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 6 Nov 2018 12:51:16 -0800 Subject: rtnetlink: Add more extack messages to rtnl_newlink Add extack arg to the nla_parse_nested calls in rtnl_newlink, and add messages for unknown device type and link network namespace id. In particular, it improves the failure message when the wrong link type is used. From $ ip li add bond1 type bonding RTNETLINK answers: Operation not supported to $ ip li add bond1 type bonding Error: Unknown device type. (The module name is bonding but the link type is bond.) Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f787b7640d49..86f2d9cbdae3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3054,7 +3054,7 @@ replay: if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { err = nla_parse_nested(attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], - ops->policy, NULL); + ops->policy, extack); if (err < 0) return err; data = attr; @@ -3076,7 +3076,7 @@ replay: m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], m_ops->slave_policy, - NULL); + extack); if (err < 0) return err; slave_data = slave_attr; @@ -3140,6 +3140,7 @@ replay: goto replay; } #endif + NL_SET_ERR_MSG(extack, "Unknown device type"); return -EOPNOTSUPP; } @@ -3160,6 +3161,7 @@ replay: link_net = get_net_ns_by_id(dest_net, id); if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); err = -EINVAL; goto out; } -- cgit v1.2.3-59-g8ed1b From 6da5b0f027a825df2aebc1927a27bda185dc03d4 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Wed, 7 Nov 2018 15:36:04 +0000 Subject: net: ensure unbound datagram socket to be chosen when not in a VRF Ensure an unbound datagram skt is chosen when not in a VRF. The check for a device match in compute_score() for UDP must be performed when there is no device match. For this, a failure is returned when there is no device match. This ensures that bound sockets are never selected, even if there is no unbound socket. Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These packets are currently blocked, as flowi6_oif was set to that of the master vrf device, and the ipi6_ifindex is that of the slave device. Allow these packets to be sent by checking the device with ipi6_ifindex has the same L3 scope as that of the bound device of the skt, which is the master vrf device. Note that this check always succeeds if the skt is unbound. Even though the right datagram skt is now selected by compute_score(), a different skt is being returned that is bound to the wrong vrf. The difference between these and stream sockets is the handling of the skt option for SO_REUSEPORT. While the handling when adding a skt for reuse correctly checks that the bound device of the skt is a match, the skts in the hashslot are already incorrect. So for the same hash, a skt for the wrong vrf may be selected for the required port. The root cause is that the skt is immediately placed into a slot when it is created, but when the skt is then bound using SO_BINDTODEVICE, it remains in the same slot. The solution is to move the skt to the correct slot by forcing a rehash. Signed-off-by: Mike Manning Reviewed-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- include/net/udp.h | 11 +++++++++++ net/core/sock.c | 2 ++ net/ipv4/udp.c | 15 ++++++--------- net/ipv6/datagram.c | 10 +++++++--- net/ipv6/udp.c | 14 +++++--------- 5 files changed, 31 insertions(+), 21 deletions(-) (limited to 'net/core') diff --git a/include/net/udp.h b/include/net/udp.h index 9e82cb391dea..a496e441645e 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -252,6 +252,17 @@ static inline int udp_rqueue_get(struct sock *sk) return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } +static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(!!net->ipv4.sysctl_udp_l3mdev_accept, + bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); +#endif +} + /* net/ipv4/udp.c */ void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); diff --git a/net/core/sock.c b/net/core/sock.c index 080a880a1761..7b304e454a38 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, lock_sock(sk); sk->sk_bound_dev_if = index; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); sk_dst_reset(sk); release_sock(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1976fddb9e00..cf73c9194bb6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -371,6 +371,7 @@ static int compute_score(struct sock *sk, struct net *net, { int score; struct inet_sock *inet; + bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || @@ -398,15 +399,11 @@ static int compute_score(struct sock *sk, struct net *net, score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } + dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif); + if (!dev_match) + return -1; + score += 4; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 1ede7a16a0be..bde08aa549f3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -772,6 +772,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, case IPV6_2292PKTINFO: { struct net_device *dev = NULL; + int src_idx; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; @@ -779,12 +780,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, } src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + src_idx = src_info->ipi6_ifindex; - if (src_info->ipi6_ifindex) { + if (src_idx) { if (fl6->flowi6_oif && - src_info->ipi6_ifindex != fl6->flowi6_oif) + src_idx != fl6->flowi6_oif && + (sk->sk_bound_dev_if != fl6->flowi6_oif || + !sk_dev_equal_l3scope(sk, src_idx))) return -EINVAL; - fl6->flowi6_oif = src_info->ipi6_ifindex; + fl6->flowi6_oif = src_idx; } addr_type = __ipv6_addr_type(&src_info->ipi6_addr); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d2d97d07ef27..0559adc2f357 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -117,6 +117,7 @@ static int compute_score(struct sock *sk, struct net *net, { int score; struct inet_sock *inet; + bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || @@ -144,15 +145,10 @@ static int compute_score(struct sock *sk, struct net *net, score++; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score++; - } + dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); + if (!dev_match) + return -1; + score++; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; -- cgit v1.2.3-59-g8ed1b From 9b319148cb34ecccacff09eca87765c87d5e19ff Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Wed, 7 Nov 2018 18:07:03 +0100 Subject: net/vlan: include the shift in skb_vlan_tag_get_prio() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 2 +- net/core/flow_dissector.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 941da4bf3929..b14bf87999aa 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -81,7 +81,7 @@ static inline bool is_vlan_dev(const struct net_device *dev) #define skb_vlan_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) #define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) -#define skb_vlan_tag_get_prio(__skb) ((__skb)->vlan_tci & VLAN_PRIO_MASK) +#define skb_vlan_tag_get_prio(__skb) (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT) static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev) { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 676f3ad629f9..56d1e9b73142 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -952,8 +952,7 @@ proto_again: if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); - key_vlan->vlan_priority = - (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); + key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); } else { key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; -- cgit v1.2.3-59-g8ed1b From 50254256f382c56bde87d970f3d0d02fdb76ec70 Mon Sep 17 00:00:00 2001 From: David Barmann Date: Thu, 8 Nov 2018 08:13:35 -0600 Subject: sock: Reset dst when changing sk_mark via setsockopt When setting the SO_MARK socket option, if the mark changes, the dst needs to be reset so that a new route lookup is performed. This fixes the case where an application wants to change routing by setting a new sk_mark. If this is done after some packets have already been sent, the dst is cached and has no effect. Signed-off-by: David Barmann Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 7b304e454a38..6d7e189e3cd9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -952,10 +952,12 @@ set_rcvbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; - else + } else if (val != sk->sk_mark) { sk->sk_mark = val; + sk_dst_reset(sk); + } break; case SO_RXQ_OVFL: -- cgit v1.2.3-59-g8ed1b From e7946760de5852f32c4e52ce47f37e85346981b9 Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Thu, 8 Nov 2018 22:27:54 +0200 Subject: net: core: dev_addr_lists: add auxiliary func to handle reference address updates In order to avoid all table update, and only remove or add new address, the auxiliary function exists, named __hw_addr_sync_dev(). It allows end driver do nothing when nothing changed and add/rm when concrete address is firstly added or lastly removed. But it doesn't include cases when an address of real device or vlan was reused by other vlans or vlan/macval devices. For handaling events when address was reused/unreused the patch adds new auxiliary routine - __hw_addr_ref_sync_dev(). It allows to do nothing when nothing was changed and do updates only for an address being added/reused/deleted/unreused. Thus, clone address changes for vlans can be mirrored in the table. The function is exclusive with __hw_addr_sync_dev(). It's responsibility of the end driver to identify address vlan device, if it needs so. Signed-off-by: Ivan Khoronzhuk Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 +++++ net/core/dev_addr_lists.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 857f8abf7b91..487fa5e0e165 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4068,6 +4068,16 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, int (*sync)(struct net_device *, const unsigned char *), int (*unsync)(struct net_device *, const unsigned char *)); +int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, + const unsigned char *, int), + int (*unsync)(struct net_device *, + const unsigned char *, int)); +void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *, int)); void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*unsync)(struct net_device *, diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index d884d8f5f0e5..81a8cd4ea3bd 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -277,6 +277,103 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, } EXPORT_SYMBOL(__hw_addr_sync_dev); +/** + * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking + * into account references + * @list: address list to synchronize + * @dev: device to sync + * @sync: function to call if address or reference on it should be added + * @unsync: function to call if address or some reference on it should removed + * + * This function is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address or references on it + * add/remove notifications. The unsync function may be NULL in which case + * the addresses or references on it requiring removal will simply be + * removed without any notification to the device. That is responsibility of + * the driver to identify and distribute address or references on it between + * internal address tables. + **/ +int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, + const unsigned char *, int), + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + int err, ref_cnt; + + /* first go through and flush out any unsynced/stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address is not used */ + if ((ha->sync_cnt << 1) <= ha->refcount) + continue; + + /* if fails defer unsyncing address */ + ref_cnt = ha->refcount - ha->sync_cnt; + if (unsync && unsync(dev, ha->addr, ref_cnt)) + continue; + + ha->refcount = (ref_cnt << 1) + 1; + ha->sync_cnt = ref_cnt; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync updated/new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address added or reused */ + if ((ha->sync_cnt << 1) >= ha->refcount) + continue; + + ref_cnt = ha->refcount - ha->sync_cnt; + err = sync(dev, ha->addr, ref_cnt); + if (err) + return err; + + ha->refcount = ref_cnt << 1; + ha->sync_cnt = ref_cnt; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_ref_sync_dev); + +/** + * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on + * it from device + * @list: address list to remove synchronized addresses (references on it) from + * @dev: device to sync + * @unsync: function to call if address and references on it should be removed + * + * Remove all addresses that were added to the device by + * __hw_addr_ref_sync_dev(). This function is intended to be called from the + * ndo_stop or ndo_open functions on devices that require explicit address (or + * references on it) add/remove notifications. If the unsync function pointer + * is NULL then this function can be used to just reset the sync_cnt for the + * addresses in the list. + **/ +void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) + continue; + + ha->refcount -= ha->sync_cnt - 1; + ha->sync_cnt = 0; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); + /** * __hw_addr_unsync_dev - Remove synchronized addresses from device * @list: address list to remove synchronized addresses from -- cgit v1.2.3-59-g8ed1b From 49f8e8329c3c05e78a75112ce006c41d24eaad0d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 8 Nov 2018 14:05:42 -0800 Subject: net: move __skb_checksum_complete*() to skbuff.c __skb_checksum_complete_head() and __skb_checksum_complete() are both declared in skbuff.h, they fit better in skbuff.c than datagram.c. Cc: Stefano Brivio Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/datagram.c | 43 ------------------------------------------- net/core/skbuff.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 43 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 57f3a6fcfc1e..07983b90d2bd 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -728,49 +728,6 @@ fault: return -EFAULT; } -__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) -{ - __sum16 sum; - - sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - if (!skb_shared(skb)) - skb->csum_valid = !sum; - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete_head); - -__sum16 __skb_checksum_complete(struct sk_buff *skb) -{ - __wsum csum; - __sum16 sum; - - csum = skb_checksum(skb, 0, skb->len, 0); - - /* skb->csum holds pseudo checksum */ - sum = csum_fold(csum_add(skb->csum, csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - - if (!skb_shared(skb)) { - /* Save full packet checksum */ - skb->csum = csum; - skb->ip_summed = CHECKSUM_COMPLETE; - skb->csum_complete_sw = 1; - skb->csum_valid = !sum; - } - - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete); - /** * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b4ee5c8b928f..5cb4b3440153 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2645,6 +2645,49 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +{ + __sum16 sum; + + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + if (!skb_shared(skb)) + skb->csum_valid = !sum; + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete_head); + +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + /* skb->csum holds pseudo checksum */ + sum = csum_fold(csum_add(skb->csum, csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + if (!skb_shared(skb)) { + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + } + + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) { net_warn_ratelimited( -- cgit v1.2.3-59-g8ed1b From b1817524c028a5a5284f21358185c74790001e0e Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Fri, 9 Nov 2018 00:18:02 +0100 Subject: net/core: use __vlan_hwaccel helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This removes assumptions about VLAN_TAG_PRESENT bit. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- net/core/dev.c | 8 +++++--- net/core/skbuff.c | 2 +- net/sched/act_vlan.c | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0ffcbdd55fa9..bf7e0a471186 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4889,7 +4889,7 @@ skip_classify: * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; @@ -5386,7 +5386,9 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); + if (skb_vlan_tag_present(p)) + diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) @@ -5652,7 +5654,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; skb->encapsulation = 0; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5cb4b3440153..396fcb3baad0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5166,7 +5166,7 @@ int skb_vlan_pop(struct sk_buff *skb) int err; if (likely(skb_vlan_tag_present(skb))) { - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } else { if (unlikely(!eth_type_vlan(skb->protocol))) return 0; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index ba677d54a7af..93fdaf707313 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -63,7 +63,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, /* extract existing tag (and guarantee no hw-accel tag) */ if (skb_vlan_tag_present(skb)) { tci = skb_vlan_tag_get(skb); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } else { /* in-payload vlan tag, pop it */ err = __skb_vlan_pop(skb, &tci); -- cgit v1.2.3-59-g8ed1b From a5a3a828cd00788a78da686c57c6d1f66191d8af Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Wed, 7 Nov 2018 16:12:01 -0800 Subject: bpf: add perf event notificaton support for sock_ops This patch allows eBPF programs that use sock_ops to send perf based event notifications using bpf_perf_event_output(). Our main use case for this is the following: We would like to monitor some subset of TCP sockets in user-space, (the monitoring application would define 4-tuples it wants to monitor) using TCP_INFO stats to analyze reported problems. The idea is to use those stats to see where the bottlenecks are likely to be ("is it application-limited?" or "is there evidence of BufferBloat in the path?" etc). Today we can do this by periodically polling for tcp_info, but this could be made more efficient if the kernel would asynchronously notify the application via tcp_info when some "interesting" thresholds (e.g., "RTT variance > X", or "total_retrans > Y" etc) are reached. And to make this effective, it is better if we could apply the threshold check *before* constructing the tcp_info netlink notification, so that we don't waste resources constructing notifications that will be discarded by the filter. This work solves the problem by adding perf event based notification support for sock_ops. The eBPF program can thus be designed to apply any desired filters to the bpf_sock_ops and trigger a perf event notification based on the evaluation from the filter. The user space component can use these perf event notifications to either read any state managed by the eBPF program, or issue a TCP_INFO netlink call if desired. Signed-off-by: Sowmini Varadhan Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- net/core/filter.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e521c5ebc7d1..ba97a6bee6f9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3908,6 +3908,26 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +static const struct bpf_func_proto bpf_sockopt_event_output_proto = { + .func = bpf_sockopt_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -5240,6 +5260,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_perf_event_output: + return &bpf_sockopt_event_output_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3-59-g8ed1b From c8123ead13a5c92dc5fd15c0fdfe88eef41e6ac1 Mon Sep 17 00:00:00 2001 From: Nitin Hande Date: Sun, 28 Oct 2018 21:02:45 -0700 Subject: bpf: Extend the sk_lookup() helper to XDP hookpoint. This patch proposes to extend the sk_lookup() BPF API to the XDP hookpoint. The sk_lookup() helper supports a lookup on incoming packet to find the corresponding socket that will receive this packet. Current support for this BPF API is at the tc hookpoint. This patch will extend this API at XDP hookpoint. A XDP program can map the incoming packet to the 5-tuple parameter and invoke the API to find the corresponding socket structure. Signed-off-by: Nitin Hande Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 4 ++ net/core/filter.c | 105 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 90 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 852dc17ab47a..47d606d744cc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2201,6 +2201,8 @@ union bpf_attr { * **CONFIG_NET** configuration option. * Return * Pointer to *struct bpf_sock*, or NULL in case of failure. + * For sockets with reuseport option, *struct bpf_sock* + * return is from reuse->socks[] using hash of the packet. * * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) * Description @@ -2233,6 +2235,8 @@ union bpf_attr { * **CONFIG_NET** configuration option. * Return * Pointer to *struct bpf_sock*, or NULL in case of failure. + * For sockets with reuseport option, *struct bpf_sock* + * return is from reuse->socks[] using hash of the packet. * * int bpf_sk_release(struct bpf_sock *sk) * Description diff --git a/net/core/filter.c b/net/core/filter.c index ba97a6bee6f9..53d50fb75ea1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4845,38 +4845,32 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, - struct sk_buff *skb, u8 family, u8 proto) + int dif, int sdif, u8 family, u8 proto) { bool refcounted = false; struct sock *sk = NULL; - int dif = 0; - - if (skb->dev) - dif = skb->dev->ifindex; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; - int sdif = inet_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, - dif, sdif, &udp_table, skb); + dif, sdif, &udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; u16 hnum = ntohs(tuple->ipv6.dport); - int sdif = inet6_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, hnum, dif, sdif, &refcounted); @@ -4885,7 +4879,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, src6, tuple->ipv6.sport, dst6, hnum, dif, sdif, - &udp_table, skb); + &udp_table, NULL); #endif } @@ -4902,31 +4896,33 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, * callers to satisfy BPF_CALL declarations. */ static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { - struct net *caller_net; struct sock *sk = NULL; u8 family = AF_UNSPEC; struct net *net; + int sdif; family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) goto out; - if (skb->dev) - caller_net = dev_net(skb->dev); + if (family == AF_INET) + sdif = inet_sdif(skb); else - caller_net = sock_net(skb->sk); + sdif = inet6_sdif(skb); + if (netns_id) { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); } else { net = caller_net; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } if (sk) @@ -4935,6 +4931,25 @@ out: return (unsigned long) sk; } +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + int ifindex; + + if (skb->dev) { + caller_net = dev_net(skb->dev); + ifindex = skb->dev->ifindex; + } else { + caller_net = sock_net(skb->sk); + ifindex = 0; + } + + return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, + proto, netns_id, flags); +} + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { @@ -4984,6 +4999,50 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_SOCKET, }; + +BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { + .func = bpf_xdp_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { + .func = bpf_xdp_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5234,6 +5293,14 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_udp: + return &bpf_xdp_sk_lookup_udp_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_xdp_sk_lookup_tcp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3-59-g8ed1b From 76c6d988aeb3c15d57ea0c245a3b5f27802c1fbe Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 12 Nov 2018 18:27:16 +0800 Subject: sctp: add sock_reuseport for the sock in __sctp_hash_endpoint This is a part of sk_reuseport support for sctp. It defines a helper sctp_bind_addrs_check() to check if the bind_addrs in two socks are matched. It will add sock_reuseport if they are completely matched, and return err if they are partly matched, and alloc sock_reuseport if all socks are not matched at all. It will work until sk_reuseport support is added in sctp_get_port_local() in the next patch. v1->v2: - use 'laddr->valid && laddr2->valid' check instead as Marcelo pointed in sctp_bind_addrs_check(). Acked-by: Neil Horman Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 2 +- include/net/sctp/structs.h | 2 ++ net/core/sock_reuseport.c | 1 + net/sctp/bind_addr.c | 28 ++++++++++++++++++++++ net/sctp/input.c | 60 +++++++++++++++++++++++++++++++++++++++------- net/sctp/socket.c | 3 +-- 6 files changed, 85 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 9a3b48a35e90..cdf2e80abc44 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -152,7 +152,7 @@ int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc, */ int sctp_rcv(struct sk_buff *skb); int sctp_v4_err(struct sk_buff *skb, u32 info); -void sctp_hash_endpoint(struct sctp_endpoint *); +int sctp_hash_endpoint(struct sctp_endpoint *ep); void sctp_unhash_endpoint(struct sctp_endpoint *); struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *, struct sctphdr *, struct sctp_association **, diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index a11f93790476..15d017f33a46 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1190,6 +1190,8 @@ int sctp_bind_addr_conflict(struct sctp_bind_addr *, const union sctp_addr *, struct sctp_sock *, struct sctp_sock *); int sctp_bind_addr_state(const struct sctp_bind_addr *bp, const union sctp_addr *addr); +int sctp_bind_addrs_check(struct sctp_sock *sp, + struct sctp_sock *sp2, int cnt2); union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp, const union sctp_addr *addrs, int addrcnt, diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index ba5cba56f574..d8fe3e549373 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } +EXPORT_SYMBOL(reuseport_add_sock); void reuseport_detach_sock(struct sock *sk) { diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 7df3704982f5..ebf28adba789 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -337,6 +337,34 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp, return match; } +int sctp_bind_addrs_check(struct sctp_sock *sp, + struct sctp_sock *sp2, int cnt2) +{ + struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr; + struct sctp_bind_addr *bp = &sp->ep->base.bind_addr; + struct sctp_sockaddr_entry *laddr, *laddr2; + bool exist = false; + int cnt = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + list_for_each_entry_rcu(laddr2, &bp2->address_list, list) { + if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) && + laddr->valid && laddr2->valid) { + exist = true; + goto next; + } + } + cnt = 0; + break; +next: + cnt++; + } + rcu_read_unlock(); + + return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1); +} + /* Does the address 'addr' conflict with any addresses in * the bp. */ diff --git a/net/sctp/input.c b/net/sctp/input.c index 00f995e37795..d7a649d240e5 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -724,43 +724,87 @@ discard: } /* Insert endpoint into the hash table. */ -static void __sctp_hash_endpoint(struct sctp_endpoint *ep) +static int __sctp_hash_endpoint(struct sctp_endpoint *ep) { - struct net *net = sock_net(ep->base.sk); - struct sctp_ep_common *epb; + struct sock *sk = ep->base.sk; + struct net *net = sock_net(sk); struct sctp_hashbucket *head; + struct sctp_ep_common *epb; epb = &ep->base; - epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port); head = &sctp_ep_hashtable[epb->hashent]; + if (sk->sk_reuseport) { + bool any = sctp_is_ep_boundall(sk); + struct sctp_ep_common *epb2; + struct list_head *list; + int cnt = 0, err = 1; + + list_for_each(list, &ep->base.bind_addr.address_list) + cnt++; + + sctp_for_each_hentry(epb2, &head->chain) { + struct sock *sk2 = epb2->sk; + + if (!net_eq(sock_net(sk2), net) || sk2 == sk || + !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) || + !sk2->sk_reuseport) + continue; + + err = sctp_bind_addrs_check(sctp_sk(sk2), + sctp_sk(sk), cnt); + if (!err) { + err = reuseport_add_sock(sk, sk2, any); + if (err) + return err; + break; + } else if (err < 0) { + return err; + } + } + + if (err) { + err = reuseport_alloc(sk, any); + if (err) + return err; + } + } + write_lock(&head->lock); hlist_add_head(&epb->node, &head->chain); write_unlock(&head->lock); + return 0; } /* Add an endpoint to the hash. Local BH-safe. */ -void sctp_hash_endpoint(struct sctp_endpoint *ep) +int sctp_hash_endpoint(struct sctp_endpoint *ep) { + int err; + local_bh_disable(); - __sctp_hash_endpoint(ep); + err = __sctp_hash_endpoint(ep); local_bh_enable(); + + return err; } /* Remove endpoint from the hash table. */ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) { - struct net *net = sock_net(ep->base.sk); + struct sock *sk = ep->base.sk; struct sctp_hashbucket *head; struct sctp_ep_common *epb; epb = &ep->base; - epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port); + epb->hashent = sctp_ep_hashfn(sock_net(sk), epb->bind_addr.port); head = &sctp_ep_hashtable[epb->hashent]; + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); + write_lock(&head->lock); hlist_del_init(&epb->node); write_unlock(&head->lock); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 739f3e50120d..2e955f1dbe3f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7852,8 +7852,7 @@ static int sctp_listen_start(struct sock *sk, int backlog) } sk->sk_max_ack_backlog = backlog; - sctp_hash_endpoint(ep); - return 0; + return sctp_hash_endpoint(ep); } /* -- cgit v1.2.3-59-g8ed1b From 7fe50ac83f4319c18ed7c634d85cad16bd0bf509 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 12 Nov 2018 14:47:18 -0800 Subject: net: dump more useful information in netdev_rx_csum_fault() Currently netdev_rx_csum_fault() only shows a device name, we need more information about the skb for debugging csum failures. Sample output: ens3: hw csum failure dev features: 0x0000000000014b89 skb len=84 data_len=0 pkt_type=0 gso_size=0 gso_type=0 nr_frags=0 ip_summed=0 csum=0 csum_complete_sw=0 csum_valid=0 csum_level=0 Note, I use pr_err() just to be consistent with the existing one. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++-- net/core/datagram.c | 2 +- net/core/dev.c | 11 +++++++++-- net/core/skbuff.c | 4 ++-- net/sunrpc/socklib.c | 2 +- 5 files changed, 16 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 97b4233120e4..917ae7b6263e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4343,9 +4343,10 @@ static inline bool can_checksum_protocol(netdev_features_t features, } #ifdef CONFIG_BUG -void netdev_rx_csum_fault(struct net_device *dev); +void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb); #else -static inline void netdev_rx_csum_fault(struct net_device *dev) +static inline void netdev_rx_csum_fault(struct net_device *dev, + struct sk_buff *skb) { } #endif diff --git a/net/core/datagram.c b/net/core/datagram.c index 07983b90d2bd..4bf62b1afa3b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -767,7 +767,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(NULL); + netdev_rx_csum_fault(NULL, skb); } return 0; fault: diff --git a/net/core/dev.c b/net/core/dev.c index bf7e0a471186..5927f6a7c301 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3091,10 +3091,17 @@ EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG -void netdev_rx_csum_fault(struct net_device *dev) +void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { if (net_ratelimit()) { pr_err("%s: hw csum failure\n", dev ? dev->name : ""); + if (dev) + pr_err("dev features: %pNF\n", &dev->features); + pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n", + skb->len, skb->data_len, skb->pkt_type, + skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, + skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level); dump_stack(); } } @@ -5781,7 +5788,7 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); } NAPI_GRO_CB(skb)->csum = wsum; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 396fcb3baad0..fcb1155a00ec 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2653,7 +2653,7 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); } if (!skb_shared(skb)) skb->csum_valid = !sum; @@ -2673,7 +2673,7 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); } if (!skb_shared(skb)) { diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 9062967575c4..7e55cfc69697 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -175,7 +175,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) return -1; if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); return 0; no_checksum: if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) -- cgit v1.2.3-59-g8ed1b From 982c17b9e3c27389a8d214333c686dab0e95cf63 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 13 Nov 2018 09:16:52 +0800 Subject: net: remove BUG_ON from __pskb_pull_tail if list is NULL pointer, and the following access of list will trigger panic, which is same as BUG_ON Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fcb1155a00ec..f95ab41c9fb9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1925,8 +1925,6 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) struct sk_buff *insp = NULL; do { - BUG_ON(!list); - if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; -- cgit v1.2.3-59-g8ed1b From cac6cc2f5ac710334ae0f6bba5630d791c253574 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 9 Nov 2018 10:54:00 -0800 Subject: bpf: Fix IPv6 dport byte order in bpf_sk_lookup_udp Lookup functions in sk_lookup have different expectations about byte order of provided arguments. Specifically __inet_lookup, __udp4_lib_lookup and __udp6_lib_lookup expect dport to be in network byte order and do ntohs(dport) internally. At the same time __inet6_lookup expects dport to be in host byte order and correspondingly name the argument hnum. sk_lookup works correctly with __inet_lookup, __udp4_lib_lookup and __inet6_lookup with regard to dport. But in __udp6_lib_lookup case it uses host instead of expected network byte order. It makes result returned by bpf_sk_lookup_udp for IPv6 incorrect. The patch fixes byte order of dport passed to __udp6_lib_lookup. Originally sk_lookup properly handled UDPv6, but not TCPv6. 5ef0ae84f02a fixes TCPv6 but breaks UDPv6. Fixes: 5ef0ae84f02a ("bpf: Fix IPv6 dport byte-order in bpf_sk_lookup") Signed-off-by: Andrey Ignatov Acked-by: Joe Stringer Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 53d50fb75ea1..f4ae933edf61 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4867,17 +4867,16 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; - u16 hnum = ntohs(tuple->ipv6.dport); if (proto == IPPROTO_TCP) sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, tuple->ipv6.dport, dif, sdif, &udp_table, NULL); #endif -- cgit v1.2.3-59-g8ed1b From 6c49e65e0d462963b4fac97ebd87014342167027 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 9 Nov 2018 10:54:01 -0800 Subject: bpf: Support socket lookup in CGROUP_SOCK_ADDR progs Make bpf_sk_lookup_tcp, bpf_sk_lookup_udp and bpf_sk_release helpers available in programs of type BPF_PROG_TYPE_CGROUP_SOCK_ADDR. Such programs operate on sockets and have access to socket and struct sockaddr passed by user to system calls such as sys_bind, sys_connect, sys_sendmsg. It's useful to be able to lookup other sockets from these programs. E.g. sys_connect may lookup IP:port endpoint and if there is a server socket bound to that endpoint ("server" can be defined by saddr & sport being zero), redirect client connection to it by rewriting IP:port in sockaddr passed to sys_connect. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index f4ae933edf61..f6ca38a7d433 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5042,6 +5042,43 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { + .func = bpf_sock_addr_sk_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { + .func = bpf_sock_addr_sk_lookup_udp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5148,6 +5185,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sock_addr_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sock_addr_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3-59-g8ed1b From 9c2122559709fe35f493634bc61065b8c0ecb2ad Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Sat, 10 Nov 2018 19:58:35 +0100 Subject: net/bpf: split VLAN_PRESENT bit handling from VLAN_TCI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- net/core/filter.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e521c5ebc7d1..c151b906df53 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -296,22 +296,21 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, break; case SKF_AD_VLAN_TAG: - case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); - if (skb_field == SKF_AD_VLAN_TAG) { - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, - ~VLAN_TAG_PRESENT); - } else { - /* dst_reg >>= 12 */ - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); - /* dst_reg &= 1 */ +#ifdef VLAN_TAG_PRESENT + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, ~VLAN_TAG_PRESENT); +#endif + break; + case SKF_AD_VLAN_TAG_PRESENT: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); - } break; } @@ -6140,19 +6139,22 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, vlan_present): - case offsetof(struct __sk_buff, vlan_tci): - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + break; + case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); - if (si->off == offsetof(struct __sk_buff, vlan_tci)) { - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, - ~VLAN_TAG_PRESENT); - } else { - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); - } +#ifdef VLAN_TAG_PRESENT + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, ~VLAN_TAG_PRESENT); +#endif break; case offsetof(struct __sk_buff, cb[0]) ... -- cgit v1.2.3-59-g8ed1b From 0c4b2d370514cb4f3454dd3b18f031d2651fab73 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Sat, 10 Nov 2018 19:58:36 +0100 Subject: net: remove VLAN_TAG_PRESENT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace VLAN_TAG_PRESENT with single bit flag and free up VLAN.CFI overload. Now VLAN.CFI is visible in networking stack and can be passed around intact. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- arch/mips/net/bpf_jit.c | 3 --- arch/powerpc/net/bpf_jit_comp.c | 3 --- arch/sparc/net/bpf_jit_comp_32.c | 4 ---- include/linux/if_vlan.h | 11 ++++++----- include/linux/skbuff.h | 16 +++++++++------- lib/test_bpf.c | 14 ++++++++------ net/core/filter.c | 6 ------ 7 files changed, 23 insertions(+), 34 deletions(-) (limited to 'net/core') diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index de4c6372ad9a..3a0e34f4e615 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1164,9 +1164,6 @@ jmp_cmp: vlan_tci) != 2); off = offsetof(struct sk_buff, vlan_tci); emit_half_load_unsigned(r_A, r_skb, off, ctx); -#ifdef VLAN_TAG_PRESENT - emit_andi(r_A, r_A, (u16)~VLAN_TAG_PRESENT, ctx); -#endif break; case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: ctx->flags |= SEEN_SKB | SEEN_A; diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index dc4a2f54e829..91d223cf512b 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -383,9 +383,6 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, PPC_LHZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, vlan_tci)); -#ifdef VLAN_TAG_PRESENT - PPC_ANDI(r_A, r_A, ~VLAN_TAG_PRESENT); -#endif break; case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: PPC_LBZ_OFFS(r_A, r_skb, PKT_VLAN_PRESENT_OFFSET()); diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c index 48f3c04dd179..84cc8f7f83e9 100644 --- a/arch/sparc/net/bpf_jit_comp_32.c +++ b/arch/sparc/net/bpf_jit_comp_32.c @@ -553,10 +553,6 @@ void bpf_jit_compile(struct bpf_prog *fp) break; case BPF_ANC | SKF_AD_VLAN_TAG: emit_skb_load16(vlan_tci, r_A); -#ifdef VLAN_TAG_PRESENT - emit_loadimm(~VLAN_TAG_PRESENT, r_TMP); - emit_and(r_A, r_TMP, r_A); -#endif break; case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: __emit_skb_load8(__pkt_vlan_present_offset, r_A); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 1be5230921b5..7a541eadf78e 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -66,7 +66,6 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) #define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ #define VLAN_PRIO_SHIFT 13 #define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator */ -#define VLAN_TAG_PRESENT VLAN_CFI_MASK #define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ #define VLAN_N_VID 4096 @@ -78,8 +77,8 @@ static inline bool is_vlan_dev(const struct net_device *dev) return dev->priv_flags & IFF_802_1Q_VLAN; } -#define skb_vlan_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) -#define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) +#define skb_vlan_tag_present(__skb) ((__skb)->vlan_present) +#define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_prio(__skb) (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT) @@ -480,7 +479,7 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, */ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { - skb->vlan_tci = 0; + skb->vlan_present = 0; } /** @@ -492,6 +491,7 @@ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) */ static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src) { + dst->vlan_present = src->vlan_present; dst->vlan_proto = src->vlan_proto; dst->vlan_tci = src->vlan_tci; } @@ -526,7 +526,8 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { skb->vlan_proto = vlan_proto; - skb->vlan_tci = VLAN_TAG_PRESENT | vlan_tci; + skb->vlan_tci = vlan_tci; + skb->vlan_present = 1; } /** diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 99f38779332c..b9aa0d1b21cf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -777,6 +777,14 @@ struct sk_buff { __u8 encap_hdr_csum:1; __u8 csum_valid:1; +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_VLAN_PRESENT_BIT 7 +#else +#define PKT_VLAN_PRESENT_BIT 0 +#endif +#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset) + __u8 __pkt_vlan_present_offset[0]; + __u8 vlan_present:1; __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 csum_not_inet:1; @@ -784,8 +792,8 @@ struct sk_buff { #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif - __u8 ipvs_property:1; + __u8 ipvs_property:1; __u8 inner_protocol_type:1; __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV @@ -816,12 +824,6 @@ struct sk_buff { __u32 priority; int skb_iif; __u32 hash; -#define PKT_VLAN_PRESENT_BIT 4 // CFI (12-th bit) in TCI -#ifdef __BIG_ENDIAN -#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, vlan_tci) -#else -#define PKT_VLAN_PRESENT_OFFSET() (offsetof(struct sk_buff, vlan_tci) + 1) -#endif __be16 vlan_proto; __u16 vlan_tci; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index aa22bcaec1dc..f3e570722a7e 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -39,6 +39,7 @@ #define SKB_HASH 0x1234aaab #define SKB_QUEUE_MAP 123 #define SKB_VLAN_TCI 0xffff +#define SKB_VLAN_PRESENT 1 #define SKB_DEV_IFINDEX 577 #define SKB_DEV_TYPE 588 @@ -725,8 +726,8 @@ static struct bpf_test tests[] = { CLASSIC, { }, { - { 1, SKB_VLAN_TCI & ~VLAN_TAG_PRESENT }, - { 10, SKB_VLAN_TCI & ~VLAN_TAG_PRESENT } + { 1, SKB_VLAN_TCI }, + { 10, SKB_VLAN_TCI } }, }, { @@ -739,8 +740,8 @@ static struct bpf_test tests[] = { CLASSIC, { }, { - { 1, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) }, - { 10, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) } + { 1, SKB_VLAN_PRESENT }, + { 10, SKB_VLAN_PRESENT } }, }, { @@ -5289,8 +5290,8 @@ static struct bpf_test tests[] = { #endif { }, { - { 1, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) }, - { 10, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) } + { 1, SKB_VLAN_PRESENT }, + { 10, SKB_VLAN_PRESENT } }, .fill_helper = bpf_fill_maxinsns6, .expected_errcode = -ENOTSUPP, @@ -6493,6 +6494,7 @@ static struct sk_buff *populate_skb(char *buf, int size) skb->hash = SKB_HASH; skb->queue_mapping = SKB_QUEUE_MAP; skb->vlan_tci = SKB_VLAN_TCI; + skb->vlan_present = SKB_VLAN_PRESENT; skb->vlan_proto = htons(ETH_P_IP); dev_net_set(&dev, &init_net); skb->dev = &dev; diff --git a/net/core/filter.c b/net/core/filter.c index c151b906df53..10acbc00ff6c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -301,9 +301,6 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); -#ifdef VLAN_TAG_PRESENT - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, ~VLAN_TAG_PRESENT); -#endif break; case SKF_AD_VLAN_TAG_PRESENT: *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); @@ -6152,9 +6149,6 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); -#ifdef VLAN_TAG_PRESENT - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, ~VLAN_TAG_PRESENT); -#endif break; case offsetof(struct __sk_buff, cb[0]) ... -- cgit v1.2.3-59-g8ed1b From 7f600f14dfac4ba4aee6283a415cdad2925d7791 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 12 Nov 2018 18:05:24 -0800 Subject: net: remove unused skb_send_sock() Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 - net/core/skbuff.c | 13 ------------- 2 files changed, 14 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b9aa0d1b21cf..a2e8297a5b00 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3335,7 +3335,6 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); -int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f95ab41c9fb9..a1be7f19d998 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2364,19 +2364,6 @@ error: } EXPORT_SYMBOL_GPL(skb_send_sock_locked); -/* Send skb data on a socket. */ -int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) -{ - int ret = 0; - - lock_sock(sk); - ret = skb_send_sock_locked(sk, skb, offset, len); - release_sock(sk); - - return ret; -} -EXPORT_SYMBOL_GPL(skb_send_sock); - /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer -- cgit v1.2.3-59-g8ed1b From 6f9a50691055618b1042ead4d84f80755d1b9315 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 19 Nov 2018 16:11:07 +0000 Subject: net: skb_scrub_packet(): Scrub offload_fwd_mark When a packet is trapped and the corresponding SKB marked as already-forwarded, it retains this marking even after it is forwarded across veth links into another bridge. There, since it ingresses the bridge over veth, which doesn't have offload_fwd_mark, it triggers a warning in nbp_switchdev_frame_mark(). Then nbp_switchdev_allowed_egress() decides not to allow egress from this bridge through another veth, because the SKB is already marked, and the mark (of 0) of course matches. Thus the packet is incorrectly blocked. Solve by resetting offload_fwd_mark() in skb_scrub_packet(). That function is called from tunnels and also from veth, and thus catches the cases where traffic is forwarded between bridges and transformed in a way that invalidates the marking. Signed-off-by: Petr Machata Suggested-by: Ido Schimmel Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a1be7f19d998..9a8a72cefe9b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4882,6 +4882,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) nf_reset(skb); nf_reset_trace(skb); +#ifdef CONFIG_NET_SWITCHDEV + skb->offload_fwd_mark = 0; + skb->offload_mr_fwd_mark = 0; +#endif + if (!xnet) return; -- cgit v1.2.3-59-g8ed1b From f11216b24219ab26d8d159fbfa12dff886b16e32 Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Thu, 22 Nov 2018 14:39:16 -0500 Subject: bpf: add skb->tstamp r/w access from tc clsact and cg skb progs This could be used to rate limit egress traffic in concert with a qdisc which supports Earliest Departure Time, such as FQ. Write access from cg skb progs only with CAP_SYS_ADMIN, since the value will be used by downstream qdiscs. It might make sense to relax this. Changes v1 -> v2: - allow access from cg skb, write only with CAP_SYS_ADMIN Signed-off-by: Vlad Dumitrescu Acked-by: Eric Dumazet Acked-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 29 +++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 1 + tools/testing/selftests/bpf/test_verifier.c | 29 +++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c1554aa07465..23e2031a43d4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2468,6 +2468,7 @@ struct __sk_buff { __u32 data_meta; struct bpf_flow_keys *flow_keys; + __u64 tstamp; }; struct bpf_tunnel_key { diff --git a/net/core/filter.c b/net/core/filter.c index f6ca38a7d433..65dc13aeca7c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5573,6 +5573,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(struct bpf_flow_keys *)) return false; break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (size != sizeof(__u64)) + return false; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5600,6 +5604,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5638,6 +5643,10 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; default: return false; } @@ -5665,6 +5674,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5874,6 +5884,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + case bpf_ctx_range(struct __sk_buff, tstamp): break; default: return false; @@ -6093,6 +6104,7 @@ static bool sk_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -6179,6 +6191,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -6488,6 +6501,22 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; + + case offsetof(struct __sk_buff, tstamp): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c1554aa07465..23e2031a43d4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2468,6 +2468,7 @@ struct __sk_buff { __u32 data_meta; struct bpf_flow_keys *flow_keys; + __u64 tstamp; }; struct bpf_tunnel_key { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 54d16fbdef8b..537a8f91af02 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -2446,6 +2446,10 @@ static struct bpf_test tests[] = { offsetof(struct __sk_buff, tc_index)), BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, cb[3])), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, tstamp)), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, tstamp)), BPF_EXIT_INSN(), }, .errstr_unpriv = "", @@ -5297,6 +5301,31 @@ static struct bpf_test tests[] = { .errstr_unpriv = "R2 leaks addr into helper function", .prog_type = BPF_PROG_TYPE_CGROUP_SKB, }, + { + "write tstamp from CGROUP_SKB", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, tstamp)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "invalid bpf_context access off=152 size=8", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + }, + { + "read tstamp from CGROUP_SKB", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, tstamp)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + }, { "multiple registers share map_lookup_elem result", .insns = { -- cgit v1.2.3-59-g8ed1b From 42519ede4fde2a50919215933e0f02c8342aba0f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Nov 2018 11:39:28 -0800 Subject: net-gro: use ffs() to speedup napi_gro_flush() We very often have few flows/chains to look at, and we might increase GRO_HASH_BUCKETS to 32 or 64 in the future. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f2bfd2eda7b2..d83582623cd7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5364,11 +5364,13 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - u32 i; + unsigned long bitmask = napi->gro_bitmask; + unsigned int i, base = ~0U; - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - if (test_bit(i, &napi->gro_bitmask)) - __napi_gro_flush_chain(napi, i, flush_old); + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __napi_gro_flush_chain(napi, base, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); -- cgit v1.2.3-59-g8ed1b From 4bffc669d6248d655aeb985a0e51bfaaf21c8b40 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 25 Nov 2018 08:26:23 -0800 Subject: net: remove unsafe skb_insert() I do not see how one can effectively use skb_insert() without holding some kind of lock. Otherwise other cpus could have changed the list right before we have a chance of acquiring list->lock. Only existing user is in drivers/infiniband/hw/nes/nes_mgt.c and this one probably meant to use __skb_insert() since it appears nesqp->pau_list is protected by nesqp->pau_lock. This looks like nesqp->pau_lock could be removed, since nesqp->pau_list.lock could be used instead. Signed-off-by: Eric Dumazet Cc: Faisal Latif Cc: Doug Ledford Cc: Jason Gunthorpe Cc: linux-rdma Signed-off-by: David S. Miller --- drivers/infiniband/hw/nes/nes_mgt.c | 4 ++-- include/linux/skbuff.h | 2 -- net/core/skbuff.c | 22 ---------------------- 3 files changed, 2 insertions(+), 26 deletions(-) (limited to 'net/core') diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c index fc0c191014e9..cc4dce5c3e5f 100644 --- a/drivers/infiniband/hw/nes/nes_mgt.c +++ b/drivers/infiniband/hw/nes/nes_mgt.c @@ -551,14 +551,14 @@ static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct ne /* Queue skb by sequence number */ if (skb_queue_len(&nesqp->pau_list) == 0) { - skb_queue_head(&nesqp->pau_list, skb); + __skb_queue_head(&nesqp->pau_list, skb); } else { skb_queue_walk(&nesqp->pau_list, tmpskb) { cb = (struct nes_rskb_cb *)&tmpskb->cb[0]; if (before(seqnum, cb->seqnum)) break; } - skb_insert(tmpskb, skb, &nesqp->pau_list); + __skb_insert(skb, tmpskb->prev, tmpskb, &nesqp->pau_list); } if (nesqp->pau_state == PAU_READY) process_it = true; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f17a7452ac7b..73902acf2b71 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1749,8 +1749,6 @@ static inline void skb_queue_head_init_class(struct sk_buff_head *list, * The "__skb_xxxx()" functions are the non-atomic ones that * can only be called with interrupts disabled. */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, - struct sk_buff_head *list); static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9a8a72cefe9b..02cd7ae3d0fb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2990,28 +2990,6 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head } EXPORT_SYMBOL(skb_append); -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * @list: list to use - * - * Place a packet before a given packet in a list. The list locks are - * taken and this function is atomic with respect to other list locked - * calls. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_insert(newsk, old->prev, old, list); - spin_unlock_irqrestore(&list->lock, flags); -} -EXPORT_SYMBOL(skb_insert); - static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) -- cgit v1.2.3-59-g8ed1b From d8f3e978bd30014081c599456285ca1d58c2aae1 Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 26 Nov 2018 13:42:41 -0800 Subject: bpf: Avoid unnecessary instruction in convert_bpf_ld_abs() 'offset' is constant and if it is zero, no need to subtract it from BPF_REG_TMP. Signed-off-by: David S. Miller Signed-off-by: Daniel Borkmann --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index aa274679965d..f50ea971f7a9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -463,7 +463,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); - *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + if (offset) + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { -- cgit v1.2.3-59-g8ed1b From a428afe82f98d2ffb31c981671630df1fa25906f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sat, 24 Nov 2018 04:34:20 +0200 Subject: net: bridge: add support for user-controlled bool options We have been adding many new bridge options, a big number of which are boolean but still take up netlink attribute ids and waste space in the skb. Recently we discussed learning from link-local packets[1] and decided yet another new boolean option will be needed, thus introducing this API to save some bridge nl space. The API supports changing the value of multiple boolean options at once via the br_boolopt_multi struct which has an optmask (which options to set, bit per opt) and optval (options' new values). Future boolean options will only be added to the br_boolopt_id enum and then will have to be handled in br_boolopt_toggle/get. The API will automatically add the ability to change and export them via netlink, sysfs can use the single boolopt function versions to do the same. The behaviour with failing/succeeding is the same as with normal netlink option changing. If an option requires mapping to internal kernel flag or needs special configuration to be enabled then it should be handled in br_boolopt_toggle. It should also be able to retrieve an option's current state via br_boolopt_get. v2: WARN_ON() on unsupported option as that shouldn't be possible and also will help catch people who add new options without handling them for both set and get. Pass down extack so if an option desires it could set it on error and be more user-friendly. [1] https://www.spinics.net/lists/netdev/msg532698.html Signed-off-by: Nikolay Aleksandrov Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 18 +++++++++++ include/uapi/linux/if_link.h | 1 + net/bridge/br.c | 71 ++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_netlink.c | 17 +++++++++- net/bridge/br_private.h | 8 +++++ net/core/rtnetlink.c | 2 +- 6 files changed, 115 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index e41eda3c71f1..6dc02c03bdf8 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -292,4 +292,22 @@ struct br_mcast_stats { __u64 mcast_bytes[BR_MCAST_DIR_SIZE]; __u64 mcast_packets[BR_MCAST_DIR_SIZE]; }; + +/* bridge boolean options + * IMPORTANT: if adding a new option do not forget to handle + * it in br_boolopt_toggle/get and bridge sysfs + */ +enum br_boolopt_id { + BR_BOOLOPT_MAX +}; + +/* struct br_boolopt_multi - change multiple bridge boolean options + * + * @optval: new option values (bit per option) + * @optmask: options to change (bit per option) + */ +struct br_boolopt_multi { + __u32 optval; + __u32 optmask; +}; #endif /* _UAPI_LINUX_IF_BRIDGE_H */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index f42c069d81db..d6533828123a 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -288,6 +288,7 @@ enum { IFLA_BR_MCAST_IGMP_VERSION, IFLA_BR_MCAST_MLD_VERSION, IFLA_BR_VLAN_STATS_PER_PORT, + IFLA_BR_MULTI_BOOLOPT, __IFLA_BR_MAX, }; diff --git a/net/bridge/br.c b/net/bridge/br.c index 360ad66c21e9..c527160c1975 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -175,6 +175,77 @@ static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; +/* br_boolopt_toggle - change user-controlled boolean option + * + * @br: bridge device + * @opt: id of the option to change + * @on: new option value + * @extack: extack for error messages + * + * Changes the value of the respective boolean option to @on taking care of + * any internal option value mapping and configuration. + */ +int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, + struct netlink_ext_ack *extack) +{ + switch (opt) { + default: + /* shouldn't be called with unsupported options */ + WARN_ON(1); + break; + } + + return 0; +} + +int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) +{ + switch (opt) { + default: + /* shouldn't be called with unsupported options */ + WARN_ON(1); + break; + } + + return 0; +} + +int br_boolopt_multi_toggle(struct net_bridge *br, + struct br_boolopt_multi *bm, + struct netlink_ext_ack *extack) +{ + unsigned long bitmap = bm->optmask; + int err = 0; + int opt_id; + + for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { + bool on = !!(bm->optval & BIT(opt_id)); + + err = br_boolopt_toggle(br, opt_id, on, extack); + if (err) { + br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n", + opt_id, br_boolopt_get(br, opt_id), on, err); + break; + } + } + + return err; +} + +void br_boolopt_multi_get(const struct net_bridge *br, + struct br_boolopt_multi *bm) +{ + u32 optval = 0; + int opt_id; + + for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++) + optval |= (br_boolopt_get(br, opt_id) << opt_id); + + bm->optval = optval; + bm->optmask = 0; +} + +/* private bridge options, controlled by the kernel */ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) { bool cur = !!br_opt_get(br, opt); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3345f1984542..13cd50326af2 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1035,6 +1035,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, + [IFLA_BR_MULTI_BOOLOPT] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_boolopt_multi) }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1296,6 +1298,15 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], } #endif + if (data[IFLA_BR_MULTI_BOOLOPT]) { + struct br_boolopt_multi *bm; + + bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]); + err = br_boolopt_multi_toggle(br, bm, extack); + if (err) + return err; + } + return 0; } @@ -1374,6 +1385,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */ #endif + nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */ 0; } @@ -1387,6 +1399,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; u8 vlan_enabled = br_vlan_enabled(br->dev); + struct br_boolopt_multi bm; u64 clockval; clockval = br_timer_value(&br->hello_timer); @@ -1403,6 +1416,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; + br_boolopt_multi_get(br, &bm); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || @@ -1420,7 +1434,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, br->topology_change_detected) || - nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr)) + nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) || + nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index bc2653738fc3..6d4c208fbf08 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -507,6 +507,14 @@ static inline int br_opt_get(const struct net_bridge *br, return test_bit(opt, &br->options); } +int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, + struct netlink_ext_ack *extack); +int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt); +int br_boolopt_multi_toggle(struct net_bridge *br, + struct br_boolopt_multi *bm, + struct netlink_ext_ack *extack); +void br_boolopt_multi_get(const struct net_bridge *br, + struct br_boolopt_multi *bm); void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on); /* br_device.c */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 86f2d9cbdae3..a498bb41c9aa 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,7 +59,7 @@ #include #include -#define RTNL_MAX_TYPE 49 +#define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 36 struct rtnl_link { -- cgit v1.2.3-59-g8ed1b From 74be39ebba36e98a973ddb914fb41dc9e5129e36 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Nov 2018 15:42:02 +0100 Subject: netns: remove net arg from rtnl_net_fill() This argument is not used anymore. Fixes: cab3c8ec8d57 ("netns: always provide the id to rtnl_net_fill()") Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/net_namespace.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index fefe72774aeb..52b9620e3457 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -739,7 +739,7 @@ static int rtnl_net_get_size(void) } static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, int nsid) + int cmd, int nsid) { struct nlmsghdr *nlh; struct rtgenmsg *rth; @@ -801,7 +801,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, id = peernet2id(net, peer); err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_NEWNSID, net, id); + RTM_NEWNSID, id); if (err < 0) goto err_out; @@ -816,7 +816,6 @@ out: } struct rtnl_net_dump_cb { - struct net *net; struct sk_buff *skb; struct netlink_callback *cb; int idx; @@ -833,7 +832,7 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, net_cb->net, id); + RTM_NEWNSID, id); if (ret < 0) return ret; @@ -846,7 +845,6 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct rtnl_net_dump_cb net_cb = { - .net = net, .skb = skb, .cb = cb, .idx = 0, @@ -876,7 +874,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id); + err = rtnl_net_fill(msg, 0, 0, 0, cmd, id); if (err < 0) goto err_out; -- cgit v1.2.3-59-g8ed1b From a0732ad14d40ee7562c8c6e04b01af6134c82831 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Nov 2018 15:42:03 +0100 Subject: netns: introduce 'struct net_fill_args' This is a preparatory work. To avoid having to much arguments for the function rtnl_net_fill(), a new structure is defined. Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/net_namespace.c | 48 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 52b9620e3457..f8a5966b086c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -738,20 +738,28 @@ static int rtnl_net_get_size(void) ; } -static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, int nsid) +struct net_fill_args { + u32 portid; + u32 seq; + int flags; + int cmd; + int nsid; +}; + +static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) { struct nlmsghdr *nlh; struct rtgenmsg *rth; - nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), + args->flags); if (!nlh) return -EMSGSIZE; rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nla_put_s32(skb, NETNSA_NSID, nsid)) + if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -767,10 +775,15 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + struct net_fill_args fillargs = { + .portid = NETLINK_CB(skb).portid, + .seq = nlh->nlmsg_seq, + .cmd = RTM_NEWNSID, + }; struct nlattr *nla; struct sk_buff *msg; struct net *peer; - int err, id; + int err; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); @@ -799,9 +812,8 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - id = peernet2id(net, peer); - err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_NEWNSID, id); + fillargs.nsid = peernet2id(net, peer); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; @@ -817,7 +829,7 @@ out: struct rtnl_net_dump_cb { struct sk_buff *skb; - struct netlink_callback *cb; + struct net_fill_args fillargs; int idx; int s_idx; }; @@ -830,9 +842,8 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) if (net_cb->idx < net_cb->s_idx) goto cont; - ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, - net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, id); + net_cb->fillargs.nsid = id; + ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; @@ -846,7 +857,12 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); struct rtnl_net_dump_cb net_cb = { .skb = skb, - .cb = cb, + .fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .cmd = RTM_NEWNSID, + }, .idx = 0, .s_idx = cb->args[0], }; @@ -867,6 +883,10 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) static void rtnl_net_notifyid(struct net *net, int cmd, int id) { + struct net_fill_args fillargs = { + .cmd = cmd, + .nsid = id, + }; struct sk_buff *msg; int err = -ENOMEM; @@ -874,7 +894,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, id); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; -- cgit v1.2.3-59-g8ed1b From cff478b9d9ccaee0de0e02700c63addf007b5d3c Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Nov 2018 15:42:04 +0100 Subject: netns: add support of NETNSA_TARGET_NSID Like it was done for link and address, add the ability to perform get/dump in another netns by specifying a target nsid attribute. Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/net_namespace.h | 1 + net/core/net_namespace.c | 86 +++++++++++++++++++++++++++++++++----- 2 files changed, 76 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/net_namespace.h b/include/uapi/linux/net_namespace.h index 0187c74d8889..0ed9dd61d32a 100644 --- a/include/uapi/linux/net_namespace.h +++ b/include/uapi/linux/net_namespace.h @@ -16,6 +16,7 @@ enum { NETNSA_NSID, NETNSA_PID, NETNSA_FD, + NETNSA_TARGET_NSID, __NETNSA_MAX, }; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f8a5966b086c..885c54197e31 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -669,6 +669,7 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, [NETNSA_FD] = { .type = NLA_U32 }, + [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, }; static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -780,9 +781,10 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, .seq = nlh->nlmsg_seq, .cmd = RTM_NEWNSID, }; + struct net *peer, *target = net; + bool put_target = false; struct nlattr *nla; struct sk_buff *msg; - struct net *peer; int err; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, @@ -806,13 +808,27 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } + if (tb[NETNSA_TARGET_NSID]) { + int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); + + target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); + if (IS_ERR(target)) { + NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); + NL_SET_ERR_MSG(extack, + "Target netns reference is invalid"); + err = PTR_ERR(target); + goto out; + } + put_target = true; + } + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } - fillargs.nsid = peernet2id(net, peer); + fillargs.nsid = peernet2id(target, peer); err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; @@ -823,15 +839,19 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, err_out: nlmsg_free(msg); out: + if (put_target) + put_net(target); put_net(peer); return err; } struct rtnl_net_dump_cb { + struct net *tgt_net; struct sk_buff *skb; struct net_fill_args fillargs; int idx; int s_idx; + bool put_tgt_net; }; static int rtnl_net_dumpid_one(int id, void *peer, void *data) @@ -852,10 +872,50 @@ cont: return 0; } +static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, + struct rtnl_net_dump_cb *net_cb, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NETNSA_MAX + 1]; + int err, i; + + err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, + rtnl_net_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + if (i == NETNSA_TARGET_NSID) { + struct net *net; + + net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); + if (IS_ERR(net)) { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Invalid target network namespace id"); + return PTR_ERR(net); + } + net_cb->tgt_net = net; + net_cb->put_tgt_net = true; + } else { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); struct rtnl_net_dump_cb net_cb = { + .tgt_net = sock_net(skb->sk), .skb = skb, .fillargs = { .portid = NETLINK_CB(cb->skb).portid, @@ -866,19 +926,23 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) .idx = 0, .s_idx = cb->args[0], }; + int err = 0; - if (cb->strict_check && - nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) { - NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request"); - return -EINVAL; + if (cb->strict_check) { + err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); + if (err < 0) + goto end; } - spin_lock_bh(&net->nsid_lock); - idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); - spin_unlock_bh(&net->nsid_lock); + spin_lock_bh(&net_cb.tgt_net->nsid_lock); + idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); cb->args[0] = net_cb.idx; - return skb->len; +end: + if (net_cb.put_tgt_net) + put_net(net_cb.tgt_net); + return err < 0 ? err : skb->len; } static void rtnl_net_notifyid(struct net *net, int cmd, int id) -- cgit v1.2.3-59-g8ed1b From 3a4f68bf660414801781fd06506a9c75c2d936e5 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Nov 2018 15:42:05 +0100 Subject: netns: enable to specify a nsid for a get request Combined with NETNSA_TARGET_NSID, it enables to "translate" a nsid from one netns to a nsid of another netns. This is useful when using NETLINK_F_LISTEN_ALL_NSID because it helps the user to interpret a nsid received from an other netns. Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/net_namespace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 885c54197e31..dd25fb22ad45 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -797,6 +797,11 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; + } else if (tb[NETNSA_NSID]) { + peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID])); + if (!peer) + peer = ERR_PTR(-ENOENT); + nla = tb[NETNSA_NSID]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; -- cgit v1.2.3-59-g8ed1b From 288f06a001eb6265122c620295b68a0dd53d1482 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Nov 2018 15:42:06 +0100 Subject: netns: enable to dump full nsid translation table Like the previous patch, the goal is to ease to convert nsids from one netns to another netns. A new attribute (NETNSA_CURRENT_NSID) is added to the kernel answer when NETNSA_TARGET_NSID is provided, thus the user can easily convert nsids. Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/net_namespace.h | 1 + net/core/net_namespace.c | 32 ++++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/net_namespace.h b/include/uapi/linux/net_namespace.h index 0ed9dd61d32a..9f9956809565 100644 --- a/include/uapi/linux/net_namespace.h +++ b/include/uapi/linux/net_namespace.h @@ -17,6 +17,7 @@ enum { NETNSA_PID, NETNSA_FD, NETNSA_TARGET_NSID, + NETNSA_CURRENT_NSID, __NETNSA_MAX, }; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index dd25fb22ad45..05b23b285058 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -736,6 +736,7 @@ static int rtnl_net_get_size(void) { return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ ; } @@ -745,6 +746,8 @@ struct net_fill_args { int flags; int cmd; int nsid; + bool add_ref; + int ref_nsid; }; static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) @@ -763,6 +766,10 @@ static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) goto nla_put_failure; + if (args->add_ref && + nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -782,7 +789,6 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, .cmd = RTM_NEWNSID, }; struct net *peer, *target = net; - bool put_target = false; struct nlattr *nla; struct sk_buff *msg; int err; @@ -824,7 +830,8 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, err = PTR_ERR(target); goto out; } - put_target = true; + fillargs.add_ref = true; + fillargs.ref_nsid = peernet2id(net, peer); } msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); @@ -844,7 +851,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, err_out: nlmsg_free(msg); out: - if (put_target) + if (fillargs.add_ref) put_net(target); put_net(peer); return err; @@ -852,11 +859,11 @@ out: struct rtnl_net_dump_cb { struct net *tgt_net; + struct net *ref_net; struct sk_buff *skb; struct net_fill_args fillargs; int idx; int s_idx; - bool put_tgt_net; }; static int rtnl_net_dumpid_one(int id, void *peer, void *data) @@ -868,6 +875,8 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) goto cont; net_cb->fillargs.nsid = id; + if (net_cb->fillargs.add_ref) + net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; @@ -904,8 +913,9 @@ static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, "Invalid target network namespace id"); return PTR_ERR(net); } + net_cb->fillargs.add_ref = true; + net_cb->ref_net = net_cb->tgt_net; net_cb->tgt_net = net; - net_cb->put_tgt_net = true; } else { NL_SET_BAD_ATTR(extack, tb[i]); NL_SET_ERR_MSG(extack, @@ -940,12 +950,22 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) } spin_lock_bh(&net_cb.tgt_net->nsid_lock); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net) && + !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) { + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); + err = -EAGAIN; + goto end; + } idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net)) + spin_unlock_bh(&net_cb.ref_net->nsid_lock); spin_unlock_bh(&net_cb.tgt_net->nsid_lock); cb->args[0] = net_cb.idx; end: - if (net_cb.put_tgt_net) + if (net_cb.fillargs.add_ref) put_net(net_cb.tgt_net); return err < 0 ? err : skb->len; } -- cgit v1.2.3-59-g8ed1b From 7246d8ed4dcce23f7509949a77be15fa9f0e3d28 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 26 Nov 2018 14:16:17 -0800 Subject: bpf: helper to pop data from messages This adds a BPF SK_MSG program helper so that we can pop data from a msg. We use this to pop metadata from a previous push data call. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 16 ++++- net/core/filter.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_bpf.c | 17 ++++- net/tls/tls_sw.c | 11 ++- 4 files changed, 209 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 23e2031a43d4..597afdbc1ab9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2268,6 +2268,19 @@ union bpf_attr { * * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags) + * Description + * Will remove *pop* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of msg + * payload and/or *pop* value being to large. + * + * Return + * 0 on success, or a negative erro in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2360,7 +2373,8 @@ union bpf_attr { FN(map_push_elem), \ FN(map_pop_elem), \ FN(map_peek_elem), \ - FN(msg_push_data), + FN(msg_push_data), \ + FN(msg_pop_data), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index f50ea971f7a9..bd0df75dc7b6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2425,6 +2425,174 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { .arg4_type = ARG_ANYTHING, }; +static void sk_msg_shift_left(struct sk_msg *msg, int i) +{ + int prev; + + do { + prev = i; + sk_msg_iter_var_next(i); + msg->sg.data[prev] = msg->sg.data[i]; + } while (i != msg->sg.end); + + sk_msg_iter_prev(msg, end); +} + +static void sk_msg_shift_right(struct sk_msg *msg, int i) +{ + struct scatterlist tmp, sge; + + sk_msg_iter_next(msg, end); + sge = sk_msg_elem_cpy(msg, i); + sk_msg_iter_var_next(i); + tmp = sk_msg_elem_cpy(msg, i); + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sk_msg_iter_var_next(i); + sge = tmp; + tmp = sk_msg_elem_cpy(msg, i); + } +} + +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + u32 i = 0, l, space, offset = 0; + u64 last = start + len; + int pop; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + /* Bounds checks: start and pop must be inside message */ + if (start >= offset + l || last >= msg->sg.size) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + pop = len; + /* --------------| offset + * -| start |-------- len -------| + * + * |----- a ----|-------- pop -------|----- b ----| + * |______________________________________________| length + * + * + * a: region at front of scatter element to save + * b: region at back of scatter element to save when length > A + pop + * pop: region to pop from element, same as input 'pop' here will be + * decremented below per iteration. + * + * Two top-level cases to handle when start != offset, first B is non + * zero and second B is zero corresponding to when a pop includes more + * than one element. + * + * Then if B is non-zero AND there is no space allocate space and + * compact A, B regions into page. If there is space shift ring to + * the rigth free'ing the next element in ring to place B, leaving + * A untouched except to reduce length. + */ + if (start != offset) { + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); + int a = start; + int b = sge->length - pop - a; + + sk_msg_iter_var_next(i); + + if (pop < sge->length - a) { + if (space) { + sge->length = a; + sk_msg_shift_right(msg, i); + nsge = sk_msg_elem(msg, i); + get_page(sg_page(sge)); + sg_set_page(nsge, + sg_page(sge), + b, sge->offset + pop + a); + } else { + struct page *page, *orig; + u8 *to, *from; + + page = alloc_pages(__GFP_NOWARN | + __GFP_COMP | GFP_ATOMIC, + get_order(a + b)); + if (unlikely(!page)) + return -ENOMEM; + + sge->length = a; + orig = sg_page(sge); + from = sg_virt(sge); + to = page_address(page); + memcpy(to, from, a); + memcpy(to + a, from + a + pop, b); + sg_set_page(sge, page, a + b, 0); + put_page(orig); + } + pop = 0; + } else if (pop >= sge->length - a) { + sge->length = a; + pop -= (sge->length - a); + } + } + + /* From above the current layout _must_ be as follows, + * + * -| offset + * -| start + * + * |---- pop ---|---------------- b ------------| + * |____________________________________________| length + * + * Offset and start of the current msg elem are equal because in the + * previous case we handled offset != start and either consumed the + * entire element and advanced to the next element OR pop == 0. + * + * Two cases to handle here are first pop is less than the length + * leaving some remainder b above. Simply adjust the element's layout + * in this case. Or pop >= length of the element so that b = 0. In this + * case advance to next element decrementing pop. + */ + while (pop) { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (pop < sge->length) { + sge->length -= pop; + sge->offset += pop; + pop = 0; + } else { + pop -= sge->length; + sk_msg_shift_left(msg, i); + } + sk_msg_iter_var_next(i); + } + + sk_mem_uncharge(msg->sk, len - pop); + msg->sg.size -= (len - pop); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_pop_data_proto = { + .func = bpf_msg_pop_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -5098,6 +5266,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_msg_push_data || + func == bpf_msg_pop_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5394,6 +5563,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; + case BPF_FUNC_msg_pop_data: + return &bpf_msg_pop_data_proto; default: return bpf_base_func_proto(func_id); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3b45fe530f91..a47c1cdf90fc 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -289,12 +289,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, { bool cork = false, enospc = msg->sg.start == msg->sg.end; struct sock *sk_redir; - u32 tosend; + u32 tosend, delta = 0; int ret; more_data: - if (psock->eval == __SK_NONE) + if (psock->eval == __SK_NONE) { + /* Track delta in msg size to add/subtract it on SK_DROP from + * returned to user copied size. This ensures user doesn't + * get a positive return code with msg_cut_data and SK_DROP + * verdict. + */ + delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (msg->sg.size < delta) + delta -= msg->sg.size; + else + delta = 0; + } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc) { @@ -350,7 +361,7 @@ more_data: default: sk_msg_free_partial(sk, msg, tosend); sk_msg_apply_bytes(psock, tosend); - *copied -= tosend; + *copied -= (tosend + delta); return -EACCES; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7b1af8b59cd2..d4ecc66464e6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -687,6 +687,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, struct sock *sk_redir; struct tls_rec *rec; int err = 0, send; + u32 delta = 0; bool enospc; psock = sk_psock_get(sk); @@ -694,8 +695,14 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, return tls_push_record(sk, flags, record_type); more_data: enospc = sk_msg_full(msg); - if (psock->eval == __SK_NONE) + if (psock->eval == __SK_NONE) { + delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (delta < msg->sg.size) + delta -= msg->sg.size; + else + delta = 0; + } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc && !full_record) { err = -ENOSPC; @@ -743,7 +750,7 @@ more_data: msg->apply_bytes -= send; if (msg->sg.size == 0) tls_free_open_rec(sk); - *copied -= send; + *copied -= (send + delta); err = -EACCES; } -- cgit v1.2.3-59-g8ed1b From 1464193107da8041e05341388964733bbba3be27 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 26 Nov 2018 09:31:26 -0800 Subject: net: explain __skb_checksum_complete() with comments Cc: Herbert Xu Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev.c | 1 + net/core/skbuff.c | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f69b2fcdee40..abe50c424b29 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5791,6 +5791,7 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 02cd7ae3d0fb..3c814565ed7c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2635,6 +2635,7 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) __sum16 sum; sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) @@ -2646,6 +2647,15 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) } EXPORT_SYMBOL(__skb_checksum_complete_head); +/* This function assumes skb->csum already holds pseudo header's checksum, + * which has been changed from the hardware checksum, for example, by + * __skb_checksum_validate_complete(). And, the original skb->csum must + * have been validated unsuccessfully for CHECKSUM_COMPLETE case. + * + * It returns non-zero if the recomputed checksum is still invalid, otherwise + * zero. The new checksum is stored back into skb->csum unless the skb is + * shared. + */ __sum16 __skb_checksum_complete(struct sk_buff *skb) { __wsum csum; @@ -2653,8 +2663,14 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) csum = skb_checksum(skb, 0, skb->len, 0); - /* skb->csum holds pseudo checksum */ sum = csum_fold(csum_add(skb->csum, csum)); + /* This check is inverted, because we already knew the hardware + * checksum is invalid before calling this function. So, if the + * re-computed checksum is valid instead, then we have a mismatch + * between the original skb->csum and skb_checksum(). This means either + * the original hardware checksum is incorrect or we screw up skb->csum + * when moving skb->data around. + */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) -- cgit v1.2.3-59-g8ed1b From b0e3f1bdf9e7140fd1151af575f468b5827a61e1 Mon Sep 17 00:00:00 2001 From: Geneviève Bastien Date: Tue, 27 Nov 2018 12:52:39 -0500 Subject: net: Add trace events for all receive exit points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trace events are already present for the receive entry points, to indicate how the reception entered the stack. This patch adds the corresponding exit trace events that will bound the reception such that all events occurring between the entry and the exit can be considered as part of the reception context. This greatly helps for dependency and root cause analyses. Without this, it is not possible with tracepoint instrumentation to determine whether a sched_wakeup event following a netif_receive_skb event is the result of the packet reception or a simple coincidence after further processing by the thread. It is possible using other mechanisms like kretprobes, but considering the "entry" points are already present, it would be good to add the matching exit events. In addition to linking packets with wakeups, the entry/exit event pair can also be used to perform network stack latency analyses. Signed-off-by: Geneviève Bastien CC: Mathieu Desnoyers CC: Steven Rostedt CC: Ingo Molnar CC: David S. Miller Reviewed-by: Steven Rostedt (VMware) (tracing side) Signed-off-by: David S. Miller --- include/trace/events/net.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/dev.c | 35 ++++++++++++++++++++++----- 2 files changed, 88 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/include/trace/events/net.h b/include/trace/events/net.h index 00aa72ce0e7c..1efd7d9b25fe 100644 --- a/include/trace/events/net.h +++ b/include/trace/events/net.h @@ -244,6 +244,65 @@ DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_ni_entry, TP_ARGS(skb) ); +DECLARE_EVENT_CLASS(net_dev_rx_exit_template, + + TP_PROTO(int ret), + + TP_ARGS(ret), + + TP_STRUCT__entry( + __field(int, ret) + ), + + TP_fast_assign( + __entry->ret = ret; + ), + + TP_printk("ret=%d", __entry->ret) +); + +DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit, + + TP_PROTO(int ret), + + TP_ARGS(ret) +); + +DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit, + + TP_PROTO(int ret), + + TP_ARGS(ret) +); + +DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit, + + TP_PROTO(int ret), + + TP_ARGS(ret) +); + +DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit, + + TP_PROTO(int ret), + + TP_ARGS(ret) +); + +DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_ni_exit, + + TP_PROTO(int ret), + + TP_ARGS(ret) +); + +DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit, + + TP_PROTO(int ret), + + TP_ARGS(ret) +); + #endif /* _TRACE_NET_H */ /* This part must be outside protection */ diff --git a/net/core/dev.c b/net/core/dev.c index abe50c424b29..04a6b7100aac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4527,9 +4527,14 @@ static int netif_rx_internal(struct sk_buff *skb) int netif_rx(struct sk_buff *skb) { + int ret; + trace_netif_rx_entry(skb); - return netif_rx_internal(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_rx); @@ -4544,6 +4549,7 @@ int netif_rx_ni(struct sk_buff *skb) if (local_softirq_pending()) do_softirq(); preempt_enable(); + trace_netif_rx_ni_exit(err); return err; } @@ -5229,9 +5235,14 @@ static void netif_receive_skb_list_internal(struct list_head *head) */ int netif_receive_skb(struct sk_buff *skb) { + int ret; + trace_netif_receive_skb_entry(skb); - return netif_receive_skb_internal(skb); + ret = netif_receive_skb_internal(skb); + trace_netif_receive_skb_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_receive_skb); @@ -5251,9 +5262,12 @@ void netif_receive_skb_list(struct list_head *head) if (list_empty(head)) return; - list_for_each_entry(skb, head, list) - trace_netif_receive_skb_list_entry(skb); + if (trace_netif_receive_skb_list_entry_enabled()) { + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); + } netif_receive_skb_list_internal(head); + trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); @@ -5645,12 +5659,17 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + gro_result_t ret; + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); - return napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + trace_napi_gro_receive_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_receive); @@ -5768,6 +5787,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) gro_result_t napi_gro_frags(struct napi_struct *napi) { + gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); if (!skb) @@ -5775,7 +5795,10 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_frags); -- cgit v1.2.3-59-g8ed1b From 420d031822737ef570df6834c0eae26f33988f20 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 27 Nov 2018 22:32:30 -0800 Subject: rtnetlink: remove a level of indentation in rtnl_newlink() rtnl_newlink() used to create VLAs based on link kind. Since commit ccf8dbcd062a ("rtnetlink: Remove VLA usage") statically sized array is created on the stack, so there is no more use for a separate code block that used to be the VLA's live range. While at it christmas tree the variables. Note that there is a goto-based retry so to be on the safe side the variables can no longer be initialized in place. It doesn't seem to matter, logically, but why make the code harder to read.. Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 313 +++++++++++++++++++++++++-------------------------- 1 file changed, 154 insertions(+), 159 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a498bb41c9aa..ee2caa2fd940 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2974,17 +2974,22 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; + unsigned char name_assign_type = NET_NAME_USER; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + const struct rtnl_link_ops *m_ops = NULL; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct net_device *master_dev = NULL; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - const struct rtnl_link_ops *m_ops = NULL; + struct nlattr *tb[IFLA_MAX + 1]; + struct net *dest_net, *link_net; + struct nlattr **slave_data; + char kind[MODULE_NAME_LEN]; struct net_device *dev; - struct net_device *master_dev = NULL; struct ifinfomsg *ifm; - char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; - struct nlattr *tb[IFLA_MAX+1]; - struct nlattr *linkinfo[IFLA_INFO_MAX+1]; - unsigned char name_assign_type = NET_NAME_USER; + struct nlattr **data; int err; #ifdef CONFIG_MODULES @@ -3040,195 +3045,185 @@ replay: ops = NULL; } - if (1) { - struct nlattr *attr[RTNL_MAX_TYPE + 1]; - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; - struct nlattr **data = NULL; - struct nlattr **slave_data = NULL; - struct net *dest_net, *link_net = NULL; - - if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; + data = NULL; + if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; - if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested(attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy, extack); - if (err < 0) - return err; - data = attr; - } - if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; - } + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + err = nla_parse_nested(attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); + if (err < 0) + return err; + data = attr; + } + if (ops->validate) { + err = ops->validate(tb, data, extack); + if (err < 0) + return err; } + } - if (m_ops) { - if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) - return -EINVAL; + slave_data = NULL; + if (m_ops) { + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; - if (m_ops->slave_maxtype && - linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested(slave_attr, - m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, - extack); - if (err < 0) - return err; - slave_data = slave_attr; - } + if (m_ops->slave_maxtype && + linkinfo[IFLA_INFO_SLAVE_DATA]) { + err = nla_parse_nested(slave_attr, m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + slave_data = slave_attr; } + } - if (dev) { - int status = 0; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + if (dev) { + int status = 0; - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; - err = ops->changelink(dev, tb, data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || + !ops->changelink) + return -EOPNOTSUPP; - if (linkinfo[IFLA_INFO_SLAVE_DATA]) { - if (!m_ops || !m_ops->slave_changelink) - return -EOPNOTSUPP; + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; + } - err = m_ops->slave_changelink(master_dev, dev, - tb, slave_data, - extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; - return do_setlink(skb, dev, ifm, extack, tb, ifname, - status); + err = m_ops->slave_changelink(master_dev, dev, tb, + slave_data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, + return do_setlink(skb, dev, ifm, extack, tb, ifname, status); + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); - return -ENODEV; - } + return -ENODEV; + } - if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; - if (!ops) { + if (!ops) { #ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; - } -#endif - NL_SET_ERR_MSG(extack, "Unknown device type"); - return -EOPNOTSUPP; + if (kind[0]) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + if (ops) + goto replay; } +#endif + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } - if (!ops->setup) - return -EOPNOTSUPP; - - if (!ifname[0]) { - snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - name_assign_type = NET_NAME_ENUM; - } + if (!ops->setup) + return -EOPNOTSUPP; - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); + if (!ifname[0]) { + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - NL_SET_ERR_MSG(extack, "Unknown network namespace id"); - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb, extack); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); + link_net = get_net_ns_by_id(dest_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + err = -EINVAL; goto out; } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; + } else { + link_net = NULL; + } - dev->ifindex = ifm->ifi_index; + dev = rtnl_create_link(link_net ? : dest_net, ifname, + name_assign_type, ops, tb, extack); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out; + } - if (ops->newlink) { - err = ops->newlink(link_net ? : net, dev, tb, data, - extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED) - free_netdev(dev); - goto out; - } - } else { - err = register_netdevice(dev); - if (err < 0) { + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) { + err = ops->newlink(link_net ? : net, dev, tb, data, extack); + /* Drivers should call free_netdev() in ->destructor + * and unregister it on failure after registration + * so that device could be finally freed in rtnl_unlock. + */ + if (err < 0) { + /* If device is not registered at all, free it now */ + if (dev->reg_state == NETREG_UNINITIALIZED) free_netdev(dev); - goto out; - } + goto out; + } + } else { + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + goto out; } - err = rtnl_configure_link(dev, ifm); + } + err = rtnl_configure_link(dev, ifm); + if (err < 0) + goto out_unregister; + if (link_net) { + err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); - if (err < 0) - goto out_unregister; - } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), - extack); - if (err) - goto out_unregister; - } + } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto out_unregister; + } out: - if (link_net) - put_net(link_net); - put_net(dest_net); - return err; + if (link_net) + put_net(link_net); + put_net(dest_net); + return err; out_unregister: - if (ops->newlink) { - LIST_HEAD(list_kill); + if (ops->newlink) { + LIST_HEAD(list_kill); - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); } + goto out; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.3-59-g8ed1b From a293974590cfdc2d59c559a54d62a5ecb648104b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 27 Nov 2018 22:32:31 -0800 Subject: rtnetlink: avoid frame size warning in rtnl_newlink() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standard kernel compilation produces the following warning: net/core/rtnetlink.c: In function ‘rtnl_newlink’: net/core/rtnetlink.c:3232:1: warning: the frame size of 1288 bytes is larger than 1024 bytes [-Wframe-larger-than=] } ^ This should not really be an issue, as rtnl_newlink() stack is generally quite shallow. Fix the warning by allocating attributes with kmalloc() in a wrapper and passing it down to rtnl_newlink(), avoiding complexities on error paths. Alternatively we could kmalloc() some structure within rtnl_newlink(), slave attributes look like a good candidate. In practice it adds to already rather high complexity and length of the function. Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ee2caa2fd940..98876cd1e36c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2971,14 +2971,13 @@ static int rtnl_group_changelink(const struct sk_buff *skb, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attr, struct netlink_ext_ack *extack) { struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; unsigned char name_assign_type = NET_NAME_USER; struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; const struct rtnl_link_ops *m_ops = NULL; - struct nlattr *attr[RTNL_MAX_TYPE + 1]; struct net_device *master_dev = NULL; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; @@ -3226,6 +3225,21 @@ out_unregister: goto out; } +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr **attr; + int ret; + + attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + ret = __rtnl_newlink(skb, nlh, attr, extack); + kfree(attr); + return ret; +} + static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { -- cgit v1.2.3-59-g8ed1b From e3da08d057002f9d0831949d51666c3e15dc6b29 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Sun, 2 Dec 2018 20:18:19 -0500 Subject: bpf: allow BPF read access to qdisc pkt_len The pkt_len field in qdisc_skb_cb stores the skb length as it will appear on the wire after segmentation. For byte accounting, this value is more accurate than skb->len. It is computed on entry to the TC layer, so only valid there. Allow read access to this field from BPF tc classifier and action programs. The implementation is analogous to tc_classid, aside from restricting to read access. To distinguish it from skb->len and self-describe export as wire_len. Changes v1->v2 - Rename pkt_len to wire_len Signed-off-by: Petar Penkov Signed-off-by: Vlad Dumitrescu Signed-off-by: Willem de Bruijn Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 16 +++++++++++++++ tools/include/uapi/linux/bpf.h | 1 + tools/testing/selftests/bpf/test_verifier.c | 32 +++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8050caea7495..0183b8e70a9e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2497,6 +2497,7 @@ struct __sk_buff { __u32 data_meta; struct bpf_flow_keys *flow_keys; __u64 tstamp; + __u32 wire_len; }; struct bpf_tunnel_key { diff --git a/net/core/filter.c b/net/core/filter.c index bd0df75dc7b6..3d54af4c363d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5773,6 +5773,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5797,6 +5798,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): @@ -5843,6 +5845,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -6273,6 +6276,7 @@ static bool sk_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -6360,6 +6364,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -6685,6 +6690,17 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sk_buff, tstamp, 8, target_size)); + break; + + case offsetof(struct __sk_buff, wire_len): + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, wire_len); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, pkt_len); + *target_size = 4; + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8050caea7495..0183b8e70a9e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2497,6 +2497,7 @@ struct __sk_buff { __u32 data_meta; struct bpf_flow_keys *flow_keys; __u64 tstamp; + __u32 wire_len; }; struct bpf_tunnel_key { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index c3b038f26ece..b4b4a3f93639 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -14033,6 +14033,38 @@ static struct bpf_test tests[] = { .result_unpriv = REJECT, .result = ACCEPT, }, + { + "check wire_len is not readable by sockets", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, wire_len)), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check wire_len is readable by tc classifier", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, wire_len)), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { + "check wire_len is not writable by tc classifier", + .insns = { + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, wire_len)), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "invalid bpf_context access", + .errstr_unpriv = "R1 leaks addr", + .result = REJECT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit v1.2.3-59-g8ed1b From 846e980a87fc30075517d6d979548294d5461bdb Mon Sep 17 00:00:00 2001 From: Shalom Toledo Date: Mon, 3 Dec 2018 07:58:59 +0000 Subject: devlink: Add 'fw_load_policy' generic parameter Many drivers load the device's firmware image during the initialization flow either from the flash or from the disk. Currently this option is not controlled by the user and the driver decides from where to load the firmware image. 'fw_load_policy' gives the ability to control this option which allows the user to choose between different loading policies supported by the driver. This parameter can be useful while testing and/or debugging the device. For example, testing a firmware bug fix. Signed-off-by: Shalom Toledo Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/devlink-params.txt | 9 +++++++++ include/net/devlink.h | 4 ++++ include/uapi/linux/devlink.h | 5 +++++ net/core/devlink.c | 5 +++++ 4 files changed, 23 insertions(+) (limited to 'net/core') diff --git a/Documentation/networking/devlink-params.txt b/Documentation/networking/devlink-params.txt index ae444ffe73ac..2d26434ddcf8 100644 --- a/Documentation/networking/devlink-params.txt +++ b/Documentation/networking/devlink-params.txt @@ -40,3 +40,12 @@ msix_vec_per_pf_min [DEVICE, GENERIC] for the device initialization. Value is same across all physical functions (PFs) in the device. Type: u32 + +fw_load_policy [DEVICE, GENERIC] + Controls the device's firmware loading policy. + Valid values: + * DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER (0) + Load firmware version preferred by the driver. + * DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH (1) + Load firmware currently stored in flash. + Type: u8 diff --git a/include/net/devlink.h b/include/net/devlink.h index 45db0c79462d..67f4293bc970 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -365,6 +365,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, + DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -392,6 +393,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME "msix_vec_per_pf_min" #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME "fw_load_policy" +#define DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE DEVLINK_PARAM_TYPE_U8 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 79407bbd296d..6e52d3660654 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -163,6 +163,11 @@ enum devlink_param_cmode { DEVLINK_PARAM_CMODE_MAX = __DEVLINK_PARAM_CMODE_MAX - 1 }; +enum devlink_param_fw_load_policy_value { + DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER, + DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH, +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, diff --git a/net/core/devlink.c b/net/core/devlink.c index 3a4b29a13d31..abb0da9d7b4b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2692,6 +2692,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, + .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, + .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3-59-g8ed1b From b5947e5d1e710c35ea281247bd27e6975250285c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 30 Nov 2018 15:32:39 -0500 Subject: udp: msg_zerocopy Extend zerocopy to udp sockets. Allow setting sockopt SO_ZEROCOPY and interpret flag MSG_ZEROCOPY. This patch was previously part of the zerocopy RFC patchsets. Zerocopy is not effective at small MTU. With segmentation offload building larger datagrams, the benefit of page flipping outweights the cost of generating a completion notification. tools/testing/selftests/net/msg_zerocopy.sh after applying follow-on test patch and making skb_orphan_frags_rx same as skb_orphan_frags: ipv4 udp -t 1 tx=191312 (11938 MB) txc=0 zc=n rx=191312 (11938 MB) ipv4 udp -z -t 1 tx=304507 (19002 MB) txc=304507 zc=y rx=304507 (19002 MB) ok ipv6 udp -t 1 tx=174485 (10888 MB) txc=0 zc=n rx=174485 (10888 MB) ipv6 udp -z -t 1 tx=294801 (18396 MB) txc=294801 zc=y rx=294801 (18396 MB) ok Changes v1 -> v2 - Fixup reverse christmas tree violation v2 -> v3 - Split refcount avoidance optimization into separate patch - Fix refcount leak on error in fragmented case (thanks to Paolo Abeni for pointing this one out!) - Fix refcount inc on zero - Test sock_flag SOCK_ZEROCOPY directly in __ip_append_data. This is needed since commit 5cf4a8532c99 ("tcp: really ignore MSG_ZEROCOPY if no SO_ZEROCOPY") did the same for tcp. Signed-off-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 6 ++++++ net/core/sock.c | 5 ++++- net/ipv4/ip_output.c | 23 ++++++++++++++++++++++- net/ipv6/ip6_output.c | 23 ++++++++++++++++++++++- 5 files changed, 55 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 73902acf2b71..04f52e719571 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -485,6 +485,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg); void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3c814565ed7c..1350901c5cb8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1105,6 +1105,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) +{ + return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) diff --git a/net/core/sock.c b/net/core/sock.c index 6d7e189e3cd9..f5bb89785e47 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1018,7 +1018,10 @@ set_rcvbuf: case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!((sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) || + (sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5dbec21856f4..6f843aff628c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); + struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct ip_options *opt = cork->opt; @@ -916,6 +917,19 @@ static int __ip_append_data(struct sock *sk, (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + if (rt->dst.dev->features & NETIF_F_SG && + csummode == CHECKSUM_PARTIAL) { + paged = true; + } else { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg); + } + } + cork->length += length; /* So, what's going on in the loop below? @@ -1006,6 +1020,7 @@ alloc_new_skb: cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; + skb_zcopy_set(skb, uarg); /* * Find where to start putting bytes. @@ -1068,7 +1083,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1098,6 +1113,10 @@ alloc_new_skb: skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; + } else { + err = skb_zerocopy_iter_dgram(skb, from, copy); + if (err < 0) + goto error; } offset += copy; length -= copy; @@ -1105,11 +1124,13 @@ alloc_new_skb: if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); + sock_zerocopy_put(uarg); return 0; error_efault: err = -EFAULT; error: + sock_zerocopy_put_abort(uarg); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 827a3f5ff3bb..7df04d20a91f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1245,6 +1245,7 @@ static int __ip6_append_data(struct sock *sk, { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; + struct ubuf_info *uarg = NULL; int exthdrlen = 0; int dst_exthdrlen = 0; int hh_len; @@ -1322,6 +1323,19 @@ emsgsize: rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + if (rt->dst.dev->features & NETIF_F_SG && + csummode == CHECKSUM_PARTIAL) { + paged = true; + } else { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg); + } + } + /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1445,6 +1459,7 @@ alloc_new_skb: cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; + skb_zcopy_set(skb, uarg); /* * Find where to start putting bytes @@ -1506,7 +1521,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1536,6 +1551,10 @@ alloc_new_skb: skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; + } else { + err = skb_zerocopy_iter_dgram(skb, from, copy); + if (err < 0) + goto error; } offset += copy; length -= copy; @@ -1543,11 +1562,13 @@ alloc_new_skb: if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); + sock_zerocopy_put(uarg); return 0; error_efault: err = -EFAULT; error: + sock_zerocopy_put_abort(uarg); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); -- cgit v1.2.3-59-g8ed1b From 52900d22288e7d45846037e1db277c665bbc40db Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 30 Nov 2018 15:32:40 -0500 Subject: udp: elide zerocopy operation in hot path With MSG_ZEROCOPY, each skb holds a reference to a struct ubuf_info. Release of its last reference triggers a completion notification. The TCP stack in tcp_sendmsg_locked holds an extra ref independent of the skbs, because it can build, send and free skbs within its loop, possibly reaching refcount zero and freeing the ubuf_info too soon. The UDP stack currently also takes this extra ref, but does not need it as all skbs are sent after return from __ip(6)_append_data. Avoid the extra refcount_inc and refcount_dec_and_test, and generally the sock_zerocopy_put in the common path, by passing the initial reference to the first skb. This approach is taken instead of initializing the refcount to 0, as that would generate error "refcount_t: increment on 0" on the next skb_zcopy_set. Changes v3 -> v4 - Move skb_zcopy_set below the only kfree_skb that might cause a premature uarg destroy before skb_zerocopy_put_abort - Move the entire skb_shinfo assignment block, to keep that cacheline access in one place Signed-off-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skbuff.h | 12 ++++++++---- net/core/skbuff.c | 9 +++++---- net/ipv4/ip_output.c | 22 +++++++++++----------- net/ipv4/tcp.c | 2 +- net/ipv6/ip6_output.c | 22 +++++++++++----------- 5 files changed, 36 insertions(+), 31 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 04f52e719571..75d50ab7997c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -481,7 +481,7 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg) } void sock_zerocopy_put(struct ubuf_info *uarg); -void sock_zerocopy_put_abort(struct ubuf_info *uarg); +void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); @@ -1326,10 +1326,14 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } -static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) +static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, + bool *have_ref) { if (skb && uarg && !skb_zcopy(skb)) { - sock_zerocopy_get(uarg); + if (unlikely(have_ref && *have_ref)) + *have_ref = false; + else + sock_zerocopy_get(uarg); skb_shinfo(skb)->destructor_arg = uarg; skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; } @@ -1374,7 +1378,7 @@ static inline void skb_zcopy_abort(struct sk_buff *skb) struct ubuf_info *uarg = skb_zcopy(skb); if (uarg) { - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, false); skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; } } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1350901c5cb8..c78ce114537e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1089,7 +1089,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg) } EXPORT_SYMBOL_GPL(sock_zerocopy_put); -void sock_zerocopy_put_abort(struct ubuf_info *uarg) +void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { struct sock *sk = skb_from_uarg(uarg)->sk; @@ -1097,7 +1097,8 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) atomic_dec(&sk->sk_zckey); uarg->len--; - sock_zerocopy_put(uarg); + if (have_uref) + sock_zerocopy_put(uarg); } } EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); @@ -1137,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -1157,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, if (skb_copy_ubufs(nskb, GFP_ATOMIC)) return -EIO; } - skb_zcopy_set(nskb, skb_uarg(orig)); + skb_zcopy_set(nskb, skb_uarg(orig), NULL); } return 0; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6f843aff628c..78f028bdad30 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -881,8 +881,8 @@ static int __ip_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; unsigned int wmem_alloc_delta = 0; + bool paged, extra_uref; u32 tskey = 0; - bool paged; skb = skb_peek_tail(queue); @@ -921,12 +921,13 @@ static int __ip_append_data(struct sock *sk, uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; + extra_uref = true; if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; } else { uarg->zerocopy = 0; - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, &extra_uref); } } @@ -1015,13 +1016,6 @@ alloc_new_skb: skb->csum = 0; skb_reserve(skb, hh_len); - /* only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - skb_zcopy_set(skb, uarg); - /* * Find where to start putting bytes. */ @@ -1054,6 +1048,13 @@ alloc_new_skb: exthdrlen = 0; csummode = CHECKSUM_NONE; + /* only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1124,13 +1125,12 @@ alloc_new_skb: if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); - sock_zerocopy_put(uarg); return 0; error_efault: err = -EFAULT; error: - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 215e4d3b3616..dc68c408bba0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1423,7 +1423,7 @@ do_error: if (copied + copied_syn) goto out; out_err: - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7df04d20a91f..ec8c235ea891 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1258,7 +1258,7 @@ static int __ip6_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; - bool paged; + bool paged, extra_uref; skb = skb_peek_tail(queue); if (!skb) { @@ -1327,12 +1327,13 @@ emsgsize: uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; + extra_uref = true; if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; } else { uarg->zerocopy = 0; - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, &extra_uref); } } @@ -1454,13 +1455,6 @@ alloc_new_skb: skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); - /* Only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - skb_zcopy_set(skb, uarg); - /* * Find where to start putting bytes */ @@ -1492,6 +1486,13 @@ alloc_new_skb: exthdrlen = 0; dst_exthdrlen = 0; + /* Only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1562,13 +1563,12 @@ alloc_new_skb: if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); - sock_zerocopy_put(uarg); return 0; error_efault: err = -EFAULT; error: - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); -- cgit v1.2.3-59-g8ed1b From 875e8939953483d856de226b72d14c6a000f9457 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Dec 2018 08:15:10 +0000 Subject: skbuff: Rename 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark' Commit abf4bb6b63d0 ("skbuff: Add the offload_mr_fwd_mark field") added the 'offload_mr_fwd_mark' field to indicate that a packet has already undergone L3 multicast routing by a capable device. The field is used to prevent the kernel from forwarding a packet through a netdev through which the device has already forwarded the packet. Currently, no unicast packet is routed by both the device and the kernel, but this is about to change by subsequent patches and we need to be able to mark such packets, so that they will no be forwarded twice. Instead of adding yet another field to 'struct sk_buff', we can just rename 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark', as a packet either has a multicast or a unicast destination IP. While at it, add a comment about both 'offload_fwd_mark' and 'offload_l3_fwd_mark'. Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 10 +++++----- include/linux/skbuff.h | 4 +++- net/core/skbuff.c | 2 +- net/ipv4/ipmr.c | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index c293ff1eed63..920085fbbf2a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3554,10 +3554,10 @@ static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u8 local_port, return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv); } -static void mlxsw_sp_rx_listener_mr_mark_func(struct sk_buff *skb, +static void mlxsw_sp_rx_listener_l3_mark_func(struct sk_buff *skb, u8 local_port, void *priv) { - skb->offload_mr_fwd_mark = 1; + skb->offload_l3_fwd_mark = 1; skb->offload_fwd_mark = 1; return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv); } @@ -3605,8 +3605,8 @@ out: MLXSW_RXL(mlxsw_sp_rx_listener_mark_func, _trap_id, _action, \ _is_ctrl, SP_##_trap_group, DISCARD) -#define MLXSW_SP_RXL_MR_MARK(_trap_id, _action, _trap_group, _is_ctrl) \ - MLXSW_RXL(mlxsw_sp_rx_listener_mr_mark_func, _trap_id, _action, \ +#define MLXSW_SP_RXL_L3_MARK(_trap_id, _action, _trap_group, _is_ctrl) \ + MLXSW_RXL(mlxsw_sp_rx_listener_l3_mark_func, _trap_id, _action, \ _is_ctrl, SP_##_trap_group, DISCARD) #define MLXSW_SP_EVENTL(_func, _trap_id) \ @@ -3683,7 +3683,7 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { MLXSW_SP_RXL_MARK(IPV6_PIM, TRAP_TO_CPU, PIM, false), MLXSW_SP_RXL_MARK(RPF, TRAP_TO_CPU, RPF, false), MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false), - MLXSW_SP_RXL_MR_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), + MLXSW_SP_RXL_L3_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), /* NVE traps */ MLXSW_SP_RXL_MARK(NVE_ENCAP_ARP, TRAP_TO_CPU, ARP, false), }; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 75d50ab7997c..b1831a5ca173 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -616,6 +616,8 @@ typedef unsigned char *sk_buff_data_t; * @pkt_type: Packet class * @fclone: skbuff clone status * @ipvs_property: skbuff is owned by ipvs + * @offload_fwd_mark: Packet was L2-forwarded in hardware + * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device * @tc_at_ingress: used within tc_classify to distinguish in/egress * @tc_redirected: packet was redirected by a tc action @@ -799,7 +801,7 @@ struct sk_buff { __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; - __u8 offload_mr_fwd_mark:1; + __u8 offload_l3_fwd_mark:1; #endif #ifdef CONFIG_NET_CLS_ACT __u8 tc_skip_classify:1; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c78ce114537e..40552547c69a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4885,7 +4885,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) #ifdef CONFIG_NET_SWITCHDEV skb->offload_fwd_mark = 0; - skb->offload_mr_fwd_mark = 0; + skb->offload_l3_fwd_mark = 0; #endif if (!xnet) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a6defbec4f1b..5cbc749a50aa 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1802,7 +1802,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, struct vif_device *out_vif = &mrt->vif_table[out_vifi]; struct vif_device *in_vif = &mrt->vif_table[in_vifi]; - if (!skb->offload_mr_fwd_mark) + if (!skb->offload_l3_fwd_mark) return false; if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) return false; -- cgit v1.2.3-59-g8ed1b From a74f0fa082b76c6a76cba5672f36218518bfdc09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Dec 2018 07:58:17 -0800 Subject: tcp: reduce POLLOUT events caused by TCP_NOTSENT_LOWAT TCP_NOTSENT_LOWAT socket option or sysctl was added in linux-3.12 as a step to enable bigger tcp sndbuf limits. It works reasonably well, but the following happens : Once the limit is reached, TCP stack generates an [E]POLLOUT event for every incoming ACK packet. This causes a high number of context switches. This patch implements the strategy David Miller added in sock_def_write_space() : - If TCP socket has a notsent_lowat constraint of X bytes, allow sendmsg() to fill up to X bytes, but send [E]POLLOUT only if number of notsent bytes is below X/2 This considerably reduces TCP_NOTSENT_LOWAT overhead, while allowing to keep the pipe full. Tested: 100 ms RTT netem testbed between A and B, 100 concurrent TCP_STREAM A:/# cat /proc/sys/net/ipv4/tcp_wmem 4096 262144 64000000 A:/# super_netperf 100 -H B -l 1000 -- -K bbr & A:/# grep TCP /proc/net/sockstat TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 1364904 # This is about 54 MB of memory per flow :/ A:/# vmstat 5 5 procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 0 0 0 256220672 13532 694976 0 0 10 0 28 14 0 1 99 0 0 2 0 0 256320016 13532 698480 0 0 512 0 715901 5927 0 10 90 0 0 0 0 0 256197232 13532 700992 0 0 735 13 771161 5849 0 11 89 0 0 1 0 0 256233824 13532 703320 0 0 512 23 719650 6635 0 11 89 0 0 2 0 0 256226880 13532 705780 0 0 642 4 775650 6009 0 12 88 0 0 A:/# echo 2097152 >/proc/sys/net/ipv4/tcp_notsent_lowat A:/# grep TCP /proc/net/sockstat TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 86411 # 3.5 MB per flow A:/# vmstat 5 5 # check that context switches have not inflated too much. procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 2 0 0 260386512 13592 662148 0 0 10 0 17 14 0 1 99 0 0 0 0 0 260519680 13592 604184 0 0 512 13 726843 12424 0 10 90 0 0 1 1 0 260435424 13592 598360 0 0 512 25 764645 12925 0 10 90 0 0 1 0 0 260855392 13592 578380 0 0 512 7 722943 13624 0 11 88 0 0 1 0 0 260445008 13592 601176 0 0 614 34 772288 14317 0 10 90 0 0 Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/sock.h | 20 +++++++++++++++----- include/net/tcp.h | 8 ++++++-- net/core/stream.c | 2 +- 3 files changed, 22 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/include/net/sock.h b/include/net/sock.h index f665d74ae509..df390a3e23fe 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1110,7 +1110,7 @@ struct proto { unsigned int inuse_idx; #endif - bool (*stream_memory_free)(const struct sock *sk); + bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*stream_memory_read)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); @@ -1192,19 +1192,29 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) #define sk_refcnt_debug_release(sk) do { } while (0) #endif /* SOCK_REFCNT_DEBUG */ -static inline bool sk_stream_memory_free(const struct sock *sk) +static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (sk->sk_wmem_queued >= sk->sk_sndbuf) return false; return sk->sk_prot->stream_memory_free ? - sk->sk_prot->stream_memory_free(sk) : true; + sk->sk_prot->stream_memory_free(sk, wake) : true; } -static inline bool sk_stream_is_writeable(const struct sock *sk) +static inline bool sk_stream_memory_free(const struct sock *sk) +{ + return __sk_stream_memory_free(sk, 0); +} + +static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) { return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && - sk_stream_memory_free(sk); + __sk_stream_memory_free(sk, wake); +} + +static inline bool sk_stream_is_writeable(const struct sock *sk) +{ + return __sk_stream_is_writeable(sk, 0); } static inline int sk_under_cgroup_hierarchy(struct sock *sk, diff --git a/include/net/tcp.h b/include/net/tcp.h index 0681afc62354..e0a65c067662 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1870,12 +1870,16 @@ static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; } -static inline bool tcp_stream_memory_free(const struct sock *sk) +/* @wake is one when sk_stream_write_space() calls us. + * This sends EPOLLOUT only if notsent_bytes is half the limit. + * This mimics the strategy used in sock_def_write_space(). + */ +static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) { const struct tcp_sock *tp = tcp_sk(sk); u32 notsent_bytes = tp->write_seq - tp->snd_nxt; - return notsent_bytes < tcp_notsent_lowat(tp); + return (notsent_bytes << wake) < tcp_notsent_lowat(tp); } #ifdef CONFIG_PROC_FS diff --git a/net/core/stream.c b/net/core/stream.c index 7d329fb1f553..e94bb02a5629 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -32,7 +32,7 @@ void sk_stream_write_space(struct sock *sk) struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_is_writeable(sk) && sock) { + if (__sk_stream_is_writeable(sk, 1) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); -- cgit v1.2.3-59-g8ed1b From 7a35a50df5a34e6eaee1b1f4f913b84879068fee Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 5 Dec 2018 20:02:29 -0800 Subject: neighbor: Add extack messages for add and delete commands Add extack messages for failures in neigh_add and neigh_delete. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 55 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 41954e42a2de..6d479b5562be 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1137,8 +1137,9 @@ static void neigh_update_hhs(struct neighbour *neigh) Caller MUST hold reference count on the entry. */ -int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - u32 flags, u32 nlmsg_pid) +static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, + u8 new, u32 flags, u32 nlmsg_pid, + struct netlink_ext_ack *extack) { u8 old; int err; @@ -1155,8 +1156,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; - if (neigh->dead) + if (neigh->dead) { + NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); goto out; + } neigh_update_ext_learned(neigh, flags, ¬ify); @@ -1193,8 +1196,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, use it, otherwise discard the request. */ err = -EINVAL; - if (!(old & NUD_VALID)) + if (!(old & NUD_VALID)) { + NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; + } lladdr = neigh->ha; } @@ -1307,6 +1312,12 @@ out: return err; } + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags, u32 nlmsg_pid) +{ + return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); +} EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is @@ -1678,8 +1689,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); - if (dst_attr == NULL) + if (!dst_attr) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1694,8 +1707,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(dst_attr) < (int)tbl->key_len) + if (nla_len(dst_attr) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); @@ -1711,10 +1726,9 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, + NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh, tbl); @@ -1744,8 +1758,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; err = -EINVAL; - if (tb[NDA_DST] == NULL) + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1755,16 +1771,21 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { + NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; + } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } + dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; @@ -1780,8 +1801,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - if (dev == NULL) + if (!dev) { + NL_SET_ERR_MSG(extack, "Device not specified"); goto out; + } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { @@ -1817,8 +1840,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_event_send(neigh, NULL); err = 0; } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); neigh_release(neigh); out: -- cgit v1.2.3-59-g8ed1b From 00f54e68924eaf075f3f24be18557899d347bc4a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 6 Dec 2018 17:05:36 +0000 Subject: net: core: dev: Add extack argument to dev_open() In order to pass extack together with NETDEV_PRE_UP notifications, it's necessary to route the extack to __dev_open() from diverse (possibly indirect) callers. One prominent API through which the notification is invoked is dev_open(). Therefore extend dev_open() with and extra extack argument and update all users. Most of the calls end up just encoding NULL, but bond and team drivers have the extack readily available. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c | 2 +- drivers/net/ethernet/cisco/enic/enic_ethtool.c | 2 +- drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 2 +- drivers/net/ethernet/sfc/ethtool.c | 2 +- drivers/net/ethernet/sfc/falcon/ethtool.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- drivers/net/hyperv/netvsc_drv.c | 4 ++-- drivers/net/net_failover.c | 8 ++++---- drivers/net/team/team.c | 2 +- drivers/net/wireless/intersil/hostap/hostap_main.c | 2 +- drivers/s390/net/qeth_l2_main.c | 2 +- drivers/s390/net/qeth_l3_main.c | 2 +- drivers/staging/fsl-dpaa2/ethsw/ethsw.c | 2 +- drivers/staging/unisys/visornic/visornic_main.c | 2 +- include/linux/netdevice.h | 2 +- net/bluetooth/6lowpan.c | 2 +- net/core/dev.c | 5 +++-- net/core/netpoll.c | 2 +- net/ipv4/ipmr.c | 4 ++-- net/ipv6/addrconf.c | 2 +- net/ipv6/ip6mr.c | 2 +- 23 files changed, 30 insertions(+), 29 deletions(-) (limited to 'net/core') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 333387f1f1fe..6b34dbefa7dd 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1538,7 +1538,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, slave_dev->flags |= IFF_SLAVE; /* open the slave since the application closed it */ - res = dev_open(slave_dev); + res = dev_open(slave_dev, extack); if (res) { netdev_dbg(bond_dev, "Opening slave %s failed\n", slave_dev->name); goto err_restore_mac; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c index a5fd71692c8b..43b42615ad84 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c @@ -525,7 +525,7 @@ static int aq_set_ringparam(struct net_device *ndev, } } if (ndev_running) - err = dev_open(ndev); + err = dev_open(ndev, NULL); err_exit: return err; diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c index f42f7a6e1559..ebd5c2cf1efe 100644 --- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c +++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c @@ -241,7 +241,7 @@ static int enic_set_ringparam(struct net_device *netdev, } enic_init_vnic_resources(enic); if (running) { - err = dev_open(netdev); + err = dev_open(netdev, NULL); if (err) goto err_out; } diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c index 774beda040a1..8e9b95871d30 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c @@ -624,7 +624,7 @@ static void hns_nic_self_test(struct net_device *ndev, clear_bit(NIC_STATE_TESTING, &priv->state); if (if_running) - (void)dev_open(ndev); + (void)dev_open(ndev, NULL); } /* Online tests aren't run; pass by default */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 4563638367ac..e678b6939da3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -821,7 +821,7 @@ static int hns3_set_ringparam(struct net_device *ndev, } if (if_running) - ret = dev_open(ndev); + ret = dev_open(ndev, NULL); return ret; } diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 3143588ffd77..600d7b895cf2 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -539,7 +539,7 @@ static void efx_ethtool_self_test(struct net_device *net_dev, /* We need rx buffers and interrupts. */ already_up = (efx->net_dev->flags & IFF_UP); if (!already_up) { - rc = dev_open(efx->net_dev); + rc = dev_open(efx->net_dev, NULL); if (rc) { netif_err(efx, drv, efx->net_dev, "failed opening device.\n"); diff --git a/drivers/net/ethernet/sfc/falcon/ethtool.c b/drivers/net/ethernet/sfc/falcon/ethtool.c index 1ccdb7a82e2a..72cedec945c1 100644 --- a/drivers/net/ethernet/sfc/falcon/ethtool.c +++ b/drivers/net/ethernet/sfc/falcon/ethtool.c @@ -517,7 +517,7 @@ static void ef4_ethtool_self_test(struct net_device *net_dev, /* We need rx buffers and interrupts. */ already_up = (efx->net_dev->flags & IFF_UP); if (!already_up) { - rc = dev_open(efx->net_dev); + rc = dev_open(efx->net_dev, NULL); if (rc) { netif_err(efx, drv, efx->net_dev, "failed opening device.\n"); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index c728ed1375b2..d20496f0ebd0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4082,7 +4082,7 @@ static void stmmac_reset_subtask(struct stmmac_priv *priv) set_bit(STMMAC_DOWN, &priv->state); dev_close(priv->dev); - dev_open(priv->dev); + dev_open(priv->dev, NULL); clear_bit(STMMAC_DOWN, &priv->state); clear_bit(STMMAC_RESETING, &priv->state); rtnl_unlock(); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 85936ed9e952..c65620adab52 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -137,7 +137,7 @@ static int netvsc_open(struct net_device *net) * slave as up. If open fails, then slave will be * still be offline (and not used). */ - ret = dev_open(vf_netdev); + ret = dev_open(vf_netdev, NULL); if (ret) netdev_warn(net, "unable to open slave: %s: %d\n", @@ -2002,7 +2002,7 @@ static void __netvsc_vf_setup(struct net_device *ndev, netif_addr_unlock_bh(ndev); if (netif_running(ndev)) { - ret = dev_open(vf_netdev); + ret = dev_open(vf_netdev, NULL); if (ret) netdev_warn(vf_netdev, "unable to open: %d\n", ret); diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index e964d312f4ca..ed1166adaa2f 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -40,14 +40,14 @@ static int net_failover_open(struct net_device *dev) primary_dev = rtnl_dereference(nfo_info->primary_dev); if (primary_dev) { - err = dev_open(primary_dev); + err = dev_open(primary_dev, NULL); if (err) goto err_primary_open; } standby_dev = rtnl_dereference(nfo_info->standby_dev); if (standby_dev) { - err = dev_open(standby_dev); + err = dev_open(standby_dev, NULL); if (err) goto err_standby_open; } @@ -517,7 +517,7 @@ static int net_failover_slave_register(struct net_device *slave_dev, dev_hold(slave_dev); if (netif_running(failover_dev)) { - err = dev_open(slave_dev); + err = dev_open(slave_dev, NULL); if (err && (err != -EBUSY)) { netdev_err(failover_dev, "Opening slave %s failed err:%d\n", slave_dev->name, err); @@ -680,7 +680,7 @@ static int net_failover_slave_name_change(struct net_device *slave_dev, /* We need to bring up the slave after the rename by udev in case * open failed with EBUSY when it was registered. */ - dev_open(slave_dev); + dev_open(slave_dev, NULL); return 0; } diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 364f514d56d8..93576e0240dd 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1212,7 +1212,7 @@ static int team_port_add(struct team *team, struct net_device *port_dev, goto err_port_enter; } - err = dev_open(port_dev); + err = dev_open(port_dev, extack); if (err) { netdev_dbg(dev, "Device %s opening failed\n", portname); diff --git a/drivers/net/wireless/intersil/hostap/hostap_main.c b/drivers/net/wireless/intersil/hostap/hostap_main.c index 012930d35434..b0e7c0a0617e 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_main.c +++ b/drivers/net/wireless/intersil/hostap/hostap_main.c @@ -690,7 +690,7 @@ static int prism2_open(struct net_device *dev) /* Master radio interface is needed for all operation, so open * it automatically when any virtual net_device is opened. */ local->master_dev_auto_open = 1; - dev_open(local->dev); + dev_open(local->dev, NULL); } netif_device_attach(dev); diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 2836231c1c5d..f108d4b44605 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1007,7 +1007,7 @@ static int __qeth_l2_set_online(struct ccwgroup_device *gdev, int recovery_mode) qeth_l2_set_rx_mode(card->dev); } else { rtnl_lock(); - dev_open(card->dev); + dev_open(card->dev, NULL); rtnl_unlock(); } } diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index eca68da39d05..42a7cdc59b76 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -2417,7 +2417,7 @@ static int __qeth_l3_set_online(struct ccwgroup_device *gdev, int recovery_mode) __qeth_l3_open(card->dev); qeth_l3_set_rx_mode(card->dev); } else { - dev_open(card->dev); + dev_open(card->dev, NULL); } rtnl_unlock(); } diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c index 4fa37d6e598b..daabaceeea52 100644 --- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c +++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c @@ -1172,7 +1172,7 @@ static int ethsw_open(struct ethsw_core *ethsw) for (i = 0; i < ethsw->sw_attr.num_ifs; i++) { port_priv = ethsw->ports[i]; - err = dev_open(port_priv->netdev); + err = dev_open(port_priv->netdev, NULL); if (err) { netdev_err(port_priv->netdev, "dev_open err %d\n", err); return err; diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c index 3647b8f1ed28..5eeb4b93b45b 100644 --- a/drivers/staging/unisys/visornic/visornic_main.c +++ b/drivers/staging/unisys/visornic/visornic_main.c @@ -2095,7 +2095,7 @@ static int visornic_resume(struct visor_device *dev, mod_timer(&devdata->irq_poll_timer, msecs_to_jiffies(2)); rtnl_lock(); - dev_open(netdev); + dev_open(netdev, NULL); rtnl_unlock(); complete_func(dev, 0); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 94fb2e12f117..d79be3055f5f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2605,7 +2605,7 @@ struct net_device *dev_get_by_name(struct net *net, const char *name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); int dev_alloc_name(struct net_device *dev, const char *name); -int dev_open(struct net_device *dev); +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack); void dev_close(struct net_device *dev); void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 828e87fe8027..9d79c7de234a 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -607,7 +607,7 @@ static void ifup(struct net_device *netdev) int err; rtnl_lock(); - err = dev_open(netdev); + err = dev_open(netdev, NULL); if (err < 0) BT_INFO("iface %s cannot be opened (%d)", netdev->name, err); rtnl_unlock(); diff --git a/net/core/dev.c b/net/core/dev.c index 04a6b7100aac..b801c1aafd70 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1406,7 +1406,8 @@ static int __dev_open(struct net_device *dev) /** * dev_open - prepare an interface for use. - * @dev: device to open + * @dev: device to open + * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally @@ -1416,7 +1417,7 @@ static int __dev_open(struct net_device *dev) * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ -int dev_open(struct net_device *dev) +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2b9fdbc43205..36a2b63ffd6d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -663,7 +663,7 @@ int netpoll_setup(struct netpoll *np) np_info(np, "device %s not up yet, forcing it\n", np->dev_name); - err = dev_open(ndev); + err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5cbc749a50aa..ea04e38f56e9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -506,7 +506,7 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) dev->flags |= IFF_MULTICAST; if (!ipmr_init_vif_indev(dev)) goto failure; - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); } @@ -589,7 +589,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) if (!ipmr_init_vif_indev(dev)) goto failure; - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 045597b9a7c0..521e471f1cf9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2820,7 +2820,7 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) dev = __dev_get_by_name(net, p.name); if (!dev) goto err_exit; - err = dev_open(dev); + err = dev_open(dev, NULL); } } #endif diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e2ea691e42c6..8c63494400c4 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -655,7 +655,7 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) return NULL; } - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); -- cgit v1.2.3-59-g8ed1b From 567c5e13be5cc74d24f5eb54cf353c2e2277189b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 6 Dec 2018 17:05:42 +0000 Subject: net: core: dev: Add extack argument to dev_change_flags() In order to pass extack together with NETDEV_PRE_UP notifications, it's necessary to route the extack to __dev_open() from diverse (possibly indirect) callers. One prominent API through which the notification is invoked is dev_change_flags(). Therefore extend dev_change_flags() with and extra extack argument and update all users. Most of the calls end up just encoding NULL, but several sites (VLAN, ipvlan, VRF, rtnetlink) do have extack available. Since the function declaration line is changed anyway, name the other function arguments to placate checkpatch. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 6 +++--- drivers/net/hyperv/netvsc_drv.c | 2 +- drivers/net/ipvlan/ipvlan_main.c | 12 ++++++++---- drivers/net/vrf.c | 4 ++-- include/linux/netdevice.h | 3 ++- net/8021q/vlan.c | 4 +++- net/core/dev.c | 4 +++- net/core/dev_ioctl.c | 2 +- net/core/net-sysfs.c | 2 +- net/core/rtnetlink.c | 3 ++- net/ipv4/devinet.c | 2 +- net/ipv4/ipconfig.c | 6 +++--- net/openvswitch/vport-geneve.c | 2 +- net/openvswitch/vport-gre.c | 2 +- net/openvswitch/vport-vxlan.c | 2 +- 15 files changed, 33 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 8710214594d8..6214d8c0d546 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -167,7 +167,7 @@ int ipoib_open(struct net_device *dev) if (flags & IFF_UP) continue; - dev_change_flags(cpriv->dev, flags | IFF_UP); + dev_change_flags(cpriv->dev, flags | IFF_UP, NULL); } up_read(&priv->vlan_rwsem); } @@ -207,7 +207,7 @@ static int ipoib_stop(struct net_device *dev) if (!(flags & IFF_UP)) continue; - dev_change_flags(cpriv->dev, flags & ~IFF_UP); + dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL); } up_read(&priv->vlan_rwsem); } @@ -1823,7 +1823,7 @@ static void ipoib_parent_unregister_pre(struct net_device *ndev) * running ensures the it will not add more work. */ rtnl_lock(); - dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); + dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL); rtnl_unlock(); /* ipoib_event() cannot be running once this returns */ diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index c65620adab52..18b5584d6377 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1993,7 +1993,7 @@ static void __netvsc_vf_setup(struct net_device *ndev, "unable to change mtu to %u\n", ndev->mtu); /* set multicast etc flags on VF */ - dev_change_flags(vf_netdev, ndev->flags | IFF_SLAVE); + dev_change_flags(vf_netdev, ndev->flags | IFF_SLAVE, NULL); /* sync address list from ndev to VF */ netif_addr_lock_bh(ndev); diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 14f1cbd3b96f..c3d3e458f541 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -85,10 +85,12 @@ static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, flags = ipvlan->dev->flags; if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) { err = dev_change_flags(ipvlan->dev, - flags | IFF_NOARP); + flags | IFF_NOARP, + extack); } else { err = dev_change_flags(ipvlan->dev, - flags & ~IFF_NOARP); + flags & ~IFF_NOARP, + extack); } if (unlikely(err)) goto fail; @@ -117,9 +119,11 @@ fail: flags = ipvlan->dev->flags; if (port->mode == IPVLAN_MODE_L3 || port->mode == IPVLAN_MODE_L3S) - dev_change_flags(ipvlan->dev, flags | IFF_NOARP); + dev_change_flags(ipvlan->dev, flags | IFF_NOARP, + NULL); else - dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP); + dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP, + NULL); } return err; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 1e9f2dc0de07..95909e262ba4 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -756,9 +756,9 @@ static void cycle_netdev(struct net_device *dev, if (!netif_running(dev)) return; - ret = dev_change_flags(dev, flags & ~IFF_UP); + ret = dev_change_flags(dev, flags & ~IFF_UP, extack); if (ret >= 0) - ret = dev_change_flags(dev, flags); + ret = dev_change_flags(dev, flags, extack); if (ret < 0) { netdev_err(dev, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d79be3055f5f..18cf464450ee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3612,7 +3612,8 @@ int dev_ifconf(struct net *net, struct ifconf *, int); int dev_ethtool(struct net *net, struct ifreq *); unsigned int dev_get_flags(const struct net_device *); int __dev_change_flags(struct net_device *, unsigned int flags); -int dev_change_flags(struct net_device *, unsigned int); +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack); void __dev_notify_flags(struct net_device *, unsigned int old_flags, unsigned int gchanges); int dev_change_name(struct net_device *, const char *); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index aef1a977279c..dc4411165e43 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -358,6 +358,7 @@ static int __vlan_device_event(struct net_device *dev, unsigned long event) static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { + struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; @@ -460,7 +461,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) - dev_change_flags(vlandev, flgs | IFF_UP); + dev_change_flags(vlandev, flgs | IFF_UP, + extack); netif_stacked_transfer_operstate(dev, vlandev); } break; diff --git a/net/core/dev.c b/net/core/dev.c index b801c1aafd70..8bba6f98b545 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7595,11 +7595,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, * dev_change_flags - change device settings * @dev: device * @flags: device state flags + * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ -int dev_change_flags(struct net_device *dev, unsigned int flags) +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 90e8aa36881e..da273ec3cc57 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -234,7 +234,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); + return dev_change_flags(dev, ifr->ifr_flags, NULL); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bd67c4d0fcfd..ff9fd2bb4ce4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -337,7 +337,7 @@ NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(dev, (unsigned int)new_flags); + return dev_change_flags(dev, (unsigned int)new_flags, NULL); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 98876cd1e36c..4c9e4e187600 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2489,7 +2489,8 @@ static int do_setlink(const struct sk_buff *skb, } if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); if (err < 0) goto errout; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a34602ae27de..5b9b6d497f71 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1100,7 +1100,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) inet_del_ifa(in_dev, ifap, 1); break; } - ret = dev_change_flags(dev, ifr->ifr_flags); + ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; case SIOCSIFADDR: /* Set interface address (and family) */ diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 88212615bf4c..55757764c381 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -220,7 +220,7 @@ static int __init ic_open_devs(void) for_each_netdev(&init_net, dev) { if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev)) continue; - if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) + if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0) pr_err("IP-Config: Failed to open %s\n", dev->name); } @@ -238,7 +238,7 @@ static int __init ic_open_devs(void) if (ic_proto_enabled && !able) continue; oflags = dev->flags; - if (dev_change_flags(dev, oflags | IFF_UP) < 0) { + if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) { pr_err("IP-Config: Failed to open %s\n", dev->name); continue; @@ -315,7 +315,7 @@ static void __init ic_close_devs(void) dev = d->dev; if (d != ic_dev && !netdev_uses_dsa(dev)) { pr_debug("IP-Config: Downing %s\n", dev->name); - dev_change_flags(dev, d->flags); + dev_change_flags(dev, d->flags, NULL); } kfree(d); } diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 5aaf3babfc3f..acb6077b7478 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -93,7 +93,7 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 0e72d95b0e8f..c38a62464b85 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -68,7 +68,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 7e6301b2ec4d..8f16f11f7ad3 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -131,7 +131,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); -- cgit v1.2.3-59-g8ed1b From 6d0403216d030e5623de3911168fceeaac2e14d6 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 6 Dec 2018 17:05:43 +0000 Subject: net: core: dev: Add extack argument to __dev_change_flags() In order to pass extack together with NETDEV_PRE_UP notifications, it's necessary to route the extack to __dev_open() from diverse (possibly indirect) callers. The last missing API is __dev_change_flags(). Therefore extend __dev_change_flags() with and extra extack argument and update the two existing users. Since the function declaration line is changed anyway, name the struct net_device argument to placate checkpatch. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- net/core/dev.c | 5 +++-- net/core/rtnetlink.c | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 18cf464450ee..fc6ba71513be 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3611,7 +3611,8 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, int dev_ifconf(struct net *net, struct ifconf *, int); int dev_ethtool(struct net *net, struct ifreq *); unsigned int dev_get_flags(const struct net_device *); -int __dev_change_flags(struct net_device *, unsigned int flags); +int __dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack); int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); void __dev_notify_flags(struct net_device *, unsigned int old_flags, diff --git a/net/core/dev.c b/net/core/dev.c index 8bba6f98b545..b37e320def13 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7498,7 +7498,8 @@ unsigned int dev_get_flags(const struct net_device *dev) } EXPORT_SYMBOL(dev_get_flags); -int __dev_change_flags(struct net_device *dev, unsigned int flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; @@ -7606,7 +7607,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; - ret = __dev_change_flags(dev, flags); + ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4c9e4e187600..91a0f7477f8e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2871,7 +2871,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { - err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + NULL); if (err < 0) return err; } -- cgit v1.2.3-59-g8ed1b From 263726053400b9c6671df8e87d3db9728199da13 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 6 Dec 2018 17:05:45 +0000 Subject: net: core: dev: Add call_netdevice_notifiers_extack() In order to propagate extack through NETDEV_PRE_UP, add a new function call_netdevice_notifiers_extack() that primes the extack field of the notifier info. Convert call_netdevice_notifiers() to a simple wrapper around the new function that passes NULL for extack. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/dev.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b37e320def13..4b033af8e6cd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -162,6 +162,9 @@ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack); static struct napi_struct *napi_by_id(unsigned int napi_id); /* @@ -1734,6 +1737,18 @@ static int call_netdevice_notifiers_info(unsigned long val, return raw_notifier_call_chain(&netdev_chain, val, info); } +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_info info = { + .dev = dev, + .extack = extack, + }; + + return call_netdevice_notifiers_info(val, &info); +} + /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1745,11 +1760,7 @@ static int call_netdevice_notifiers_info(unsigned long val, int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info = { - .dev = dev, - }; - - return call_netdevice_notifiers_info(val, &info); + return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); -- cgit v1.2.3-59-g8ed1b From 40c900aa1ff580afe941ff77f327f004546f0ce7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 6 Dec 2018 17:05:47 +0000 Subject: net: core: dev: Attach extack to NETDEV_PRE_UP Drivers may need to validate configuration of a device that's about to be upped. Should the validation fail, there's currently no way to communicate details of the failure to the user, beyond an error number. To mend that, change __dev_open() to take an extack argument and pass it from __dev_change_flags() and dev_open(), where it was propagated in the previous patches. Change __dev_open() to call call_netdevice_notifiers_extack() so that the passed-in extack is attached to the NETDEV_PRE_UP notifier. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4b033af8e6cd..068b60db35ae 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1364,7 +1364,7 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); -static int __dev_open(struct net_device *dev) +static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; @@ -1380,7 +1380,7 @@ static int __dev_open(struct net_device *dev) */ netpoll_poll_disable(dev); - ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; @@ -1427,7 +1427,7 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) if (dev->flags & IFF_UP) return 0; - ret = __dev_open(dev); + ret = __dev_open(dev, extack); if (ret < 0) return ret; @@ -7547,7 +7547,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, if (old_flags & IFF_UP) __dev_close(dev); else - ret = __dev_open(dev); + ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { -- cgit v1.2.3-59-g8ed1b From 58956317c8de52009d1a38a721474c24aef74fe7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 7 Dec 2018 12:24:57 -0800 Subject: neighbor: Improve garbage collection The existing garbage collection algorithm has a number of problems: 1. The gc algorithm will not evict PERMANENT entries as those entries are managed by userspace, yet the existing algorithm walks the entire hash table which means it always considers PERMANENT entries when looking for entries to evict. In some use cases (e.g., EVPN) there can be tens of thousands of PERMANENT entries leading to wasted CPU cycles when gc kicks in. As an example, with 32k permanent entries, neigh_alloc has been observed taking more than 4 msec per invocation. 2. Currently, when the number of neighbor entries hits gc_thresh2 and the last flush for the table was more than 5 seconds ago gc kicks in walks the entire hash table evicting *all* entries not in PERMANENT or REACHABLE state and not marked as externally learned. There is no discriminator on when the neigh entry was created or if it just moved from REACHABLE to another NUD_VALID state (e.g., NUD_STALE). It is possible for entries to be created or for established neighbor entries to be moved to STALE (e.g., an external node sends an ARP request) right before the 5 second window lapses: -----|---------x|----------|----- t-5 t t+5 If that happens those entries are evicted during gc causing unnecessary thrashing on neighbor entries and userspace caches trying to track them. Further, this contradicts the description of gc_thresh2 which says "Entries older than 5 seconds will be cleared". One workaround is to make gc_thresh2 == gc_thresh3 but that negates the whole point of having separate thresholds. 3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries when gc_thresh2 is exceeded is over kill and contributes to trashing especially during startup. This patch addresses these problems as follows: 1. Use of a separate list_head to track entries that can be garbage collected along with a separate counter. PERMANENT entries are not added to this list. The gc_thresh parameters are only compared to the new counter, not the total entries in the table. The forced_gc function is updated to only walk this new gc_list looking for entries to evict. 2. Entries are added to the list head at the tail and removed from the front. 3. Entries are only evicted if they were last updated more than 5 seconds ago, adhering to the original intent of gc_thresh2. 4. Forced gc is stopped once the number of gc_entries drops below gc_thresh2. 5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped when allocating a new neighbor for a PERMANENT entry. By extension this means there are no explicit limits on the number of PERMANENT entries that can be created, but this is no different than FIB entries or FDB entries. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 4 +- include/net/neighbour.h | 3 + net/core/neighbour.c | 119 +++++++++++++++++++++++---------- 3 files changed, 90 insertions(+), 36 deletions(-) (limited to 'net/core') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index af2a69439b93..acdfb5d2bcaa 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -108,8 +108,8 @@ neigh/default/gc_thresh2 - INTEGER Default: 512 neigh/default/gc_thresh3 - INTEGER - Maximum number of neighbor entries allowed. Increase this - when using large numbers of interfaces and when communicating + Maximum number of non-PERMANENT neighbor entries allowed. Increase + this when using large numbers of interfaces and when communicating with large numbers of directly-connected peers. Default: 1024 diff --git a/include/net/neighbour.h b/include/net/neighbour.h index f58b384aa6c9..6c13072910ab 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -154,6 +154,7 @@ struct neighbour { struct hh_cache hh; int (*output)(struct neighbour *, struct sk_buff *); const struct neigh_ops *ops; + struct list_head gc_list; struct rcu_head rcu; struct net_device *dev; u8 primary_key[0]; @@ -214,6 +215,8 @@ struct neigh_table { struct timer_list proxy_timer; struct sk_buff_head proxy_queue; atomic_t entries; + atomic_t gc_entries; + struct list_head gc_list; rwlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6d479b5562be..c3b58712e98b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -118,6 +118,34 @@ unsigned long neigh_rand_reach_time(unsigned long base) } EXPORT_SYMBOL(neigh_rand_reach_time); +static void neigh_mark_dead(struct neighbour *n) +{ + n->dead = 1; + if (!list_empty(&n->gc_list)) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } +} + +static void neigh_change_state(struct neighbour *n, u8 new) +{ + bool on_gc_list = !list_empty(&n->gc_list); + bool new_is_perm = new & NUD_PERMANENT; + + n->nud_state = new; + + /* remove from the gc list if new state is permanent; + * add to the gc list if new state is not permanent + */ + if (new_is_perm && on_gc_list) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } else if (!new_is_perm && !on_gc_list) { + /* add entries to the tail; cleaning removes from the front */ + list_add_tail(&n->gc_list, &n->tbl->gc_list); + atomic_inc(&n->tbl->gc_entries); + } +} static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, struct neighbour __rcu **np, struct neigh_table *tbl) @@ -132,7 +160,7 @@ static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, neigh = rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock)); rcu_assign_pointer(*np, neigh); - n->dead = 1; + neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); @@ -166,32 +194,31 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) static int neigh_forced_gc(struct neigh_table *tbl) { + int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; + unsigned long tref = jiffies - 5 * HZ; + u8 flags = NTF_EXT_LEARNED; + struct neighbour *n, *tmp; + u8 state = NUD_PERMANENT; int shrunk = 0; - int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; - struct neighbour __rcu **np; - np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - /* Neighbour record may be discarded if: - * - nobody refers to it. - * - it is not permanent - */ - if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, - tbl)) { - shrunk = 1; - continue; - } - np = &n->next; + list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { + if (refcount_read(&n->refcnt) == 1) { + bool remove = false; + + write_lock(&n->lock); + if (!(n->nud_state & state) && !(n->flags & flags) && + time_after(tref, n->updated)) + remove = true; + write_unlock(&n->lock); + + if (remove && neigh_remove_one(n, tbl)) + shrunk++; + if (shrunk >= max_clean) + break; } } @@ -260,8 +287,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); - n->dead = 1; - + neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. We must destroy neighbour entry, @@ -321,13 +347,18 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) } EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, + struct net_device *dev, + bool permanent) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; - entries = atomic_inc_return(&tbl->entries) - 1; + if (permanent) + goto do_alloc; + + entries = atomic_inc_return(&tbl->gc_entries) - 1; if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { @@ -340,6 +371,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device } } +do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; @@ -358,11 +390,19 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; + + if (!permanent) + list_add_tail(&n->gc_list, &n->tbl->gc_list); + else + INIT_LIST_HEAD(&n->gc_list); + + atomic_inc(&tbl->entries); out: return n; out_entries: - atomic_dec(&tbl->entries); + if (!permanent) + atomic_dec(&tbl->gc_entries); goto out; } @@ -505,13 +545,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev, bool want_ref) +static struct neighbour *___neigh_create(struct neigh_table *tbl, + const void *pkey, + struct net_device *dev, + bool permanent, bool want_ref) { + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, permanent); u32 hash_val; unsigned int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; if (!n) { @@ -591,6 +633,12 @@ out_neigh_release: neigh_release(n); goto out; } + +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) +{ + return ___neigh_create(tbl, pkey, dev, false, want_ref); +} EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) @@ -854,7 +902,7 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; - n->dead = 1; + neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; @@ -1167,7 +1215,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); - neigh->nud_state = new; + neigh_change_state(neigh, new); err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1246,7 +1294,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); - neigh->nud_state = new; + neigh_change_state(neigh, new); notify = 1; } @@ -1582,6 +1630,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); + INIT_LIST_HEAD(&tbl->gc_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); @@ -1813,7 +1862,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - neigh = __neigh_lookup_errno(tbl, dst, dev); + neigh = ___neigh_create(tbl, dst, dev, + ndm->ndm_state & NUD_PERMANENT, + true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; @@ -2654,7 +2705,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); - n->dead = 1; + neigh_mark_dead(n); } else np = &n->next; write_unlock(&n->lock); -- cgit v1.2.3-59-g8ed1b From 0fbe82e628c817e292ff588cd5847fc935e025f2 Mon Sep 17 00:00:00 2001 From: yupeng Date: Wed, 5 Dec 2018 18:56:28 -0800 Subject: net: call sk_dst_reset when set SO_DONTROUTE after set SO_DONTROUTE to 1, the IP layer should not route packets if the dest IP address is not in link scope. But if the socket has cached the dst_entry, such packets would be routed until the sk_dst_cache expires. So we should clean the sk_dst_cache when a user set SO_DONTROUTE option. Below are server/client python scripts which could reprodue this issue: server side code: ========================================================================== import socket import struct import time s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('0.0.0.0', 9000)) s.listen(1) sock, addr = s.accept() sock.setsockopt(socket.SOL_SOCKET, socket.SO_DONTROUTE, struct.pack('i', 1)) while True: sock.send(b'foo') time.sleep(1) ========================================================================== client side code: ========================================================================== import socket import time s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect(('server_address', 9000)) while True: data = s.recv(1024) print(data) ========================================================================== Signed-off-by: yupeng Signed-off-by: David S. Miller --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index f5bb89785e47..f00902c532cc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -700,6 +700,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); -- cgit v1.2.3-59-g8ed1b From 8cc196d6ef86bbab01dbf16f6c596cf56aa0839e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 10 Dec 2018 13:54:07 -0800 Subject: neighbor: gc_list changes should be protected by table lock Adding and removing neighbor entries to / from the gc_list need to be done while holding the table lock; a couple of places were missed in the original patch. Move the list_add_tail in neigh_alloc to ___neigh_create where the lock is already obtained. Since neighbor entries should rarely be moved to/from PERMANENT state, add lock/unlock around the gc_list changes in neigh_change_state rather than extending the lock hold around all neighbor updates. Fixes: 58956317c8de ("neighbor: Improve garbage collection") Reported-by: Andrei Vagin Reported-by: syzbot+6cc2fd1d3bdd2e007363@syzkaller.appspotmail.com Reported-by: syzbot+35e87b87c00f386b041f@syzkaller.appspotmail.com Reported-by: syzbot+b354d1fb59091ea73c37@syzkaller.appspotmail.com Reported-by: syzbot+3ddead5619658537909b@syzkaller.appspotmail.com Reported-by: syzbot+424d47d5c456ce8b2bbe@syzkaller.appspotmail.com Reported-by: syzbot+e4d42eb35f6a27b0a628@syzkaller.appspotmail.com Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index c3b58712e98b..03fdc5ae66b0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -138,11 +138,17 @@ static void neigh_change_state(struct neighbour *n, u8 new) * add to the gc list if new state is not permanent */ if (new_is_perm && on_gc_list) { + write_lock_bh(&n->tbl->lock); list_del_init(&n->gc_list); + write_unlock_bh(&n->tbl->lock); + atomic_dec(&n->tbl->gc_entries); } else if (!new_is_perm && !on_gc_list) { /* add entries to the tail; cleaning removes from the front */ + write_lock_bh(&n->tbl->lock); list_add_tail(&n->gc_list, &n->tbl->gc_list); + write_unlock_bh(&n->tbl->lock); + atomic_inc(&n->tbl->gc_entries); } } @@ -390,11 +396,7 @@ do_alloc: n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; - - if (!permanent) - list_add_tail(&n->gc_list, &n->tbl->gc_list); - else - INIT_LIST_HEAD(&n->gc_list); + INIT_LIST_HEAD(&n->gc_list); atomic_inc(&tbl->entries); out: @@ -616,6 +618,9 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl, } n->dead = 0; + if (!permanent) + list_add_tail(&n->gc_list, &n->tbl->gc_list); + if (want_ref) neigh_hold(n); rcu_assign_pointer(n->next, -- cgit v1.2.3-59-g8ed1b From 2fd527b72bb6f95dfe8a1902e998cb76390c431e Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 12 Dec 2018 17:02:48 +0000 Subject: net: ndo_bridge_setlink: Add extack Drivers may not be able to implement a VLAN addition or reconfiguration. In those cases it's desirable to explain to the user that it was rejected (and why). To that end, add extack argument to ndo_bridge_setlink. Adapt all users to that change. Following patches will use the new argument in the bridge driver. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/emulex/benet/be_main.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 4 +++- drivers/net/ethernet/intel/ice/ice_main.c | 3 ++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 ++- include/linux/netdevice.h | 5 +++-- net/bridge/br_netlink.c | 3 ++- net/bridge/br_private.h | 3 ++- net/core/rtnetlink.c | 6 ++++-- 9 files changed, 20 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index f4f29939ba72..8a2e9cdd38ee 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9618,7 +9618,7 @@ static int bnxt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, } static int bnxt_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, - u16 flags) + u16 flags, struct netlink_ext_ack *extack) { struct bnxt *bp = netdev_priv(dev); struct nlattr *attr, *br_spec; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 245abf0d19c0..852f5bfe5f6d 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -4955,7 +4955,7 @@ fw_exit: } static int be_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, - u16 flags) + u16 flags, struct netlink_ext_ack *extack) { struct be_adapter *adapter = netdev_priv(dev); struct nlattr *attr, *br_spec; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 6d5b13f69dec..fbb21ac06c98 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11685,6 +11685,7 @@ static int i40e_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], * @dev: the netdev being configured * @nlh: RTNL message * @flags: bridge flags + * @extack: netlink extended ack * * Inserts a new hardware bridge if not already created and * enables the bridging mode requested (VEB or VEPA). If the @@ -11697,7 +11698,8 @@ static int i40e_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], **/ static int i40e_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, - u16 flags) + u16 flags, + struct netlink_ext_ack *extack) { struct i40e_netdev_priv *np = netdev_priv(dev); struct i40e_vsi *vsi = np->vsi; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index e45e57499d91..f9f0d470412b 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3624,6 +3624,7 @@ static int ice_vsi_update_bridge_mode(struct ice_vsi *vsi, u16 bmode) * @dev: the netdev being configured * @nlh: RTNL message * @flags: bridge setlink flags + * @extack: netlink extended ack * * Sets the bridge mode (VEB/VEPA) of the switch to which the netdev (VSI) is * hooked up to. Iterates through the PF VSI list and sets the loopback mode (if @@ -3632,7 +3633,7 @@ static int ice_vsi_update_bridge_mode(struct ice_vsi *vsi, u16 bmode) */ static int ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, - u16 __always_unused flags) + u16 __always_unused flags, struct netlink_ext_ack *extack) { struct ice_netdev_priv *np = netdev_priv(dev); struct ice_pf *pf = np->vsi->back; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 49a4ea38eb07..f1e40734c975 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9979,7 +9979,8 @@ static int ixgbe_configure_bridge_mode(struct ixgbe_adapter *adapter, } static int ixgbe_ndo_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) + struct nlmsghdr *nlh, u16 flags, + struct netlink_ext_ack *extack) { struct ixgbe_adapter *adapter = netdev_priv(dev); struct nlattr *attr, *br_spec; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fc6ba71513be..36ca5f50f822 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1165,7 +1165,7 @@ struct dev_ifalias { * entries to skb and update idx with the number of entries. * * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, - * u16 flags) + * u16 flags, struct netlink_ext_ack *extack) * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq, * struct net_device *dev, u32 filter_mask, * int nlflags) @@ -1390,7 +1390,8 @@ struct net_device_ops { int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, - u16 flags); + u16 flags, + struct netlink_ext_ack *extack); int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index ff2c10d47529..f9be70b26091 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -850,7 +850,8 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) } /* Change state and parameters on port. */ -int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) +int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, + struct netlink_ext_ack *extack) { struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct nlattr *tb[IFLA_BRPORT_MAX + 1]; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 5719b4d3e466..090dfacdc438 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1138,7 +1138,8 @@ int br_netlink_init(void); void br_netlink_fini(void); void br_ifinfo_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port); -int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); +int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags, + struct netlink_ext_ack *extack); int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c9c0407a7ee0..3b6e551f9e69 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4332,7 +4332,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags); + err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, + extack); if (err) goto out; @@ -4344,7 +4345,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, - flags); + flags, + extack); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; -- cgit v1.2.3-59-g8ed1b From 3a37a9636cf3a1af2621a33f7eef8a2a3da81030 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 13 Dec 2018 11:54:30 +0000 Subject: net: dev: Add extack argument to dev_set_mac_address() A follow-up patch will add a notifier type NETDEV_PRE_CHANGEADDR, which allows vetoing of MAC address changes. One prominent path to that notification is through dev_set_mac_address(). Therefore give this function an extack argument, so that it can be packed together with the notification. Thus a textual reason for rejection (or a warning) can be communicated back to the user. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/bonding/bond_alb.c | 9 +++++---- drivers/net/bonding/bond_main.c | 15 ++++++++------- drivers/net/hyperv/netvsc_drv.c | 4 ++-- drivers/net/macvlan.c | 4 ++-- drivers/net/tap.c | 2 +- drivers/net/team/team.c | 2 +- drivers/net/tun.c | 2 +- drivers/usb/gadget/function/u_ether.c | 2 +- include/linux/netdevice.h | 3 ++- net/core/dev.c | 4 +++- net/core/dev_ioctl.c | 2 +- net/core/rtnetlink.c | 2 +- net/ieee802154/nl-phy.c | 2 +- 13 files changed, 29 insertions(+), 24 deletions(-) (limited to 'net/core') diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index e82108c917a6..9431127bbc60 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1031,7 +1031,7 @@ static int alb_set_slave_mac_addr(struct slave *slave, u8 addr[], */ memcpy(ss.__data, addr, len); ss.ss_family = dev->type; - if (dev_set_mac_address(dev, (struct sockaddr *)&ss)) { + if (dev_set_mac_address(dev, (struct sockaddr *)&ss, NULL)) { netdev_err(slave->bond->dev, "dev_set_mac_address of dev %s failed! ALB mode requires that the base driver support setting the hw address also when the network device's interface is open\n", dev->name); return -EOPNOTSUPP; @@ -1250,7 +1250,7 @@ static int alb_set_mac_address(struct bonding *bond, void *addr) bond_hw_addr_copy(tmp_addr, slave->dev->dev_addr, slave->dev->addr_len); - res = dev_set_mac_address(slave->dev, addr); + res = dev_set_mac_address(slave->dev, addr, NULL); /* restore net_device's hw address */ bond_hw_addr_copy(slave->dev->dev_addr, tmp_addr, @@ -1273,7 +1273,7 @@ unwind: bond_hw_addr_copy(tmp_addr, rollback_slave->dev->dev_addr, rollback_slave->dev->addr_len); dev_set_mac_address(rollback_slave->dev, - (struct sockaddr *)&ss); + (struct sockaddr *)&ss, NULL); bond_hw_addr_copy(rollback_slave->dev->dev_addr, tmp_addr, rollback_slave->dev->addr_len); } @@ -1732,7 +1732,8 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave bond->dev->addr_len); ss.ss_family = bond->dev->type; /* we don't care if it can't change its mac, best effort */ - dev_set_mac_address(new_slave->dev, (struct sockaddr *)&ss); + dev_set_mac_address(new_slave->dev, (struct sockaddr *)&ss, + NULL); bond_hw_addr_copy(new_slave->dev->dev_addr, tmp_addr, new_slave->dev->addr_len); diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 6b34dbefa7dd..06039be63034 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -680,7 +680,7 @@ static void bond_do_fail_over_mac(struct bonding *bond, } rv = dev_set_mac_address(new_active->dev, - (struct sockaddr *)&ss); + (struct sockaddr *)&ss, NULL); if (rv) { netdev_err(bond->dev, "Error %d setting MAC of slave %s\n", -rv, new_active->dev->name); @@ -695,7 +695,7 @@ static void bond_do_fail_over_mac(struct bonding *bond, ss.ss_family = old_active->dev->type; rv = dev_set_mac_address(old_active->dev, - (struct sockaddr *)&ss); + (struct sockaddr *)&ss, NULL); if (rv) netdev_err(bond->dev, "Error %d setting MAC of slave %s\n", -rv, new_active->dev->name); @@ -1527,7 +1527,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, */ memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len); ss.ss_family = slave_dev->type; - res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss); + res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, + extack); if (res) { netdev_dbg(bond_dev, "Error %d calling set_mac_address\n", res); goto err_restore_mtu; @@ -1818,7 +1819,7 @@ err_restore_mac: bond_hw_addr_copy(ss.__data, new_slave->perm_hwaddr, new_slave->dev->addr_len); ss.ss_family = slave_dev->type; - dev_set_mac_address(slave_dev, (struct sockaddr *)&ss); + dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL); } err_restore_mtu: @@ -1999,7 +2000,7 @@ static int __bond_release_one(struct net_device *bond_dev, bond_hw_addr_copy(ss.__data, slave->perm_hwaddr, slave->dev->addr_len); ss.ss_family = slave_dev->type; - dev_set_mac_address(slave_dev, (struct sockaddr *)&ss); + dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL); } if (unregister) @@ -3732,7 +3733,7 @@ static int bond_set_mac_address(struct net_device *bond_dev, void *addr) bond_for_each_slave(bond, slave, iter) { netdev_dbg(bond_dev, "slave %p %s\n", slave, slave->dev->name); - res = dev_set_mac_address(slave->dev, addr); + res = dev_set_mac_address(slave->dev, addr, NULL); if (res) { /* TODO: consider downing the slave * and retry ? @@ -3761,7 +3762,7 @@ unwind: break; tmp_res = dev_set_mac_address(rollback_slave->dev, - (struct sockaddr *)&tmp_ss); + (struct sockaddr *)&tmp_ss, NULL); if (tmp_res) { netdev_dbg(bond_dev, "unwind err %d dev %s\n", tmp_res, rollback_slave->dev->name); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 18b5584d6377..91ed15ea5883 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1247,7 +1247,7 @@ static int netvsc_set_mac_addr(struct net_device *ndev, void *p) return -ENODEV; if (vf_netdev) { - err = dev_set_mac_address(vf_netdev, addr); + err = dev_set_mac_address(vf_netdev, addr, NULL); if (err) return err; } @@ -1258,7 +1258,7 @@ static int netvsc_set_mac_addr(struct net_device *ndev, void *p) } else if (vf_netdev) { /* rollback change on VF */ memcpy(addr->sa_data, ndev->dev_addr, ETH_ALEN); - dev_set_mac_address(vf_netdev, addr); + dev_set_mac_address(vf_netdev, addr, NULL); } return err; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 0da3d36b283b..fc726ce4c164 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -744,7 +744,7 @@ static int macvlan_set_mac_address(struct net_device *dev, void *p) if (vlan->mode == MACVLAN_MODE_PASSTHRU) { macvlan_set_addr_change(vlan->port); - return dev_set_mac_address(vlan->lowerdev, addr); + return dev_set_mac_address(vlan->lowerdev, addr, NULL); } if (macvlan_addr_busy(vlan->port, addr->sa_data)) @@ -1213,7 +1213,7 @@ static void macvlan_port_destroy(struct net_device *dev) sa.sa_family = port->dev->type; memcpy(&sa.sa_data, port->perm_addr, port->dev->addr_len); - dev_set_mac_address(port->dev, &sa); + dev_set_mac_address(port->dev, &sa, NULL); } kfree(port); diff --git a/drivers/net/tap.c b/drivers/net/tap.c index f03004f37eca..443b2694130c 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1113,7 +1113,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd, rtnl_unlock(); return -ENOLINK; } - ret = dev_set_mac_address(tap->dev, &sa); + ret = dev_set_mac_address(tap->dev, &sa, NULL); tap_put_tap_dev(tap); rtnl_unlock(); return ret; diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 93576e0240dd..afd9d25d1992 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -59,7 +59,7 @@ static int __set_port_dev_addr(struct net_device *port_dev, memcpy(addr.__data, dev_addr, port_dev->addr_len); addr.ss_family = port_dev->type; - return dev_set_mac_address(port_dev, (struct sockaddr *)&addr); + return dev_set_mac_address(port_dev, (struct sockaddr *)&addr, NULL); } static int team_port_set_orig_dev_addr(struct team_port *port) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index ea528248d7d0..72577aa35b06 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3202,7 +3202,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, tun_debug(KERN_DEBUG, tun, "set hw address: %pM\n", ifr.ifr_hwaddr.sa_data); - ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr); + ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr, NULL); break; case TUNGETSNDBUF: diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 0f026d445e31..737bd77a575d 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -879,7 +879,7 @@ int gether_register_netdev(struct net_device *net) sa.sa_family = net->type; memcpy(sa.sa_data, dev->dev_mac, ETH_ALEN); rtnl_lock(); - status = dev_set_mac_address(net, &sa); + status = dev_set_mac_address(net, &sa, NULL); rtnl_unlock(); if (status) pr_warn("cannot set self ethernet address: %d\n", status); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 36ca5f50f822..d89875ec21ac 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3628,7 +3628,8 @@ int dev_set_mtu_ext(struct net_device *dev, int mtu, int dev_set_mtu(struct net_device *, int); int dev_change_tx_queue_len(struct net_device *, unsigned long); void dev_set_group(struct net_device *, int); -int dev_set_mac_address(struct net_device *, struct sockaddr *); +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); diff --git a/net/core/dev.c b/net/core/dev.c index 754284873355..7250a3a73fa4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7759,10 +7759,12 @@ EXPORT_SYMBOL(dev_set_group); * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address + * @extack: netlink extended ack * * Change the hardware (MAC) address of the device */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index da273ec3cc57..31380fd5a4e2 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -246,7 +246,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) case SIOCSIFHWADDR: if (dev->addr_len > sizeof(struct sockaddr)) return -EINVAL; - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3b6e551f9e69..f8bdb8adab2c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2444,7 +2444,7 @@ static int do_setlink(const struct sk_buff *skb, sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); - err = dev_set_mac_address(dev, sa); + err = dev_set_mac_address(dev, sa, extack); kfree(sa); if (err) goto errout; diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index b231e40f006a..0c25c0bcc4da 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -242,7 +242,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) * dev_set_mac_address require RTNL_LOCK */ rtnl_lock(); - rc = dev_set_mac_address(dev, &addr); + rc = dev_set_mac_address(dev, &addr, NULL); rtnl_unlock(); if (rc) goto dev_unregister; -- cgit v1.2.3-59-g8ed1b From 1570415f0810fce085066fb39827397452c3965a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 13 Dec 2018 11:54:33 +0000 Subject: net: dev: Add NETDEV_PRE_CHANGEADDR The NETDEV_CHANGEADDR notification is emitted after a device address changes. Extending this message to allow vetoing is certainly possible, but several other notification types have instead adopted a simple two-stage approach: first a "pre" notification is sent to make sure all interested parties are OK with a change that's about to be done. Then the change is done, and afterwards a "post" notification is sent. This dual approach is easier to use: when the change is vetoed, nothing has changed yet, and it's therefore unnecessary to roll anything back. Therefore adopt it for NETDEV_CHANGEADDR as well. To that end, add NETDEV_PRE_CHANGEADDR and an info structure to go along with it. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 +++++++- net/core/dev.c | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d89875ec21ac..1d5ad053ccf7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2450,7 +2450,8 @@ enum netdev_cmd { NETDEV_REGISTER, NETDEV_UNREGISTER, NETDEV_CHANGEMTU, /* notify after mtu change happened */ - NETDEV_CHANGEADDR, + NETDEV_CHANGEADDR, /* notify after the address change */ + NETDEV_PRE_CHANGEADDR, /* notify before the address change */ NETDEV_GOING_DOWN, NETDEV_CHANGENAME, NETDEV_FEAT_CHANGE, @@ -2512,6 +2513,11 @@ struct netdev_notifier_changelowerstate_info { void *lower_state_info; /* is lower dev state */ }; +struct netdev_notifier_pre_changeaddr_info { + struct netdev_notifier_info info; /* must be first */ + const unsigned char *dev_addr; +}; + static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { diff --git a/net/core/dev.c b/net/core/dev.c index 7250a3a73fa4..01497b7d1bdf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1589,6 +1589,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) + N(PRE_CHANGEADDR) } #undef N return "UNKNOWN_NETDEV_EVENT"; -- cgit v1.2.3-59-g8ed1b From d59cdf9475ad84d1f57cab1d162cf289702cfb15 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 13 Dec 2018 11:54:35 +0000 Subject: net: dev: Issue NETDEV_PRE_CHANGEADDR When a device address is about to be changed, or an address added to the list of device HW addresses, it is necessary to ensure that all interested parties can support the address. Therefore, send the NETDEV_PRE_CHANGEADDR notification, and if anyone bails on it, do not change the address. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 24 ++++++++++++++++++++++++ net/core/dev_addr_lists.c | 3 +++ 3 files changed, 29 insertions(+) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1d5ad053ccf7..811632d4d8b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3634,6 +3634,8 @@ int dev_set_mtu_ext(struct net_device *dev, int mtu, int dev_set_mtu(struct net_device *, int); int dev_change_tx_queue_len(struct net_device *, unsigned long); void dev_set_group(struct net_device *, int); +int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_change_carrier(struct net_device *, bool new_carrier); diff --git a/net/core/dev.c b/net/core/dev.c index 01497b7d1bdf..ed9aa4a91f1f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7756,6 +7756,27 @@ void dev_set_group(struct net_device *dev, int new_group) } EXPORT_SYMBOL(dev_set_group); +/** + * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack + */ +int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_pre_changeaddr_info info = { + .info.dev = dev, + .info.extack = extack, + .dev_addr = addr, + }; + int rc; + + rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); + return notifier_to_errno(rc); +} +EXPORT_SYMBOL(dev_pre_changeaddr_notify); + /** * dev_set_mac_address - Change Media Access Control Address * @dev: device @@ -7776,6 +7797,9 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; + err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); + if (err) + return err; err = ops->ndo_set_mac_address(dev, sa); if (err) return err; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 81a8cd4ea3bd..a6723b306717 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -498,6 +498,9 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, ASSERT_RTNL(); + err = dev_pre_changeaddr_notify(dev, addr, NULL); + if (err) + return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); -- cgit v1.2.3-59-g8ed1b From 9c29a2f55ec05cc8b525ee3b2d75d3cd37911123 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 11 Dec 2018 18:57:21 -0700 Subject: neighbor: Fix locking order for gc_list changes Lock checker noted an inverted lock order between neigh_change_state (neighbor lock then table lock) and neigh_periodic_work (table lock and then neighbor lock) resulting in: [ 121.057652] ====================================================== [ 121.058740] WARNING: possible circular locking dependency detected [ 121.059861] 4.20.0-rc6+ #43 Not tainted [ 121.060546] ------------------------------------------------------ [ 121.061630] kworker/0:2/65 is trying to acquire lock: [ 121.062519] (____ptrval____) (&n->lock){++--}, at: neigh_periodic_work+0x237/0x324 [ 121.063894] [ 121.063894] but task is already holding lock: [ 121.064920] (____ptrval____) (&tbl->lock){+.-.}, at: neigh_periodic_work+0x194/0x324 [ 121.066274] [ 121.066274] which lock already depends on the new lock. [ 121.066274] [ 121.067693] [ 121.067693] the existing dependency chain (in reverse order) is: ... Fix by renaming neigh_change_state to neigh_update_gc_list, changing it to only manage whether an entry should be on the gc_list and taking locks in the same order as neigh_periodic_work. Invoke at the end of neigh_update only if diff between old or new states has the PERMANENT flag set. Fixes: 8cc196d6ef86 ("neighbor: gc_list changes should be protected by table lock") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 03fdc5ae66b0..010784123bc1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -127,30 +127,30 @@ static void neigh_mark_dead(struct neighbour *n) } } -static void neigh_change_state(struct neighbour *n, u8 new) +static void neigh_update_gc_list(struct neighbour *n) { - bool on_gc_list = !list_empty(&n->gc_list); - bool new_is_perm = new & NUD_PERMANENT; + bool on_gc_list, new_is_perm; - n->nud_state = new; + write_lock_bh(&n->tbl->lock); + write_lock(&n->lock); /* remove from the gc list if new state is permanent; * add to the gc list if new state is not permanent */ + new_is_perm = n->nud_state & NUD_PERMANENT; + on_gc_list = !list_empty(&n->gc_list); + if (new_is_perm && on_gc_list) { - write_lock_bh(&n->tbl->lock); list_del_init(&n->gc_list); - write_unlock_bh(&n->tbl->lock); - atomic_dec(&n->tbl->gc_entries); } else if (!new_is_perm && !on_gc_list) { /* add entries to the tail; cleaning removes from the front */ - write_lock_bh(&n->tbl->lock); list_add_tail(&n->gc_list, &n->tbl->gc_list); - write_unlock_bh(&n->tbl->lock); - atomic_inc(&n->tbl->gc_entries); } + + write_unlock(&n->lock); + write_unlock_bh(&n->tbl->lock); } static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, @@ -1220,7 +1220,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); - neigh_change_state(neigh, new); + neigh->nud_state = new; err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1299,7 +1299,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); - neigh_change_state(neigh, new); + neigh->nud_state = new; notify = 1; } @@ -1360,6 +1360,9 @@ out: neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); + if ((new ^ old) & NUD_PERMANENT) + neigh_update_gc_list(neigh); + if (notify) neigh_update_notify(neigh, nlmsg_pid); -- cgit v1.2.3-59-g8ed1b From 758a7f0b32ab890831d321145c3fe72bb85c0350 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 11 Dec 2018 18:57:22 -0700 Subject: neighbor: Fix state check in neigh_forced_gc PERMANENT entries are not on the gc_list so the state check is now redundant. Also, the move to not purge entries until after 5 seconds should not apply to FAILED entries; those can be removed immediately to make way for newer ones. This restores the previous logic prior to the gc_list. Fixes: 58956317c8de ("neighbor: Improve garbage collection") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 010784123bc1..acaa1a64150d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -204,7 +204,6 @@ static int neigh_forced_gc(struct neigh_table *tbl) unsigned long tref = jiffies - 5 * HZ; u8 flags = NTF_EXT_LEARNED; struct neighbour *n, *tmp; - u8 state = NUD_PERMANENT; int shrunk = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); @@ -216,8 +215,8 @@ static int neigh_forced_gc(struct neigh_table *tbl) bool remove = false; write_lock(&n->lock); - if (!(n->nud_state & state) && !(n->flags & flags) && - time_after(tref, n->updated)) + if ((n->nud_state == NUD_FAILED) || + (!(n->flags & flags) && time_after(tref, n->updated))) remove = true; write_unlock(&n->lock); -- cgit v1.2.3-59-g8ed1b From 7e6f182bec7debb420a2c12ae0ea1813645a7ac4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 11 Dec 2018 18:57:23 -0700 Subject: neighbor: Remove state and flags arguments to neigh_del neigh_del now only has 1 caller, and the state and flags arguments are both 0. Remove them and simplify neigh_del. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index acaa1a64150d..bb6f9ca7a3ce 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -153,14 +153,13 @@ static void neigh_update_gc_list(struct neighbour *n) write_unlock_bh(&n->tbl->lock); } -static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, - struct neighbour __rcu **np, struct neigh_table *tbl) +static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, + struct neigh_table *tbl) { bool retval = false; write_lock(&n->lock); - if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) && - !(n->flags & flags)) { + if (refcount_read(&n->refcnt) == 1) { struct neighbour *neigh; neigh = rcu_dereference_protected(n->next, @@ -192,7 +191,7 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock)))) { if (n == ndel) - return neigh_del(n, 0, 0, np, tbl); + return neigh_del(n, np, tbl); np = &n->next; } return false; -- cgit v1.2.3-59-g8ed1b From 526f1b587cf826d78c3e522428ce6b24a8da0d65 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 11 Dec 2018 18:57:24 -0700 Subject: neighbor: Move neigh_update_ext_learned to core file neigh_update_ext_learned has one caller in neighbour.c so does not need to be defined in the header. Move it and in the process remove the intialization of ndm_flags and just set it based on the flags check. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/neighbour.h | 18 ------------------ net/core/neighbour.c | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index f886b58956a6..ef0a60448a96 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -549,24 +549,6 @@ static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, } while (read_seqretry(&n->ha_lock, seq)); } -static inline void neigh_update_ext_learned(struct neighbour *neigh, u32 flags, - int *notify) -{ - u8 ndm_flags = 0; - - if (!(flags & NEIGH_UPDATE_F_ADMIN)) - return; - - ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; - if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { - if (ndm_flags & NTF_EXT_LEARNED) - neigh->flags |= NTF_EXT_LEARNED; - else - neigh->flags &= ~NTF_EXT_LEARNED; - *notify = 1; - } -} - static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags, int *notify) { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bb6f9ca7a3ce..2401040f799b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -153,6 +153,24 @@ static void neigh_update_gc_list(struct neighbour *n) write_unlock_bh(&n->tbl->lock); } +static void neigh_update_ext_learned(struct neighbour *neigh, u32 flags, + int *notify) +{ + u8 ndm_flags; + + if (!(flags & NEIGH_UPDATE_F_ADMIN)) + return; + + ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { + if (ndm_flags & NTF_EXT_LEARNED) + neigh->flags |= NTF_EXT_LEARNED; + else + neigh->flags &= ~NTF_EXT_LEARNED; + *notify = 1; + } +} + static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, struct neigh_table *tbl) { -- cgit v1.2.3-59-g8ed1b From e997f8a20a57cae16ed0c7a2bff6d3ab75f58123 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 11 Dec 2018 18:57:25 -0700 Subject: neighbor: Remove externally learned entries from gc_list Externally learned entries are similar to PERMANENT entries in the sense they are managed by userspace and can not be garbage collected. As such remove them from the gc_list, remove the flags check from neigh_forced_gc and skip threshold checks in neigh_alloc. As with PERMANENT entries, this allows unlimited number of NTF_EXT_LEARNED entries. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2401040f799b..42b413774370 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -129,21 +129,22 @@ static void neigh_mark_dead(struct neighbour *n) static void neigh_update_gc_list(struct neighbour *n) { - bool on_gc_list, new_is_perm; + bool on_gc_list, exempt_from_gc; write_lock_bh(&n->tbl->lock); write_lock(&n->lock); - /* remove from the gc list if new state is permanent; - * add to the gc list if new state is not permanent + /* remove from the gc list if new state is permanent or if neighbor + * is externally learned; otherwise entry should be on the gc list */ - new_is_perm = n->nud_state & NUD_PERMANENT; + exempt_from_gc = n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_LEARNED; on_gc_list = !list_empty(&n->gc_list); - if (new_is_perm && on_gc_list) { + if (exempt_from_gc && on_gc_list) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); - } else if (!new_is_perm && !on_gc_list) { + } else if (!exempt_from_gc && !on_gc_list) { /* add entries to the tail; cleaning removes from the front */ list_add_tail(&n->gc_list, &n->tbl->gc_list); atomic_inc(&n->tbl->gc_entries); @@ -153,13 +154,14 @@ static void neigh_update_gc_list(struct neighbour *n) write_unlock_bh(&n->tbl->lock); } -static void neigh_update_ext_learned(struct neighbour *neigh, u32 flags, +static bool neigh_update_ext_learned(struct neighbour *neigh, u32 flags, int *notify) { + bool rc = false; u8 ndm_flags; if (!(flags & NEIGH_UPDATE_F_ADMIN)) - return; + return rc; ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { @@ -167,8 +169,11 @@ static void neigh_update_ext_learned(struct neighbour *neigh, u32 flags, neigh->flags |= NTF_EXT_LEARNED; else neigh->flags &= ~NTF_EXT_LEARNED; + rc = true; *notify = 1; } + + return rc; } static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, @@ -219,7 +224,6 @@ static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; unsigned long tref = jiffies - 5 * HZ; - u8 flags = NTF_EXT_LEARNED; struct neighbour *n, *tmp; int shrunk = 0; @@ -233,7 +237,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || - (!(n->flags & flags) && time_after(tref, n->updated))) + time_after(tref, n->updated)) remove = true; write_unlock(&n->lock); @@ -371,13 +375,13 @@ EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, - bool permanent) + bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; - if (permanent) + if (exempt_from_gc) goto do_alloc; entries = atomic_inc_return(&tbl->gc_entries) - 1; @@ -419,7 +423,7 @@ out: return n; out_entries: - if (!permanent) + if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); goto out; } @@ -566,9 +570,9 @@ EXPORT_SYMBOL(neigh_lookup_nodev); static struct neighbour *___neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, - bool permanent, bool want_ref) + bool exempt_from_gc, bool want_ref) { - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, permanent); + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc); u32 hash_val; unsigned int key_len = tbl->key_len; int error; @@ -634,7 +638,7 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl, } n->dead = 0; - if (!permanent) + if (!exempt_from_gc) list_add_tail(&n->gc_list, &n->tbl->gc_list); if (want_ref) @@ -1210,6 +1214,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid, struct netlink_ext_ack *extack) { + bool ext_learn_change = false; u8 old; int err; int notify = 0; @@ -1230,7 +1235,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, goto out; } - neigh_update_ext_learned(neigh, flags, ¬ify); + ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); if (!(new & NUD_VALID)) { neigh_del_timer(neigh); @@ -1376,7 +1381,7 @@ out: neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); - if ((new ^ old) & NUD_PERMANENT) + if (((new ^ old) & NUD_PERMANENT) || ext_learn_change) neigh_update_gc_list(neigh); if (notify) @@ -1881,14 +1886,16 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { + bool exempt_from_gc; + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } - neigh = ___neigh_create(tbl, dst, dev, - ndm->ndm_state & NUD_PERMANENT, - true); + exempt_from_gc = ndm->ndm_state & NUD_PERMANENT || + ndm->ndm_flags & NTF_EXT_LEARNED; + neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; -- cgit v1.2.3-59-g8ed1b From aaa5d90b395a72faff797b00d815165ee0e664c0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 14 Dec 2018 11:51:58 +0100 Subject: net: use indirect call wrappers at GRO network layer This avoids an indirect calls for L3 GRO receive path, both for ipv4 and ipv6, if the latter is not compiled as a module. Note that when IPv6 is compiled as builtin, it will be checked first, so we have a single additional compare for the more common path. v1 -> v2: - adapted to INDIRECT_CALL_ changes Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/inet_common.h | 2 ++ net/core/dev.c | 15 +++++++++++++-- net/ipv6/ip6_offload.c | 6 +++--- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 3ca969cbd161..56e7592811ea 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -2,6 +2,8 @@ #ifndef _INET_COMMON_H #define _INET_COMMON_H +#include + extern const struct proto_ops inet_stream_ops; extern const struct proto_ops inet_dgram_ops; diff --git a/net/core/dev.c b/net/core/dev.c index ed9aa4a91f1f..1b5a4410be0e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -145,6 +145,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -5338,6 +5339,8 @@ static void flush_all_backlogs(void) put_online_cpus(); } +INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); static int napi_gro_complete(struct sk_buff *skb) { struct packet_offload *ptype; @@ -5357,7 +5360,9 @@ static int napi_gro_complete(struct sk_buff *skb) if (ptype->type != type || !ptype->callbacks.gro_complete) continue; - err = ptype->callbacks.gro_complete(skb, 0); + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, 0); break; } rcu_read_unlock(); @@ -5504,6 +5509,10 @@ static void gro_flush_oldest(struct list_head *head) napi_gro_complete(oldest); } +INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, + struct sk_buff *)); static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); @@ -5553,7 +5562,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->csum_valid = 0; } - pp = ptype->callbacks.gro_receive(gro_head, skb); + pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + gro_head, skb); break; } rcu_read_unlock(); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 70f525c33cb6..ff8b484d2258 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -164,8 +164,8 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, return len; } -static struct sk_buff *ipv6_gro_receive(struct list_head *head, - struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, + struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; @@ -301,7 +301,7 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, return inet_gro_receive(head, skb); } -static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) +INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); -- cgit v1.2.3-59-g8ed1b From df9b0e30d44c901ac27c0f38cd54511b3f130c6d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 15 Dec 2018 14:09:06 -0800 Subject: neighbor: Add protocol attribute Similar to routes and rules, add protocol attribute to neighbor entries for easier tracking of how each was created. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 ++ include/uapi/linux/neighbour.h | 1 + net/core/neighbour.c | 24 +++++++++++++++++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 30fd50adf234..66221f1991c0 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -149,6 +149,7 @@ struct neighbour { __u8 nud_state; __u8 type; __u8 dead; + u8 protocol; seqlock_t ha_lock; unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8); struct hh_cache hh; @@ -173,6 +174,7 @@ struct pneigh_entry { possible_net_t net; struct net_device *dev; u8 flags; + u8 protocol; u8 key[0]; }; diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 998155444e0d..cd144e3099a3 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -28,6 +28,7 @@ enum { NDA_MASTER, NDA_LINK_NETNSID, NDA_SRC_VNI, + NDA_PROTOCOL, /* Originator of entry */ __NDA_MAX }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 42b413774370..fb4372cb1de1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1828,6 +1828,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev = NULL; struct neighbour *neigh; void *dst, *lladdr; + u8 protocol = 0; int err; ASSERT_RTNL(); @@ -1867,6 +1868,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + if (tb[NDA_PROTOCOL]) { + if (nla_len(tb[NDA_PROTOCOL]) != sizeof(u8)) { + NL_SET_ERR_MSG(extack, "Invalid protocol attribute"); + goto out; + } + protocol = nla_get_u8(tb[NDA_PROTOCOL]); + } + if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; @@ -1874,6 +1883,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { pn->flags = ndm->ndm_flags; + if (protocol) + pn->protocol = protocol; err = 0; } goto out; @@ -1924,6 +1935,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } else err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); + + if (protocol) + neigh->protocol = protocol; + neigh_release(neigh); out: @@ -2417,6 +2432,9 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; + if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -2448,6 +2466,9 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; + if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -3103,7 +3124,8 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) - + nla_total_size(4); /* NDA_PROBES */ + + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(1); /* NDA_PROTOCOL */ } static void __neigh_notify(struct neighbour *n, int type, int flags, -- cgit v1.2.3-59-g8ed1b From 5b2f94b27622d5b92d1cebf4bb5a627db4444607 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Sat, 15 Dec 2018 22:35:08 -0800 Subject: net: rtnetlink: support for fdb get This patch adds support for fdb get similar to route get. arguments can be any of the following (similar to fdb add/del/dump): [bridge, mac, vlan] or [bridge_port, mac, vlan, flags=[NTF_MASTER]] or [dev, mac, [vni|vlan], flags=[NTF_SELF]] Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +- net/core/rtnetlink.c | 168 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 173 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 811632d4d8b1..1377d085ef99 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1387,7 +1387,12 @@ struct net_device_ops { struct net_device *dev, struct net_device *filter_dev, int *idx); - + int (*ndo_fdb_get)(struct sk_buff *skb, + struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, + u16 vid, u32 portid, u32 seq, + struct netlink_ext_ack *extack); int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f8bdb8adab2c..baf2685b4da2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3460,6 +3460,18 @@ void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, new_nsid, new_ifindex); } +static const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, + [NDA_PROBES] = { .type = NLA_U32 }, + [NDA_VLAN] = { .type = NLA_U16 }, + [NDA_PORT] = { .type = NLA_U16 }, + [NDA_VNI] = { .type = NLA_U32 }, + [NDA_IFINDEX] = { .type = NLA_U32 }, + [NDA_MASTER] = { .type = NLA_U32 }, +}; + static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, @@ -4021,6 +4033,160 @@ out: return skb->len; } +static int valid_fdb_get_strict(const struct nlmsghdr *nlh, + struct nlattr **tb, u8 *ndm_flags, + int *br_idx, int *brport_idx, u8 **addr, + u16 *vid, struct netlink_ext_ack *extack) +{ + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_MASTER: + *br_idx = nla_get_u32(tb[i]); + break; + case NDA_LLADDR: + if (nla_len(tb[i]) != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); + return -EINVAL; + } + *addr = nla_data(tb[i]); + break; + case NDA_VLAN: + err = fdb_vid_parse(tb[i], vid, extack); + if (err) + return err; + break; + case NDA_VNI: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = NULL, *br_dev = NULL; + const struct net_device_ops *ops = NULL; + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[NDA_MAX + 1]; + struct sk_buff *skb; + int brport_idx = 0; + u8 ndm_flags = 0; + int br_idx = 0; + u8 *addr = NULL; + u16 vid = 0; + int err; + + err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, + &brport_idx, &addr, &vid, extack); + if (err < 0) + return err; + + if (brport_idx) { + dev = __dev_get_by_index(net, brport_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (br_idx) { + if (dev) { + NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); + return -EINVAL; + } + + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Invalid master ifindex"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } + + if (dev) { + if (!ndm_flags || (ndm_flags & NTF_MASTER)) { + if (!(dev->priv_flags & IFF_BRIDGE_PORT)) { + NL_SET_ERR_MSG(extack, "Device is not a bridge port"); + return -EINVAL; + } + br_dev = netdev_master_upper_dev_get(dev); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Master of device not found"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } else { + if (!(ndm_flags & NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); + return -EINVAL; + } + ops = dev->netdev_ops; + } + } + + if (!br_dev && !dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -ENODEV; + } + + if (!ops || !ops->ndo_fdb_get) { + NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); + return -EOPNOTSUPP; + } + + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (br_dev) + dev = br_dev; + err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, extack); + if (err) + goto out; + + return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +out: + kfree_skb(skb); + return err; +} + static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, unsigned int attrnum, unsigned int flag) { @@ -5081,7 +5247,7 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, 0); + rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); -- cgit v1.2.3-59-g8ed1b From 3bdbd0228e7555ec745e08469b98e5a0966409d6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 16 Dec 2018 15:47:04 -0800 Subject: bpf: sockmap, metadata support for reporting size of msg This adds metadata to sk_msg_md for BPF programs to read the sk_msg size. When the SK_MSG program is running under an application that is using sendfile the data is not copied into sk_msg buffers by default. Rather the BPF program uses sk_msg_pull_data to read the bytes in. This avoids doing the costly memcopy instructions when they are not in fact needed. However, if we don't know the size of the sk_msg we have to guess if needed bytes are available by doing a pull request which may fail. By including the size of the sk_msg BPF programs can check the size before issuing sk_msg_pull_data requests. Additionally, the same applies for sendmsg calls when the application provides multiple iovs. Here the BPF program needs to pull in data to update data pointers but its not clear where the data ends without a size parameter. In many cases "guessing" is not easy to do and results in multiple calls to pull and without bounded loops everything gets fairly tricky. Clean this up by including a u32 size field. Note, all writes into sk_msg_md are rejected already from sk_msg_is_valid_access so nothing additional is needed there. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 3 +++ include/uapi/linux/bpf.h | 1 + net/core/filter.c | 6 ++++++ 3 files changed, 10 insertions(+) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 2a11e9d91dfa..eb8f6cb84c10 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -36,6 +36,9 @@ struct sk_msg_sg { struct scatterlist data[MAX_MSG_FRAGS + 1]; }; +/* UAPI in filter.c depends on struct sk_msg_sg being first element. If + * this is moved filter.c also must be updated. + */ struct sk_msg { struct sk_msg_sg sg; void *data; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1d324c2cbca2..91c43884f295 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2665,6 +2665,7 @@ struct sk_msg_md { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + __u32 size; /* Total size of sk_msg */ }; struct sk_reuseport_md { diff --git a/net/core/filter.c b/net/core/filter.c index f9348806e843..3a3b21726fb5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7530,6 +7530,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct sk_msg_md, size): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_sg, size)); + break; } return insn - insn_buf; -- cgit v1.2.3-59-g8ed1b From df5042f4c5b9326c593bf2e31ed859ebc3b4130a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:16 +0100 Subject: sk_buff: add skb extension infrastructure This adds an optional extension infrastructure, with ispec (xfrm) and bridge netfilter as first users. objdiff shows no changes if kernel is built without xfrm and br_netfilter support. The third (planned future) user is Multipath TCP which is still out-of-tree. MPTCP needs to map logical mptcp sequence numbers to the tcp sequence numbers used by individual subflows. This DSS mapping is read/written from tcp option space on receive and written to tcp option space on transmitted tcp packets that are part of and MPTCP connection. Extending skb_shared_info or adding a private data field to skb fclones doesn't work for incoming skb, so a different DSS propagation method would be required for the receive side. mptcp has same requirements as secpath/bridge netfilter: 1. extension memory is released when the sk_buff is free'd. 2. data is shared after cloning an skb (clone inherits extension) 3. adding extension to an skb will COW the extension buffer if needed. The "MPTCP upstreaming" effort adds SKB_EXT_MPTCP extension to store the mapping for tx and rx processing. Two new members are added to sk_buff: 1. 'active_extensions' byte (filling a hole), telling which extensions are available for this skb. This has two purposes. a) avoids the need to initialize the pointer. b) allows to "delete" an extension by clearing its bit value in ->active_extensions. While it would be possible to store the active_extensions byte in the extension struct instead of sk_buff, there is one problem with this: When an extension has to be disabled, we can always clear the bit in skb->active_extensions. But in case it would be stored in the extension buffer itself, we might have to COW it first, if we are dealing with a cloned skb. On kmalloc failure we would be unable to turn an extension off. 2. extension pointer, located at the end of the sk_buff. If the active_extensions byte is 0, the pointer is undefined, it is not initialized on skb allocation. This adds extra code to skb clone and free paths (to deal with refcount/free of extension area) but this replaces similar code that manages skb->nf_bridge and skb->sp structs in the followup patches of the series. It is possible to add support for extensions that are not preseved on clones/copies. To do this, it would be needed to define a bitmask of all extensions that need copy/cow semantics, and change __skb_ext_copy() to check ->active_extensions & SKB_EXT_PRESERVE_ON_CLONE, then just set ->active_extensions to 0 on the new clone. This isn't done here because all extensions that get added here need the copy/cow semantics. v2: Allocate entire extension space using kmem_cache. Upside is that this allows better tracking of used memory, downside is that we will allocate more space than strictly needed in most cases (its unlikely that all extensions are active/needed at same time for same skb). The allocated memory (except the small extension header) is not cleared, so no additonal overhead aside from memory usage. Avoid atomic_dec_and_test operation on skb_ext_put() by using similar trick as kfree_skbmem() does with fclone_ref: If recount is 1, there is no concurrent user and we can free right away. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 111 ++++++++++++++++++++++++++++++++++- net/Kconfig | 3 + net/core/skbuff.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_output.c | 1 + net/ipv6/ip6_output.c | 1 + 5 files changed, 270 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b1831a5ca173..88f7541837e3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -245,6 +245,7 @@ struct iov_iter; struct napi_struct; struct bpf_prog; union bpf_attr; +struct skb_ext; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -636,6 +637,7 @@ typedef unsigned char *sk_buff_data_t; * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves + * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -665,6 +667,7 @@ typedef unsigned char *sk_buff_data_t; * @data: Data head pointer * @truesize: Buffer size * @users: User count - see {datagram,tcp}.c + * @extensions: allocated extensions, valid if active_extensions is nonzero */ struct sk_buff { @@ -747,7 +750,9 @@ struct sk_buff { head_frag:1, xmit_more:1, pfmemalloc:1; - +#ifdef CONFIG_SKB_EXTENSIONS + __u8 active_extensions; +#endif /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ @@ -869,6 +874,11 @@ struct sk_buff { *data; unsigned int truesize; refcount_t users; + +#ifdef CONFIG_SKB_EXTENSIONS + /* only useable after checking ->active_extensions != 0 */ + struct skb_ext *extensions; +#endif }; #ifdef __KERNEL__ @@ -3896,6 +3906,105 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) atomic_inc(&nfct->use); } #endif + +#ifdef CONFIG_SKB_EXTENSIONS +enum skb_ext_id { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + SKB_EXT_BRIDGE_NF, +#endif + SKB_EXT_NUM, /* must be last */ +}; + +/** + * struct skb_ext - sk_buff extensions + * @refcnt: 1 on allocation, deallocated on 0 + * @offset: offset to add to @data to obtain extension address + * @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units + * @data: start of extension data, variable sized + * + * Note: offsets/lengths are stored in chunks of 8 bytes, this allows + * to use 'u8' types while allowing up to 2kb worth of extension data. + */ +struct skb_ext { + refcount_t refcnt; + u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */ + u8 chunks; /* same */ + char data[0] __aligned(8); +}; + +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); +void __skb_ext_put(struct skb_ext *ext); + +static inline void skb_ext_put(struct sk_buff *skb) +{ + if (skb->active_extensions) + __skb_ext_put(skb->extensions); +} + +static inline void skb_ext_get(struct sk_buff *skb) +{ + if (skb->active_extensions) { + struct skb_ext *ext = skb->extensions; + + if (ext) + refcount_inc(&ext->refcnt); + } +} + +static inline void __skb_ext_copy(struct sk_buff *dst, + const struct sk_buff *src) +{ + dst->active_extensions = src->active_extensions; + + if (src->active_extensions) { + struct skb_ext *ext = src->extensions; + + refcount_inc(&ext->refcnt); + dst->extensions = ext; + } +} + +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) +{ + skb_ext_put(dst); + __skb_ext_copy(dst, src); +} + +static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i) +{ + return !!ext->offset[i]; +} + +static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id) +{ + return skb->active_extensions & (1 << id); +} + +static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) +{ + if (skb_ext_exist(skb, id)) + __skb_ext_del(skb, id); +} + +static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) +{ + if (skb_ext_exist(skb, id)) { + struct skb_ext *ext = skb->extensions; + + return (void *)ext + (ext->offset[id] << 3); + } + + return NULL; +} +#else +static inline void skb_ext_put(struct sk_buff *skb) {} +static inline void skb_ext_get(struct sk_buff *skb) {} +static inline void skb_ext_del(struct sk_buff *skb, int unused) {} +static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} +#endif /* CONFIG_SKB_EXTENSIONS */ + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) { diff --git a/net/Kconfig b/net/Kconfig index f235edb593ba..93b291292860 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -51,6 +51,9 @@ config NET_INGRESS config NET_EGRESS bool +config SKB_EXTENSIONS + bool + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 40552547c69a..d2dfad33e686 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,9 @@ struct kmem_cache *skbuff_head_cache __ro_after_init; static struct kmem_cache *skbuff_fclone_cache __ro_after_init; +#ifdef CONFIG_SKB_EXTENSIONS +static struct kmem_cache *skbuff_ext_cache __ro_after_init; +#endif int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -617,6 +620,7 @@ void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); #endif + skb_ext_put(skb); } /* Free everything but the sk_buff shell. */ @@ -796,6 +800,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->dev = old->dev; memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); + __skb_ext_copy(new, old); #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif @@ -3902,6 +3907,40 @@ done: } EXPORT_SYMBOL_GPL(skb_gro_receive); +#ifdef CONFIG_SKB_EXTENSIONS +#define SKB_EXT_ALIGN_VALUE 8 +#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) + +static const u8 skb_ext_type_len[] = { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), +#endif +}; + +static __always_inline unsigned int skb_ext_total_length(void) +{ + return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + skb_ext_type_len[SKB_EXT_BRIDGE_NF] + +#endif + 0; +} + +static void skb_extensions_init(void) +{ + BUILD_BUG_ON(SKB_EXT_NUM >= 8); + BUILD_BUG_ON(skb_ext_total_length() > 255); + + skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", + SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +} +#else +static void skb_extensions_init(void) {} +#endif + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", @@ -3916,6 +3955,7 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + skb_extensions_init(); } static int @@ -5554,3 +5594,118 @@ void skb_condense(struct sk_buff *skb) */ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); } + +#ifdef CONFIG_SKB_EXTENSIONS +static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) +{ + return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); +} + +static struct skb_ext *skb_ext_alloc(void) +{ + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + + if (new) { + memset(new->offset, 0, sizeof(new->offset)); + refcount_set(&new->refcnt, 1); + } + + return new; +} + +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) +{ + struct skb_ext *new; + + if (refcount_read(&old->refcnt) == 1) + return old; + + new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + if (!new) + return NULL; + + memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); + refcount_set(&new->refcnt, 1); + + __skb_ext_put(old); + return new; +} + +/** + * skb_ext_add - allocate space for given extension, COW if needed + * @skb: buffer + * @id: extension to allocate space for + * + * Allocates enough space for the given extension. + * If the extension is already present, a pointer to that extension + * is returned. + * + * If the skb was cloned, COW applies and the returned memory can be + * modified without changing the extension space of clones buffers. + * + * Returns pointer to the extension or NULL on allocation failure. + */ +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *new, *old = NULL; + unsigned int newlen, newoff; + + if (skb->active_extensions) { + old = skb->extensions; + + new = skb_ext_maybe_cow(old); + if (!new) + return NULL; + + if (__skb_ext_exist(old, id)) { + if (old != new) + skb->extensions = new; + goto set_active; + } + + newoff = old->chunks; + } else { + newoff = SKB_EXT_CHUNKSIZEOF(*new); + + new = skb_ext_alloc(); + if (!new) + return NULL; + } + + newlen = newoff + skb_ext_type_len[id]; + new->chunks = newlen; + new->offset[id] = newoff; + skb->extensions = new; +set_active: + skb->active_extensions |= 1 << id; + return skb_ext_get_ptr(new, id); +} +EXPORT_SYMBOL(skb_ext_add); + +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *ext = skb->extensions; + + skb->active_extensions &= ~(1 << id); + if (skb->active_extensions == 0) { + skb->extensions = NULL; + __skb_ext_put(ext); + } +} +EXPORT_SYMBOL(__skb_ext_del); + +void __skb_ext_put(struct skb_ext *ext) +{ + /* If this is last clone, nothing can increment + * it after check passes. Avoids one atomic op. + */ + if (refcount_read(&ext->refcnt) == 1) + goto free_now; + + if (!refcount_dec_and_test(&ext->refcnt)) + return; +free_now: + kmem_cache_free(skbuff_ext_cache, ext); +} +EXPORT_SYMBOL(__skb_ext_put); +#endif /* CONFIG_SKB_EXTENSIONS */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ab6618036afe..c80188875f39 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -533,6 +533,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); + skb_ext_copy(to, from); #if IS_ENABLED(CONFIG_IP_VS) to->ipvs_property = from->ipvs_property; #endif diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9d55ee33b7f9..703a8e801c5c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -581,6 +581,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); + skb_ext_copy(to, from); skb_copy_secmark(to, from); } -- cgit v1.2.3-59-g8ed1b From de8bda1d22d38b7d5cd08b33f86efd94d4c86630 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:17 +0100 Subject: net: convert bridge_nf to use skb extension infrastructure This converts the bridge netfilter (calling iptables hooks from bridge) facility to use the extension infrastructure. The bridge_nf specific hooks in skb clone and free paths are removed, they have been replaced by the skb_ext hooks that do the same as the bridge nf allocations hooks did. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netfilter_bridge.h | 4 ++-- include/linux/skbuff.h | 28 ++-------------------------- include/net/netfilter/br_netfilter.h | 8 ++++---- net/Kconfig | 1 + net/bridge/br_netfilter_hooks.c | 20 ++------------------ net/bridge/br_netfilter_ipv6.c | 4 ++-- net/core/skbuff.c | 3 --- 7 files changed, 13 insertions(+), 55 deletions(-) (limited to 'net/core') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 0a65a422587c..5f2614d02e03 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -20,12 +20,12 @@ static inline void br_drop_fake_rtable(struct sk_buff *skb) static inline struct nf_bridge_info * nf_bridge_info_get(const struct sk_buff *skb) { - return skb->nf_bridge; + return skb_ext_find(skb, SKB_EXT_BRIDGE_NF); } static inline bool nf_bridge_info_exists(const struct sk_buff *skb) { - return skb->nf_bridge != NULL; + return skb_ext_exist(skb, SKB_EXT_BRIDGE_NF); } static inline int nf_bridge_get_physinif(const struct sk_buff *skb) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 88f7541837e3..2f42d2e99f17 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -255,7 +255,6 @@ struct nf_conntrack { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { - refcount_t use; enum { BRNF_PROTO_UNCHANGED, BRNF_PROTO_8021Q, @@ -720,9 +719,6 @@ struct sk_buff { #endif #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; -#endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - struct nf_bridge_info *nf_bridge; #endif unsigned int len, data_len; @@ -4005,18 +4001,6 @@ static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} #endif /* CONFIG_SKB_EXTENSIONS */ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) -{ - if (nf_bridge && refcount_dec_and_test(&nf_bridge->use)) - kfree(nf_bridge); -} -static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge) -{ - if (nf_bridge) - refcount_inc(&nf_bridge->use); -} -#endif /* CONFIG_BRIDGE_NETFILTER */ static inline void nf_reset(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) @@ -4024,8 +4008,7 @@ static inline void nf_reset(struct sk_buff *skb) skb->_nfct = 0; #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(skb->nf_bridge); - skb->nf_bridge = NULL; + skb_ext_del(skb, SKB_EXT_BRIDGE_NF); #endif } @@ -4043,7 +4026,7 @@ static inline void ipvs_reset(struct sk_buff *skb) #endif } -/* Note: This doesn't put any conntrack and bridge info in dst. */ +/* Note: This doesn't put any conntrack info in dst. */ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, bool copy) { @@ -4051,10 +4034,6 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, dst->_nfct = src->_nfct; nf_conntrack_get(skb_nfct(src)); #endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - dst->nf_bridge = src->nf_bridge; - nf_bridge_get(src->nf_bridge); -#endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) if (copy) dst->nf_trace = src->nf_trace; @@ -4065,9 +4044,6 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(dst)); -#endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(dst->nf_bridge); #endif __nf_copy(dst, src, true); } diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h index 6efc0153987b..4cd56808ac4e 100644 --- a/include/net/netfilter/br_netfilter.h +++ b/include/net/netfilter/br_netfilter.h @@ -6,12 +6,12 @@ static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb) { - skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC); + struct nf_bridge_info *b = skb_ext_add(skb, SKB_EXT_BRIDGE_NF); - if (likely(skb->nf_bridge)) - refcount_set(&(skb->nf_bridge->use), 1); + if (b) + memset(b, 0, sizeof(*b)); - return skb->nf_bridge; + return b; } void nf_bridge_update_protocol(struct sk_buff *skb); diff --git a/net/Kconfig b/net/Kconfig index 93b291292860..5cb9de1aaf88 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -187,6 +187,7 @@ config BRIDGE_NETFILTER depends on NETFILTER && INET depends on NETFILTER_ADVANCED select NETFILTER_FAMILY_BRIDGE + select SKB_EXTENSIONS default m ---help--- Enabling this option will let arptables resp. iptables see bridged diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c58cf68b45c5..d21a23698410 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -132,10 +132,7 @@ static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); static void nf_bridge_info_free(struct sk_buff *skb) { - if (skb->nf_bridge) { - nf_bridge_put(skb->nf_bridge); - skb->nf_bridge = NULL; - } + skb_ext_del(skb, SKB_EXT_BRIDGE_NF); } static inline struct net_device *bridge_parent(const struct net_device *dev) @@ -148,19 +145,7 @@ static inline struct net_device *bridge_parent(const struct net_device *dev) static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge = skb->nf_bridge; - - if (refcount_read(&nf_bridge->use) > 1) { - struct nf_bridge_info *tmp = nf_bridge_alloc(skb); - - if (tmp) { - memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); - refcount_set(&tmp->use, 1); - } - nf_bridge_put(nf_bridge); - nf_bridge = tmp; - } - return nf_bridge; + return skb_ext_add(skb, SKB_EXT_BRIDGE_NF); } unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) @@ -508,7 +493,6 @@ static unsigned int br_nf_pre_routing(void *priv, if (br_validate_ipv4(state->net, skb)) return NF_DROP; - nf_bridge_put(skb->nf_bridge); if (!nf_bridge_alloc(skb)) return NF_DROP; if (!setup_pre_routing(skb)) diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 96c072e71ea2..94039f588f1d 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -224,8 +224,8 @@ unsigned int br_nf_pre_routing_ipv6(void *priv, if (br_validate_ipv6(state->net, skb)) return NF_DROP; - nf_bridge_put(skb->nf_bridge); - if (!nf_bridge_alloc(skb)) + nf_bridge = nf_bridge_alloc(skb); + if (!nf_bridge) return NF_DROP; if (!setup_pre_routing(skb)) return NF_DROP; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d2dfad33e686..0c65723591d7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -616,9 +616,6 @@ void skb_release_head_state(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb_nfct(skb)); -#endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(skb->nf_bridge); #endif skb_ext_put(skb); } -- cgit v1.2.3-59-g8ed1b From 4165079ba328dd47262a2183049d3591f0a750b1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:27 +0100 Subject: net: switch secpath to use skb extension infrastructure Remove skb->sp and allocate secpath storage via extension infrastructure. This also reduces sk_buff by 8 bytes on x86_64. Total size of allyesconfig kernel is reduced slightly, as there is less inlined code (one conditional atomic op instead of two on skb_clone). No differences in throughput in following ipsec performance tests: - transport mode with aes on 10GB link - tunnel mode between two network namespaces with aes and null cipher Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- Documentation/networking/xfrm_device.txt | 7 ++-- include/linux/skbuff.h | 10 +++--- include/net/xfrm.h | 22 +------------ net/core/skbuff.c | 47 +++++++++++++++++++++++---- net/xfrm/xfrm_input.c | 56 +++++--------------------------- 5 files changed, 59 insertions(+), 83 deletions(-) (limited to 'net/core') diff --git a/Documentation/networking/xfrm_device.txt b/Documentation/networking/xfrm_device.txt index 267f55b5f54a..a1c904dc70dc 100644 --- a/Documentation/networking/xfrm_device.txt +++ b/Documentation/networking/xfrm_device.txt @@ -111,9 +111,10 @@ the stack in xfrm_input(). xfrm_state_hold(xs); store the state information into the skb - skb->sp = secpath_dup(skb->sp); - skb->sp->xvec[skb->sp->len++] = xs; - skb->sp->olen++; + sp = secpath_set(skb); + if (!sp) return; + sp->xvec[sp->len++] = xs; + sp->olen++; indicate the success and/or error status of the offload xo = xfrm_offload(skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d0f254a016bf..3f741b04e55d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -714,9 +714,6 @@ struct sk_buff { struct list_head tcp_tsorted_anchor; }; -#ifdef CONFIG_XFRM - struct sec_path *sp; -#endif #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; #endif @@ -3907,6 +3904,9 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) enum skb_ext_id { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) SKB_EXT_BRIDGE_NF, +#endif +#ifdef CONFIG_XFRM + SKB_EXT_SEC_PATH, #endif SKB_EXT_NUM, /* must be last */ }; @@ -4069,7 +4069,7 @@ static inline void skb_init_secmark(struct sk_buff *skb) static inline int secpath_exists(const struct sk_buff *skb) { #ifdef CONFIG_XFRM - return skb->sp != NULL; + return skb_ext_exist(skb, SKB_EXT_SEC_PATH); #else return 0; #endif @@ -4127,7 +4127,7 @@ static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) static inline struct sec_path *skb_sec_path(const struct sk_buff *skb) { #ifdef CONFIG_XFRM - return skb->sp; + return skb_ext_find(skb, SKB_EXT_SEC_PATH); #else return NULL; #endif diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 31220edcce95..38c232861a64 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1096,7 +1096,6 @@ struct xfrm_offload { }; struct sec_path { - refcount_t refcnt; int len; int olen; @@ -1104,32 +1103,13 @@ struct sec_path { struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; }; -static inline struct sec_path * -secpath_get(struct sec_path *sp) -{ - if (sp) - refcount_inc(&sp->refcnt); - return sp; -} - -void __secpath_destroy(struct sec_path *sp); - -static inline void -secpath_put(struct sec_path *sp) -{ - if (sp && refcount_dec_and_test(&sp->refcnt)) - __secpath_destroy(sp); -} - -struct sec_path *secpath_dup(struct sec_path *src); struct sec_path *secpath_set(struct sk_buff *skb); static inline void secpath_reset(struct sk_buff *skb) { #ifdef CONFIG_XFRM - secpath_put(skb->sp); - skb->sp = NULL; + skb_ext_del(skb, SKB_EXT_SEC_PATH); #endif } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0c65723591d7..cb0bf4215745 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -609,7 +609,6 @@ fastpath: void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); - secpath_reset(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); @@ -798,9 +797,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); __skb_ext_copy(new, old); -#ifdef CONFIG_XFRM - new->sp = secpath_get(old->sp); -#endif __nf_copy(new, old, false); /* Note : this field could be in headers_start/headers_end section @@ -3912,6 +3908,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), #endif +#ifdef CONFIG_XFRM + [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -3919,6 +3918,9 @@ static __always_inline unsigned int skb_ext_total_length(void) return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) skb_ext_type_len[SKB_EXT_BRIDGE_NF] + +#endif +#ifdef CONFIG_XFRM + skb_ext_type_len[SKB_EXT_SEC_PATH] + #endif 0; } @@ -5610,7 +5612,8 @@ static struct skb_ext *skb_ext_alloc(void) return new; } -static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, + unsigned int old_active) { struct skb_ext *new; @@ -5624,6 +5627,15 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); refcount_set(&new->refcnt, 1); +#ifdef CONFIG_XFRM + if (old_active & (1 << SKB_EXT_SEC_PATH)) { + struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_hold(sp->xvec[i]); + } +#endif __skb_ext_put(old); return new; } @@ -5650,7 +5662,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) if (skb->active_extensions) { old = skb->extensions; - new = skb_ext_maybe_cow(old); + new = skb_ext_maybe_cow(old, skb->active_extensions); if (!new) return NULL; @@ -5679,6 +5691,16 @@ set_active: } EXPORT_SYMBOL(skb_ext_add); +#ifdef CONFIG_XFRM +static void skb_ext_put_sp(struct sec_path *sp) +{ + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_put(sp->xvec[i]); +} +#endif + void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { struct skb_ext *ext = skb->extensions; @@ -5687,6 +5709,14 @@ void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) if (skb->active_extensions == 0) { skb->extensions = NULL; __skb_ext_put(ext); +#ifdef CONFIG_XFRM + } else if (id == SKB_EXT_SEC_PATH && + refcount_read(&ext->refcnt) == 1) { + struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); + + skb_ext_put_sp(sp); + sp->len = 0; +#endif } } EXPORT_SYMBOL(__skb_ext_del); @@ -5702,6 +5732,11 @@ void __skb_ext_put(struct skb_ext *ext) if (!refcount_dec_and_test(&ext->refcnt)) return; free_now: +#ifdef CONFIG_XFRM + if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) + skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); +#endif + kmem_cache_free(skbuff_ext_cache, ext); } EXPORT_SYMBOL(__skb_ext_put); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index b4db25b244fa..6bc817359b58 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -38,8 +38,6 @@ struct xfrm_trans_cb { #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) -static struct kmem_cache *secpath_cachep __ro_after_init; - static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; @@ -111,54 +109,21 @@ static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, return ret; } -void __secpath_destroy(struct sec_path *sp) -{ - int i; - for (i = 0; i < sp->len; i++) - xfrm_state_put(sp->xvec[i]); - kmem_cache_free(secpath_cachep, sp); -} -EXPORT_SYMBOL(__secpath_destroy); - -struct sec_path *secpath_dup(struct sec_path *src) +struct sec_path *secpath_set(struct sk_buff *skb) { - struct sec_path *sp; + struct sec_path *sp, *tmp = skb_ext_find(skb, SKB_EXT_SEC_PATH); - sp = kmem_cache_alloc(secpath_cachep, GFP_ATOMIC); + sp = skb_ext_add(skb, SKB_EXT_SEC_PATH); if (!sp) return NULL; - sp->len = 0; - sp->olen = 0; + if (tmp) /* reused existing one (was COW'd if needed) */ + return sp; + /* allocated new secpath */ memset(sp->ovec, 0, sizeof(sp->ovec)); - - if (src) { - int i; - - memcpy(sp, src, sizeof(*sp)); - for (i = 0; i < sp->len; i++) - xfrm_state_hold(sp->xvec[i]); - } - refcount_set(&sp->refcnt, 1); - return sp; -} -EXPORT_SYMBOL(secpath_dup); - -struct sec_path *secpath_set(struct sk_buff *skb) -{ - struct sec_path *sp = skb->sp; - - /* Allocate new secpath or COW existing one. */ - if (!sp || refcount_read(&sp->refcnt) != 1) { - sp = secpath_dup(skb->sp); - if (!sp) - return NULL; - - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; - } + sp->olen = 0; + sp->len = 0; return sp; } @@ -552,11 +517,6 @@ void __init xfrm_input_init(void) if (err) gro_cells.cells = NULL; - secpath_cachep = kmem_cache_create("secpath_cache", - sizeof(struct sec_path), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); - for_each_possible_cpu(i) { struct xfrm_trans_tasklet *trans; -- cgit v1.2.3-59-g8ed1b From 82cbb5c631a07b3aa6df6eab644d55da9de5a645 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 19 Dec 2018 12:51:38 -0800 Subject: neighbour: register rtnl doit handler this patch registers neigh doit handler. The doit handler returns a neigh entry given dst and dev. This is similar to route and fdb doit (get) handlers. Also moves nda_policy declaration from rtnetlink.c to neighbour.c Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/neighbour.h | 1 + net/core/neighbour.c | 204 +++++++++++++++++++++++++++++++++++++++++++++--- net/core/rtnetlink.c | 12 --- 3 files changed, 194 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 66221f1991c0..7c1ab9edba03 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -255,6 +255,7 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 +extern const struct nla_policy nda_policy[]; static inline bool neigh_key_eq16(const struct neighbour *n, const void *pkey) { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index fb4372cb1de1..43687c9abe1d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1751,6 +1751,18 @@ static struct neigh_table *neigh_find_table(int family) return tbl; } +const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, + [NDA_PROBES] = { .type = NLA_U32 }, + [NDA_VLAN] = { .type = NLA_U16 }, + [NDA_PORT] = { .type = NLA_U16 }, + [NDA_VNI] = { .type = NLA_U32 }, + [NDA_IFINDEX] = { .type = NLA_U32 }, + [NDA_MASTER] = { .type = NLA_U32 }, +}; + static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2711,6 +2723,186 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int neigh_valid_get_req(const struct nlmsghdr *nlh, + struct neigh_table **tbl, + void **dst, int *dev_idx, u8 *ndm_flags, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~NTF_PROXY) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *dev_idx = ndm->ndm_ifindex; + *tbl = neigh_find_table(ndm->ndm_family); + if (*tbl == NULL) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + return -EAFNOSUPPORT; + } + + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_DST: + if (nla_len(tb[i]) != (int)(*tbl)->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + return -EINVAL; + } + *dst = nla_data(tb[i]); + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); + return -EINVAL; + } + } + + return 0; +} + +static inline size_t neigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static int neigh_get_reply(struct net *net, struct neighbour *neigh, + u32 pid, u32 seq) +{ + struct sk_buff *skb; + int err = 0; + + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + if (err) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, pid); +errout: + return err; +} + +static inline size_t pneigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN); /* NDA_DST */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, + u32 pid, u32 seq, struct neigh_table *tbl) +{ + struct sk_buff *skb; + int err = 0; + + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, pid); +errout: + return err; +} + +static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct net_device *dev = NULL; + struct neigh_table *tbl = NULL; + struct neighbour *neigh; + void *dst = NULL; + u8 ndm_flags = 0; + int dev_idx = 0; + int err; + + err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, + extack); + if (err < 0) + return err; + + if (dev_idx) { + dev = __dev_get_by_index(net, dev_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (!dst) { + NL_SET_ERR_MSG(extack, "Network address not specified"); + return -EINVAL; + } + + if (ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + + pn = pneigh_lookup(tbl, net, dst, dev, 0); + if (!pn) { + NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); + return -ENOENT; + } + return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, tbl); + } + + if (!dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -EINVAL; + } + + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + return -ENOENT; + } + + err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq); + + neigh_release(neigh); + + return err; +} + void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; @@ -3118,16 +3310,6 @@ static const struct seq_operations neigh_stat_seq_ops = { }; #endif /* CONFIG_PROC_FS */ -static inline size_t neigh_nlmsg_size(void) -{ - return NLMSG_ALIGN(sizeof(struct ndmsg)) - + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ - + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ - + nla_total_size(sizeof(struct nda_cacheinfo)) - + nla_total_size(4) /* NDA_PROBES */ - + nla_total_size(1); /* NDA_PROTOCOL */ -} - static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { @@ -3511,7 +3693,7 @@ static int __init neigh_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, 0); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index baf2685b4da2..48f61885fd6f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3460,18 +3460,6 @@ void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, new_nsid, new_ifindex); } -static const struct nla_policy nda_policy[NDA_MAX+1] = { - [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, - [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, - [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, - [NDA_PROBES] = { .type = NLA_U32 }, - [NDA_VLAN] = { .type = NLA_U16 }, - [NDA_PORT] = { .type = NLA_U16 }, - [NDA_VNI] = { .type = NLA_U32 }, - [NDA_IFINDEX] = { .type = NLA_U32 }, - [NDA_MASTER] = { .type = NLA_U32 }, -}; - static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, -- cgit v1.2.3-59-g8ed1b From 754d5da63145852736f34cfc762164f5d8d6537b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Dec 2018 15:53:22 -0800 Subject: neighbor: Initialize protocol when new pneigh_entry are created pneigh_lookup uses kmalloc versus kzalloc when new entries are allocated. Given that the newly added protocol field needs to be initialized. Fixes: df9b0e30d44c ("neighbor: Add protocol attribute") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 43687c9abe1d..d9fa101b0e41 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -725,6 +725,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, if (!n) goto out; + n->protocol = 0; write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; -- cgit v1.2.3-59-g8ed1b From a9cd3439e3c6d777a05c63b4d06c3500d1d4074e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Dec 2018 20:02:36 -0800 Subject: neighbor: Use nda_policy for validating attributes in adds and dump requests Add NDA_PROTOCOL to nda_policy and use the policy for attribute parsing and validation for adding neighbors and in dump requests. Remove the now duplicate checks on nla_len. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d9fa101b0e41..8baa9ab01db6 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1762,6 +1762,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_VNI] = { .type = NLA_U32 }, [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, + [NDA_PROTOCOL] = { .type = NLA_U8 }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1845,7 +1846,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, int err; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nda_policy, extack); if (err < 0) goto out; @@ -1881,13 +1882,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; - if (tb[NDA_PROTOCOL]) { - if (nla_len(tb[NDA_PROTOCOL]) != sizeof(u8)) { - NL_SET_ERR_MSG(extack, "Invalid protocol attribute"); - goto out; - } + if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); - } if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; @@ -2639,10 +2635,10 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, } err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - NULL, extack); + nda_policy, extack); } else { err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - NULL, extack); + nda_policy, extack); } if (err < 0) return err; @@ -2654,17 +2650,9 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, /* all new attributes should require strict_check */ switch (i) { case NDA_IFINDEX: - if (nla_len(tb[i]) != sizeof(u32)) { - NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in neighbor dump request"); - return -EINVAL; - } filter->dev_idx = nla_get_u32(tb[i]); break; case NDA_MASTER: - if (nla_len(tb[i]) != sizeof(u32)) { - NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in neighbor dump request"); - return -EINVAL; - } filter->master_idx = nla_get_u32(tb[i]); break; default: -- cgit v1.2.3-59-g8ed1b From bc1b4f013b5029b4c8b63fb9ba8d084119486d7b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:30 -0800 Subject: bpf: sk_msg, improve offset chk in _is_valid_access The check for max offset in sk_msg_is_valid_access uses sizeof() which is incorrect because it would allow accessing possibly past the end of the struct in the padded case. Further, it doesn't preclude accessing any padding that may be added in the middle of a struct. All told this makes it fragile to rely on. To fix this explicitly check offsets with fields using the bpf_ctx_range() and bpf_ctx_range_till() macros. For reference the current structure layout looks as follows (reported by pahole) struct sk_msg_md { union { void * data; /* 8 */ }; /* 0 8 */ union { void * data_end; /* 8 */ }; /* 8 8 */ __u32 family; /* 16 4 */ __u32 remote_ip4; /* 20 4 */ __u32 local_ip4; /* 24 4 */ __u32 remote_ip6[4]; /* 28 16 */ __u32 local_ip6[4]; /* 44 16 */ __u32 remote_port; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u32 local_port; /* 64 4 */ __u32 size; /* 68 4 */ /* size: 72, cachelines: 2, members: 10 */ /* last cacheline: 8 bytes */ }; So there should be no padding at the moment but fixing this now prevents future errors. Reported-by: Alexei Starovoitov Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/filter.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 3a3b21726fb5..6bd9f08f6162 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6313,6 +6313,9 @@ static bool sk_msg_is_valid_access(int off, int size, if (type == BPF_WRITE) return false; + if (off % size != 0) + return false; + switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; @@ -6324,16 +6327,20 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; - default: + case bpf_ctx_range(struct sk_msg_md, family): + case bpf_ctx_range(struct sk_msg_md, remote_ip4): + case bpf_ctx_range(struct sk_msg_md, local_ip4): + case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct sk_msg_md, remote_port): + case bpf_ctx_range(struct sk_msg_md, local_port): + case bpf_ctx_range(struct sk_msg_md, size): if (size != sizeof(__u32)) return false; - } - - if (off < 0 || off >= sizeof(struct sk_msg_md)) - return false; - if (off % size != 0) + break; + default: return false; - + } return true; } -- cgit v1.2.3-59-g8ed1b From 7a69c0f250568e6ab72f401b2c69aa0e666c94f2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:31 -0800 Subject: bpf: skmsg, replace comments with BUILD bug Enforce comment on structure layout dependency with a BUILD_BUG_ON to ensure the condition is maintained. Suggested-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 4 +--- net/core/filter.c | 3 +++ 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index eb8f6cb84c10..dd57e6f408b1 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -36,9 +36,7 @@ struct sk_msg_sg { struct scatterlist data[MAX_MSG_FRAGS + 1]; }; -/* UAPI in filter.c depends on struct sk_msg_sg being first element. If - * this is moved filter.c also must be updated. - */ +/* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; diff --git a/net/core/filter.c b/net/core/filter.c index 6bd9f08f6162..447dd1bad31f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7425,6 +7425,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, int off; #endif + /* convert ctx uses the fact sg element is first in struct */ + BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); + switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), -- cgit v1.2.3-59-g8ed1b From 51199405f967207de372d9b60989eb87d7ae8809 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:32 -0800 Subject: bpf: skb_verdict, support SK_PASS on RX BPF path Add SK_PASS verdict support to SK_SKB_VERDICT programs. Now that support for redirects exists we can implement SK_PASS as a redirect to the same socket. This simplifies the BPF programs and avoids an extra map lookup on RX path for simple visibility cases. Further, reduces user (BPF programmer in this context) confusion when their program drops skb due to lack of support. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/skmsg.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 56a99d0c9aa0..8a91a460de8f 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -669,6 +669,22 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, bool ingress; switch (verdict) { + case __SK_PASS: + sk_other = psock->sk; + if (sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + goto out_free; + } + if (atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf) { + struct tcp_skb_cb *tcp = TCP_SKB_CB(skb); + + tcp->bpf.flags |= BPF_F_INGRESS; + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + break; + } + goto out_free; case __SK_REDIRECT: sk_other = tcp_skb_bpf_redirect_fetch(skb); if (unlikely(!sk_other)) -- cgit v1.2.3-59-g8ed1b From 552de91068828daef50a227a665068cf8dde835e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:33 -0800 Subject: bpf: sk_msg, fix socket data_ready events When a skb verdict program is in-use and either another BPF program redirects to that socket or the new SK_PASS support is used the data_ready callback does not wake up application. Instead because the stream parser/verdict is using the sk data_ready callback we wake up the stream parser/verdict block. Fix this by adding a helper to check if the stream parser block is enabled on the sk and if so call the saved pointer which is the upper layers wake up function. This fixes application stalls observed when an application is waiting for data in a blocking read(). Fixes: d829e9c4112b ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 8 ++++++++ net/core/skmsg.c | 6 +++--- net/ipv4/tcp_bpf.c | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index dd57e6f408b1..178a3933a71b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -417,6 +417,14 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) sk_psock_drop(sk, psock); } +static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) +{ + if (psock->parser.enabled) + psock->parser.saved_data_ready(sk); + else + sk->sk_data_ready(sk); +} + static inline void psock_set_prog(struct bpf_prog **pprog, struct bpf_prog *prog) { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8a91a460de8f..3df7627db4bb 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -403,7 +403,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) msg->skb = skb; sk_psock_queue_msg(psock, msg); - sk->sk_data_ready(sk); + sk_psock_data_ready(sk, psock); return copied; } @@ -751,7 +751,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) } /* Called with socket lock held. */ -static void sk_psock_data_ready(struct sock *sk) +static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; @@ -799,7 +799,7 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) return; parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; parser->enabled = true; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index a47c1cdf90fc..87503343743d 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -198,7 +198,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, msg->sg.start = i; msg->sg.size -= apply_bytes; sk_psock_queue_msg(psock, tmp); - sk->sk_data_ready(sk); + sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); kfree(tmp); -- cgit v1.2.3-59-g8ed1b From a136678c0bdbb650daff5df5eec1dab960e074a7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:34 -0800 Subject: bpf: sk_msg, zap ingress queue on psock down In addition to releasing any cork'ed data on a psock when the psock is removed we should also release any skb's in the ingress work queue. Otherwise the skb's eventually get free'd but late in the tear down process so we see the WARNING due to non-zero sk_forward_alloc. void sk_stream_kill_queues(struct sock *sk) { ... WARN_ON(sk->sk_forward_alloc); ... } Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/skmsg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3df7627db4bb..86c9726fced8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -572,6 +572,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { rcu_assign_sk_user_data(sk, NULL); sk_psock_cork_free(psock); + sk_psock_zap_ingress(psock); sk_psock_restore_proto(sk, psock); write_lock_bh(&sk->sk_callback_lock); -- cgit v1.2.3-59-g8ed1b From 463561e6b9facf93ef90b65d2c75a80c7262d778 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 20 Dec 2018 16:50:50 +0000 Subject: neighbour: remove stray semicolon Currently the stray semicolon means that the final term in the addition is being missed. Fix this by removing it. Cleans up clang warning: net/core/neighbour.c:2821:9: warning: expression result unused [-Wunused-value] Fixes: 82cbb5c631a0 ("neighbour: register rtnl doit handler") Signed-off-by: Colin Ian King Acked-By: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index fa384f775f1a..763a7b08df67 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2811,7 +2811,7 @@ errout: static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) - + nla_total_size(MAX_ADDR_LEN); /* NDA_DST */ + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(1); /* NDA_PROTOCOL */ } -- cgit v1.2.3-59-g8ed1b From e94e50bd88f7ed2f2d40c32c06efd61c36c33ec8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Dec 2018 19:03:13 +0100 Subject: net: fix possible user-after-free in skb_ext_add() On cow we can free the old extension: we must avoid dereferencing such extension after skb_ext_maybe_cow(). Since 'new' contents are always equal to 'old' after the copy, we can fix the above accessing the relevant data using 'new'. Fixes: df5042f4c5b9 ("sk_buff: add skb extension infrastructure") Signed-off-by: Paolo Abeni Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cb0bf4215745..e1d88762f659 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5666,13 +5666,13 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) if (!new) return NULL; - if (__skb_ext_exist(old, id)) { + if (__skb_ext_exist(new, id)) { if (old != new) skb->extensions = new; goto set_active; } - newoff = old->chunks; + newoff = new->chunks; } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); -- cgit v1.2.3-59-g8ed1b From 682ec859518d73435cc924d816da2953343241c1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Dec 2018 19:03:15 +0100 Subject: net: minor cleanup in skb_ext_add() When the extension to be added is already present, the only skb field we may need to update is 'extensions': we can reorder the code and avoid a branch. v1 -> v2: - be sure to flag the newly added extension as active Signed-off-by: Paolo Abeni Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e1d88762f659..37317ffec146 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5666,11 +5666,8 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) if (!new) return NULL; - if (__skb_ext_exist(new, id)) { - if (old != new) - skb->extensions = new; + if (__skb_ext_exist(new, id)) goto set_active; - } newoff = new->chunks; } else { @@ -5684,8 +5681,8 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) newlen = newoff + skb_ext_type_len[id]; new->chunks = newlen; new->offset[id] = newoff; - skb->extensions = new; set_active: + skb->extensions = new; skb->active_extensions |= 1 << id; return skb_ext_get_ptr(new, id); } -- cgit v1.2.3-59-g8ed1b From 50d5258634aee2e62832aa086d2fb0de00e72b91 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 21 Dec 2018 14:49:01 -0600 Subject: net: core: Fix Spectre v1 vulnerability flen is indirectly controlled by user-space, hence leading to a potential exploitation of the Spectre variant 1 vulnerability. This issue was detected with the help of Smatch: net/core/filter.c:1101 bpf_check_classic() warn: potential spectre issue 'filter' [w] Fix this by sanitizing flen before using it to index filter at line 1101: switch (filter[flen - 1].code) { and through pc at line 1040: const struct sock_filter *ftest = &filter[pc]; Notice that given that speculation windows are large, the policy is to kill the speculation on the first load and not worry if it can be completed with a dependent load/store [1]. [1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 8d2c629501e2..0c74c2f9776a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -73,6 +73,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -1038,6 +1039,7 @@ static int bpf_check_classic(const struct sock_filter *filter, bool anc_found; int pc; + flen = array_index_nospec(flen, BPF_MAXINSNS + 1); /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; -- cgit v1.2.3-59-g8ed1b From f2ab95814103314af3239d322e382c61c69a788d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 23 Dec 2018 16:01:35 -0800 Subject: net: Revert recent Spectre-v1 patches. This reverts: 50d5258634ae ("net: core: Fix Spectre v1 vulnerability") d686026b1e6e ("phonet: af_phonet: Fix Spectre v1 vulnerability") a95386f0390a ("nfc: af_nfc: Fix Spectre v1 vulnerability") a3ac5817ffe8 ("can: af_can: Fix Spectre v1 vulnerability") After some discussion with Alexei Starovoitov these all seem to be completely unnecessary. Signed-off-by: David S. Miller --- net/can/af_can.c | 2 -- net/core/filter.c | 2 -- net/nfc/af_nfc.c | 2 -- net/phonet/af_phonet.c | 3 --- 4 files changed, 9 deletions(-) (limited to 'net/core') diff --git a/net/can/af_can.c b/net/can/af_can.c index cade7250c6d4..1684ba5b51eb 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include @@ -137,7 +136,6 @@ static int can_create(struct net *net, struct socket *sock, int protocol, if (protocol < 0 || protocol >= CAN_NPROTO) return -EINVAL; - protocol = array_index_nospec(protocol, CAN_NPROTO); cp = can_get_proto(protocol); diff --git a/net/core/filter.c b/net/core/filter.c index 0c74c2f9776a..8d2c629501e2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -73,7 +73,6 @@ #include #include #include -#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -1039,7 +1038,6 @@ static int bpf_check_classic(const struct sock_filter *filter, bool anc_found; int pc; - flen = array_index_nospec(flen, BPF_MAXINSNS + 1); /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 256f3c57059e..d3e594eb36d0 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -21,7 +21,6 @@ #include #include -#include #include "nfc.h" @@ -38,7 +37,6 @@ static int nfc_sock_create(struct net *net, struct socket *sock, int proto, if (proto < 0 || proto >= NFC_SOCKPROTO_MAX) return -EINVAL; - proto = array_index_nospec(proto, NFC_SOCKPROTO_MAX); read_lock(&proto_tab_lock); if (proto_tab[proto] && try_module_get(proto_tab[proto]->owner)) { diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index d4b2abd78858..3b0ef691f5b1 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -34,8 +34,6 @@ #include #include -#include - /* Transport protocol registration */ static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; @@ -45,7 +43,6 @@ static const struct phonet_protocol *phonet_proto_get(unsigned int protocol) if (protocol >= PHONET_NPROTO) return NULL; - protocol = array_index_nospec(protocol, PHONET_NPROTO); rcu_read_lock(); pp = rcu_dereference(proto_tab[protocol]); -- cgit v1.2.3-59-g8ed1b From 0eb987c874dc93f9c9d85a6465dbde20fdd3884c Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Sun, 23 Dec 2018 19:42:38 -0600 Subject: net/net_namespace: Check the return value of register_pernet_subsys() In net_ns_init(), register_pernet_subsys() could fail while registering network namespace subsystems. The fix checks the return value and sends a panic() on failure. Signed-off-by: Aditya Pakki Reviewed-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/net_namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index fefe72774aeb..af8849a7a9c3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -917,7 +917,8 @@ static int __init net_ns_init(void) init_net_initialized = true; up_write(&pernet_ops_rwsem); - register_pernet_subsys(&net_ns_ops); + if (register_pernet_subsys(&net_ns_ops)) + panic("Could not register network namespace subsystems"); rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, RTNL_FLAG_DOIT_UNLOCKED); -- cgit v1.2.3-59-g8ed1b