diff options
Diffstat (limited to 'net')
337 files changed, 11307 insertions, 3694 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 54728d2eda18..d4bcfd8f95bf 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -172,7 +172,6 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) if (err < 0) goto out_uninit_mvrp; - vlan->nest_level = dev_get_nest_level(real_dev) + 1; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 93eadf179123..e5bff5cc6f97 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -489,36 +489,6 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } -/* - * vlan network devices have devices nesting below it, and are a special - * "super class" of normal network devices; split their locks off into a - * separate class since they always nest. - */ -static struct lock_class_key vlan_netdev_xmit_lock_key; -static struct lock_class_key vlan_netdev_addr_lock_key; - -static void vlan_dev_set_lockdep_one(struct net_device *dev, - struct netdev_queue *txq, - void *_subclass) -{ - lockdep_set_class_and_subclass(&txq->_xmit_lock, - &vlan_netdev_xmit_lock_key, - *(int *)_subclass); -} - -static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass) -{ - lockdep_set_class_and_subclass(&dev->addr_list_lock, - &vlan_netdev_addr_lock_key, - subclass); - netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass); -} - -static int vlan_dev_get_lock_subclass(struct net_device *dev) -{ - return vlan_dev_priv(dev)->nest_level; -} - static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, @@ -609,8 +579,6 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev)); - vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) return -ENOMEM; @@ -812,7 +780,6 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, - .ndo_get_lock_subclass = vlan_dev_get_lock_subclass, .ndo_get_iflink = vlan_dev_get_iflink, }; diff --git a/net/atm/common.c b/net/atm/common.c index b7528e77997c..0ce530af534d 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -668,7 +668,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* writable? */ diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 6c11cdf4dd4c..fbd0c5e7b299 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -109,7 +109,7 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) dev_kfree_skb(skb); goto as_indicate_complete; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); skb_queue_tail(&sk->sk_receive_queue, skb); pr_debug("waking sk_sleep(sk) 0x%p\n", sk_sleep(sk)); sk->sk_state_change(sk); diff --git a/net/atm/svc.c b/net/atm/svc.c index 908cbb8654f5..ba144d035e3d 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -381,7 +381,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags, msg->pvc.sap_addr.vpi, msg->pvc.sap_addr.vci); dev_kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); if (error) { sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL, &old_vcc->qos, error); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index bb222b882b67..324306d6fde0 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1384,7 +1384,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; out: diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index dcdbaeeb2358..cd6afe895db9 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -356,7 +356,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, make->sk_state = TCP_ESTABLISHED; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); bh_unlock_sock(sk); } else { if (!mine) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index d78938e3e008..5b0b20e6da95 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -22,6 +22,8 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> +#include <linux/lockdep.h> +#include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> @@ -193,14 +195,18 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) unsigned char *ogm_buff; u32 random_seqno; + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&hard_iface->bat_iv.ogm_seqno, random_seqno); hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN; ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC); - if (!ogm_buff) + if (!ogm_buff) { + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); return -ENOMEM; + } hard_iface->bat_iv.ogm_buff = ogm_buff; @@ -212,35 +218,59 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) batadv_ogm_packet->reserved = 0; batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE; + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); + return 0; } static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + kfree(hard_iface->bat_iv.ogm_buff); hard_iface->bat_iv.ogm_buff = NULL; + + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; - unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff; + void *ogm_buff; - batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + + ogm_buff = hard_iface->bat_iv.ogm_buff; + if (!ogm_buff) + goto unlock; + + batadv_ogm_packet = ogm_buff; ether_addr_copy(batadv_ogm_packet->orig, hard_iface->net_dev->dev_addr); ether_addr_copy(batadv_ogm_packet->prev_sender, hard_iface->net_dev->dev_addr); + +unlock: + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; - unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff; + void *ogm_buff; - batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + + ogm_buff = hard_iface->bat_iv.ogm_buff; + if (!ogm_buff) + goto unlock; + + batadv_ogm_packet = ogm_buff; batadv_ogm_packet->ttl = BATADV_TTL; + +unlock: + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } /* when do we schedule our own ogm to be sent */ @@ -742,7 +772,11 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) } } -static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) +/** + * batadv_iv_ogm_schedule_buff() - schedule submission of hardif ogm buffer + * @hard_iface: interface whose ogm buffer should be transmitted + */ +static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff; @@ -753,9 +787,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) u16 tvlv_len = 0; unsigned long send_time; - if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || - hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) - return; + lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex); /* the interface gets activated here to avoid race conditions between * the moment of activating the interface in @@ -823,6 +855,17 @@ out: batadv_hardif_put(primary_if); } +static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) +{ + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || + hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) + return; + + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + batadv_iv_ogm_schedule_buff(hard_iface); + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); +} + /** * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface * @orig_node: originator which reproadcasted the OGMs directly diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 64054edc2e3c..4ff6cf1ecae7 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1085,7 +1085,6 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) hard_iface->bat_v.aggr_len = 0; skb_queue_head_init(&hard_iface->bat_v.aggr_list); - spin_lock_init(&hard_iface->bat_v.aggr_list_lock); INIT_DELAYED_WORK(&hard_iface->bat_v.aggr_wq, batadv_v_ogm_aggr_work); } diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index dc4f7430cb5a..714ce56cfcc8 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -18,6 +18,7 @@ #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> +#include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/rculist.h> @@ -151,7 +152,7 @@ static unsigned int batadv_v_ogm_len(struct sk_buff *skb) * @skb: the OGM to check * @hard_iface: the interface to use to send the OGM * - * Caller needs to hold the hard_iface->bat_v.aggr_list_lock. + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. * * Return: True, if the given OGMv2 packet still fits, false otherwise. */ @@ -162,7 +163,7 @@ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, BATADV_MAX_AGGREGATION_BYTES); unsigned int ogm_len = batadv_v_ogm_len(skb); - lockdep_assert_held(&hard_iface->bat_v.aggr_list_lock); + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); return hard_iface->bat_v.aggr_len + ogm_len <= max; } @@ -173,17 +174,13 @@ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, * * Empties the OGMv2 aggregation queue and frees all the skbs it contained. * - * Caller needs to hold the hard_iface->bat_v.aggr_list_lock. + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) { - struct sk_buff *skb; - - lockdep_assert_held(&hard_iface->bat_v.aggr_list_lock); - - while ((skb = skb_dequeue(&hard_iface->bat_v.aggr_list))) - kfree_skb(skb); + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); + __skb_queue_purge(&hard_iface->bat_v.aggr_list); hard_iface->bat_v.aggr_len = 0; } @@ -196,7 +193,7 @@ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) * * The aggregation queue is empty after this call. * - * Caller needs to hold the hard_iface->bat_v.aggr_list_lock. + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) { @@ -205,7 +202,7 @@ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) unsigned int ogm_len; struct sk_buff *skb; - lockdep_assert_held(&hard_iface->bat_v.aggr_list_lock); + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); if (!aggr_len) return; @@ -219,7 +216,7 @@ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) skb_reserve(skb_aggr, ETH_HLEN + NET_IP_ALIGN); skb_reset_network_header(skb_aggr); - while ((skb = skb_dequeue(&hard_iface->bat_v.aggr_list))) { + while ((skb = __skb_dequeue(&hard_iface->bat_v.aggr_list))) { hard_iface->bat_v.aggr_len -= batadv_v_ogm_len(skb); ogm_len = batadv_v_ogm_len(skb); @@ -246,24 +243,22 @@ static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, return; } - spin_lock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); if (!batadv_v_ogm_queue_left(skb, hard_iface)) batadv_v_ogm_aggr_send(hard_iface); hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb); - skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); - spin_unlock_bh(&hard_iface->bat_v.aggr_list_lock); + __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** - * batadv_v_ogm_send() - periodic worker broadcasting the own OGM - * @work: work queue item + * batadv_v_ogm_send_softif() - periodic worker broadcasting the own OGM + * @bat_priv: the bat priv with all the soft interface information */ -static void batadv_v_ogm_send(struct work_struct *work) +static void batadv_v_ogm_send_softif(struct batadv_priv *bat_priv) { struct batadv_hard_iface *hard_iface; - struct batadv_priv_bat_v *bat_v; - struct batadv_priv *bat_priv; struct batadv_ogm2_packet *ogm_packet; struct sk_buff *skb, *skb_tmp; unsigned char *ogm_buff; @@ -271,8 +266,7 @@ static void batadv_v_ogm_send(struct work_struct *work) u16 tvlv_len = 0; int ret; - bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work); - bat_priv = container_of(bat_v, struct batadv_priv, bat_v); + lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) goto out; @@ -364,6 +358,23 @@ out: } /** + * batadv_v_ogm_send() - periodic worker broadcasting the own OGM + * @work: work queue item + */ +static void batadv_v_ogm_send(struct work_struct *work) +{ + struct batadv_priv_bat_v *bat_v; + struct batadv_priv *bat_priv; + + bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work); + bat_priv = container_of(bat_v, struct batadv_priv, bat_v); + + mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); + batadv_v_ogm_send_softif(bat_priv); + mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); +} + +/** * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface * @work: work queue item * @@ -377,9 +388,9 @@ void batadv_v_ogm_aggr_work(struct work_struct *work) batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work); hard_iface = container_of(batv, struct batadv_hard_iface, bat_v); - spin_lock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_send(hard_iface); - spin_unlock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_start_queue_timer(hard_iface); } @@ -410,9 +421,9 @@ void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { cancel_delayed_work_sync(&hard_iface->bat_v.aggr_wq); - spin_lock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_list_free(hard_iface); - spin_unlock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** @@ -424,11 +435,15 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) struct batadv_priv *bat_priv = netdev_priv(primary_iface->soft_iface); struct batadv_ogm2_packet *ogm_packet; + mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); if (!bat_priv->bat_v.ogm_buff) - return; + goto unlock; ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff; ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr); + +unlock: + mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } /** @@ -1050,6 +1065,8 @@ int batadv_v_ogm_init(struct batadv_priv *bat_priv) atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno); INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send); + mutex_init(&bat_priv->bat_v.ogm_buff_mutex); + return 0; } @@ -1061,7 +1078,11 @@ void batadv_v_ogm_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq); + mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); + kfree(bat_priv->bat_v.ogm_buff); bat_priv->bat_v.ogm_buff = NULL; bat_priv->bat_v.ogm_buff_len = 0; + + mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index c90e47342bb0..afb52282d5bd 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -18,6 +18,7 @@ #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> +#include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> @@ -929,6 +930,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) INIT_LIST_HEAD(&hard_iface->list); INIT_HLIST_HEAD(&hard_iface->neigh_list); + mutex_init(&hard_iface->bat_iv.ogm_buff_mutex); spin_lock_init(&hard_iface->neigh_list_lock); kref_init(&hard_iface->refcount); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 6967f2e4c3f4..c7b340ddd0e7 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2019.4" +#define BATADV_SOURCE_VERSION "2019.5" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 1d5bdf3a4b65..f9ec8e7507b6 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1421,7 +1421,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, if (*orig) return BATADV_FORW_SINGLE; - /* fall through */ + fallthrough; case 0: return BATADV_FORW_NONE; default: diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 9cbed6f5a85a..832e156c519e 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -22,7 +22,6 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> -#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> @@ -230,7 +229,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, break; } - /* fall through */ + fallthrough; case ETH_P_BATMAN: goto dropped; } @@ -455,7 +454,7 @@ void batadv_interface_rx(struct net_device *soft_iface, if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; - /* fall through */ + fallthrough; case ETH_P_BATMAN: goto dropped; } @@ -740,36 +739,6 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, return 0; } -/* batman-adv network devices have devices nesting below it and are a special - * "super class" of normal network devices; split their locks off into a - * separate class since they always nest. - */ -static struct lock_class_key batadv_netdev_xmit_lock_key; -static struct lock_class_key batadv_netdev_addr_lock_key; - -/** - * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue - * @dev: device which owns the tx queue - * @txq: tx queue to modify - * @_unused: always NULL - */ -static void batadv_set_lockdep_class_one(struct net_device *dev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); -} - -/** - * batadv_set_lockdep_class() - Set txq and addr_list lockdep class - * @dev: network device to modify - */ -static void batadv_set_lockdep_class(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); - netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); -} - /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify @@ -783,8 +752,6 @@ static int batadv_softif_init_late(struct net_device *dev) int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; - batadv_set_lockdep_class(dev); - bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index be7c02aa91e2..47718a82eaf2 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -17,6 +17,7 @@ #include <linux/if.h> #include <linux/if_ether.h> #include <linux/kref.h> +#include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/sched.h> /* for linux/wait.h */ @@ -81,6 +82,9 @@ struct batadv_hard_iface_bat_iv { /** @ogm_seqno: OGM sequence number - used to identify each OGM */ atomic_t ogm_seqno; + + /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */ + struct mutex ogm_buff_mutex; }; /** @@ -126,9 +130,6 @@ struct batadv_hard_iface_bat_v { /** @aggr_len: size of the OGM aggregate (excluding ethernet header) */ unsigned int aggr_len; - /** @aggr_list_lock: protects aggr_list */ - spinlock_t aggr_list_lock; - /** * @throughput_override: throughput override to disable link * auto-detection @@ -1539,6 +1540,9 @@ struct batadv_priv_bat_v { /** @ogm_seqno: OGM sequence number - used to identify each OGM */ atomic_t ogm_seqno; + /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */ + struct mutex ogm_buff_mutex; + /** @ogm_wq: workqueue used to schedule OGM transmissions */ struct delayed_work ogm_wq; }; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index bb55d92691b0..4febc82a7c76 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -571,15 +571,7 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) return err < 0 ? NET_XMIT_DROP : err; } -static int bt_dev_init(struct net_device *dev) -{ - netdev_lockdep_set_classes(dev); - - return 0; -} - static const struct net_device_ops netdev_ops = { - .ndo_init = bt_dev_init, .ndo_start_xmit = bt_xmit, }; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 94ddf19998c7..3fd124927d4d 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -173,7 +173,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) else release_sock(sk); - parent->sk_ack_backlog++; + sk_acceptq_added(parent); } EXPORT_SYMBOL(bt_accept_enqueue); @@ -185,7 +185,7 @@ void bt_accept_unlink(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); list_del_init(&bt_sk(sk)->accept_q); - bt_sk(sk)->parent->sk_ack_backlog--; + sk_acceptq_removed(bt_sk(sk)->parent); bt_sk(sk)->parent = NULL; sock_put(sk); } @@ -460,7 +460,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, if (sk->sk_state == BT_LISTEN) return bt_accept_poll(sk); - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); @@ -470,7 +470,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == BT_CLOSED) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ad5b0ac1f9ce..7ff92dd4c53c 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -934,6 +934,14 @@ static void hci_req_directed_advertising(struct hci_request *req, return; memset(&cp, 0, sizeof(cp)); + + /* Some controllers might reject command if intervals are not + * within range for undirected advertising. + * BCM20702A0 is known to be affected by this. + */ + cp.min_interval = cpu_to_le16(0x0020); + cp.max_interval = cpu_to_le16(0x0020); + cp.type = LE_ADV_DIRECT_IND; cp.own_address_type = own_addr_type; cp.direct_addr_type = conn->dst_type; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 04bc79359a17..0cc9ce917222 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -842,8 +842,8 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { struct hci_cp_le_write_def_data_len cp; - cp.tx_len = hdev->le_max_tx_len; - cp.tx_time = hdev->le_max_tx_time; + cp.tx_len = cpu_to_le16(hdev->le_max_tx_len); + cp.tx_time = cpu_to_le16(hdev->le_max_tx_time); hci_req_add(req, HCI_OP_LE_WRITE_DEF_DATA_LEN, sizeof(cp), &cp); } @@ -4440,7 +4440,14 @@ static void hci_rx_work(struct work_struct *work) hci_send_to_sock(hdev, skb); } - if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { + /* If the device has been opened in HCI_USER_CHANNEL, + * the userspace has exclusive access to device. + * When device is HCI_INIT, we still need to process + * the data packets to the driver in order + * to complete its setup(). + */ + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + !test_bit(HCI_INIT, &hdev->flags)) { kfree_skb(skb); continue; } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 7f6a581b5b7e..2a1b64dbf76e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -904,9 +904,9 @@ static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; - /* Ignore instance 0 */ + /* Instance 0x00 always set local name */ if (instance == 0x00) - return 0; + return 1; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) @@ -923,9 +923,9 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) u8 instance = hdev->cur_adv_instance; struct adv_info *adv_instance; - /* Ignore instance 0 */ + /* Instance 0x00 always set local name */ if (instance == 0x00) - return 0; + return 1; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) @@ -1273,6 +1273,14 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) instance_flags = get_adv_instance_flags(hdev, instance); + /* If instance already has the flags set skip adding it once + * again. + */ + if (adv_instance && eir_get_data(adv_instance->adv_data, + adv_instance->adv_data_len, EIR_FLAGS, + NULL)) + goto skip_flags; + /* The Add Advertising command allows userspace to set both the general * and limited discoverable flags. */ @@ -1305,6 +1313,7 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) } } +skip_flags: if (adv_instance) { memcpy(ptr, adv_instance->adv_data, adv_instance->adv_data_len); @@ -1690,7 +1699,7 @@ int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance) * scheduling it. */ if (adv_instance && adv_instance->duration) { - u16 duration = adv_instance->duration * MSEC_PER_SEC; + u16 duration = adv_instance->timeout * MSEC_PER_SEC; /* Time = N * 10 ms */ adv_set->duration = cpu_to_le16(duration / 10); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 26e8cfad22b8..6b42be4b5861 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -502,15 +502,12 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], const bdaddr_t *bdaddr) { struct l2cap_chan *chan = hdev->smp_data; - struct smp_dev *smp; u8 hash[3]; int err; if (!chan || !chan->data) return false; - smp = chan->data; - BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk); err = smp_ah(irk, &bdaddr->b[3], hash); @@ -523,14 +520,11 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa) { struct l2cap_chan *chan = hdev->smp_data; - struct smp_dev *smp; int err; if (!chan || !chan->data) return -EOPNOTSUPP; - smp = chan->data; - get_random_bytes(&rpa->b[3], 3); rpa->b[5] &= 0x3f; /* Clear two most significant bits */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 1153bbcdff72..0be4497cb832 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -218,10 +218,18 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + FIELD_SIZEOF(struct __sk_buff, cb), + offsetof(struct __sk_buff, tstamp))) + return -EINVAL; + + /* tstamp is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, tstamp) + + FIELD_SIZEOF(struct __sk_buff, tstamp), sizeof(struct __sk_buff))) return -EINVAL; skb->priority = __skb->priority; + skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); return 0; @@ -235,6 +243,7 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) return; __skb->priority = skb->priority; + __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 681b72862c16..434effde02c3 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -24,8 +24,6 @@ const struct nf_br_ops __rcu *nf_br_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_br_ops); -static struct lock_class_key bridge_netdev_addr_lock_key; - /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -108,11 +106,6 @@ out: return NETDEV_TX_OK; } -static void br_set_lockdep_class(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); -} - static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -150,7 +143,6 @@ static int br_dev_init(struct net_device *dev) br_mdb_hash_fini(br); br_fdb_hash_fini(br); } - br_set_lockdep_class(dev); return err; } @@ -271,6 +263,37 @@ static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) strlcpy(info->bus_info, "N/A", sizeof(info->bus_info)); } +static int br_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_port *p; + + cmd->base.duplex = DUPLEX_UNKNOWN; + cmd->base.port = PORT_OTHER; + cmd->base.speed = SPEED_UNKNOWN; + + list_for_each_entry(p, &br->port_list, list) { + struct ethtool_link_ksettings ecmd; + struct net_device *pdev = p->dev; + + if (!netif_running(pdev) || !netif_oper_up(pdev)) + continue; + + if (__ethtool_get_link_ksettings(pdev, &ecmd)) + continue; + + if (ecmd.base.speed == (__u32)SPEED_UNKNOWN) + continue; + + if (cmd->base.speed == (__u32)SPEED_UNKNOWN || + cmd->base.speed < ecmd.base.speed) + cmd->base.speed = ecmd.base.speed; + } + + return 0; +} + static netdev_features_t br_fix_features(struct net_device *dev, netdev_features_t features) { @@ -373,8 +396,9 @@ static int br_del_slave(struct net_device *dev, struct net_device *slave_dev) } static const struct ethtool_ops br_ethtool_ops = { - .get_drvinfo = br_getinfo, - .get_link = ethtool_op_get_link, + .get_drvinfo = br_getinfo, + .get_link = ethtool_op_get_link, + .get_link_ksettings = br_get_link_ksettings, }; static const struct net_device_ops br_netdev_ops = { diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b1d3248c0252..4877a0db16c6 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -75,8 +75,9 @@ static inline unsigned long hold_time(const struct net_bridge *br) static inline int has_expired(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { - return !fdb->is_static && !fdb->added_by_external_learn && - time_before_eq(fdb->updated + hold_time(br), jiffies); + return !test_bit(BR_FDB_STATIC, &fdb->flags) && + !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags) && + time_before_eq(fdb->updated + hold_time(br), jiffies); } static void fdb_rcu_free(struct rcu_head *head) @@ -197,7 +198,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f, { trace_fdb_delete(br, f); - if (f->is_static) + if (test_bit(BR_FDB_STATIC, &f->flags)) fdb_del_hw_addr(br, f->key.addr.addr); hlist_del_init_rcu(&f->fdb_node); @@ -224,7 +225,7 @@ static void fdb_delete_local(struct net_bridge *br, if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && (!vid || br_vlan_find(vg, vid))) { f->dst = op; - f->added_by_user = 0; + clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } } @@ -235,7 +236,7 @@ static void fdb_delete_local(struct net_bridge *br, if (p && ether_addr_equal(br->dev->dev_addr, addr) && (!vid || (v && br_vlan_should_use(v)))) { f->dst = NULL; - f->added_by_user = 0; + clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } @@ -250,7 +251,8 @@ void br_fdb_find_delete_local(struct net_bridge *br, spin_lock_bh(&br->hash_lock); f = br_fdb_find(br, addr, vid); - if (f && f->is_local && !f->added_by_user && f->dst == p) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !test_bit(BR_FDB_ADDED_BY_USER, &f->flags) && f->dst == p) fdb_delete_local(br, p, f); spin_unlock_bh(&br->hash_lock); } @@ -265,7 +267,8 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) spin_lock_bh(&br->hash_lock); vg = nbp_vlan_group(p); hlist_for_each_entry(f, &br->fdb_list, fdb_node) { - if (f->dst == p && f->is_local && !f->added_by_user) { + if (f->dst == p && test_bit(BR_FDB_LOCAL, &f->flags) && + !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) { /* delete old one */ fdb_delete_local(br, p, f); @@ -306,7 +309,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) /* If old entry was unassociated with any port, then delete it. */ f = br_fdb_find(br, br->dev->dev_addr, 0); - if (f && f->is_local && !f->dst && !f->added_by_user) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, 0); @@ -321,7 +325,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) if (!br_vlan_should_use(v)) continue; f = br_fdb_find(br, br->dev->dev_addr, v->vid); - if (f && f->is_local && !f->dst && !f->added_by_user) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, v->vid); } @@ -346,7 +351,8 @@ void br_fdb_cleanup(struct work_struct *work) hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { unsigned long this_timer; - if (f->is_static || f->added_by_external_learn) + if (test_bit(BR_FDB_STATIC, &f->flags) || + test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) continue; this_timer = f->updated + delay; if (time_after(this_timer, now)) { @@ -373,7 +379,7 @@ void br_fdb_flush(struct net_bridge *br) spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) fdb_delete(br, f, true); } spin_unlock_bh(&br->hash_lock); @@ -397,10 +403,11 @@ void br_fdb_delete_by_port(struct net_bridge *br, continue; if (!do_all) - if (f->is_static || (vid && f->key.vlan_id != vid)) + if (test_bit(BR_FDB_STATIC, &f->flags) || + (vid && f->key.vlan_id != vid)) continue; - if (f->is_local) + if (test_bit(BR_FDB_LOCAL, &f->flags)) fdb_delete_local(br, p, f); else fdb_delete(br, f, true); @@ -469,8 +476,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, fe->port_no = f->dst->port_no; fe->port_hi = f->dst->port_no >> 8; - fe->is_local = f->is_local; - if (!f->is_static) + fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags); + if (!test_bit(BR_FDB_STATIC, &f->flags)) fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); ++fe; ++num; @@ -484,8 +491,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, __u16 vid, - unsigned char is_local, - unsigned char is_static) + unsigned long flags) { struct net_bridge_fdb_entry *fdb; @@ -494,12 +500,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, memcpy(fdb->key.addr.addr, addr, ETH_ALEN); fdb->dst = source; fdb->key.vlan_id = vid; - fdb->is_local = is_local; - fdb->is_static = is_static; - fdb->added_by_user = 0; - fdb->added_by_external_learn = 0; - fdb->offloaded = 0; - fdb->is_sticky = 0; + fdb->flags = flags; fdb->updated = fdb->used = jiffies; if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode, @@ -526,14 +527,15 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, /* it is okay to have multiple ports with same * address, just use the first one. */ - if (fdb->is_local) + if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return 0; br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n", source ? source->dev->name : br->dev->name, addr, vid); fdb_delete(br, fdb, true); } - fdb = fdb_create(br, source, addr, vid, 1, 1); + fdb = fdb_create(br, source, addr, vid, + BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC)); if (!fdb) return -ENOMEM; @@ -555,7 +557,7 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, } void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user) + const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; bool fdb_modified = false; @@ -564,15 +566,10 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, if (hold_time(br) == 0) return; - /* ignore packets unless we are using this port */ - if (!(source->state == BR_STATE_LEARNING || - source->state == BR_STATE_FORWARDING)) - return; - fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); if (likely(fdb)) { /* attempt to update an entry for a local interface */ - if (unlikely(fdb->is_local)) { + if (unlikely(test_bit(BR_FDB_LOCAL, &fdb->flags))) { if (net_ratelimit()) br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n", source->dev->name, addr, vid); @@ -580,30 +577,30 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, unsigned long now = jiffies; /* fastpath: update of existing entry */ - if (unlikely(source != fdb->dst && !fdb->is_sticky)) { + if (unlikely(source != fdb->dst && + !test_bit(BR_FDB_STICKY, &fdb->flags))) { fdb->dst = source; fdb_modified = true; /* Take over HW learned entry */ - if (unlikely(fdb->added_by_external_learn)) - fdb->added_by_external_learn = 0; + if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN, + &fdb->flags))) + clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, + &fdb->flags); } if (now != fdb->updated) fdb->updated = now; - if (unlikely(added_by_user)) - fdb->added_by_user = 1; + if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (unlikely(fdb_modified)) { - trace_br_fdb_update(br, source, addr, vid, added_by_user); + trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } } } else { spin_lock(&br->hash_lock); - fdb = fdb_create(br, source, addr, vid, 0, 0); + fdb = fdb_create(br, source, addr, vid, flags); if (fdb) { - if (unlikely(added_by_user)) - fdb->added_by_user = 1; - trace_br_fdb_update(br, source, addr, vid, - added_by_user); + trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } /* else we lose race and someone else inserts @@ -616,9 +613,9 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, static int fdb_to_nud(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { - if (fdb->is_local) + if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return NUD_PERMANENT; - else if (fdb->is_static) + else if (test_bit(BR_FDB_STATIC, &fdb->flags)) return NUD_NOARP; else if (has_expired(br, fdb)) return NUD_STALE; @@ -648,11 +645,11 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex; ndm->ndm_state = fdb_to_nud(br, fdb); - if (fdb->offloaded) + if (test_bit(BR_FDB_OFFLOADED, &fdb->flags)) ndm->ndm_flags |= NTF_OFFLOADED; - if (fdb->added_by_external_learn) + if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) ndm->ndm_flags |= NTF_EXT_LEARNED; - if (fdb->is_sticky) + if (test_bit(BR_FDB_STICKY, &fdb->flags)) ndm->ndm_flags |= NTF_STICKY; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) @@ -799,7 +796,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, const u8 *addr, u16 state, u16 flags, u16 vid, u8 ndm_flags) { - u8 is_sticky = !!(ndm_flags & NTF_STICKY); + bool is_sticky = !!(ndm_flags & NTF_STICKY); struct net_bridge_fdb_entry *fdb; bool modified = false; @@ -823,7 +820,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (!(flags & NLM_F_CREATE)) return -ENOENT; - fdb = fdb_create(br, source, addr, vid, 0, 0); + fdb = fdb_create(br, source, addr, vid, 0); if (!fdb) return -ENOMEM; @@ -840,34 +837,28 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (fdb_to_nud(br, fdb) != state) { if (state & NUD_PERMANENT) { - fdb->is_local = 1; - if (!fdb->is_static) { - fdb->is_static = 1; + set_bit(BR_FDB_LOCAL, &fdb->flags); + if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); - } } else if (state & NUD_NOARP) { - fdb->is_local = 0; - if (!fdb->is_static) { - fdb->is_static = 1; + clear_bit(BR_FDB_LOCAL, &fdb->flags); + if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); - } } else { - fdb->is_local = 0; - if (fdb->is_static) { - fdb->is_static = 0; + clear_bit(BR_FDB_LOCAL, &fdb->flags); + if (test_and_clear_bit(BR_FDB_STATIC, &fdb->flags)) fdb_del_hw_addr(br, addr); - } } modified = true; } - if (is_sticky != fdb->is_sticky) { - fdb->is_sticky = is_sticky; + if (is_sticky != test_bit(BR_FDB_STICKY, &fdb->flags)) { + change_bit(BR_FDB_STICKY, &fdb->flags); modified = true; } - fdb->added_by_user = 1; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); fdb->used = jiffies; if (modified) { @@ -890,9 +881,12 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, br->dev->name); return -EINVAL; } + if (!nbp_state_should_learn(p)) + return 0; + local_bh_disable(); rcu_read_lock(); - br_fdb_update(br, p, addr, vid, true); + br_fdb_update(br, p, addr, vid, BIT(BR_FDB_ADDED_BY_USER)); rcu_read_unlock(); local_bh_enable(); } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { @@ -1064,7 +1058,7 @@ int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p) rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; err = dev_uc_add(p->dev, f->key.addr.addr); if (err) @@ -1078,7 +1072,7 @@ done: rollback: hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!tmp->is_static) + if (!test_bit(BR_FDB_STATIC, &tmp->flags)) continue; if (tmp == f) break; @@ -1097,7 +1091,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; dev_uc_del(p->dev, f->key.addr.addr); @@ -1119,14 +1113,15 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, fdb = br_fdb_find(br, addr, vid); if (!fdb) { - fdb = fdb_create(br, p, addr, vid, 0, 0); + unsigned long flags = BIT(BR_FDB_ADDED_BY_EXT_LEARN); + + if (swdev_notify) + flags |= BIT(BR_FDB_ADDED_BY_USER); + fdb = fdb_create(br, p, addr, vid, flags); if (!fdb) { err = -ENOMEM; goto err_unlock; } - if (swdev_notify) - fdb->added_by_user = 1; - fdb->added_by_external_learn = 1; fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } else { fdb->updated = jiffies; @@ -1136,17 +1131,17 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, modified = true; } - if (fdb->added_by_external_learn) { + if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { /* Refresh entry */ fdb->used = jiffies; - } else if (!fdb->added_by_user) { + } else if (!test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags)) { /* Take over SW learned entry */ - fdb->added_by_external_learn = 1; + set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); modified = true; } if (swdev_notify) - fdb->added_by_user = 1; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (modified) fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); @@ -1168,7 +1163,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); - if (fdb && fdb->added_by_external_learn) + if (fdb && test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) fdb_delete(br, fdb, swdev_notify); else err = -ENOENT; @@ -1186,8 +1181,8 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); - if (fdb) - fdb->offloaded = offloaded; + if (fdb && offloaded != test_bit(BR_FDB_OFFLOADED, &fdb->flags)) + change_bit(BR_FDB_OFFLOADED, &fdb->flags); spin_unlock_bh(&br->hash_lock); } @@ -1206,7 +1201,7 @@ void br_fdb_clear_offload(const struct net_device *dev, u16 vid) spin_lock_bh(&p->br->hash_lock); hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) { if (f->dst == p && f->key.vlan_id == vid) - f->offloaded = 0; + clear_bit(BR_FDB_OFFLOADED, &f->flags); } spin_unlock_bh(&p->br->hash_lock); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 09b1dd8cd853..8944ceb47fe9 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -88,7 +88,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; if (p->flags & BR_LEARNING) - br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); + br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); local_rcv = !!(br->dev->flags & IFF_PROMISC); if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { @@ -151,7 +151,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (dst) { unsigned long now = jiffies; - if (dst->is_local) + if (test_bit(BR_FDB_LOCAL, &dst->flags)) return br_pass_frame_up(skb); if (now != dst->used) @@ -182,9 +182,10 @@ static void __br_handle_local_finish(struct sk_buff *skb) /* check if vlan is allowed, to avoid spoofing */ if ((p->flags & BR_LEARNING) && + nbp_state_should_learn(p) && !br_opt_get(p->br, BROPT_NO_LL_LEARN) && br_should_learn(p, skb, &vid)) - br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); + br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0); } /* note: already called with rcu_read_lock */ diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ce2ab14ee605..36b0367ca1e0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -172,6 +172,16 @@ struct net_bridge_vlan_group { u16 pvid; }; +/* bridge fdb flags */ +enum { + BR_FDB_LOCAL, + BR_FDB_STATIC, + BR_FDB_STICKY, + BR_FDB_ADDED_BY_USER, + BR_FDB_ADDED_BY_EXT_LEARN, + BR_FDB_OFFLOADED, +}; + struct net_bridge_fdb_key { mac_addr addr; u16 vlan_id; @@ -183,12 +193,7 @@ struct net_bridge_fdb_entry { struct net_bridge_fdb_key key; struct hlist_node fdb_node; - unsigned char is_local:1, - is_static:1, - is_sticky:1, - added_by_user:1, - added_by_external_learn:1, - offloaded:1; + unsigned long flags; /* write-heavy members should not affect lookups */ unsigned long updated ____cacheline_aligned_in_smp; @@ -495,6 +500,11 @@ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v) return true; } +static inline bool nbp_state_should_learn(const struct net_bridge_port *p) +{ + return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING; +} + static inline int br_opt_get(const struct net_bridge *br, enum net_bridge_opts opt) { @@ -566,7 +576,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user); + const unsigned char *addr, u16 vid, unsigned long flags); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 921310d3cbae..015209bf44aa 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -129,15 +129,19 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr, fdb->key.vlan_id, fdb->dst->dev, - fdb->added_by_user, - fdb->offloaded); + test_bit(BR_FDB_ADDED_BY_USER, + &fdb->flags), + test_bit(BR_FDB_OFFLOADED, + &fdb->flags)); break; case RTM_NEWNEIGH: br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr, fdb->key.vlan_id, fdb->dst->dev, - fdb->added_by_user, - fdb->offloaded); + test_bit(BR_FDB_ADDED_BY_USER, + &fdb->flags), + test_bit(BR_FDB_OFFLOADED, + &fdb->flags)); break; } } diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index ed91ea31978a..12a4f4d93681 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -20,7 +20,6 @@ static unsigned int ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; - struct net_device *dev; if (skb_ensure_writable(skb, ETH_ALEN)) return EBT_DROP; @@ -33,10 +32,22 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) else skb->pkt_type = PACKET_MULTICAST; } else { - if (xt_hooknum(par) != NF_BR_BROUTING) - dev = br_port_get_rcu(xt_in(par))->br->dev; - else + const struct net_device *dev; + + switch (xt_hooknum(par)) { + case NF_BR_BROUTING: dev = xt_in(par); + break; + case NF_BR_PRE_ROUTING: + dev = br_port_get_rcu(xt_in(par))->br->dev; + break; + default: + dev = NULL; + break; + } + + if (!dev) /* NF_BR_LOCAL_OUT */ + return info->target; if (ether_addr_equal(info->mac, dev->dev_addr)) skb->pkt_type = PACKET_HOST; diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 8842798c29e6..809673222382 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -33,6 +33,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; unsigned int hlen, ll_rs, mtu; + ktime_t tstamp = skb->tstamp; struct ip_frag_state state; struct iphdr *iph; int err; @@ -80,6 +81,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, if (iter.frag) ip_fraglist_prepare(skb, &iter); + skb->tstamp = tstamp; err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -93,7 +95,7 @@ slow_path: * This may also be a clone skbuff, we could preserve the geometry for * the copies but probably not worth the effort. */ - ip_frag_init(skb, hlen, ll_rs, frag_max_size, &state); + ip_frag_init(skb, hlen, ll_rs, frag_max_size, false, &state); while (state.left > 0) { struct sk_buff *skb2; @@ -104,6 +106,7 @@ slow_path: goto blackhole; } + skb2->tstamp = tstamp; err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/caif/Kconfig b/net/caif/Kconfig index eb83051c8330..b7532a79ca7a 100644 --- a/net/caif/Kconfig +++ b/net/caif/Kconfig @@ -13,11 +13,11 @@ menuconfig CAIF with its modems. It is accessed from user space as sockets (PF_CAIF). Say Y (or M) here if you build for a phone product (e.g. Android or - MeeGo ) that uses CAIF as transport, if unsure say N. + MeeGo) that uses CAIF as transport. If unsure say N. If you select to build it as module then CAIF_NETDEV also needs to be - built as modules. You will also need to say yes to any CAIF physical - devices that your platform requires. + built as a module. You will also need to say Y (or M) to any CAIF + physical devices that your platform requires. See Documentation/networking/caif for a further explanation on how to use and configure CAIF. @@ -37,7 +37,7 @@ config CAIF_NETDEV default CAIF ---help--- Say Y if you will be using a CAIF based GPRS network device. - This can be either built-in or a loadable module, + This can be either built-in or a loadable module. If you select to build it as a built-in then the main CAIF device must also be a built-in. If unsure say Y. @@ -48,7 +48,7 @@ config CAIF_USB default n ---help--- Say Y if you are using CAIF over USB CDC NCM. - This can be either built-in or a loadable module, + This can be either built-in or a loadable module. If you select to build it as a built-in then the main CAIF device must also be a built-in. If unsure say N. diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 13ea920600ae..ef14da50a981 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -953,7 +953,7 @@ static __poll_t caif_poll(struct file *file, mask |= EPOLLRDHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || + if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) mask |= EPOLLIN | EPOLLRDNORM; diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 37c1040bcb9c..4d8ba701e15d 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -580,6 +580,7 @@ static int j1939_sk_release(struct socket *sock) j1939_netdev_stop(priv); } + kfree(jsk->filters); sock_orphan(sk); sock->sk = NULL; @@ -909,8 +910,10 @@ void j1939_sk_errqueue(struct j1939_session *session, memset(serr, 0, sizeof(*serr)); switch (type) { case J1939_ERRQUEUE_ACK: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) + if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) { + kfree_skb(skb); return; + } serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; @@ -918,8 +921,10 @@ void j1939_sk_errqueue(struct j1939_session *session, state = "ACK"; break; case J1939_ERRQUEUE_SCHED: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) + if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) { + kfree_skb(skb); return; + } serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index fe000ea757ea..e5f1a56994c6 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1273,9 +1273,27 @@ j1939_xtp_rx_abort(struct j1939_priv *priv, struct sk_buff *skb, static void j1939_xtp_rx_eoma_one(struct j1939_session *session, struct sk_buff *skb) { + struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); + const u8 *dat; + int len; + if (j1939_xtp_rx_cmd_bad_pgn(session, skb)) return; + dat = skb->data; + + if (skcb->addr.type == J1939_ETP) + len = j1939_etp_ctl_to_size(dat); + else + len = j1939_tp_ctl_to_size(dat); + + if (session->total_message_size != len) { + netdev_warn_once(session->priv->ndev, + "%s: 0x%p: Incorrect size. Expected: %i; got: %i.\n", + __func__, session, session->total_message_size, + len); + } + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); session->pkt.tx_acked = session->pkt.total; @@ -1432,7 +1450,7 @@ j1939_session *j1939_session_fresh_new(struct j1939_priv *priv, skcb = j1939_skb_to_cb(skb); memcpy(skcb, rel_skcb, sizeof(*skcb)); - session = j1939_session_new(priv, skb, skb->len); + session = j1939_session_new(priv, skb, size); if (!session) { kfree_skb(skb); return NULL; diff --git a/net/core/datagram.c b/net/core/datagram.c index 4cc8dc5db2b7..da3c24ed129c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -97,7 +97,7 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, if (error) goto out_err; - if (sk->sk_receive_queue.prev != skb) + if (READ_ONCE(sk->sk_receive_queue.prev) != skb) goto out; /* Socket shut down? */ @@ -278,7 +278,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, break; sk_busy_loop(sk, flags & MSG_DONTWAIT); - } while (sk->sk_receive_queue.prev != *last); + } while (READ_ONCE(sk->sk_receive_queue.prev) != *last); error = -EAGAIN; @@ -640,7 +640,7 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->len += copied; skb->truesize += truesize; if (sk && sk->sk_type == SOCK_STREAM) { - sk->sk_wmem_queued += truesize; + sk_wmem_queued_add(sk, truesize); sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); @@ -767,7 +767,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, mask = 0; /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); @@ -777,7 +777,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ diff --git a/net/core/dev.c b/net/core/dev.c index bf3ed413abaf..1c799d486623 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -146,6 +146,7 @@ #include "net-sysfs.h" #define MAX_GRO_SKBS 8 +#define MAX_NEST_DEV 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) @@ -228,6 +229,122 @@ static inline void rps_unlock(struct softnet_data *sd) #endif } +static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, + const char *name) +{ + struct netdev_name_node *name_node; + + name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); + if (!name_node) + return NULL; + INIT_HLIST_NODE(&name_node->hlist); + name_node->dev = dev; + name_node->name = name; + return name_node; +} + +static struct netdev_name_node * +netdev_name_node_head_alloc(struct net_device *dev) +{ + struct netdev_name_node *name_node; + + name_node = netdev_name_node_alloc(dev, dev->name); + if (!name_node) + return NULL; + INIT_LIST_HEAD(&name_node->list); + return name_node; +} + +static void netdev_name_node_free(struct netdev_name_node *name_node) +{ + kfree(name_node); +} + +static void netdev_name_node_add(struct net *net, + struct netdev_name_node *name_node) +{ + hlist_add_head_rcu(&name_node->hlist, + dev_name_hash(net, name_node->name)); +} + +static void netdev_name_node_del(struct netdev_name_node *name_node) +{ + hlist_del_rcu(&name_node->hlist); +} + +static struct netdev_name_node *netdev_name_node_lookup(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry_rcu(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +int netdev_name_node_alt_create(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (name_node) + return -EEXIST; + name_node = netdev_name_node_alloc(dev, name); + if (!name_node) + return -ENOMEM; + netdev_name_node_add(net, name_node); + /* The node that holds dev->name acts as a head of per-device list. */ + list_add_tail(&name_node->list, &dev->name_node->list); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_create); + +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + list_del(&name_node->list); + netdev_name_node_del(name_node); + kfree(name_node->name); + netdev_name_node_free(name_node); +} + +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (!name_node) + return -ENOENT; + __netdev_name_node_alt_destroy(name_node); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_destroy); + +static void netdev_name_node_alt_flush(struct net_device *dev) +{ + struct netdev_name_node *name_node, *tmp; + + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) + __netdev_name_node_alt_destroy(name_node); +} + /* Device list insertion */ static void list_netdevice(struct net_device *dev) { @@ -237,7 +354,7 @@ static void list_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); @@ -255,7 +372,7 @@ static void unlist_netdevice(struct net_device *dev) /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); list_del_rcu(&dev->dev_list); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); @@ -276,88 +393,6 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); -#ifdef CONFIG_LOCKDEP -/* - * register_netdevice() inits txq->_xmit_lock and sets lockdep class - * according to dev->type - */ -static const unsigned short netdev_lock_type[] = { - ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, - ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, - ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, - ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, - ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, - ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, - ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, - ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, - ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, - ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, - ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, - ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, - ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, - ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, - ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; - -static const char *const netdev_lock_name[] = { - "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", - "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", - "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", - "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", - "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", - "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", - "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", - "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", - "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", - "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", - "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", - "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", - "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", - "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", - "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; - -static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; -static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; - -static inline unsigned short netdev_lock_pos(unsigned short dev_type) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) - if (netdev_lock_type[i] == dev_type) - return i; - /* the last key is used by default */ - return ARRAY_SIZE(netdev_lock_type) - 1; -} - -static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, - unsigned short dev_type) -{ - int i; - - i = netdev_lock_pos(dev_type); - lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], - netdev_lock_name[i]); -} - -static inline void netdev_set_addr_lockdep_class(struct net_device *dev) -{ - int i; - - i = netdev_lock_pos(dev->type); - lockdep_set_class_and_name(&dev->addr_list_lock, - &netdev_addr_lock_key[i], - netdev_lock_name[i]); -} -#else -static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, - unsigned short dev_type) -{ -} -static inline void netdev_set_addr_lockdep_class(struct net_device *dev) -{ -} -#endif - /******************************************************************************* * * Protocol management and registration routines @@ -733,14 +768,10 @@ EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); - - hlist_for_each_entry(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; + struct netdev_name_node *node_name; - return NULL; + node_name = netdev_name_node_lookup(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); @@ -758,14 +789,10 @@ EXPORT_SYMBOL(__dev_get_by_name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *node_name; - hlist_for_each_entry_rcu(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; - - return NULL; + node_name = netdev_name_node_lookup_rcu(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); @@ -1141,8 +1168,8 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -int dev_get_valid_name(struct net *net, struct net_device *dev, - const char *name) +static int dev_get_valid_name(struct net *net, struct net_device *dev, + const char *name) { BUG_ON(!net); @@ -1158,7 +1185,6 @@ int dev_get_valid_name(struct net *net, struct net_device *dev, return 0; } -EXPORT_SYMBOL(dev_get_valid_name); /** * dev_change_name - change name of a device @@ -1232,13 +1258,13 @@ rollback: netdev_adjacent_rename_links(dev, oldname); write_lock_bh(&dev_base_lock); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); write_unlock_bh(&dev_base_lock); synchronize_rcu(); write_lock_bh(&dev_base_lock); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); write_unlock_bh(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); @@ -1617,6 +1643,62 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, return nb->notifier_call(nb, val, &info); } +static int call_netdevice_register_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + int err; + + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + return err; + + if (!(dev->flags & IFF_UP)) + return 0; + + call_netdevice_notifier(nb, NETDEV_UP, dev); + return 0; +} + +static void call_netdevice_unregister_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); + } + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); +} + +static int call_netdevice_register_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + err = call_netdevice_register_notifiers(nb, dev); + if (err) + goto rollback; + } + return 0; + +rollback: + for_each_netdev_continue_reverse(net, dev) + call_netdevice_unregister_notifiers(nb, dev); + return err; +} + +static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + + for_each_netdev(net, dev) + call_netdevice_unregister_notifiers(nb, dev); +} + static int dev_boot_phase = 1; /** @@ -1635,8 +1717,6 @@ static int dev_boot_phase = 1; int register_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; - struct net_device *last; struct net *net; int err; @@ -1649,17 +1729,9 @@ int register_netdevice_notifier(struct notifier_block *nb) if (dev_boot_phase) goto unlock; for_each_net(net) { - for_each_netdev(net, dev) { - err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); - err = notifier_to_errno(err); - if (err) - goto rollback; - - if (!(dev->flags & IFF_UP)) - continue; - - call_netdevice_notifier(nb, NETDEV_UP, dev); - } + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto rollback; } unlock: @@ -1668,22 +1740,9 @@ unlock: return err; rollback: - last = dev; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev == last) - goto outroll; - - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } - } + for_each_net_continue_reverse(net) + call_netdevice_unregister_net_notifiers(nb, net); -outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } @@ -1734,6 +1793,80 @@ unlock: EXPORT_SYMBOL(unregister_netdevice_notifier); /** + * register_netdevice_notifier_net - register a per-netns network notifier block + * @net: network namespace + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = raw_notifier_chain_register(&net->netdev_chain, nb); + if (err) + goto unlock; + if (dev_boot_phase) + goto unlock; + + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto chain_unregister; + +unlock: + rtnl_unlock(); + return err; + +chain_unregister: + raw_notifier_chain_unregister(&netdev_chain, nb); + goto unlock; +} +EXPORT_SYMBOL(register_netdevice_notifier_net); + +/** + * unregister_netdevice_notifier_net - unregister a per-netns + * network notifier block + * @net: network namespace + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = raw_notifier_chain_unregister(&net->netdev_chain, nb); + if (err) + goto unlock; + + call_netdevice_unregister_net_notifiers(nb, net); + +unlock: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_net); + +/** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function * @info: notifier information data @@ -1745,7 +1878,18 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { + struct net *net = dev_net(info->dev); + int ret; + ASSERT_RTNL(); + + /* Run per-netns notifier block chain first, then run the global one. + * Hopefully, one day, the global one is going to be removed after + * all notifier block registrators get converted to be per-netns. + */ + ret = raw_notifier_call_chain(&net->netdev_chain, val, info); + if (ret & NOTIFY_STOP_MASK) + return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } @@ -2771,7 +2915,7 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); - if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + if (!netif_xmit_stopped(txq)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); @@ -2939,12 +3083,9 @@ int skb_checksum_help(struct sk_buff *skb) offset += skb->csum_offset; BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__sum16))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); + if (ret) + goto out; *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: @@ -2979,12 +3120,11 @@ int skb_crc32c_csum_help(struct sk_buff *skb) ret = -EINVAL; goto out; } - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__le32))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + + ret = skb_ensure_writable(skb, offset + sizeof(__le32)); + if (ret) + goto out; + crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); @@ -3467,7 +3607,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty && + if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) && qdisc_run_begin(q)) { if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { @@ -5663,6 +5803,26 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); +/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ +static void gro_normal_list(struct napi_struct *napi) +{ + if (!napi->rx_count) + return; + netif_receive_skb_list_internal(&napi->rx_list); + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; +} + +/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, + * pass the whole batch up to the stack. + */ +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +{ + list_add_tail(&skb->list, &napi->rx_list); + if (++napi->rx_count >= gro_normal_batch) + gro_normal_list(napi); +} + static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); @@ -5670,12 +5830,13 @@ static void napi_skb_free_stolen_head(struct sk_buff *skb) kmem_cache_free(skbuff_head_cache, skb); } -static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static gro_result_t napi_skb_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - if (netif_receive_skb_internal(skb)) - ret = GRO_DROP; + gro_normal_one(napi, skb); break; case GRO_DROP: @@ -5707,7 +5868,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_gro_reset_offset(skb); - ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); return ret; @@ -5753,26 +5914,6 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static void gro_normal_list(struct napi_struct *napi) -{ - if (!napi->rx_count) - return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; -} - -/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, - * pass the whole batch up to the stack. - */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) -{ - list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) - gro_normal_list(napi); -} - static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) @@ -6489,6 +6630,9 @@ struct netdev_adjacent { /* upper master flag, there can only be one master device per list */ bool master; + /* lookup ignore flag */ + bool ignore; + /* counter for the number of times this device was added to us */ u16 ref_nr; @@ -6511,7 +6655,7 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, return NULL; } -static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data) +static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data) { struct net_device *dev = data; @@ -6532,7 +6676,7 @@ bool netdev_has_upper_dev(struct net_device *dev, { ASSERT_RTNL(); - return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, + return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, upper_dev); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -6550,7 +6694,7 @@ EXPORT_SYMBOL(netdev_has_upper_dev); bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev) { - return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, + return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, upper_dev); } EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); @@ -6594,6 +6738,22 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); +static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) +{ + struct netdev_adjacent *upper; + + ASSERT_RTNL(); + + if (list_empty(&dev->adj_list.upper)) + return NULL; + + upper = list_first_entry(&dev->adj_list.upper, + struct netdev_adjacent, list); + if (likely(upper->master) && !upper->ignore) + return upper->dev; + return NULL; +} + /** * netdev_has_any_lower_dev - Check if device is linked to some device * @dev: device @@ -6644,6 +6804,23 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); +static struct net_device *__netdev_next_upper_dev(struct net_device *dev, + struct list_head **iter, + bool *ignore) +{ + struct netdev_adjacent *upper; + + upper = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + *ignore = upper->ignore; + + return upper->dev; +} + static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, struct list_head **iter) { @@ -6661,34 +6838,111 @@ static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, return upper->dev; } +static int __netdev_walk_all_upper_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + bool ignore; + + now = dev; + iter = &dev->adj_list.upper; + + while (1) { + if (now != dev) { + ret = fn(now, data); + if (ret) + return ret; + } + + next = NULL; + while (1) { + udev = __netdev_next_upper_dev(now, &iter, &ignore); + if (!udev) + break; + if (ignore) + continue; + + next = udev; + niter = &udev->adj_list.upper; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} + int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, void *data), void *data) { - struct net_device *udev; - struct list_head *iter; - int ret; + struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; - for (iter = &dev->adj_list.upper, - udev = netdev_next_upper_dev_rcu(dev, &iter); - udev; - udev = netdev_next_upper_dev_rcu(dev, &iter)) { - /* first is the upper device itself */ - ret = fn(udev, data); - if (ret) - return ret; + now = dev; + iter = &dev->adj_list.upper; - /* then look at all of its upper devices */ - ret = netdev_walk_all_upper_dev_rcu(udev, fn, data); - if (ret) - return ret; + while (1) { + if (now != dev) { + ret = fn(now, data); + if (ret) + return ret; + } + + next = NULL; + while (1) { + udev = netdev_next_upper_dev_rcu(now, &iter); + if (!udev) + break; + + next = udev; + niter = &udev->adj_list.upper; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); +static bool __netdev_has_upper_dev(struct net_device *dev, + struct net_device *upper_dev) +{ + ASSERT_RTNL(); + + return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, + upper_dev); +} + /** * netdev_lower_get_next_private - Get the next ->private from the * lower neighbour list @@ -6785,34 +7039,119 @@ static struct net_device *netdev_next_lower_dev(struct net_device *dev, return lower->dev; } +static struct net_device *__netdev_next_lower_dev(struct net_device *dev, + struct list_head **iter, + bool *ignore) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + *ignore = lower->ignore; + + return lower->dev; +} + int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, void *data), void *data) { - struct net_device *ldev; - struct list_head *iter; - int ret; + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; - for (iter = &dev->adj_list.lower, - ldev = netdev_next_lower_dev(dev, &iter); - ldev; - ldev = netdev_next_lower_dev(dev, &iter)) { - /* first is the lower device itself */ - ret = fn(ldev, data); - if (ret) - return ret; + now = dev; + iter = &dev->adj_list.lower; - /* then look at all of its lower devices */ - ret = netdev_walk_all_lower_dev(ldev, fn, data); - if (ret) - return ret; + while (1) { + if (now != dev) { + ret = fn(now, data); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = netdev_next_lower_dev(now, &iter); + if (!ldev) + break; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); +static int __netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + bool ignore; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, data); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = __netdev_next_lower_dev(now, &iter, &ignore); + if (!ldev) + break; + if (ignore) + continue; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} + static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter) { @@ -6827,28 +7166,99 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, return lower->dev; } -int netdev_walk_all_lower_dev_rcu(struct net_device *dev, - int (*fn)(struct net_device *dev, - void *data), - void *data) +static u8 __netdev_upper_depth(struct net_device *dev) +{ + struct net_device *udev; + struct list_head *iter; + u8 max_depth = 0; + bool ignore; + + for (iter = &dev->adj_list.upper, + udev = __netdev_next_upper_dev(dev, &iter, &ignore); + udev; + udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { + if (ignore) + continue; + if (max_depth < udev->upper_level) + max_depth = udev->upper_level; + } + + return max_depth; +} + +static u8 __netdev_lower_depth(struct net_device *dev) { struct net_device *ldev; struct list_head *iter; - int ret; + u8 max_depth = 0; + bool ignore; for (iter = &dev->adj_list.lower, - ldev = netdev_next_lower_dev_rcu(dev, &iter); + ldev = __netdev_next_lower_dev(dev, &iter, &ignore); ldev; - ldev = netdev_next_lower_dev_rcu(dev, &iter)) { - /* first is the lower device itself */ - ret = fn(ldev, data); - if (ret) - return ret; + ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { + if (ignore) + continue; + if (max_depth < ldev->lower_level) + max_depth = ldev->lower_level; + } - /* then look at all of its lower devices */ - ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data); - if (ret) - return ret; + return max_depth; +} + +static int __netdev_update_upper_level(struct net_device *dev, void *data) +{ + dev->upper_level = __netdev_upper_depth(dev) + 1; + return 0; +} + +static int __netdev_update_lower_level(struct net_device *dev, void *data) +{ + dev->lower_level = __netdev_lower_depth(dev) + 1; + return 0; +} + +int netdev_walk_all_lower_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, data); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = netdev_next_lower_dev_rcu(now, &iter); + if (!ldev) + break; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; } return 0; @@ -6952,6 +7362,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->master = master; adj->ref_nr = 1; adj->private = private; + adj->ignore = false; dev_hold(adj_dev); pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", @@ -7102,14 +7513,17 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (netdev_has_upper_dev(upper_dev, dev)) + if (__netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; + if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) + return -EMLINK; + if (!master) { - if (netdev_has_upper_dev(dev, upper_dev)) + if (__netdev_has_upper_dev(dev, upper_dev)) return -EEXIST; } else { - master_dev = netdev_master_upper_dev_get(dev); + master_dev = __netdev_master_upper_dev_get(dev); if (master_dev) return master_dev == upper_dev ? -EEXIST : -EBUSY; } @@ -7131,6 +7545,13 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) goto rollback; + __netdev_update_upper_level(dev, NULL); + __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); + + __netdev_update_lower_level(upper_dev, NULL); + __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, + NULL); + return 0; rollback: @@ -7213,9 +7634,96 @@ void netdev_upper_dev_unlink(struct net_device *dev, call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); + + __netdev_update_upper_level(dev, NULL); + __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); + + __netdev_update_lower_level(upper_dev, NULL); + __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, + NULL); } EXPORT_SYMBOL(netdev_upper_dev_unlink); +static void __netdev_adjacent_dev_set(struct net_device *upper_dev, + struct net_device *lower_dev, + bool val) +{ + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); + if (adj) + adj->ignore = val; + + adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); + if (adj) + adj->ignore = val; +} + +static void netdev_adjacent_dev_disable(struct net_device *upper_dev, + struct net_device *lower_dev) +{ + __netdev_adjacent_dev_set(upper_dev, lower_dev, true); +} + +static void netdev_adjacent_dev_enable(struct net_device *upper_dev, + struct net_device *lower_dev) +{ + __netdev_adjacent_dev_set(upper_dev, lower_dev, false); +} + +int netdev_adjacent_change_prepare(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + int err; + + if (!new_dev) + return 0; + + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_disable(dev, old_dev); + + err = netdev_upper_dev_link(new_dev, dev, extack); + if (err) { + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_enable(dev, old_dev); + return err; + } + + return 0; +} +EXPORT_SYMBOL(netdev_adjacent_change_prepare); + +void netdev_adjacent_change_commit(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev) +{ + if (!new_dev || !old_dev) + return; + + if (new_dev == old_dev) + return; + + netdev_adjacent_dev_enable(dev, old_dev); + netdev_upper_dev_unlink(old_dev, dev); +} +EXPORT_SYMBOL(netdev_adjacent_change_commit); + +void netdev_adjacent_change_abort(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev) +{ + if (!new_dev) + return; + + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_enable(dev, old_dev); + + netdev_upper_dev_unlink(new_dev, dev); +} +EXPORT_SYMBOL(netdev_adjacent_change_abort); + /** * netdev_bonding_info_change - Dispatch event about slave change * @dev: device @@ -7329,25 +7837,6 @@ void *netdev_lower_dev_get_private(struct net_device *dev, EXPORT_SYMBOL(netdev_lower_dev_get_private); -int dev_get_nest_level(struct net_device *dev) -{ - struct net_device *lower = NULL; - struct list_head *iter; - int max_nest = -1; - int nest; - - ASSERT_RTNL(); - - netdev_for_each_lower_dev(dev, lower, iter) { - nest = dev_get_nest_level(lower); - if (max_nest < nest) - max_nest = nest; - } - - return max_nest + 1; -} -EXPORT_SYMBOL(dev_get_nest_level); - /** * netdev_lower_change - Dispatch event about lower device state change * @lower_dev: device @@ -8154,7 +8643,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } - if (prog->aux->id == prog_id) { + /* prog->aux->id may be 0 for orphaned device-bound progs */ + if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); return 0; } @@ -8264,6 +8754,9 @@ static void rollback_registered_many(struct list_head *head) dev_uc_flush(dev); dev_mc_flush(dev); + netdev_name_node_alt_flush(dev); + netdev_name_node_free(dev->name_node); + if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -8619,7 +9112,7 @@ static void netdev_init_one_queue(struct net_device *dev, { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); - netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; @@ -8666,6 +9159,43 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); +static void netdev_register_lockdep_key(struct net_device *dev) +{ + lockdep_register_key(&dev->qdisc_tx_busylock_key); + lockdep_register_key(&dev->qdisc_running_key); + lockdep_register_key(&dev->qdisc_xmit_lock_key); + lockdep_register_key(&dev->addr_list_lock_key); +} + +static void netdev_unregister_lockdep_key(struct net_device *dev) +{ + lockdep_unregister_key(&dev->qdisc_tx_busylock_key); + lockdep_unregister_key(&dev->qdisc_running_key); + lockdep_unregister_key(&dev->qdisc_xmit_lock_key); + lockdep_unregister_key(&dev->addr_list_lock_key); +} + +void netdev_update_lockdep_key(struct net_device *dev) +{ + struct netdev_queue *queue; + int i; + + lockdep_unregister_key(&dev->qdisc_xmit_lock_key); + lockdep_unregister_key(&dev->addr_list_lock_key); + + lockdep_register_key(&dev->qdisc_xmit_lock_key); + lockdep_register_key(&dev->addr_list_lock_key); + + lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); + for (i = 0; i < dev->num_tx_queues; i++) { + queue = netdev_get_tx_queue(dev, i); + + lockdep_set_class(&queue->_xmit_lock, + &dev->qdisc_xmit_lock_key); + } +} +EXPORT_SYMBOL(netdev_update_lockdep_key); + /** * register_netdevice - register a network device * @dev: device to register @@ -8700,12 +9230,17 @@ int register_netdevice(struct net_device *dev) BUG_ON(!net); spin_lock_init(&dev->addr_list_lock); - netdev_set_addr_lockdep_class(dev); + lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; + ret = -ENOMEM; + dev->name_node = netdev_name_node_head_alloc(dev); + if (!dev->name_node) + goto out; + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -8827,6 +9362,8 @@ out: return ret; err_uninit: + if (dev->name_node) + netdev_name_node_free(dev->name_node); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (dev->priv_destructor) @@ -9210,8 +9747,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); + netdev_register_lockdep_key(dev); + dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; + dev->upper_level = 1; + dev->lower_level = 1; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); @@ -9292,6 +9833,8 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; + netdev_unregister_lockdep_key(dev); + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); @@ -9460,7 +10003,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); - new_nsid = peernet2id_alloc(dev_net(dev), net); + new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); /* If there is an ifindex conflict assign a new one */ if (__dev_get_by_index(net, dev->ifindex)) new_ifindex = dev_new_index(net); @@ -9635,6 +10178,8 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); + return 0; err_idx: diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 6393ba930097..2f949b5a1eb9 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -637,7 +637,7 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -667,7 +667,7 @@ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -691,7 +691,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock_nested(to); + netif_addr_lock(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); @@ -858,7 +858,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -888,7 +888,7 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -912,7 +912,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock_nested(to); + netif_addr_lock(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); diff --git a/net/core/devlink.c b/net/core/devlink.c index 0fbcd44aa64f..1338f5fbc7d2 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -95,16 +95,25 @@ static LIST_HEAD(devlink_list); */ static DEFINE_MUTEX(devlink_mutex); -static struct net *devlink_net(const struct devlink *devlink) +struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); } +EXPORT_SYMBOL_GPL(devlink_net); -static void devlink_net_set(struct devlink *devlink, struct net *net) +static void __devlink_net_set(struct devlink *devlink, struct net *net) { write_pnet(&devlink->_net, net); } +void devlink_net_set(struct devlink *devlink, struct net *net) +{ + if (WARN_ON(devlink->registered)) + return; + __devlink_net_set(devlink, net); +} +EXPORT_SYMBOL_GPL(devlink_net_set); + static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { @@ -434,8 +443,16 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, { struct devlink *devlink; - devlink = devlink_get_from_info(info); - if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + /* When devlink changes netns, it would not be found + * by devlink_get_from_info(). So try if it is stored first. + */ + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { + devlink = info->user_ptr[0]; + } else { + devlink = devlink_get_from_info(info); + WARN_ON(IS_ERR(devlink)); + } + if (!IS_ERR(devlink) && ~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); } @@ -1035,7 +1052,7 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1058,6 +1075,9 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -1233,7 +1253,7 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1256,6 +1276,9 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -1460,7 +1483,7 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1485,6 +1508,9 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -2674,6 +2700,72 @@ devlink_resources_validate(struct devlink *devlink, return err; } +static struct net *devlink_netns_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID]; + struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD]; + struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID]; + struct net *net; + + if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { + NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified"); + return ERR_PTR(-EINVAL); + } + + if (netns_pid_attr) { + net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr)); + } else if (netns_fd_attr) { + net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr)); + } else if (netns_id_attr) { + net = get_net_ns_by_id(sock_net(skb->sk), + nla_get_u32(netns_id_attr)); + if (!net) + net = ERR_PTR(-EINVAL); + } else { + WARN_ON(1); + net = ERR_PTR(-EINVAL); + } + if (IS_ERR(net)) { + NL_SET_ERR_MSG(info->extack, "Unknown network namespace"); + return ERR_PTR(-EINVAL); + } + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return ERR_PTR(-EPERM); + } + return net; +} + +static void devlink_param_notify(struct devlink *devlink, + unsigned int port_index, + struct devlink_param_item *param_item, + enum devlink_command cmd); + +static void devlink_reload_netns_change(struct devlink *devlink, + struct net *dest_net) +{ + struct devlink_param_item *param_item; + + /* Userspace needs to be notified about devlink objects + * removed from original and entering new network namespace. + * The rest of the devlink objects are re-created during + * reload process so the notifications are generated separatelly. + */ + + list_for_each_entry(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_DEL); + devlink_notify(devlink, DEVLINK_CMD_DEL); + + __devlink_net_set(devlink, dest_net); + + devlink_notify(devlink, DEVLINK_CMD_NEW); + list_for_each_entry(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_NEW); +} + static bool devlink_reload_supported(struct devlink *devlink) { return devlink->ops->reload_down && devlink->ops->reload_up; @@ -2694,9 +2786,30 @@ bool devlink_is_reload_failed(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_is_reload_failed); +static int devlink_reload(struct devlink *devlink, struct net *dest_net, + struct netlink_ext_ack *extack) +{ + int err; + + if (!devlink->reload_enabled) + return -EOPNOTSUPP; + + err = devlink->ops->reload_down(devlink, !!dest_net, extack); + if (err) + return err; + + if (dest_net && !net_eq(dest_net, devlink_net(devlink))) + devlink_reload_netns_change(devlink, dest_net); + + err = devlink->ops->reload_up(devlink, extack); + devlink_reload_failed_set(devlink, !!err); + return err; +} + static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct net *dest_net = NULL; int err; if (!devlink_reload_supported(devlink)) @@ -2707,11 +2820,20 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); return err; } - err = devlink->ops->reload_down(devlink, info->extack); - if (err) - return err; - err = devlink->ops->reload_up(devlink, info->extack); - devlink_reload_failed_set(devlink, !!err); + + if (info->attrs[DEVLINK_ATTR_NETNS_PID] || + info->attrs[DEVLINK_ATTR_NETNS_FD] || + info->attrs[DEVLINK_ATTR_NETNS_ID]) { + dest_net = devlink_netns_get(skb, info); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + } + + err = devlink_reload(devlink, dest_net, info->extack); + + if (dest_net) + put_net(dest_net); + return err; } @@ -3160,7 +3282,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -3188,6 +3310,9 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -3416,7 +3541,7 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -3449,6 +3574,9 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -3823,29 +3951,19 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); u64 ret_offset, start_offset, end_offset = 0; + struct nlattr **attrs = info->attrs; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; - struct nlattr **attrs; bool dump = true; void *hdr; int err; start_offset = *((u64 *)&cb->args[0]); - attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return -ENOMEM; - - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, - devlink_nl_family.policy, cb->extack); - if (err) - goto out_free; - mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) { @@ -3922,7 +4040,6 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, genlmsg_end(skb, hdr); mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); - kfree(attrs); return skb->len; @@ -3932,8 +4049,6 @@ out_unlock: mutex_unlock(&devlink->lock); out_dev: mutex_unlock(&devlink_mutex); -out_free: - kfree(attrs); return err; } @@ -4071,7 +4186,7 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -4099,6 +4214,9 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, } mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -4301,12 +4419,11 @@ int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) } EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); -int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len) +static int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len) { return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); } -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, bool value) @@ -4414,19 +4531,26 @@ int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u16 value_len) + const void *value, u32 value_len) { + u32 data_size; + u32 offset; int err; - err = devlink_fmsg_pair_nest_start(fmsg, name); + err = devlink_fmsg_arr_pair_nest_start(fmsg, name); if (err) return err; - err = devlink_fmsg_binary_put(fmsg, value, value_len); - if (err) - return err; + for (offset = 0; offset < value_len; offset += data_size) { + data_size = value_len - offset; + if (data_size > DEVLINK_FMSG_MAX_SIZE) + data_size = DEVLINK_FMSG_MAX_SIZE; + err = devlink_fmsg_binary_put(fmsg, value + offset, data_size); + if (err) + return err; + } - err = devlink_fmsg_pair_nest_end(fmsg); + err = devlink_fmsg_arr_pair_nest_end(fmsg); if (err) return err; @@ -4737,14 +4861,17 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); static int devlink_health_reporter_recover(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, struct netlink_ext_ack *extack) { int err; + if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) + return 0; + if (!reporter->ops->recover) return -EOPNOTSUPP; - err = reporter->ops->recover(reporter, priv_ctx); + err = reporter->ops->recover(reporter, priv_ctx, extack); if (err) return err; @@ -4765,7 +4892,8 @@ devlink_health_dump_clear(struct devlink_health_reporter *reporter) } static int devlink_health_do_dump(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, + struct netlink_ext_ack *extack) { int err; @@ -4786,7 +4914,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter, goto dump_err; err = reporter->ops->dump(reporter, reporter->dump_fmsg, - priv_ctx); + priv_ctx, extack); if (err) goto dump_err; @@ -4833,11 +4961,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, mutex_lock(&reporter->dump_lock); /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx); + devlink_health_do_dump(reporter, priv_ctx, NULL); mutex_unlock(&reporter->dump_lock); if (reporter->auto_recover) - return devlink_health_reporter_recover(reporter, priv_ctx); + return devlink_health_reporter_recover(reporter, + priv_ctx, NULL); return 0; } @@ -4872,21 +5001,10 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, static struct devlink_health_reporter * devlink_health_reporter_get_from_cb(struct netlink_callback *cb) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_health_reporter *reporter; + struct nlattr **attrs = info->attrs; struct devlink *devlink; - struct nlattr **attrs; - int err; - - attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return NULL; - - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, - devlink_nl_family.policy, cb->extack); - if (err) - goto free; mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); @@ -4895,12 +5013,9 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); mutex_unlock(&devlink_mutex); - kfree(attrs); return reporter; unlock: mutex_unlock(&devlink_mutex); -free: - kfree(attrs); return NULL; } @@ -5089,7 +5204,7 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - err = devlink_health_reporter_recover(reporter, NULL); + err = devlink_health_reporter_recover(reporter, NULL, info->extack); devlink_health_reporter_put(reporter); return err; @@ -5122,7 +5237,7 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, if (err) goto out; - err = reporter->ops->diagnose(reporter, fmsg); + err = reporter->ops->diagnose(reporter, fmsg, info->extack); if (err) goto out; @@ -5157,7 +5272,7 @@ devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, } mutex_lock(&reporter->dump_lock); if (!start) { - err = devlink_health_do_dump(reporter, NULL); + err = devlink_health_do_dump(reporter, NULL, cb->extack); if (err) goto unlock; cb->args[1] = reporter->dump_ts; @@ -5798,6 +5913,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -6028,7 +6146,8 @@ static const struct genl_ops devlink_nl_ops[] = { }, { .cmd = DEVLINK_CMD_REGION_READ, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_region_read_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, @@ -6076,7 +6195,8 @@ static const struct genl_ops devlink_nl_ops[] = { }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | @@ -6160,7 +6280,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) if (!devlink) return NULL; devlink->ops = ops; - devlink_net_set(devlink, &init_net); + __devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); @@ -6186,6 +6306,7 @@ int devlink_register(struct devlink *devlink, struct device *dev) { mutex_lock(&devlink_mutex); devlink->dev = dev; + devlink->registered = true; list_add_tail(&devlink->list, &devlink_list); devlink_notify(devlink, DEVLINK_CMD_NEW); mutex_unlock(&devlink_mutex); @@ -6201,6 +6322,8 @@ EXPORT_SYMBOL_GPL(devlink_register); void devlink_unregister(struct devlink *devlink) { mutex_lock(&devlink_mutex); + WARN_ON(devlink_reload_supported(devlink) && + devlink->reload_enabled); devlink_notify(devlink, DEVLINK_CMD_DEL); list_del(&devlink->list); mutex_unlock(&devlink_mutex); @@ -6208,6 +6331,41 @@ void devlink_unregister(struct devlink *devlink) EXPORT_SYMBOL_GPL(devlink_unregister); /** + * devlink_reload_enable - Enable reload of devlink instance + * + * @devlink: devlink + * + * Should be called at end of device initialization + * process when reload operation is supported. + */ +void devlink_reload_enable(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + devlink->reload_enabled = true; + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_reload_enable); + +/** + * devlink_reload_disable - Disable reload of devlink instance + * + * @devlink: devlink + * + * Should be called at the beginning of device cleanup + * process when reload operation is supported. + */ +void devlink_reload_disable(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + /* Mutex is taken which ensures that no reload operation is in + * progress while setting up forbidded flag. + */ + devlink->reload_enabled = false; + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_reload_disable); + +/** * devlink_free - Free devlink instance resources * * @devlink: devlink @@ -7495,6 +7653,21 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP), DEVLINK_TRAP(TTL_ERROR, EXCEPTION), DEVLINK_TRAP(TAIL_DROP, DROP), + DEVLINK_TRAP(NON_IP_PACKET, DROP), + DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP), + DEVLINK_TRAP(DIP_LB, DROP), + DEVLINK_TRAP(SIP_MC, DROP), + DEVLINK_TRAP(SIP_LB, DROP), + DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP), + DEVLINK_TRAP(IPV4_SIP_BC, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), + DEVLINK_TRAP(MTU_ERROR, EXCEPTION), + DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION), + DEVLINK_TRAP(RPF, EXCEPTION), + DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), + DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), + DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8065,9 +8238,43 @@ int devlink_compat_switch_id_get(struct net_device *dev, return 0; } +static void __net_exit devlink_pernet_pre_exit(struct net *net) +{ + struct devlink *devlink; + int err; + + /* In case network namespace is getting destroyed, reload + * all devlink instances from this namespace into init_net. + */ + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (net_eq(devlink_net(devlink), net)) { + if (WARN_ON(!devlink_reload_supported(devlink))) + continue; + err = devlink_reload(devlink, &init_net, NULL); + if (err && err != -EOPNOTSUPP) + pr_warn("Failed to reload devlink instance into init_net\n"); + } + } + mutex_unlock(&devlink_mutex); +} + +static struct pernet_operations devlink_pernet_ops __net_initdata = { + .pre_exit = devlink_pernet_pre_exit, +}; + static int __init devlink_init(void) { - return genl_register_family(&devlink_nl_family); + int err; + + err = genl_register_family(&devlink_nl_family); + if (err) + goto out; + err = register_pernet_subsys(&devlink_pernet_ops); + +out: + WARN_ON(err); + return err; } subsys_initcall(devlink_init); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c763106c73fc..cd9bc67381b2 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1396,11 +1396,13 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr) static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) { - struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; + struct ethtool_wolinfo wol; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; + memset(&wol, 0, sizeof(struct ethtool_wolinfo)); + wol.cmd = ETHTOOL_GWOL; dev->ethtool_ops->get_wol(dev, &wol); if (copy_to_user(useraddr, &wol, sizeof(wol))) diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 470a606d5e8d..fc96259807b6 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -12,17 +12,15 @@ static unsigned int fib_notifier_net_id; struct fib_notifier_net { struct list_head fib_notifier_ops; + struct atomic_notifier_head fib_chain; }; -static ATOMIC_NOTIFIER_HEAD(fib_chain); - -int call_fib_notifier(struct notifier_block *nb, struct net *net, +int call_fib_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { int err; - info->net = net; err = nb->notifier_call(nb, event_type, info); return notifier_to_errno(err); } @@ -31,106 +29,100 @@ EXPORT_SYMBOL(call_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); int err; - info->net = net; - err = atomic_notifier_call_chain(&fib_chain, event_type, info); + err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info); return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifiers); -static unsigned int fib_seq_sum(void) +static unsigned int fib_seq_sum(struct net *net) { - struct fib_notifier_net *fn_net; + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; unsigned int fib_seq = 0; - struct net *net; rtnl_lock(); - down_read(&net_rwsem); - for_each_net(net) { - fn_net = net_generic(net, fib_notifier_net_id); - rcu_read_lock(); - list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { - if (!try_module_get(ops->owner)) - continue; - fib_seq += ops->fib_seq_read(net); - module_put(ops->owner); - } - rcu_read_unlock(); + rcu_read_lock(); + list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { + if (!try_module_get(ops->owner)) + continue; + fib_seq += ops->fib_seq_read(net); + module_put(ops->owner); } - up_read(&net_rwsem); + rcu_read_unlock(); rtnl_unlock(); return fib_seq; } -static int fib_net_dump(struct net *net, struct notifier_block *nb) +static int fib_net_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; + int err = 0; + rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { - int err; - if (!try_module_get(ops->owner)) continue; - err = ops->fib_dump(net, nb); + err = ops->fib_dump(net, nb, extack); module_put(ops->owner); if (err) - return err; + goto unlock; } - return 0; +unlock: + rcu_read_unlock(); + + return err; } -static bool fib_dump_is_consistent(struct notifier_block *nb, +static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb), unsigned int fib_seq) { - atomic_notifier_chain_register(&fib_chain, nb); - if (fib_seq == fib_seq_sum()) + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + atomic_notifier_chain_register(&fn_net->fib_chain, nb); + if (fib_seq == fib_seq_sum(net)) return true; - atomic_notifier_chain_unregister(&fib_chain, nb); + atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); if (cb) cb(nb); return false; } #define FIB_DUMP_MAX_RETRIES 5 -int register_fib_notifier(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)) +int register_fib_notifier(struct net *net, struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + struct netlink_ext_ack *extack) { int retries = 0; int err; do { - unsigned int fib_seq = fib_seq_sum(); - struct net *net; - - rcu_read_lock(); - for_each_net_rcu(net) { - err = fib_net_dump(net, nb); - if (err) - goto err_fib_net_dump; - } - rcu_read_unlock(); - - if (fib_dump_is_consistent(nb, cb, fib_seq)) + unsigned int fib_seq = fib_seq_sum(net); + + err = fib_net_dump(net, nb, extack); + if (err) + return err; + + if (fib_dump_is_consistent(net, nb, cb, fib_seq)) return 0; } while (++retries < FIB_DUMP_MAX_RETRIES); return -EBUSY; - -err_fib_net_dump: - rcu_read_unlock(); - return err; } EXPORT_SYMBOL(register_fib_notifier); -int unregister_fib_notifier(struct notifier_block *nb) +int unregister_fib_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&fib_chain, nb); + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); } EXPORT_SYMBOL(unregister_fib_notifier); @@ -181,6 +173,7 @@ static int __net_init fib_notifier_net_init(struct net *net) struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); INIT_LIST_HEAD(&fn_net->fib_notifier_ops); + ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain); return 0; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index dd220ce7ca7a..3e7e15278c46 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -321,16 +321,18 @@ out: } EXPORT_SYMBOL_GPL(fib_rules_lookup); -static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, - struct fib_rule *rule, int family) + struct fib_rule *rule, int family, + struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, + .info.extack = extack, .rule = rule, }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, @@ -350,20 +352,25 @@ static int call_fib_rule_notifiers(struct net *net, } /* Called with rcu_read_lock() */ -int fib_rules_dump(struct net *net, struct notifier_block *nb, int family) +int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, + struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; + int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; - list_for_each_entry_rcu(rule, &ops->rules_list, list) - call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule, - family); + list_for_each_entry_rcu(rule, &ops->rules_list, list) { + err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, + rule, family, extack); + if (err) + break; + } rules_ops_put(ops); - return 0; + return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); diff --git a/net/core/filter.c b/net/core/filter.c index ed6563622ce3..fc303abec8fa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2245,7 +2245,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!msg->sg.copy[i] && bytes_sg_total <= len) + if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2450,7 +2450,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; - msg->sg.copy[new] = false; + __clear_bit(new, &msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); @@ -3798,7 +3798,7 @@ BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(skb_size > skb->len)) + if (unlikely(!skb || skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, @@ -3816,6 +3816,19 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static u32 bpf_skb_output_btf_ids[5]; +const struct bpf_func_proto bpf_skb_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_skb_output_btf_ids, +}; + static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; @@ -4252,12 +4265,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, case SO_RCVBUF: val = min_t(u32, val, sysctl_rmem_max); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); + WRITE_ONCE(sk->sk_rcvbuf, + max_t(int, val * 2, SOCK_MIN_RCVBUF)); break; case SO_SNDBUF: val = min_t(u32, val, sysctl_wmem_max); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); + WRITE_ONCE(sk->sk_sndbuf, + max_t(int, val * 2, SOCK_MIN_SNDBUF)); break; case SO_MAX_PACING_RATE: /* 32bit version */ if (val != ~0U) @@ -4274,7 +4289,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_MARK: if (sk->sk_mark != val) { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 7c09d87d3269..ca871657a4c4 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -114,19 +114,50 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, { struct bpf_prog *attached; struct net *net; + int ret = 0; net = current->nsproxy->net_ns; mutex_lock(&flow_dissector_mutex); + + if (net == &init_net) { + /* BPF flow dissector in the root namespace overrides + * any per-net-namespace one. When attaching to root, + * make sure we don't have any BPF program attached + * to the non-root namespaces. + */ + struct net *ns; + + for_each_net(ns) { + if (ns == &init_net) + continue; + if (rcu_access_pointer(ns->flow_dissector_prog)) { + ret = -EEXIST; + goto out; + } + } + } else { + /* Make sure root flow dissector is not attached + * when attaching to the non-root namespace. + */ + if (rcu_access_pointer(init_net.flow_dissector_prog)) { + ret = -EEXIST; + goto out; + } + } + attached = rcu_dereference_protected(net->flow_dissector_prog, lockdep_is_held(&flow_dissector_mutex)); - if (attached) { - /* Only one BPF program can be attached at a time */ - mutex_unlock(&flow_dissector_mutex); - return -EEXIST; + if (attached == prog) { + /* The same program cannot be attached twice */ + ret = -EINVAL; + goto out; } rcu_assign_pointer(net->flow_dissector_prog, prog); + if (attached) + bpf_prog_put(attached); +out: mutex_unlock(&flow_dissector_mutex); - return 0; + return ret; } int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) @@ -147,27 +178,6 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) mutex_unlock(&flow_dissector_mutex); return 0; } -/** - * skb_flow_get_be16 - extract be16 entity - * @skb: sk_buff to extract from - * @poff: offset to extract at - * @data: raw buffer pointer to the packet - * @hlen: packet header length - * - * The function will try to retrieve a be32 entity at - * offset poff - */ -static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, - void *data, int hlen) -{ - __be16 *u, _u; - - u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u); - if (u) - return *u; - - return 0; -} /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -203,6 +213,72 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +static bool icmp_has_id(u8 type) +{ + switch (type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + return true; + } + + return false; +} + +/** + * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields + * @skb: sk_buff to extract from + * @key_icmp: struct flow_dissector_key_icmp to fill + * @data: raw buffer pointer to the packet + * @toff: offset to extract at + * @hlen: packet header length + */ +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + void *data, int thoff, int hlen) +{ + struct icmphdr *ih, _ih; + + ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); + if (!ih) + return; + + key_icmp->type = ih->type; + key_icmp->code = ih->code; + + /* As we use 0 to signal that the Id field is not present, + * avoid confusion with packets without such field + */ + if (icmp_has_id(ih->type)) + key_icmp->id = ih->un.echo.id ? : 1; + else + key_icmp->id = 0; +} +EXPORT_SYMBOL(skb_flow_get_icmp_tci); + +/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet + * using skb_flow_get_icmp_tci(). + */ +static void __skb_flow_dissect_icmp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, + void *data, int thoff, int hlen) +{ + struct flow_dissector_key_icmp *key_icmp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) + return; + + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + + skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); +} + void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) @@ -853,7 +929,6 @@ bool __skb_flow_dissect(const struct net *net, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; - struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; struct bpf_prog *attached = NULL; @@ -910,7 +985,10 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { rcu_read_lock(); - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(init_net.flow_dissector_prog); + + if (!attached) + attached = rcu_dereference(net->flow_dissector_prog); if (attached) { struct bpf_flow_keys flow_keys; @@ -1295,6 +1373,12 @@ ip_proto_again: data, nhoff, hlen); break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + __skb_flow_dissect_icmp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + default: break; } @@ -1308,14 +1392,6 @@ ip_proto_again: data, hlen); } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP)) { - key_icmp = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP, - target_container); - key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen); - } - /* Process result of IP proto processing */ switch (fdret) { case FLOW_DISSECT_RET_PROTO_AGAIN: @@ -1350,32 +1426,23 @@ out_bad: } EXPORT_SYMBOL(__skb_flow_dissect); -static u32 hashrnd __read_mostly; +static siphash_key_t hashrnd __read_mostly; static __always_inline void __flow_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_words(const u32 *words, u32 length, - u32 keyval) -{ - return jhash2(words, length, keyval); -} - -static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow) +static const void *flow_keys_hash_start(const struct flow_keys *flow) { - const void *p = flow; - - BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32)); - return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET); + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT); + return &flow->FLOW_KEYS_HASH_START_FIELD; } static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); + BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); - BUILD_BUG_ON(offsetof(typeof(*flow), addrs) != - sizeof(*flow) - sizeof(flow->addrs)); switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: @@ -1388,7 +1455,7 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow) diff -= sizeof(flow->addrs.tipckey); break; } - return (sizeof(*flow) - diff) / sizeof(u32); + return sizeof(*flow) - diff; } __be32 flow_get_u32_src(const struct flow_keys *flow) @@ -1421,6 +1488,9 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow) } EXPORT_SYMBOL(flow_get_u32_dst); +/* Sort the source and destination IP (and the ports if the IP are the same), + * to have consistent hash within the two directions + */ static inline void __flow_hash_consistentify(struct flow_keys *keys) { int addr_diff, i; @@ -1454,14 +1524,15 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys) } } -static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) +static inline u32 __flow_hash_from_keys(struct flow_keys *keys, + const siphash_key_t *keyval) { u32 hash; __flow_hash_consistentify(keys); - hash = __flow_hash_words(flow_keys_hash_start(keys), - flow_keys_hash_length(keys), keyval); + hash = siphash(flow_keys_hash_start(keys), + flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; @@ -1471,12 +1542,13 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) u32 flow_hash_from_keys(struct flow_keys *keys) { __flow_hash_secret_init(); - return __flow_hash_from_keys(keys, hashrnd); + return __flow_hash_from_keys(keys, &hashrnd); } EXPORT_SYMBOL(flow_hash_from_keys); static inline u32 ___skb_get_hash(const struct sk_buff *skb, - struct flow_keys *keys, u32 keyval) + struct flow_keys *keys, + const siphash_key_t *keyval) { skb_flow_dissect_flow_keys(skb, keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); @@ -1524,7 +1596,7 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb) &keys, NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); - return __flow_hash_from_keys(&keys, hashrnd); + return __flow_hash_from_keys(&keys, &hashrnd); } EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); @@ -1544,13 +1616,14 @@ void __skb_get_hash(struct sk_buff *skb) __flow_hash_secret_init(); - hash = ___skb_get_hash(skb, &keys, hashrnd); + hash = ___skb_get_hash(skb, &keys, &hashrnd); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash); -__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) +__u32 skb_get_hash_perturb(const struct sk_buff *skb, + const siphash_key_t *perturb) { struct flow_keys keys; diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index bfe7bdd4c340..80dbf2f4016e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -48,7 +48,7 @@ struct net_rate_estimator { u8 intvl_log; /* period : (250ms << intvl_log) */ seqcount_t seq; - u32 last_packets; + u64 last_packets; u64 last_bytes; u64 avpps; @@ -83,7 +83,7 @@ static void est_timer(struct timer_list *t) brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); brate -= (est->avbps >> est->ewma_log); - rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); + rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); rate -= (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 36888f5e09eb..1d653fbfcf52 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -123,8 +123,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, for_each_possible_cpu(i) { struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); unsigned int start; - u64 bytes; - u32 packets; + u64 bytes, packets; do { start = u64_stats_fetch_begin_irq(&bcpu->syncp); @@ -176,12 +175,17 @@ ___gnet_stats_copy_basic(const seqcount_t *running, if (d->tail) { struct gnet_stats_basic sb; + int res; memset(&sb, 0, sizeof(sb)); sb.bytes = bstats.bytes; sb.packets = bstats.packets; - return gnet_stats_copy(d, type, &sb, sizeof(sb), - TCA_STATS_PAD); + res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); + if (res < 0 || sb.packets == bstats.packets) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, + sizeof(bstats.packets), TCA_STATS_PAD); } return 0; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index f93785e5833c..74cfb8b5ab33 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -88,11 +88,16 @@ static int bpf_lwt_input_reroute(struct sk_buff *skb) int err = -EINVAL; if (skb->protocol == htons(ETH_P_IP)) { + struct net_device *dev = skb_dst(skb)->dev; struct iphdr *iph = ip_hdr(skb); + dev_hold(dev); + skb_dst_drop(skb); err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb_dst(skb)->dev); + iph->tos, dev); + dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { + skb_dst_drop(skb); err = ipv6_stub->ipv6_route_input(skb); } else { err = -EAFNOSUPPORT; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5480edff0c86..652da6369037 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1197,7 +1197,7 @@ static void neigh_update_hhs(struct neighbour *neigh) if (update) { hh = &neigh->hh; - if (hh->hh_len) { + if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); @@ -1476,7 +1476,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) struct net_device *dev = neigh->dev; unsigned int seq; - if (dev->header_ops->cache && !neigh->hh.hh_len) + if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { @@ -2052,8 +2052,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, goto nla_put_failure; { unsigned long now = jiffies; - unsigned int flush_delta = now - tbl->last_flush; - unsigned int rand_delta = now - tbl->last_rand; + long flush_delta = now - tbl->last_flush; + long rand_delta = now - tbl->last_rand; struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 36347933ec3a..6bbd06f7dc7d 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -20,8 +20,8 @@ static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff struct hlist_head *h; unsigned int count = 0, offset = get_offset(*pos); - h = &net->dev_name_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, h, name_hlist) { + h = &net->dev_index_head[get_bucket(*pos)]; + hlist_for_each_entry_rcu(dev, h, index_hlist) { if (++count == offset) return dev; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a0e0d298c991..39402840025e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -245,11 +245,12 @@ static int __peernet2id(struct net *net, struct net *peer) return __peernet2id_alloc(net, peer, &no); } -static void rtnl_net_notifyid(struct net *net, int cmd, int id); +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh, gfp_t gfp); /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ -int peernet2id_alloc(struct net *net, struct net *peer) +int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { bool alloc = false, alive = false; int id; @@ -268,7 +269,7 @@ int peernet2id_alloc(struct net *net, struct net *peer) id = __peernet2id_alloc(net, peer, &alloc); spin_unlock_bh(&net->nsid_lock); if (alloc && id >= 0) - rtnl_net_notifyid(net, RTM_NEWNSID, id); + rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp); if (alive) put_net(peer); return id; @@ -478,6 +479,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: + key_remove_domain(net->key_domain); put_user_ns(user_ns); net_drop_ns(net); dec_ucounts: @@ -532,7 +534,8 @@ static void unhash_nsid(struct net *net, struct net *last) idr_remove(&tmp->netns_ids, id); spin_unlock_bh(&tmp->nsid_lock); if (id >= 0) - rtnl_net_notifyid(tmp, RTM_DELNSID, id); + rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, + GFP_KERNEL); if (tmp == last) break; } @@ -764,7 +767,8 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, err = alloc_netid(net, peer, nsid); spin_unlock_bh(&net->nsid_lock); if (err >= 0) { - rtnl_net_notifyid(net, RTM_NEWNSID, err); + rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, + nlh, GFP_KERNEL); err = 0; } else if (err == -ENOSPC && nsid >= 0) { err = -EEXIST; @@ -1051,16 +1055,19 @@ end: return err < 0 ? err : skb->len; } -static void rtnl_net_notifyid(struct net *net, int cmd, int id) +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh, gfp_t gfp) { struct net_fill_args fillargs = { + .portid = portid, + .seq = nlh ? nlh->nlmsg_seq : 0, .cmd = cmd, .nsid = id, }; struct sk_buff *msg; int err = -ENOMEM; - msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); + msg = nlmsg_new(rtnl_net_get_size(), gfp); if (!msg) goto out; @@ -1068,7 +1075,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (err < 0) goto err_out; - rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0); + rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp); return; err_out: diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 48b1e429857c..294bfcf0ce0e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3404,7 +3404,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) HARD_TX_LOCK(odev, txq, smp_processor_id()); if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { - ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; } diff --git a/net/core/request_sock.c b/net/core/request_sock.c index c9bb00008528..f35c2e998406 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -96,7 +96,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; - tcp_sk(sk)->fastopen_rsk = NULL; + RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; tcp_rsk(req)->tfo_listener = false; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1ee6460f8275..000eddb1207d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -980,6 +980,19 @@ static size_t rtnl_xdp_size(void) return xdp_size; } +static size_t rtnl_prop_list_size(const struct net_device *dev) +{ + struct netdev_name_node *name_node; + size_t size; + + if (list_empty(&dev->name_node->list)) + return 0; + size = nla_total_size(0); + list_for_each_entry(name_node, &dev->name_node->list, list) + size += nla_total_size(ALTIFNAMSIZ); + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1027,6 +1040,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + + rtnl_prop_list_size(dev) + 0; } @@ -1523,7 +1537,7 @@ static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev, - struct net *src_net) + struct net *src_net, gfp_t gfp) { bool put_iflink = false; @@ -1531,7 +1545,7 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb, struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { - int id = peernet2id_alloc(src_net, link_net); + int id = peernet2id_alloc(src_net, link_net, gfp); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) return -EMSGSIZE; @@ -1584,12 +1598,48 @@ static int rtnl_fill_link_af(struct sk_buff *skb, return 0; } +static int rtnl_fill_alt_ifnames(struct sk_buff *skb, + const struct net_device *dev) +{ + struct netdev_name_node *name_node; + int count = 0; + + list_for_each_entry(name_node, &dev->name_node->list, list) { + if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) + return -EMSGSIZE; + count++; + } + return count; +} + +static int rtnl_fill_prop_list(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *prop_list; + int ret; + + prop_list = nla_nest_start(skb, IFLA_PROP_LIST); + if (!prop_list) + return -EMSGSIZE; + + ret = rtnl_fill_alt_ifnames(skb, dev); + if (ret <= 0) + goto nest_cancel; + + nla_nest_end(skb, prop_list); + return 0; + +nest_cancel: + nla_nest_cancel(skb, prop_list); + return ret; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, u32 event, int *new_nsid, int new_ifindex, - int tgt_netnsid) + int tgt_netnsid, gfp_t gfp) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1681,7 +1731,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } - if (rtnl_fill_link_netnsid(skb, dev, src_net)) + if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) goto nla_put_failure; if (new_nsid && @@ -1697,6 +1747,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure_rcu; rcu_read_unlock(); + if (rtnl_fill_prop_list(skb, dev)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -1750,6 +1803,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, + [IFLA_PROP_LIST] = { .type = NLA_NESTED }, + [IFLA_ALT_IFNAME] = { .type = NLA_STRING, + .len = ALTIFNAMSIZ - 1 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2001,7 +2057,7 @@ walk_entries: NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, - netnsid); + netnsid, GFP_KERNEL); if (err < 0) { if (likely(skb->len)) @@ -2355,6 +2411,7 @@ static int do_set_master(struct net_device *dev, int ifindex, err = ops->ndo_del_slave(upper_dev, dev); if (err) return err; + netdev_update_lockdep_key(dev); } else { return -EOPNOTSUPP; } @@ -2722,6 +2779,26 @@ errout: return err; } +static struct net_device *rtnl_dev_get(struct net *net, + struct nlattr *ifname_attr, + struct nlattr *altifname_attr, + char *ifname) +{ + char buffer[ALTIFNAMSIZ]; + + if (!ifname) { + ifname = buffer; + if (ifname_attr) + nla_strlcpy(ifname, ifname_attr, IFNAMSIZ); + else if (altifname_attr) + nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ); + else + return NULL; + } + + return __dev_get_by_name(net, ifname); +} + static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2750,8 +2827,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); else goto errout; @@ -2824,7 +2901,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *tgt_net = net; struct net_device *dev = NULL; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; int netnsid = -1; @@ -2838,9 +2914,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); @@ -2852,8 +2925,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(tgt_net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else @@ -3024,12 +3098,10 @@ replay: ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else { - if (ifname[0]) - dev = __dev_get_by_name(net, ifname); - else - dev = NULL; - } + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); + else + dev = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); @@ -3291,6 +3363,7 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, switch (i) { case IFLA_IFNAME: + case IFLA_ALT_IFNAME: case IFLA_EXT_MASK: case IFLA_TARGET_NETNSID: break; @@ -3309,7 +3382,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; @@ -3332,9 +3404,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); @@ -3342,8 +3411,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(tgt_net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(tgt_net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); else goto out; @@ -3359,7 +3429,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, - 0, NULL, 0, netnsid); + 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -3373,6 +3443,100 @@ out: return err; } +static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, + bool *changed, struct netlink_ext_ack *extack) +{ + char *alt_ifname; + int err; + + err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + alt_ifname = nla_data(attr); + if (cmd == RTM_NEWLINKPROP) { + alt_ifname = kstrdup(alt_ifname, GFP_KERNEL); + if (!alt_ifname) + return -ENOMEM; + err = netdev_name_node_alt_create(dev, alt_ifname); + if (err) { + kfree(alt_ifname); + return err; + } + } else if (cmd == RTM_DELLINKPROP) { + err = netdev_name_node_alt_destroy(dev, alt_ifname); + if (err) + return err; + } else { + WARN_ON(1); + return 0; + } + + *changed = true; + return 0; +} + +static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + struct ifinfomsg *ifm; + bool changed = false; + struct nlattr *attr; + int err, rem; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err) + return err; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (!tb[IFLA_PROP_LIST]) + return 0; + + nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { + switch (nla_type(attr)) { + case IFLA_ALT_IFNAME: + err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); + if (err) + return err; + break; + } + } + + if (changed) + netdev_state_change(dev); + return 0; +} + +static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); +} + +static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); +} + static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -3471,7 +3635,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, 0, 0, change, 0, 0, event, - new_nsid, new_ifindex, -1); + new_nsid, new_ifindex, -1, flags); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -3916,7 +4080,7 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { - NL_SET_ERR_MSG(extack, "Invalid values in header for fbd dump request"); + NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); return -EINVAL; } @@ -5331,6 +5495,9 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); + rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 529133611ea2..867e61df00db 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4415,7 +4415,7 @@ static void skb_set_err_queue(struct sk_buff *skb) int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned int)sk->sk_rcvbuf) + (unsigned int)READ_ONCE(sk->sk_rcvbuf)) return -ENOMEM; skb_orphan(skb); @@ -5477,12 +5477,14 @@ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, * @skb: buffer * @mpls_lse: MPLS label stack entry to push * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) + * @mac_len: length of the MAC header * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ -int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto) +int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, + int mac_len) { struct mpls_shim_hdr *lse; int err; @@ -5499,15 +5501,15 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto) return err; if (!skb->inner_protocol) { - skb_set_inner_network_header(skb, skb->mac_len); + skb_set_inner_network_header(skb, mac_len); skb_set_inner_protocol(skb, skb->protocol); } skb_push(skb, MPLS_HLEN); memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), - skb->mac_len); + mac_len); skb_reset_mac_header(skb); - skb_set_network_header(skb, skb->mac_len); + skb_set_network_header(skb, mac_len); lse = mpls_hdr(skb); lse->label_stack_entry = mpls_lse; @@ -5526,29 +5528,30 @@ EXPORT_SYMBOL_GPL(skb_mpls_push); * * @skb: buffer * @next_proto: ethertype of header after popped MPLS header + * @mac_len: length of the MAC header * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ -int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto) +int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len) { int err; if (unlikely(!eth_p_mpls(skb->protocol))) - return -EINVAL; + return 0; - err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); + err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); if (unlikely(err)) return err; skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), - skb->mac_len); + mac_len); __skb_pull(skb, MPLS_HLEN); skb_reset_mac_header(skb); - skb_set_network_header(skb, skb->mac_len); + skb_set_network_header(skb, mac_len); if (skb->dev && skb->dev->type == ARPHRD_ETHER) { struct ethhdr *hdr; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index cf390e0aa73d..ad31e4e53d0a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -270,18 +270,28 @@ void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) msg->sg.data[i].length -= trim; sk_mem_uncharge(sk, trim); + /* Adjust copybreak if it falls into the trimmed part of last buf */ + if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length) + msg->sg.copybreak = msg->sg.data[i].length; out: - /* If we trim data before curr pointer update copybreak and current - * so that any future copy operations start at new copy location. + sk_msg_iter_var_next(i); + msg->sg.end = i; + + /* If we trim data a full sg elem before curr pointer update + * copybreak and current so that any future copy operations + * start at new copy location. * However trimed data that has not yet been used in a copy op * does not require an update. */ - if (msg->sg.curr >= i) { + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >= + sk_msg_iter_dist(msg->sg.start, msg->sg.end)) { + sk_msg_iter_var_prev(i); msg->sg.curr = i; msg->sg.copybreak = msg->sg.data[i].length; } - sk_msg_iter_var_next(i); - msg->sg.end = i; } EXPORT_SYMBOL_GPL(sk_msg_trim); diff --git a/net/core/sock.c b/net/core/sock.c index fac2b4d80de5..71787f7c4f8c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -333,7 +333,6 @@ EXPORT_SYMBOL(__sk_backlog_rcv); static int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; - int size; if (timeo == MAX_SCHEDULE_TIMEOUT) { tv.tv_sec = 0; @@ -354,13 +353,11 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval) old_tv.tv_sec = tv.tv_sec; old_tv.tv_usec = tv.tv_usec; *(struct __kernel_old_timeval *)optval = old_tv; - size = sizeof(old_tv); - } else { - *(struct __kernel_sock_timeval *)optval = tv; - size = sizeof(tv); + return sizeof(old_tv); } - return size; + *(struct __kernel_sock_timeval *)optval = tv; + return sizeof(tv); } static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval) @@ -522,7 +519,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { + } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; @@ -687,7 +684,8 @@ out: return ret; } -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, + int valbool) { if (valbool) sock_set_flag(sk, bit); @@ -785,7 +783,8 @@ set_sndbuf: */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); + WRITE_ONCE(sk->sk_sndbuf, + max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; @@ -831,7 +830,8 @@ set_rcvbuf: * returning the value we actually used in getsockopt * is the most desirable behavior. */ - sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); + WRITE_ONCE(sk->sk_rcvbuf, + max_t(int, val * 2, SOCK_MIN_RCVBUF)); break; case SO_RCVBUFFORCE: @@ -974,7 +974,7 @@ set_rcvbuf: if (sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_RCVTIMEO_OLD: @@ -1125,7 +1125,7 @@ set_rcvbuf: break; } case SO_INCOMING_CPU: - sk->sk_incoming_cpu = val; + WRITE_ONCE(sk->sk_incoming_cpu, val); break; case SO_CNX_ADVICE: @@ -1474,7 +1474,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_INCOMING_CPU: - v.val = sk->sk_incoming_cpu; + v.val = READ_ONCE(sk->sk_incoming_cpu); break; case SO_MEMINFO: @@ -2088,8 +2088,10 @@ EXPORT_SYMBOL(sock_i_ino); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { - if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + if (force || + refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { skb_set_owner_w(skb, sk); return skb; @@ -2190,7 +2192,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) + if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; if (sk->sk_shutdown & SEND_SHUTDOWN) break; @@ -2225,7 +2227,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) + if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) break; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -2334,8 +2336,8 @@ static void sk_leave_memory_pressure(struct sock *sk) } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure; - if (memory_pressure && *memory_pressure) - *memory_pressure = 0; + if (memory_pressure && READ_ONCE(*memory_pressure)) + WRITE_ONCE(*memory_pressure, 0); } } @@ -2806,7 +2808,7 @@ static void sock_def_write_space(struct sock *sk) /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | @@ -3011,7 +3013,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp, return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt);; + sock_write_timestamp(sk, kt); ts = ktime_to_timespec64(kt); } @@ -3038,7 +3040,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp, } EXPORT_SYMBOL(sock_gettstamp); -void sock_enable_timestamp(struct sock *sk, int flag) +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; @@ -3204,13 +3206,13 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); - mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); - mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; - mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } @@ -3596,7 +3598,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) { struct sock *sk = p; - return !skb_queue_empty(&sk->sk_receive_queue) || + return !skb_queue_empty_lockless(&sk->sk_receive_queue) || sk_busy_loop_timeout(sk, start_time); } EXPORT_SYMBOL(sk_busy_loop_end); diff --git a/net/core/xdp.c b/net/core/xdp.c index d7bf62ffbb5e..20781ad5f9c3 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -386,7 +386,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame - * while still leveraging this protection. The @napi_direct boolian + * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d9b4200ed12d..d19557c6d04b 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -117,7 +117,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr, inet->inet_sport, inet->inet_dport); - inet->inet_id = dp->dccps_iss ^ jiffies; + inet->inet_id = prandom_u32(); err = dccp_connect(sk); rt = NULL; @@ -416,7 +416,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; - newinet->inet_id = jiffies; + newinet->inet_id = prandom_u32(); if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) goto put_and_exit; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 5bad08dc4316..a52e8ba1ced0 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -944,7 +944,7 @@ int inet_dccp_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) goto out; - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 0ea75286abf4..e19a92a62e14 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1091,7 +1091,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags, } cb = DN_SKB_CB(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern); if (newsk == NULL) { release_sock(sk); @@ -1205,7 +1205,7 @@ static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wai struct dn_scp *scp = DN_SK(sk); __poll_t mask = datagram_poll(file, sock, wait); - if (!skb_queue_empty(&scp->other_receive_queue)) + if (!skb_queue_empty_lockless(&scp->other_receive_queue)) mask |= EPOLLRDBAND; return mask; diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index e4161e0c86aa..c68503a18025 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -328,7 +328,7 @@ static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb) return; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_state_change(sk); } diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 29e2bd5cc5af..136612792c08 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -20,7 +20,7 @@ if NET_DSA # tagging formats config NET_DSA_TAG_8021Q - tristate "Tag driver for switches using custom 802.1Q VLAN headers" + tristate select VLAN_8021Q help Unlike the other tagging protocols, the 802.1Q config option simply diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 43120a3fb06f..17281fec710c 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -246,7 +246,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_PM_SLEEP static bool dsa_is_port_initialized(struct dsa_switch *ds, int p) { - return dsa_is_user_port(ds, p) && ds->ports[p].slave; + const struct dsa_port *dp = dsa_to_port(ds, p); + + return dp->type == DSA_PORT_TYPE_USER && dp->slave; } int dsa_switch_suspend(struct dsa_switch *ds) @@ -258,7 +260,7 @@ int dsa_switch_suspend(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_suspend(ds->ports[i].slave); + ret = dsa_slave_suspend(dsa_to_port(ds, i)->slave); if (ret) return ret; } @@ -285,7 +287,7 @@ int dsa_switch_resume(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_resume(ds->ports[i].slave); + ret = dsa_slave_resume(dsa_to_port(ds, i)->slave); if (ret) return ret; } @@ -329,6 +331,91 @@ int call_dsa_notifiers(unsigned long val, struct net_device *dev, } EXPORT_SYMBOL_GPL(call_dsa_notifiers); +int dsa_devlink_param_get(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct dsa_devlink_priv *dl_priv; + struct dsa_switch *ds; + + dl_priv = devlink_priv(dl); + ds = dl_priv->ds; + + if (!ds->ops->devlink_param_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_param_get(ds, id, ctx); +} +EXPORT_SYMBOL_GPL(dsa_devlink_param_get); + +int dsa_devlink_param_set(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct dsa_devlink_priv *dl_priv; + struct dsa_switch *ds; + + dl_priv = devlink_priv(dl); + ds = dl_priv->ds; + + if (!ds->ops->devlink_param_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_param_set(ds, id, ctx); +} +EXPORT_SYMBOL_GPL(dsa_devlink_param_set); + +int dsa_devlink_params_register(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count) +{ + return devlink_params_register(ds->devlink, params, params_count); +} +EXPORT_SYMBOL_GPL(dsa_devlink_params_register); + +void dsa_devlink_params_unregister(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count) +{ + devlink_params_unregister(ds->devlink, params, params_count); +} +EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister); + +int dsa_devlink_resource_register(struct dsa_switch *ds, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params) +{ + return devlink_resource_register(ds->devlink, resource_name, + resource_size, resource_id, + parent_resource_id, + size_params); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_register); + +void dsa_devlink_resources_unregister(struct dsa_switch *ds) +{ + devlink_resources_unregister(ds->devlink, NULL); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister); + +void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv) +{ + return devlink_resource_occ_get_register(ds->devlink, resource_id, + occ_get, occ_get_priv); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register); + +void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, + u64 resource_id) +{ + devlink_resource_occ_get_unregister(ds->devlink, resource_id); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister); + static int __init dsa_init_module(void) { int rc; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 73002022c9d8..9ef2caa13f27 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -45,8 +45,12 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index) dst->index = index; + INIT_LIST_HEAD(&dst->rtable); + + INIT_LIST_HEAD(&dst->ports); + INIT_LIST_HEAD(&dst->list); - list_add_tail(&dsa_tree_list, &dst->list); + list_add_tail(&dst->list, &dsa_tree_list); kref_init(&dst->refcount); @@ -111,24 +115,38 @@ static bool dsa_port_is_user(struct dsa_port *dp) static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst, struct device_node *dn) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; + list_for_each_entry(dp, &dst->ports, list) + if (dp->dn == dn) + return dp; - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; + return NULL; +} - if (dp->dn == dn) - return dp; - } - } +struct dsa_link *dsa_link_touch(struct dsa_port *dp, struct dsa_port *link_dp) +{ + struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst; + struct dsa_link *dl; - return NULL; + dst = ds->dst; + + list_for_each_entry(dl, &dst->rtable, list) + if (dl->dp == dp && dl->link_dp == link_dp) + return dl; + + dl = kzalloc(sizeof(*dl), GFP_KERNEL); + if (!dl) + return NULL; + + dl->dp = dp; + dl->link_dp = link_dp; + + INIT_LIST_HEAD(&dl->list); + list_add_tail(&dl->list, &dst->rtable); + + return dl; } static bool dsa_port_setup_routing_table(struct dsa_port *dp) @@ -138,6 +156,7 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp) struct device_node *dn = dp->dn; struct of_phandle_iterator it; struct dsa_port *link_dp; + struct dsa_link *dl; int err; of_for_each_phandle(&it, err, dn, "link", NULL, 0) { @@ -147,24 +166,22 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp) return false; } - ds->rtable[link_dp->ds->index] = dp->index; + dl = dsa_link_touch(dp, link_dp); + if (!dl) { + of_node_put(it.node); + return false; + } } return true; } -static bool dsa_switch_setup_routing_table(struct dsa_switch *ds) +static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) { bool complete = true; struct dsa_port *dp; - int i; - - for (i = 0; i < DSA_MAX_SWITCHES; i++) - ds->rtable[i] = DSA_RTABLE_NONE; - - for (i = 0; i < ds->num_ports; i++) { - dp = &ds->ports[i]; + list_for_each_entry(dp, &dst->ports, list) { if (dsa_port_is_dsa(dp)) { complete = dsa_port_setup_routing_table(dp); if (!complete) @@ -175,81 +192,42 @@ static bool dsa_switch_setup_routing_table(struct dsa_switch *ds) return complete; } -static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) -{ - struct dsa_switch *ds; - bool complete = true; - int device; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - complete = dsa_switch_setup_routing_table(ds); - if (!complete) - break; - } - - return complete; -} - static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - if (dsa_port_is_cpu(dp)) - return dp; - } - } + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_cpu(dp)) + return dp; return NULL; } static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; - struct dsa_port *dp; - int device, port; + struct dsa_port *cpu_dp, *dp; - /* DSA currently only supports a single CPU port */ - dst->cpu_dp = dsa_tree_find_first_cpu(dst); - if (!dst->cpu_dp) { - pr_warn("Tree has no master device\n"); + cpu_dp = dsa_tree_find_first_cpu(dst); + if (!cpu_dp) { + pr_err("DSA: tree %d has no CPU port\n", dst->index); return -EINVAL; } /* Assign the default CPU port to all ports of the fabric */ - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) - dp->cpu_dp = dst->cpu_dp; - } - } + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = cpu_dp; return 0; } static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst) { - /* DSA currently only supports a single CPU port */ - dst->cpu_dp = NULL; + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = NULL; } static int dsa_port_setup(struct dsa_port *dp) @@ -265,6 +243,9 @@ static int dsa_port_setup(struct dsa_port *dp) bool dsa_port_enabled = false; int err = 0; + if (dp->setup) + return 0; + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: dsa_port_disable(dp); @@ -333,14 +314,21 @@ static int dsa_port_setup(struct dsa_port *dp) dsa_port_link_unregister_of(dp); if (err && devlink_port_registered) devlink_port_unregister(dlp); + if (err) + return err; - return err; + dp->setup = true; + + return 0; } static void dsa_port_teardown(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; + if (!dp->setup) + return; + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: break; @@ -363,11 +351,17 @@ static void dsa_port_teardown(struct dsa_port *dp) } break; } + + dp->setup = false; } static int dsa_switch_setup(struct dsa_switch *ds) { - int err = 0; + struct dsa_devlink_priv *dl_priv; + int err; + + if (ds->setup) + return 0; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus * driver and before ops->setup() has run, since the switch drivers and @@ -379,9 +373,11 @@ static int dsa_switch_setup(struct dsa_switch *ds) /* Add the switch to devlink before calling setup, so that setup can * add dpipe tables */ - ds->devlink = devlink_alloc(&dsa_devlink_ops, 0); + ds->devlink = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv)); if (!ds->devlink) return -ENOMEM; + dl_priv = devlink_priv(ds->devlink); + dl_priv->ds = ds; err = devlink_register(ds->devlink, ds->dev); if (err) @@ -395,6 +391,8 @@ static int dsa_switch_setup(struct dsa_switch *ds) if (err < 0) goto unregister_notifier; + devlink_params_publish(ds->devlink); + if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) { @@ -409,6 +407,8 @@ static int dsa_switch_setup(struct dsa_switch *ds) goto unregister_notifier; } + ds->setup = true; + return 0; unregister_notifier: @@ -424,6 +424,9 @@ free_devlink: static void dsa_switch_teardown(struct dsa_switch *ds) { + if (!ds->setup) + return; + if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); @@ -438,95 +441,72 @@ static void dsa_switch_teardown(struct dsa_switch *ds) ds->devlink = NULL; } + ds->setup = false; } static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port, i; - int err = 0; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; + int err; - err = dsa_switch_setup(ds); + list_for_each_entry(dp, &dst->ports, list) { + err = dsa_switch_setup(dp->ds); if (err) - goto switch_teardown; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; + goto teardown; + } - err = dsa_port_setup(dp); - if (err) - goto ports_teardown; - } + list_for_each_entry(dp, &dst->ports, list) { + err = dsa_port_setup(dp); + if (err) + goto teardown; } return 0; -ports_teardown: - for (i = 0; i < port; i++) - dsa_port_teardown(&ds->ports[i]); +teardown: + list_for_each_entry(dp, &dst->ports, list) + dsa_port_teardown(dp); - dsa_switch_teardown(ds); - -switch_teardown: - for (i = 0; i < device; i++) { - ds = dst->ds[i]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - dsa_port_teardown(dp); - } - - dsa_switch_teardown(ds); - } + list_for_each_entry(dp, &dst->ports, list) + dsa_switch_teardown(dp->ds); return err; } static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; + list_for_each_entry(dp, &dst->ports, list) + dsa_port_teardown(dp); - dsa_port_teardown(dp); - } - - dsa_switch_teardown(ds); - } + list_for_each_entry(dp, &dst->ports, list) + dsa_switch_teardown(dp->ds); } static int dsa_tree_setup_master(struct dsa_switch_tree *dst) { - struct dsa_port *cpu_dp = dst->cpu_dp; - struct net_device *master = cpu_dp->master; + struct dsa_port *dp; + int err; - /* DSA currently supports a single pair of CPU port and master device */ - return dsa_master_setup(master, cpu_dp); + list_for_each_entry(dp, &dst->ports, list) { + if (dsa_port_is_cpu(dp)) { + err = dsa_master_setup(dp->master, dp); + if (err) + return err; + } + } + + return 0; } static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) { - struct dsa_port *cpu_dp = dst->cpu_dp; - struct net_device *master = cpu_dp->master; + struct dsa_port *dp; - return dsa_master_teardown(master); + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_cpu(dp)) + dsa_master_teardown(dp->master); } static int dsa_tree_setup(struct dsa_switch_tree *dst) @@ -572,6 +552,8 @@ teardown_default_cpu: static void dsa_tree_teardown(struct dsa_switch_tree *dst) { + struct dsa_link *dl, *next; + if (!dst->setup) return; @@ -581,39 +563,36 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) dsa_tree_teardown_default_cpu(dst); + list_for_each_entry_safe(dl, next, &dst->rtable, list) { + list_del(&dl->list); + kfree(dl); + } + pr_info("DSA: tree %d torn down\n", dst->index); dst->setup = false; } -static void dsa_tree_remove_switch(struct dsa_switch_tree *dst, - unsigned int index) +static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) { - dsa_tree_teardown(dst); + struct dsa_switch_tree *dst = ds->dst; + struct dsa_port *dp; - dst->ds[index] = NULL; - dsa_tree_put(dst); -} + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds == ds && dp->index == index) + return dp; -static int dsa_tree_add_switch(struct dsa_switch_tree *dst, - struct dsa_switch *ds) -{ - unsigned int index = ds->index; - int err; + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + if (!dp) + return NULL; - if (dst->ds[index]) - return -EBUSY; + dp->ds = ds; + dp->index = index; - dsa_tree_get(dst); - dst->ds[index] = ds; + INIT_LIST_HEAD(&dp->list); + list_add_tail(&dp->list, &dst->ports); - err = dsa_tree_setup(dst); - if (err) { - dst->ds[index] = NULL; - dsa_tree_put(dst); - } - - return err; + return dp; } static int dsa_port_parse_user(struct dsa_port *dp, const char *name) @@ -708,7 +687,7 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, goto out_put_node; } - dp = &ds->ports[reg]; + dp = dsa_to_port(ds, reg); err = dsa_port_parse_of(dp, port); if (err) @@ -732,8 +711,6 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, return sz; ds->index = m[1]; - if (ds->index >= DSA_MAX_SWITCHES) - return -EINVAL; ds->dst = dsa_tree_touch(m[0]); if (!ds->dst) @@ -742,6 +719,20 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, return 0; } +static int dsa_switch_touch_ports(struct dsa_switch *ds) +{ + struct dsa_port *dp; + int port; + + for (port = 0; port < ds->num_ports; port++) { + dp = dsa_port_touch(ds, port); + if (!dp) + return -ENOMEM; + } + + return 0; +} + static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) { int err; @@ -750,6 +741,10 @@ static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) if (err) return err; + err = dsa_switch_touch_ports(ds); + if (err) + return err; + return dsa_switch_parse_ports_of(ds, dn); } @@ -787,7 +782,7 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds, for (i = 0; i < DSA_MAX_PORTS; i++) { name = cd->port_names[i]; dev = cd->netdev[i]; - dp = &ds->ports[i]; + dp = dsa_to_port(ds, i); if (!name) continue; @@ -807,6 +802,8 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds, static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) { + int err; + ds->cd = cd; /* We don't support interconnected switches nor multiple trees via @@ -817,22 +814,29 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) if (!ds->dst) return -ENOMEM; - return dsa_switch_parse_ports(ds, cd); -} - -static int dsa_switch_add(struct dsa_switch *ds) -{ - struct dsa_switch_tree *dst = ds->dst; + err = dsa_switch_touch_ports(ds); + if (err) + return err; - return dsa_tree_add_switch(dst, ds); + return dsa_switch_parse_ports(ds, cd); } static int dsa_switch_probe(struct dsa_switch *ds) { - struct dsa_chip_data *pdata = ds->dev->platform_data; - struct device_node *np = ds->dev->of_node; + struct dsa_switch_tree *dst; + struct dsa_chip_data *pdata; + struct device_node *np; int err; + if (!ds->dev) + return -ENODEV; + + pdata = ds->dev->platform_data; + np = ds->dev->of_node; + + if (!ds->num_ports) + return -EINVAL; + if (np) err = dsa_switch_parse_of(ds, np); else if (pdata) @@ -843,29 +847,14 @@ static int dsa_switch_probe(struct dsa_switch *ds) if (err) return err; - return dsa_switch_add(ds); -} - -struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) -{ - struct dsa_switch *ds; - int i; - - ds = devm_kzalloc(dev, struct_size(ds, ports, n), GFP_KERNEL); - if (!ds) - return NULL; - - ds->dev = dev; - ds->num_ports = n; - - for (i = 0; i < ds->num_ports; ++i) { - ds->ports[i].index = i; - ds->ports[i].ds = ds; - } + dst = ds->dst; + dsa_tree_get(dst); + err = dsa_tree_setup(dst); + if (err) + dsa_tree_put(dst); - return ds; + return err; } -EXPORT_SYMBOL_GPL(dsa_switch_alloc); int dsa_register_switch(struct dsa_switch *ds) { @@ -883,9 +872,16 @@ EXPORT_SYMBOL_GPL(dsa_register_switch); static void dsa_switch_remove(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; - unsigned int index = ds->index; + struct dsa_port *dp, *next; - dsa_tree_remove_switch(dst, index); + dsa_tree_teardown(dst); + + list_for_each_entry_safe(dp, next, &dst->ports, list) { + list_del(&dp->list); + kfree(dp); + } + + dsa_tree_put(dst); } void dsa_unregister_switch(struct dsa_switch *ds) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 12f8c7ee4dd8..53e7577896b6 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -104,25 +104,14 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; - struct dsa_switch *ds; - struct dsa_port *slave_port; + struct dsa_port *dp; - if (device < 0 || device >= DSA_MAX_SWITCHES) - return NULL; + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds->index == device && dp->index == port && + dp->type == DSA_PORT_TYPE_USER) + return dp->slave; - ds = dst->ds[device]; - if (!ds) - return NULL; - - if (port < 0 || port >= ds->num_ports) - return NULL; - - slave_port = &ds->ports[port]; - - if (unlikely(slave_port->type != DSA_PORT_TYPE_USER)) - return NULL; - - return slave_port->slave; + return NULL; } /* port.c */ diff --git a/net/dsa/master.c b/net/dsa/master.c index a8e52c9967f4..3255dfc97f86 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -310,8 +310,6 @@ static void dsa_master_reset_mtu(struct net_device *dev) rtnl_unlock(); } -static struct lock_class_key dsa_master_addr_list_lock_key; - int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { int ret; @@ -325,9 +323,6 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) wmb(); dev->dsa_ptr = cpu_dp; - lockdep_set_class(&dev->addr_list_lock, - &dsa_master_addr_list_lock_key); - ret = dsa_master_ethtool_setup(dev); if (ret) return ret; diff --git a/net/dsa/port.c b/net/dsa/port.c index 9b54e5a76297..6e93c36bf0c0 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -561,7 +561,7 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp) struct dsa_switch *ds = dp->ds; struct phy_device *phydev; int port = dp->index; - int mode; + phy_interface_t mode; int err; err = of_phy_register_fixed_link(dn); @@ -574,8 +574,8 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp) phydev = of_phy_find_device(dn); - mode = of_get_phy_mode(dn); - if (mode < 0) + err = of_get_phy_mode(dn, &mode); + if (err) mode = PHY_INTERFACE_MODE_NA; phydev->interface = mode; @@ -593,10 +593,11 @@ static int dsa_port_phylink_register(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; struct device_node *port_dn = dp->dn; - int mode, err; + phy_interface_t mode; + int err; - mode = of_get_phy_mode(port_dn); - if (mode < 0) + err = of_get_phy_mode(port_dn, &mode); + if (err) mode = PHY_INTERFACE_MODE_NA; dp->pl_config.dev = ds->dev; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 75d58229a4bd..78ffc87dc25e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -789,6 +789,22 @@ static int dsa_slave_set_link_ksettings(struct net_device *dev, return phylink_ethtool_ksettings_set(dp->pl, cmd); } +static void dsa_slave_get_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + phylink_ethtool_get_pauseparam(dp->pl, pause); +} + +static int dsa_slave_set_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return phylink_ethtool_set_pauseparam(dp->pl, pause); +} + #ifdef CONFIG_NET_POLL_CONTROLLER static int dsa_slave_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) @@ -1192,6 +1208,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_eee = dsa_slave_get_eee, .get_link_ksettings = dsa_slave_get_link_ksettings, .set_link_ksettings = dsa_slave_set_link_ksettings, + .get_pauseparam = dsa_slave_get_pauseparam, + .set_pauseparam = dsa_slave_set_pauseparam, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, .get_ts_info = dsa_slave_get_ts_info, @@ -1295,11 +1313,12 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct device_node *port_dn = dp->dn; struct dsa_switch *ds = dp->ds; + phy_interface_t mode; u32 phy_flags = 0; - int mode, ret; + int ret; - mode = of_get_phy_mode(port_dn); - if (mode < 0) + ret = of_get_phy_mode(port_dn, &mode); + if (ret) mode = PHY_INTERFACE_MODE_NA; dp->pl_config.dev = &slave_dev->dev; @@ -1341,15 +1360,6 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } -static struct lock_class_key dsa_slave_netdev_xmit_lock_key; -static void dsa_slave_set_lockdep_class_one(struct net_device *dev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, - &dsa_slave_netdev_xmit_lock_key); -} - int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); @@ -1433,9 +1443,6 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->max_mtu = ETH_MAX_MTU; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); - netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, - NULL); - SET_NETDEV_DEV(slave_dev, port->ds->dev); slave_dev->dev.of_node = port->dn; slave_dev->vlan_features = master->vlan_features; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 6a9607518823..df4abe897ed6 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -20,7 +20,7 @@ static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, int i; for (i = 0; i < ds->num_ports; ++i) { - struct dsa_port *dp = &ds->ports[i]; + struct dsa_port *dp = dsa_to_port(ds, i); if (dp->ageing_time && dp->ageing_time < ageing_time) ageing_time = dp->ageing_time; @@ -98,7 +98,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, if (unset_vlan_filtering) { struct switchdev_trans trans = {0}; - err = dsa_port_vlan_filtering(&ds->ports[info->port], + err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), false, &trans); if (err && err != EOPNOTSUPP) return err; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 9c1cc2482b68..73632d21f1a6 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -31,15 +31,14 @@ * Must be transmitted as zero and ignored on receive. * * SWITCH_ID - VID[8:6]: - * Index of switch within DSA tree. Must be between 0 and - * DSA_MAX_SWITCHES - 1. + * Index of switch within DSA tree. Must be between 0 and 7. * * RSV - VID[5:4]: * To be used for further expansion of PORT or for other purposes. * Must be transmitted as zero and ignored on receive. * * PORT - VID[3:0]: - * Index of switch port. Must be between 0 and DSA_MAX_PORTS - 1. + * Index of switch port. Must be between 0 and 15. */ #define DSA_8021Q_DIR_SHIFT 10 @@ -103,7 +102,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) if (!dsa_is_user_port(ds, port)) return 0; - slave = ds->ports[port].slave; + slave = dsa_to_port(ds, port)->slave; err = br_vlan_get_pvid(slave, &pvid); if (err < 0) @@ -118,7 +117,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) return err; } - return dsa_port_vid_add(&ds->ports[port], pvid, vinfo.flags); + return dsa_port_vid_add(dsa_to_port(ds, port), pvid, vinfo.flags); } /* If @enabled is true, installs @vid with @flags into the switch port's HW @@ -130,7 +129,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid, u16 flags, bool enabled) { - struct dsa_port *dp = &ds->ports[port]; + struct dsa_port *dp = dsa_to_port(ds, port); struct bridge_vlan_info vinfo; int err; @@ -342,13 +341,4 @@ struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dsa_8021q_remove_header); -static const struct dsa_device_ops dsa_8021q_netdev_ops = { - .name = "8021q", - .proto = DSA_TAG_PROTO_8021Q, - .overhead = VLAN_HLEN, -}; - MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_8021Q); - -module_dsa_tag_driver(dsa_8021q_netdev_ops); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 17374afee28f..9040fe55e0f5 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -244,7 +244,12 @@ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); memcpy(eth->h_dest, neigh->ha, ETH_ALEN); - hh->hh_len = ETH_HLEN; + + /* Pairs with READ_ONCE() in neigh_resolve_output(), + * neigh_hh_output() and neigh_update_hhs(). + */ + smp_store_release(&hh->hh_len, ETH_HLEN); + return 0; } EXPORT_SYMBOL(eth_header_cache); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 3297e7fa9945..c0b107cdd715 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -58,13 +58,6 @@ static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; -static int lowpan_dev_init(struct net_device *ldev) -{ - netdev_lockdep_set_classes(ldev); - - return 0; -} - static int lowpan_open(struct net_device *dev) { if (!open_count) @@ -96,7 +89,6 @@ static int lowpan_get_iflink(const struct net_device *dev) } static const struct net_device_ops lowpan_netdev_ops = { - .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index ffcfcef76291..7c5a1aa5adb4 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -236,21 +236,14 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, struct cfg802154_registered_device **rdev, struct wpan_dev **wpan_dev) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); int err; rtnl_lock(); if (!cb->args[0]) { - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nl802154_fam.hdrsize, - genl_family_attrbuf(&nl802154_fam), - nl802154_fam.maxattr, - nl802154_policy, NULL); - if (err) - goto out_unlock; - *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), - genl_family_attrbuf(&nl802154_fam)); + info->attrs); if (IS_ERR(*wpan_dev)) { err = PTR_ERR(*wpan_dev); goto out_unlock; @@ -557,17 +550,8 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct netlink_callback *cb, struct nl802154_dump_wpan_phy_state *state) { - struct nlattr **tb = genl_family_attrbuf(&nl802154_fam); - int ret = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nl802154_fam.hdrsize, - tb, nl802154_fam.maxattr, - nl802154_policy, NULL); - - /* TODO check if we can handle error here, - * we have no backward compatibility - */ - if (ret) - return 0; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct nlattr **tb = info->attrs; if (tb[NL802154_ATTR_WPAN_PHY]) state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]); @@ -2203,7 +2187,8 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, static const struct genl_ops nl802154_ops[] = { { .cmd = NL802154_CMD_GET_WPAN_PHY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .doit = nl802154_get_wpan_phy, .dumpit = nl802154_dump_wpan_phy, .done = nl802154_dump_wpan_phy_done, @@ -2343,7 +2328,8 @@ static const struct genl_ops nl802154_ops[] = { }, { .cmd = NL802154_CMD_GET_SEC_KEY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching key id? */ .dumpit = nl802154_dump_llsec_key, .flags = GENL_ADMIN_PERM, @@ -2369,7 +2355,8 @@ static const struct genl_ops nl802154_ops[] = { /* TODO unique identifier must short+pan OR extended_addr */ { .cmd = NL802154_CMD_GET_SEC_DEV, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching extended_addr? */ .dumpit = nl802154_dump_llsec_dev, .flags = GENL_ADMIN_PERM, @@ -2395,7 +2382,8 @@ static const struct genl_ops nl802154_ops[] = { /* TODO remove complete devkey, put it as nested? */ { .cmd = NL802154_CMD_GET_SEC_DEVKEY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO doit by matching ??? */ .dumpit = nl802154_dump_llsec_devkey, .flags = GENL_ADMIN_PERM, @@ -2420,7 +2408,8 @@ static const struct genl_ops nl802154_ops[] = { }, { .cmd = NL802154_CMD_GET_SEC_LEVEL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching frame_type? */ .dumpit = nl802154_dump_llsec_seclevel, .flags = GENL_ADMIN_PERM, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 70f92aaca411..53de8e00990e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -208,7 +208,7 @@ int inet_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 9a0fe0c2fa02..4a8550c49202 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -73,7 +73,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len reuseport_has_conns(sk, true); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); - inet->inet_id = jiffies; + inet->inet_id = prandom_u32(); sk_dst_set(sk, &rt->dst); err = 0; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index dde77f72e03e..71c78d223dfd 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1148,7 +1148,7 @@ void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric) if (!(dev->flags & IFF_UP) || ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) || ipv4_is_zeronet(prefix) || - prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32) + (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32)) return; /* add the new */ diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index b804ccbdb241..0c28bd469a68 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -9,12 +9,12 @@ #include <net/netns/ipv4.h> #include <net/ip_fib.h> -int call_fib4_notifier(struct notifier_block *nb, struct net *net, +int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET; - return call_fib_notifier(nb, net, event_type, info); + return call_fib_notifier(nb, event_type, info); } int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, @@ -34,17 +34,16 @@ static unsigned int fib4_seq_read(struct net *net) return net->ipv4.fib_seq + fib4_rules_seq_read(net); } -static int fib4_dump(struct net *net, struct notifier_block *nb) +static int fib4_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { int err; - err = fib4_rules_dump(net, nb); + err = fib4_rules_dump(net, nb, extack); if (err) return err; - fib_notify(net, nb); - - return 0; + return fib_notify(net, nb, extack); } static const struct fib_notifier_ops fib4_notifier_ops_template = { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b43a7ba5c6a4..f99e3bac5cab 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -65,9 +65,10 @@ bool fib4_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib4_rule_default); -int fib4_rules_dump(struct net *net, struct notifier_block *nb) +int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, AF_INET); + return fib_rules_dump(net, nb, AF_INET, extack); } unsigned int fib4_rules_seq_read(struct net *net) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 0913a090b2bf..f1888c683426 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1814,8 +1814,8 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) int ret = 0; unsigned int hash = fib_laddr_hashfn(local); struct hlist_head *head = &fib_info_laddrhash[hash]; + int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; struct net *net = dev_net(dev); - int tb_id = l3mdev_fib_table(dev); struct fib_info *fi; if (!fib_info_laddrhash || local == 0) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1ab2fb6bb37d..b9df9c09b84e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -74,11 +74,13 @@ #include <trace/events/fib.h> #include "fib_lookup.h" -static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_alias *fa) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, @@ -86,7 +88,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, .type = fa->fa_type, .tb_id = fa->tb_id, }; - return call_fib4_notifier(nb, net, event_type, &info.info); + return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, @@ -2015,10 +2017,12 @@ void fib_info_notify_update(struct net *net, struct nl_info *info) } } -static void fib_leaf_notify(struct net *net, struct key_vector *l, - struct fib_table *tb, struct notifier_block *nb) +static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, + struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib_alias *fa; + int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; @@ -2032,39 +2036,53 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, if (tb->tb_id != fa->tb_id) continue; - call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fa); + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, + KEYLENGTH - fa->fa_slen, + fa, extack); + if (err) + return err; } + return 0; } -static void fib_table_notify(struct net *net, struct fib_table *tb, - struct notifier_block *nb) +static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; + int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - fib_leaf_notify(net, l, tb, nb); + err = fib_leaf_notify(l, tb, nb, extack); + if (err) + return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } + return 0; } -void fib_notify(struct net *net, struct notifier_block *nb) +int fib_notify(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { unsigned int h; + int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, head, tb_hlist) - fib_table_notify(net, tb, nb); + hlist_for_each_entry_rcu(tb, head, tb_hlist) { + err = fib_table_notify(tb, nb, extack); + if (err) + return err; + } } + return 0; } static void __trie_free_rcu(struct rcu_head *head) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4298aae74e0e..18068ed42f25 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -249,10 +249,11 @@ bool icmp_global_allow(void) bool rc = false; /* Check if token bucket is empty and cannot be refilled - * without taking the spinlock. + * without taking the spinlock. The READ_ONCE() are paired + * with the following WRITE_ONCE() in this same function. */ - if (!icmp_global.credit) { - delta = min_t(u32, now - icmp_global.stamp, HZ); + if (!READ_ONCE(icmp_global.credit)) { + delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ); if (delta < HZ / 50) return false; } @@ -262,14 +263,14 @@ bool icmp_global_allow(void) if (delta >= HZ / 50) { incr = sysctl_icmp_msgs_per_sec * delta / HZ ; if (incr) - icmp_global.stamp = now; + WRITE_ONCE(icmp_global.stamp, now); } credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); if (credit) { credit--; rc = true; } - icmp_global.credit = credit; + WRITE_ONCE(icmp_global.credit, credit); spin_unlock(&icmp_global.lock); return rc; } @@ -682,7 +683,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); if (dev) - saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); + saddr = inet_select_addr(dev, iph->saddr, + RT_SCOPE_LINK); else saddr = 0; rcu_read_unlock(); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 480d0b22db1a..3b9c7a2725a9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1563,7 +1563,7 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb) } } -static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +static __sum16 ip_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_simple_validate(skb); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a9183543ca30..e4c6e8b40490 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -716,7 +716,7 @@ static void reqsk_timer_handler(struct timer_list *t) * ones are about to clog our table. */ qlen = reqsk_queue_len(queue); - if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) { + if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { @@ -906,7 +906,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, percpu_counter_inc(sk->sk_prot->orphan_count); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { - BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if @@ -915,7 +915,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, * Also to satisfy an assertion in * tcp_v4_destroy_sock(). */ - tcp_sk(child)->fastopen_rsk = NULL; + RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL); } inet_csk_destroy_sock(child); } @@ -934,7 +934,7 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, req->sk = child; req->dl_next = NULL; if (queue->rskq_accept_head == NULL) - queue->rskq_accept_head = req; + WRITE_ONCE(queue->rskq_accept_head, req); else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index bbb005eb5218..af154977904c 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -193,7 +193,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), - .idiag_wmem = sk->sk_wmem_queued, + .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), .idiag_fmem = sk->sk_forward_alloc, .idiag_tmem = sk_wmem_alloc_get(sk), }; @@ -226,17 +226,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = - jiffies_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_to_msecs(sk->sk_timer.expires - jiffies); + jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); } else { r->idiag_timer = 0; r->idiag_expires = 0; @@ -342,16 +342,13 @@ static int inet_twsk_diag_fill(struct sock *sk, r = nlmsg_data(nlh); BUG_ON(tw->tw_state != TCP_TIME_WAIT); - tmo = tw->tw_timer.expires - jiffies; - if (tmo < 0) - tmo = 0; - inet_diag_msg_common_fill(r, sk); r->idiag_retrans = 0; r->idiag_state = tw->tw_substate; r->idiag_timer = 3; - r->idiag_expires = jiffies_to_msecs(tmo); + tmo = tw->tw_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; @@ -385,7 +382,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, offsetof(struct sock, sk_cookie)); tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; - r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0; + r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 97824864e40d..83fb00153018 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -240,7 +240,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score = sk->sk_family == PF_INET ? 2 : 1; - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index be778599bfed..ff327a62c9ce 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -160,7 +160,12 @@ static void inet_peer_gc(struct inet_peer_base *base, base->total / inet_peer_threshold * HZ; for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; - delta = (__u32)jiffies - p->dtime; + + /* The READ_ONCE() pairs with the WRITE_ONCE() + * in inet_putpeer() + */ + delta = (__u32)jiffies - READ_ONCE(p->dtime); + if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; } @@ -237,7 +242,10 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { - p->dtime = (__u32)jiffies; + /* The WRITE_ONCE() pairs with itself (we run lockless) + * and the READ_ONCE() in inet_peer_gc() + */ + WRITE_ONCE(p->dtime, (__u32)jiffies); if (refcount_dec_and_test(&p->refcnt)) call_rcu(&p->rcu, inetpeer_free_rcu); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 52690bb3e40f..10636fb6093e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -509,9 +509,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) key = &tun_info->key; if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) goto err_free_skb; - md = ip_tunnel_info_opts(tun_info); - if (!md) + if (tun_info->options_len < sizeof(*md)) goto err_free_skb; + md = ip_tunnel_info_opts(tun_info); /* ERSPAN has fixed 8 byte GRE header */ version = md->version; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c59a78a267c3..24a95126e698 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -611,5 +611,6 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - ip_sublist_rcv(&sublist, curr_dev, curr_net); + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, curr_dev, curr_net); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 28fca408812c..3d8baaaf7086 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -645,11 +645,12 @@ void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) EXPORT_SYMBOL(ip_fraglist_prepare); void ip_frag_init(struct sk_buff *skb, unsigned int hlen, - unsigned int ll_rs, unsigned int mtu, + unsigned int ll_rs, unsigned int mtu, bool DF, struct ip_frag_state *state) { struct iphdr *iph = ip_hdr(skb); + state->DF = DF; state->hlen = hlen; state->ll_rs = ll_rs; state->mtu = mtu; @@ -668,9 +669,6 @@ static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; - if (IPCB(from)->flags & IPSKB_FRAG_PMTU) - state->iph->frag_off |= htons(IP_DF); - /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE @@ -738,6 +736,8 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state) */ iph = ip_hdr(skb2); iph->frag_off = htons((state->offset >> 3)); + if (state->DF) + iph->frag_off |= htons(IP_DF); /* * Added AC : If we are fragmenting a fragment that's not the @@ -771,6 +771,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; + ktime_t tstamp = skb->tstamp; struct ip_frag_state state; int err = 0; @@ -846,6 +847,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ip_fraglist_prepare(skb, &iter); } + skb->tstamp = tstamp; err = output(net, sk, skb); if (!err) @@ -881,7 +883,8 @@ slow_path: * Fragment the datagram. */ - ip_frag_init(skb, hlen, ll_rs, mtu, &state); + ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU, + &state); /* * Keep copying data until we run out. @@ -900,6 +903,7 @@ slow_path: /* * Put this fragment into the sending queue. */ + skb2->tstamp = tstamp; err = output(net, sk, skb2); if (err) goto fail; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 1452a97914a0..c724fb30d048 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -34,6 +34,9 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/dst_metadata.h> +#include <net/geneve.h> +#include <net/vxlan.h> +#include <net/erspan.h> const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; @@ -126,15 +129,14 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, if (!md || md->type != METADATA_IP_TUNNEL || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) - return NULL; - res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags); + src = &md->u.tun_info; + res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags); if (!res) return NULL; dst = &res->u.tun_info; - src = &md->u.tun_info; dst->key.tun_id = src->key.tun_id; if (src->mode & IP_TUNNEL_INFO_IPV6) memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, @@ -143,6 +145,8 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, dst->key.u.ipv4.dst = src->key.u.ipv4.src; dst->key.tun_flags = src->key.tun_flags; dst->mode = src->mode | IP_TUNNEL_INFO_TX; + ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src), + src->options_len, 0); return res; } @@ -217,24 +221,199 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, + [LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = { + [LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED }, + [LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED }, + [LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = { + [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, + [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, +}; + +static const struct nla_policy +vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = { + [LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = { + [LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, + [LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; +static int ip_tun_parse_opts_geneve(struct nlattr *attr, + struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1]; + int data_len, err; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr, + geneve_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] || + !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] || + !tb[LWTUNNEL_IP_OPT_GENEVE_DATA]) + return -EINVAL; + + attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA]; + data_len = nla_len(attr); + if (data_len % 4) + return -EINVAL; + + if (info) { + struct geneve_opt *opt = ip_tunnel_info_opts(info); + + memcpy(opt->opt_data, nla_data(attr), data_len); + opt->length = data_len / 4; + attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS]; + opt->opt_class = nla_get_be16(attr); + attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE]; + opt->type = nla_get_u8(attr); + info->key.tun_flags |= TUNNEL_GENEVE_OPT; + } + + return sizeof(struct geneve_opt) + data_len; +} + +static int ip_tun_parse_opts_vxlan(struct nlattr *attr, + struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr, + vxlan_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP]) + return -EINVAL; + + if (info) { + struct vxlan_metadata *md = ip_tunnel_info_opts(info); + + attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; + md->gbp = nla_get_u32(attr); + info->key.tun_flags |= TUNNEL_VXLAN_OPT; + } + + return sizeof(struct vxlan_metadata); +} + +static int ip_tun_parse_opts_erspan(struct nlattr *attr, + struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr, + erspan_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER]) + return -EINVAL; + + if (info) { + struct erspan_metadata *md = ip_tunnel_info_opts(info); + + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_VER]; + md->version = nla_get_u8(attr); + + if (md->version == 1 && tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]) { + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]; + md->u.index = nla_get_be32(attr); + } else if (md->version == 2 && tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] && + tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]) { + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR]; + md->u.md2.dir = nla_get_u8(attr); + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]; + set_hwid(&md->u.md2, nla_get_u8(attr)); + } else { + return -EINVAL; + } + + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + } + + return sizeof(struct erspan_metadata); +} + +static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPTS_MAX + 1]; + int err; + + if (!attr) + return 0; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPTS_MAX, attr, + ip_opts_policy, extack); + if (err) + return err; + + if (tb[LWTUNNEL_IP_OPTS_GENEVE]) + err = ip_tun_parse_opts_geneve(tb[LWTUNNEL_IP_OPTS_GENEVE], + info, extack); + else if (tb[LWTUNNEL_IP_OPTS_VXLAN]) + err = ip_tun_parse_opts_vxlan(tb[LWTUNNEL_IP_OPTS_VXLAN], + info, extack); + else if (tb[LWTUNNEL_IP_OPTS_ERSPAN]) + err = ip_tun_parse_opts_erspan(tb[LWTUNNEL_IP_OPTS_ERSPAN], + info, extack); + else + err = -EINVAL; + + return err; +} + +static int ip_tun_get_optlen(struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + return ip_tun_parse_opts(attr, NULL, extack); +} + +static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + return ip_tun_parse_opts(attr, info, extack); +} + static int ip_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { - struct ip_tunnel_info *tun_info; - struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; - int err; + struct lwtunnel_state *new_state; + struct ip_tunnel_info *tun_info; + int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, extack); if (err < 0) return err; - new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack); + if (opt_len < 0) + return opt_len; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; @@ -242,6 +421,12 @@ static int ip_tun_build_state(struct nlattr *attr, tun_info = lwt_tun_info(new_state); + err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack); + if (err < 0) { + lwtstate_free(new_state); + return err; + } + #ifdef CONFIG_DST_CACHE err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL); if (err) { @@ -266,10 +451,12 @@ static int ip_tun_build_state(struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); if (tb[LWTUNNEL_IP_FLAGS]) - tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]); + tun_info->key.tun_flags |= + (nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) & + ~TUNNEL_OPTIONS_PRESENT); tun_info->mode = IP_TUNNEL_INFO_TX; - tun_info->options_len = 0; + tun_info->options_len = opt_len; *ts = new_state; @@ -285,6 +472,109 @@ static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate) #endif } +static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct geneve_opt *opt; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE); + if (!nest) + return -ENOMEM; + + opt = ip_tunnel_info_opts(tun_info); + if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS, opt->opt_class) || + nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) || + nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4, + opt->opt_data)) { + nla_nest_cancel(skb, nest); + return -ENOMEM; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct vxlan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN); + if (!nest) + return -ENOMEM; + + md = ip_tunnel_info_opts(tun_info); + if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) { + nla_nest_cancel(skb, nest); + return -ENOMEM; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct erspan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN); + if (!nest) + return -ENOMEM; + + md = ip_tunnel_info_opts(tun_info); + if (nla_put_u32(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version)) + goto err; + + if (md->version == 1 && + nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index)) + goto err; + + if (md->version == 2 && + (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) || + nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID, + get_hwid(&md->u.md2)))) + goto err; + + nla_nest_end(skb, nest); + return 0; +err: + nla_nest_cancel(skb, nest); + return -ENOMEM; +} + +static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type, + struct ip_tunnel_info *tun_info) +{ + struct nlattr *nest; + int err = 0; + + if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + return 0; + + nest = nla_nest_start_noflag(skb, type); + if (!nest) + return -ENOMEM; + + if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT) + err = ip_tun_fill_encap_opts_geneve(skb, tun_info); + else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT) + err = ip_tun_fill_encap_opts_vxlan(skb, tun_info); + else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT) + err = ip_tun_fill_encap_opts_erspan(skb, tun_info); + + if (err) { + nla_nest_cancel(skb, nest); + return err; + } + + nla_nest_end(skb, nest); + return 0; +} + static int ip_tun_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { @@ -296,12 +586,47 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) || + ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info)) return -ENOMEM; return 0; } +static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) +{ + int opt_len; + + if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + return 0; + + opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */ + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + struct geneve_opt *opt = ip_tunnel_info_opts(info); + + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_GENEVE */ + + nla_total_size(2) /* OPT_GENEVE_CLASS */ + + nla_total_size(1) /* OPT_GENEVE_TYPE */ + + nla_total_size(opt->length * 4); + /* OPT_GENEVE_DATA */ + } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */ + + nla_total_size(4); /* OPT_VXLAN_GBP */ + } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + struct erspan_metadata *md = ip_tunnel_info_opts(info); + + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */ + + nla_total_size(1) /* OPT_ERSPAN_VER */ + + (md->version == 1 ? nla_total_size(4) + /* OPT_ERSPAN_INDEX (v1) */ + : nla_total_size(1) + + nla_total_size(1)); + /* OPT_ERSPAN_DIR + HWID (v2) */ + } + + return opt_len; +} + static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */ @@ -309,13 +634,21 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ - + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ + + nla_total_size(2) /* LWTUNNEL_IP_FLAGS */ + + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); + /* LWTUNNEL_IP_OPTS */ } static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { - return memcmp(lwt_tun_info(a), lwt_tun_info(b), - sizeof(struct ip_tunnel_info)); + struct ip_tunnel_info *info_a = lwt_tun_info(a); + struct ip_tunnel_info *info_b = lwt_tun_info(b); + + return memcmp(info_a, info_b, sizeof(info_a->key)) || + info_a->mode != info_b->mode || + info_a->options_len != info_b->options_len || + memcmp(ip_tunnel_info_opts(info_a), + ip_tunnel_info_opts(info_b), info_a->options_len); } static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { @@ -341,17 +674,21 @@ static int ip6_tun_build_state(struct nlattr *attr, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { - struct ip_tunnel_info *tun_info; - struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; - int err; + struct lwtunnel_state *new_state; + struct ip_tunnel_info *tun_info; + int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, extack); if (err < 0) return err; - new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack); + if (opt_len < 0) + return opt_len; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; @@ -359,6 +696,12 @@ static int ip6_tun_build_state(struct nlattr *attr, tun_info = lwt_tun_info(new_state); + err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack); + if (err < 0) { + lwtstate_free(new_state); + return err; + } + if (tb[LWTUNNEL_IP6_ID]) tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]); @@ -375,10 +718,12 @@ static int ip6_tun_build_state(struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); if (tb[LWTUNNEL_IP6_FLAGS]) - tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]); + tun_info->key.tun_flags |= + (nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) & + ~TUNNEL_OPTIONS_PRESENT); tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; - tun_info->options_len = 0; + tun_info->options_len = opt_len; *ts = new_state; @@ -396,7 +741,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) || + ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info)) return -ENOMEM; return 0; @@ -409,7 +755,9 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ - + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ + + nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */ + + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); + /* LWTUNNEL_IP6_OPTS */ } static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 9bcca08efec9..32e20b758b68 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1483,10 +1483,10 @@ static int __init ip_auto_config(void) * missing values. */ if (ic_myaddr == NONE || -#ifdef CONFIG_ROOT_NFS +#if defined(CONFIG_ROOT_NFS) || defined(CONFIG_CIFS_ROOT) (root_server_addr == NONE && ic_servaddr == NONE && - ROOT_DEV == Root_NFS) || + (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) || #endif ic_first_dev->next) { #ifdef IPCONFIG_DYNAMIC @@ -1513,6 +1513,12 @@ static int __init ip_auto_config(void) goto try_try_again; } #endif +#ifdef CONFIG_CIFS_ROOT + if (ROOT_DEV == Root_CIFS) { + pr_err("IP-Config: Retrying forever (CIFS root)...\n"); + goto try_try_again; + } +#endif if (--retries) { pr_err("IP-Config: Reopening network devices...\n"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 716d5472c022..440294bdb752 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -278,9 +278,10 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack); } static unsigned int ipmr_rules_seq_read(struct net *net) @@ -336,7 +337,8 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -3040,10 +3042,11 @@ static unsigned int ipmr_seq_read(struct net *net) return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); } -static int ipmr_dump(struct net *net, struct notifier_block *nb) +static int ipmr_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, - ipmr_mr_table_iter, &mrt_lock); + ipmr_mr_table_iter, &mrt_lock, extack); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index ea48bd15a575..aa8738a91210 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -386,15 +386,17 @@ EXPORT_SYMBOL(mr_rtm_dumproute); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock) + rwlock_t *mrt_lock, + struct netlink_ext_ack *extack) { struct mr_table *mrt; int err; - err = rules_dump(net, nb); + err = rules_dump(net, nb, extack); if (err) return err; @@ -409,17 +411,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, if (!v->dev) continue; - mr_call_vif_notifier(nb, net, family, - FIB_EVENT_VIF_ADD, - v, vifi, mrt->id); + err = mr_call_vif_notifier(nb, family, + FIB_EVENT_VIF_ADD, + v, vifi, mrt->id, extack); + if (err) + break; } read_unlock(mrt_lock); + if (err) + return err; + /* Notify on table MFC entries */ - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - mr_call_mfc_notifier(nb, net, family, - FIB_EVENT_ENTRY_ADD, - mfc, mrt->id); + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + err = mr_call_mfc_notifier(nb, family, + FIB_EVENT_ENTRY_ADD, + mfc, mrt->id, extack); + if (err) + return err; + } } return 0; diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index 012c4047c788..168b72e18be0 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -9,6 +9,8 @@ static struct nf_flowtable_type flowtable_ipv4 = { .family = NFPROTO_IPV4, .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route, .free = nf_flow_table_free, .hook = nf_flow_offload_ip_hook, .owner = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index 36a28d46149c..c94445b44d8c 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -31,16 +31,8 @@ extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, if (icmph == NULL) return 1; - switch (icmph->type) { - case ICMP_DEST_UNREACH: - case ICMP_SOURCE_QUENCH: - case ICMP_REDIRECT: - case ICMP_TIME_EXCEEDED: - case ICMP_PARAMETERPROB: - break; - default: + if (!icmp_is_err(icmph->type)) return 1; - } inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(struct icmphdr), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 14654876127e..dcc4fa10138d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1482,7 +1482,7 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { - dst_dev_put(&orig->dst); + rt_add_uncached_list(orig); dst_release(&orig->dst); } } else { @@ -1894,10 +1894,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, if (!icmph) goto out; - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_REDIRECT && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB) + if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, @@ -2470,14 +2467,17 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; - int err = -ENETUNREACH; + int err; if (fl4->saddr) { - rth = ERR_PTR(-EINVAL); if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr) || - ipv4_is_zeronet(fl4->saddr)) + ipv4_is_zeronet(fl4->saddr)) { + rth = ERR_PTR(-EINVAL); goto out; + } + + rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 535b69326f66..345b2b0ff618 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -62,10 +62,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ -u64 cookie_init_timestamp(struct request_sock *req) +u64 cookie_init_timestamp(struct request_sock *req, u64 now) { struct inet_request_sock *ireq; - u32 ts, ts_now = tcp_time_stamp_raw(); + u32 ts, ts_now = tcp_ns_to_ts(now); u32 options = 0; ireq = inet_rsk(req); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f98a1882e537..9b48aec29aca 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -326,7 +326,7 @@ void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; - if (tcp_memory_pressure) + if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; @@ -341,7 +341,7 @@ void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; - if (!tcp_memory_pressure) + if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) @@ -450,8 +450,8 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; + WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); + WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); sk_sockets_allocated_inc(sk); sk->sk_route_forced_caps = NETIF_F_GSO; @@ -477,7 +477,7 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, int target, struct sock *sk) { - return (tp->rcv_nxt - tp->copied_seq >= target) || + return (READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq) >= target) || (sk->sk_prot->stream_memory_read ? sk->sk_prot->stream_memory_read(sk) : false); } @@ -543,10 +543,10 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Connected or passive Fast Open socket? */ if (state != TCP_SYN_SENT && - (state != TCP_SYN_RECV || tp->fastopen_rsk)) { + (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); - if (tp->urg_seq == tp->copied_seq && + if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE) && tp->urg_data) target++; @@ -584,7 +584,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; @@ -607,7 +607,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) unlock_sock_fast(sk, slow); break; case SIOCATMARK: - answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + answ = tp->urg_data && + READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) @@ -616,7 +617,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_una; + answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) @@ -625,7 +626,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_nxt; + answ = READ_ONCE(tp->write_seq) - + READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; @@ -657,7 +659,7 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->sacked = 0; __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; @@ -1032,10 +1034,10 @@ new_segment: skb->len += copy; skb->data_len += copy; skb->truesize += copy; - sk->sk_wmem_queued += copy; + sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; - tp->write_seq += copy; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -1362,7 +1364,7 @@ new_segment: if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; - tp->write_seq += copy; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -1668,9 +1670,9 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_eat_skb(sk, skb); if (!desc->count) break; - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); } - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); @@ -1699,7 +1701,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) else cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; val = min(val, cap); - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); @@ -1709,7 +1711,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) val <<= 1; if (val > sk->sk_rcvbuf) { - sk->sk_rcvbuf = val; + WRITE_ONCE(sk->sk_rcvbuf, val); tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); } return 0; @@ -1739,8 +1741,8 @@ static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc) { unsigned long address = (unsigned long)zc->address; + u32 length = 0, seq, offset, zap_len; const skb_frag_t *frags = NULL; - u32 length = 0, seq, offset; struct vm_area_struct *vma; struct sk_buff *skb = NULL; struct tcp_sock *tp; @@ -1767,12 +1769,12 @@ static int tcp_zerocopy_receive(struct sock *sk, seq = tp->copied_seq; inq = tcp_inq(sk); zc->length = min_t(u32, zc->length, inq); - zc->length &= ~(PAGE_SIZE - 1); - if (zc->length) { - zap_page_range(vma, address, zc->length); + zap_len = zc->length & ~(PAGE_SIZE - 1); + if (zap_len) { + zap_page_range(vma, address, zap_len); zc->recv_skip_hint = 0; } else { - zc->recv_skip_hint = inq; + zc->recv_skip_hint = zc->length; } ret = 0; while (length + PAGE_SIZE <= zc->length) { @@ -1819,7 +1821,7 @@ static int tcp_zerocopy_receive(struct sock *sk, out: up_read(¤t->mm->mmap_sem); if (length) { - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ @@ -1956,13 +1958,12 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, struct sk_buff *skb, *last; u32 urg_hole = 0; struct scm_timestamping_internal tss; - bool has_tss = false; - bool has_cmsg; + int cmsg_flags; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); - if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && (sk->sk_state == TCP_ESTABLISHED)) sk_busy_loop(sk, nonblock); @@ -1972,7 +1973,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (sk->sk_state == TCP_LISTEN) goto out; - has_cmsg = tp->recvmsg_inq; + cmsg_flags = tp->recvmsg_inq ? 1 : 0; timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */ @@ -2045,7 +2046,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* Well, if we have backlog, try to process it now yet. */ - if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { @@ -2117,7 +2118,7 @@ found_ok_skb: if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { - ++*seq; + WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; @@ -2139,7 +2140,7 @@ found_ok_skb: } } - *seq += used; + WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; @@ -2155,8 +2156,7 @@ skip_copy: if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, &tss); - has_tss = true; - has_cmsg = true; + cmsg_flags |= 2; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; @@ -2166,7 +2166,7 @@ skip_copy: found_fin_ok: /* Process the FIN. */ - ++*seq; + WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; @@ -2181,10 +2181,10 @@ found_fin_ok: release_sock(sk); - if (has_cmsg) { - if (has_tss) + if (cmsg_flags) { + if (cmsg_flags & 2) tcp_recv_timestamp(msg, sk, &tss); - if (tp->recvmsg_inq) { + if (cmsg_flags & 1) { inq = tcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } @@ -2487,7 +2487,10 @@ adjudge_to_death: } if (sk->sk_state == TCP_CLOSE) { - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + struct request_sock *req; + + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + lockdep_sock_is_held(sk)); /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. @@ -2559,6 +2562,7 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; + u32 seq; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); @@ -2585,7 +2589,7 @@ int tcp_disconnect(struct sock *sk, int flags) __kfree_skb(sk->sk_rx_skb_cache); sk->sk_rx_skb_cache = NULL; } - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->urg_data = 0; tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); @@ -2601,9 +2605,12 @@ int tcp_disconnect(struct sock *sk, int flags) tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; - tp->write_seq += tp->max_window + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; + + seq = tp->write_seq + tp->max_window + 2; + if (!seq) + seq = 1; + WRITE_ONCE(tp->write_seq, seq); + icsk->icsk_backoff = 0; tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; @@ -2657,6 +2664,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); inet->defer_connect = 0; + tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); @@ -2930,9 +2938,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (sk->sk_state != TCP_CLOSE) err = -EPERM; else if (tp->repair_queue == TCP_SEND_QUEUE) - tp->write_seq = val; + WRITE_ONCE(tp->write_seq, val); else if (tp->repair_queue == TCP_RECV_QUEUE) - tp->rcv_nxt = val; + WRITE_ONCE(tp->rcv_nxt, val); else err = -EINVAL; break; @@ -3215,8 +3223,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) * tcpi_unacked -> Number of children ready for accept() * tcpi_sacked -> max backlog */ - info->tcpi_unacked = sk->sk_ack_backlog; - info->tcpi_sacked = sk->sk_max_ack_backlog; + info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); + info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); return; } @@ -3296,6 +3304,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3831,7 +3840,13 @@ EXPORT_SYMBOL(tcp_md5_hash_key); void tcp_done(struct sock *sk) { - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + struct request_sock *req; + + /* We might be called with a new socket, after + * inet_csk_prepare_forced_close() has been called + * so we can not use lockdep_sock_is_held(sk) + */ + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 81a8221d650a..0d08f9e2d8d0 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -21,13 +21,14 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, struct tcp_info *info = _info; if (inet_sk_state_load(sk) == TCP_LISTEN) { - r->idiag_rqueue = sk->sk_ack_backlog; - r->idiag_wqueue = sk->sk_max_ack_backlog; + r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); + r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } else if (sk->sk_type == SOCK_STREAM) { const struct tcp_sock *tp = tcp_sk(sk); - r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); - r->idiag_wqueue = tp->write_seq - tp->snd_una; + r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - + READ_ONCE(tp->copied_seq), 0); + r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una; } if (info) tcp_get_info(sk, info); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 3fd451271a70..19ad9586c720 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -253,7 +253,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, */ tp = tcp_sk(child); - tp->fastopen_rsk = req; + rcu_assign_pointer(tp->fastopen_rsk, req); tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never @@ -422,7 +422,10 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, cookie->len = -1; return true; } - return cookie->len > 0; + if (cookie->len > 0) + return true; + tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE; + return false; } /* This function checks if we want to defer sending SYN until the first diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3578357abe30..88b987ca9ebb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -359,7 +359,8 @@ static void tcp_sndbuf_expand(struct sock *sk) sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) - sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); + WRITE_ONCE(sk->sk_sndbuf, + min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2])); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -483,8 +484,9 @@ static void tcp_clamp_window(struct sock *sk) !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - net->ipv4.sysctl_tcp_rmem[2]); + WRITE_ONCE(sk->sk_rcvbuf, + min(atomic_read(&sk->sk_rmem_alloc), + net->ipv4.sysctl_tcp_rmem[2])); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -648,7 +650,7 @@ void tcp_rcv_space_adjust(struct sock *sk) rcvbuf = min_t(u64, rcvwin * rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { - sk->sk_rcvbuf = rcvbuf; + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make the window clamp follow along. */ tp->window_clamp = tcp_win_from_space(sk, rcvbuf); @@ -2666,7 +2668,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); - if ((flag & FLAG_SND_UNA_ADVANCED || tp->fastopen_rsk) && + if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) && tcp_try_undo_loss(sk, false)) return; @@ -2990,7 +2992,7 @@ void tcp_rearm_rto(struct sock *sk) /* If the retrans timer is currently being used by Fast Open * for SYN-ACK retrans purpose, stay put. */ - if (tp->fastopen_rsk) + if (rcu_access_pointer(tp->fastopen_rsk)) return; if (!tp->packets_out) { @@ -3362,7 +3364,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) sock_owned_by_me((struct sock *)tp); tp->bytes_received += delta; - tp->rcv_nxt = seq; + WRITE_ONCE(tp->rcv_nxt, seq); } /* Update our send window. @@ -5356,7 +5358,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) } tp->urg_data = TCP_URG_NOTYET; - tp->urg_seq = ptr; + WRITE_ONCE(tp->urg_seq, ptr); /* Disable header prediction. */ tp->pred_flags = 0; @@ -5812,6 +5814,10 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ + if (tp->total_retrans) + tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; + else + tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; skb_rbtree_walk_from(data) { if (__tcp_retransmit_skb(sk, data, 1)) break; @@ -5932,7 +5938,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* Ok.. it's good. Set up sequence numbers and * move to established. */ - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -5961,7 +5967,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); smc_check_reset_syn(tp); @@ -6035,8 +6041,8 @@ discard: tp->tcp_header_len = sizeof(struct tcphdr); } - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -6087,6 +6093,8 @@ reset_and_undo: static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { + struct request_sock *req; + tcp_try_undo_loss(sk, false); /* Reset rtx states to prevent spurious retransmits_timed_out() */ @@ -6096,7 +6104,9 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, * we no longer need req so release it. */ - reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false); + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + lockdep_sock_is_held(sk)); + reqsk_fastopen_remove(sk, req, false); /* Re-arm the timer because data may have been sent out. * This is similar to the regular data transmission case @@ -6171,7 +6181,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_mstamp_refresh(tp); tp->rx_opt.saw_tstamp = 0; - req = tp->fastopen_rsk; + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); if (req) { bool req_stolen; @@ -6211,7 +6222,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bf124b1742df..92282f98dc82 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -121,11 +121,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == AF_INET6) { if (ipv6_addr_loopback(&tw->tw_v6_daddr) || - (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && - (tw->tw_v6_daddr.s6_addr[12] == 127)) || + ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || - (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && - (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) + ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) loopback = true; } else #endif @@ -164,9 +162,11 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) * without appearing to create any others. */ if (likely(!tp->repair)) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; + u32 seq = tcptw->tw_snd_nxt + 65535 + 2; + + if (!seq) + seq = 1; + WRITE_ONCE(tp->write_seq, seq); tp->rx_opt.ts_recent = tcptw->tw_ts_recent; tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; } @@ -253,7 +253,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; if (likely(!tp->repair)) - tp->write_seq = 0; + WRITE_ONCE(tp->write_seq, 0); } inet->inet_dport = usin->sin_port; @@ -291,16 +291,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (likely(!tp->repair)) { if (!tp->write_seq) - tp->write_seq = secure_tcp_seq(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port); + WRITE_ONCE(tp->write_seq, + secure_tcp_seq(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port)); tp->tsoffset = secure_tcp_ts_off(sock_net(sk), inet->inet_saddr, inet->inet_daddr); } - inet->inet_id = tp->write_seq ^ jiffies; + inet->inet_id = prandom_u32(); if (tcp_fastopen_defer_connect(sk, &err)) return err; @@ -478,7 +479,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ - fastopen = tp->fastopen_rsk; + fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { @@ -1447,7 +1448,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - newinet->inet_id = newtp->write_seq ^ jiffies; + newinet->inet_id = prandom_u32(); if (!dst) { dst = inet_csk_route_child_sock(sk, newsk, req); @@ -1644,7 +1645,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { - u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; + u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -2121,7 +2122,7 @@ void tcp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); - BUG_ON(tp->fastopen_rsk); + BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); @@ -2450,17 +2451,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) state = inet_sk_state_load(sk); if (state == TCP_LISTEN) - rx_queue = sk->sk_ack_backlog; + rx_queue = READ_ONCE(sk->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. */ - rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - + READ_ONCE(tp->copied_seq), 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", i, src, srcp, dest, destp, state, - tp->write_seq - tp->snd_una, + READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), @@ -2677,7 +2679,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; - net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); + net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index bb140a5db8c0..c802bc80c400 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -462,6 +462,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; struct tcp_sock *oldtp, *newtp; + u32 seq; if (!newsk) return NULL; @@ -475,12 +476,16 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, /* Now setup tcp_sock */ newtp->pred_flags = 0; - newtp->rcv_wup = newtp->copied_seq = - newtp->rcv_nxt = treq->rcv_isn + 1; + seq = treq->rcv_isn + 1; + newtp->rcv_wup = seq; + WRITE_ONCE(newtp->copied_seq, seq); + WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; - newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; + seq = treq->snt_isn + 1; + newtp->snd_sml = newtp->snd_una = seq; + WRITE_ONCE(newtp->snd_nxt, seq); + newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); @@ -495,7 +500,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, @@ -541,7 +546,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); newtp->fastopen_req = NULL; - newtp->fastopen_rsk = NULL; + RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fec6d67bfd14..be6d22b8190f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -67,7 +67,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); @@ -1196,10 +1196,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ - tp->write_seq = TCP_SKB_CB(skb)->end_seq; + WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } @@ -1333,7 +1333,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); - sk->sk_wmem_queued += buff->truesize; + sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); nlen = skb->len - len - nsize; buff->truesize += nlen; @@ -1443,7 +1443,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (delta_truesize) { skb->truesize -= delta_truesize; - sk->sk_wmem_queued -= delta_truesize; + sk_wmem_queued_add(sk, -delta_truesize); sk_mem_uncharge(sk, delta_truesize); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); } @@ -1888,7 +1888,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, return -ENOMEM; skb_copy_decrypted(buff, skb); - sk->sk_wmem_queued += buff->truesize; + sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; @@ -2152,7 +2152,7 @@ static int tcp_mtu_probe(struct sock *sk) nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) return -1; - sk->sk_wmem_queued += nskb->truesize; + sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); @@ -2482,7 +2482,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ - if (tp->fastopen_rsk) + if (rcu_access_pointer(tp->fastopen_rsk)) return false; early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; @@ -3142,7 +3142,7 @@ void tcp_send_fin(struct sock *sk) * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ - tp->snd_nxt++; + WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); return; } } else { @@ -3222,7 +3222,7 @@ int tcp_send_synack(struct sock *sk) tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); - sk->sk_wmem_queued += nskb->truesize; + sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = nskb; } @@ -3290,7 +3290,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp_ns = cookie_init_timestamp(req); + skb->skb_mstamp_ns = cookie_init_timestamp(req, now); else #endif { @@ -3426,14 +3426,14 @@ static void tcp_connect_init(struct sock *sk) tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; - tp->snd_nxt = tp->write_seq; + WRITE_ONCE(tp->snd_nxt, tp->write_seq); if (likely(!tp->repair)) tp->rcv_nxt = 0; else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; @@ -3447,9 +3447,9 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) tcb->end_seq += skb->len; __skb_header_release(skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); - tp->write_seq = tcb->end_seq; + WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); } @@ -3586,11 +3586,11 @@ int tcp_connect(struct sock *sk) /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ - tp->snd_nxt = tp->write_seq; + WRITE_ONCE(tp->snd_nxt, tp->write_seq); tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { - tp->snd_nxt = TCP_SKB_CB(buff)->seq; + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); tp->pushed_seq = TCP_SKB_CB(buff)->seq; } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 05be564414e9..dd5a6317a801 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -386,15 +386,13 @@ abort: tcp_write_err(sk); * Timer for Fast Open socket to retransmit SYNACK. Note that the * sk here is the child socket, not the parent (listener) socket. */ -static void tcp_fastopen_synack_timer(struct sock *sk) +static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) { struct inet_connection_sock *icsk = inet_csk(sk); int max_retries = icsk->icsk_syn_retries ? : sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ struct tcp_sock *tp = tcp_sk(sk); - struct request_sock *req; - req = tcp_sk(sk)->fastopen_rsk; req->rsk_ops->syn_ack_timeout(req); if (req->num_timeout >= max_retries) { @@ -435,11 +433,14 @@ void tcp_retransmit_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock *req; - if (tp->fastopen_rsk) { + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); + if (req) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - tcp_fastopen_synack_timer(sk); + tcp_fastopen_synack_timer(sk, req); /* Before we receive ACK to our SYN-ACK don't retransmit * anything else (e.g., data or FIN segments). */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 14bc654b6842..da805ee7558b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -388,7 +388,7 @@ static int compute_score(struct sock *sk, struct net *net, return -1; score += 4; - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; return score; } @@ -1316,6 +1316,20 @@ static void udp_set_dev_scratch(struct sk_buff *skb) scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } +static void udp_skb_csum_unnecessary_set(struct sk_buff *skb) +{ + /* We come here after udp_lib_checksum_complete() returned 0. + * This means that __skb_checksum_complete() might have + * set skb->csum_valid to 1. + * On 64bit platforms, we can set csum_unnecessary + * to true, but only if the skb is not shared. + */ +#if BITS_PER_LONG == 64 + if (!skb_shared(skb)) + udp_skb_scratch(skb)->csum_unnecessary = true; +#endif +} + static int udp_skb_truesize(struct sk_buff *skb) { return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; @@ -1550,10 +1564,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, *total += skb->truesize; kfree_skb(skb); } else { - /* the csum related bits could be changed, refresh - * the scratch area - */ - udp_set_dev_scratch(skb); + udp_skb_csum_unnecessary_set(skb); break; } } @@ -1577,7 +1588,7 @@ static int first_packet_length(struct sock *sk) spin_lock_bh(&rcvq->lock); skb = __first_packet_length(sk, rcvq, &total); - if (!skb && !skb_queue_empty(sk_queue)) { + if (!skb && !skb_queue_empty_lockless(sk_queue)) { spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, rcvq); spin_unlock(&sk_queue->lock); @@ -1650,7 +1661,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, return skb; } - if (skb_queue_empty(sk_queue)) { + if (skb_queue_empty_lockless(sk_queue)) { spin_unlock_bh(&queue->lock); goto busy_check; } @@ -1676,7 +1687,7 @@ busy_check: break; sk_busy_loop(sk, flags & MSG_DONTWAIT); - } while (!skb_queue_empty(sk_queue)); + } while (!skb_queue_empty_lockless(sk_queue)); /* sk_queue is empty, reader_queue may contain peeked packets */ } while (timeo && @@ -2523,9 +2534,11 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, case UDP_ENCAP: switch (val) { case 0: +#ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: up->encap_rcv = xfrm4_udp_encap_rcv; +#endif /* FALLTHROUGH */ case UDP_ENCAP_L2TPINUDP: up->encap_type = val; @@ -2712,7 +2725,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; - if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) + if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 34ccef18b40e..98d82305d6de 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5552,14 +5552,13 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (!nla) goto nla_put_failure; - - if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) - goto nla_put_failure; - read_lock_bh(&idev->lock); memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); read_unlock_bh(&idev->lock); + if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 783f3c1466da..2fc079284ca4 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <net/ipv6.h> #include <net/ipv6_stubs.h> +#include <net/addrconf.h> #include <net/ip.h> /* if ipv6 module registers this function is used by xfrm to force all diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c index 05f82baaa99e..f87ae33e1d01 100644 --- a/net/ipv6/fib6_notifier.c +++ b/net/ipv6/fib6_notifier.c @@ -7,12 +7,12 @@ #include <net/netns/ipv6.h> #include <net/ip6_fib.h> -int call_fib6_notifier(struct notifier_block *nb, struct net *net, +int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; - return call_fib_notifier(nb, net, event_type, info); + return call_fib_notifier(nb, event_type, info); } int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, @@ -27,15 +27,16 @@ static unsigned int fib6_seq_read(struct net *net) return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } -static int fib6_dump(struct net *net, struct notifier_block *nb) +static int fib6_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { int err; - err = fib6_rules_dump(net, nb); + err = fib6_rules_dump(net, nb, extack); if (err) return err; - return fib6_tables_dump(net, nb); + return fib6_tables_dump(net, nb, extack); } static const struct fib_notifier_ops fib6_notifier_ops_template = { diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index f9e8fe3ff0c5..fafe556d21e0 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -47,9 +47,10 @@ bool fib6_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib6_rule_default); -int fib6_rules_dump(struct net *net, struct notifier_block *nb) +int fib6_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, AF_INET6); + return fib_rules_dump(net, nb, AF_INET6, extack); } unsigned int fib6_rules_seq_read(struct net *net) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 62c997201970..ef408a5090a2 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -516,13 +516,29 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, mip6_addr_swap(skb); + sk = icmpv6_xmit_lock(net); + if (!sk) + goto out_bh_enable; + memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = hdr->saddr; if (force_saddr) saddr = force_saddr; - if (saddr) + if (saddr) { fl6.saddr = *saddr; + } else { + /* select a more meaningful saddr from input if */ + struct net_device *in_netdev; + + in_netdev = dev_get_by_index(net, IP6CB(skb)->iif); + if (in_netdev) { + ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, + inet6_sk(sk)->srcprefs, + &fl6.saddr); + dev_put(in_netdev); + } + } fl6.flowi6_mark = mark; fl6.flowi6_oif = iif; fl6.fl6_icmp_type = type; @@ -531,10 +547,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); - sk = icmpv6_xmit_lock(net); - if (!sk) - goto out_bh_enable; - sk->sk_mark = mark; np = inet6_sk(sk); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index cf60fae9533b..fbe9d4295eac 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -105,7 +105,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score = 1; - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6e2af411cd9c..f66bc2af4e9d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -357,15 +357,17 @@ unsigned int fib6_tables_seq_read(struct net *net) return fib_seq; } -static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, - struct fib6_info *rt) + struct fib6_info *rt, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { + .info.extack = extack, .rt = rt, }; - return call_fib6_notifier(nb, net, event_type, &info.info); + return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, @@ -401,40 +403,51 @@ int call_fib6_multipath_entry_notifiers(struct net *net, struct fib6_dump_arg { struct net *net; struct notifier_block *nb; + struct netlink_ext_ack *extack; }; -static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) +static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { if (rt == arg->net->ipv6.fib6_null_entry) - return; - call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt); + return 0; + return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, + rt, arg->extack); } static int fib6_node_dump(struct fib6_walker *w) { struct fib6_info *rt; + int err = 0; - for_each_fib6_walker_rt(w) - fib6_rt_dump(rt, w->args); + for_each_fib6_walker_rt(w) { + err = fib6_rt_dump(rt, w->args); + if (err) + break; + } w->leaf = NULL; - return 0; + return err; } -static void fib6_table_dump(struct net *net, struct fib6_table *tb, - struct fib6_walker *w) +static int fib6_table_dump(struct net *net, struct fib6_table *tb, + struct fib6_walker *w) { + int err; + w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); - fib6_walk(net, w); + err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); + return err; } /* Called with rcu_read_lock() */ -int fib6_tables_dump(struct net *net, struct notifier_block *nb) +int fib6_tables_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; + int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) @@ -443,19 +456,24 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb) w->func = fib6_node_dump; arg.net = net; arg.nb = nb; + arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) - fib6_table_dump(net, tb, w); + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { + err = fib6_table_dump(net, tb, w); + if (err < 0) + goto out; + } } +out: kfree(w); - return 0; + return err; } static int fib6_dump_node(struct fib6_walker *w) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index d5779d6a6065..923034c52ce4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -980,9 +980,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, dsfield = key->tos; if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) goto tx_err; - md = ip_tunnel_info_opts(tun_info); - if (!md) + if (tun_info->options_len < sizeof(*md)) goto tx_err; + md = ip_tunnel_info_opts(tun_info); tun_id = tunnel_id_to_key32(key->tun_id); if (md->version == 1) { @@ -2192,6 +2192,7 @@ static void ip6erspan_tap_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &ip6erspan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 3d71c7d6102c..ef7f707d9ae3 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -325,7 +325,8 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - ip6_sublist_rcv(&sublist, curr_dev, curr_net); + if (!list_empty(&sublist)) + ip6_sublist_rcv(&sublist, curr_dev, curr_net); } INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index edadee4a7e76..71827b56c006 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -768,6 +768,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, inet6_sk(skb->sk) : NULL; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; + ktime_t tstamp = skb->tstamp; int hroom, err = 0; __be32 frag_id; u8 *prevhdr, nexthdr = 0; @@ -855,6 +856,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); + skb->tstamp = tstamp; err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -913,6 +915,7 @@ slow_path: /* * Put this fragment into the sending queue. */ + frag->tstamp = tstamp; err = output(net, sk, frag); if (err) goto fail; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 857a89ad4d6c..bfa49ff70531 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -265,9 +265,10 @@ static void __net_exit ip6mr_rules_exit(struct net *net) rtnl_unlock(); } -static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR); + return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(struct net *net) @@ -324,7 +325,8 @@ static void __net_exit ip6mr_rules_exit(struct net *net) rtnl_unlock(); } -static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -1256,10 +1258,11 @@ static unsigned int ip6mr_seq_read(struct net *net) return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); } -static int ip6mr_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, - ip6mr_mr_table_iter, &mrt_lock); + ip6mr_mr_table_iter, &mrt_lock, extack); } static struct notifier_block ip6_mr_notifier = { diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index a9bff556d3b2..409e79b84a83 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -119,6 +119,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + ktime_t tstamp = skb->tstamp; struct ip6_frag_state state; u8 *prevhdr, nexthdr = 0; unsigned int mtu, hlen; @@ -183,6 +184,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); + skb->tstamp = tstamp; err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -215,6 +217,7 @@ slow_path: goto blackhole; } + skb2->tstamp = tstamp; err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index f6d9a48c7a2a..f069bc0dc056 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -10,6 +10,8 @@ static struct nf_flowtable_type flowtable_ipv6 = { .family = NFPROTO_IPV6, .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route, .free = nf_flow_table_free, .hook = nf_flow_offload_ipv6_hook, .owner = THIS_MODULE, diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 34d51cd426b0..6bac68fb27a3 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -150,4 +150,4 @@ EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs"); -MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support"); +MODULE_DESCRIPTION("Netfilter IPv6 transparent proxy support"); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a63ff85fe141..edcb52543518 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -621,6 +621,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh) { struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; + unsigned long last_probe; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; @@ -639,6 +640,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh) nh_gw = &fib6_nh->fib_nh_gw6; dev = fib6_nh->fib_nh_dev; rcu_read_lock_bh(); + last_probe = READ_ONCE(fib6_nh->last_probe); idev = __in6_dev_get(dev); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { @@ -654,13 +656,15 @@ static void rt6_probe(struct fib6_nh *fib6_nh) __neigh_set_probe_once(neigh); } write_unlock(&neigh->lock); - } else if (time_after(jiffies, fib6_nh->last_probe + + } else if (time_after(jiffies, last_probe + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } - if (work) { - fib6_nh->last_probe = jiffies; + if (!work || cmpxchg(&fib6_nh->last_probe, + last_probe, jiffies) != last_probe) { + kfree(work); + } else { INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; dev_hold(dev); @@ -1475,11 +1479,11 @@ static u32 rt6_exception_hash(const struct in6_addr *dst, u32 val; net_get_random_once(&seed, sizeof(seed)); - val = jhash(dst, sizeof(*dst), seed); + val = jhash2((const u32 *)dst, sizeof(*dst)/sizeof(u32), seed); #ifdef CONFIG_IPV6_SUBTREES if (src) - val = jhash(src, sizeof(*src), val); + val = jhash2((const u32 *)src, sizeof(*src)/sizeof(u32), val); #endif return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); } @@ -2291,10 +2295,7 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb, if (!icmph) goto out; - if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && - icmph->icmp6_type != ICMPV6_PKT_TOOBIG && - icmph->icmp6_type != ICMPV6_TIME_EXCEED && - icmph->icmp6_type != ICMPV6_PARAMPROB) + if (!icmpv6_is_err(icmph->icmp6_type)) goto out; inner_iph = skb_header_pointer(skb, @@ -3383,6 +3384,9 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, int err; fib6_nh->fib_nh_family = AF_INET6; +#ifdef CONFIG_IPV6_ROUTER_PREF + fib6_nh->last_probe = jiffies; +#endif err = -ENODEV; if (cfg->fc_ifindex) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e3d9f4559c99..81f51335e326 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -215,7 +215,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; - tp->write_seq = 0; + WRITE_ONCE(tp->write_seq, 0); } sk->sk_v6_daddr = usin->sin6_addr; @@ -311,10 +311,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (likely(!tp->repair)) { if (!tp->write_seq) - tp->write_seq = secure_tcpv6_seq(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport); + WRITE_ONCE(tp->write_seq, + secure_tcpv6_seq(np->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport)); tp->tsoffset = secure_tcpv6_ts_off(sock_net(sk), np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32); @@ -406,7 +407,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ - fastopen = tp->fastopen_rsk; + fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { @@ -1890,12 +1891,13 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) state = inet_sk_state_load(sp); if (state == TCP_LISTEN) - rx_queue = sp->sk_ack_backlog; + rx_queue = READ_ONCE(sp->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. */ - rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - + READ_ONCE(tp->copied_seq), 0); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " @@ -1906,7 +1908,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, state, - tp->write_seq - tp->snd_una, + READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6324d3a8cb53..9fec580c968e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -135,7 +135,7 @@ static int compute_score(struct sock *sk, struct net *net, return -1; score++; - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; return score; diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index fd5ac2788e45..d3b520b9b2c9 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -56,7 +56,6 @@ static int l2tp_eth_dev_init(struct net_device *dev) { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); - netdev_lockdep_set_classes(dev); return 0; } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 2017b7d780f5..2922d4150d88 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -113,22 +113,26 @@ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) * * Send data via reliable llc2 connection. * Returns 0 upon success, non-zero if action did not succeed. + * + * This function always consumes a reference to the skb. */ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) { struct llc_sock* llc = llc_sk(sk); - int rc = 0; if (unlikely(llc_data_accept_state(llc->state) || llc->remote_busy_flag || llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); + int rc; rc = llc_ui_wait_for_busy_core(sk, timeout); + if (rc) { + kfree_skb(skb); + return rc; + } } - if (unlikely(!rc)) - rc = llc_build_and_send_pkt(sk, skb); - return rc; + return llc_build_and_send_pkt(sk, skb); } static void llc_ui_sk_init(struct socket *sock, struct sock *sk) @@ -701,7 +705,7 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags, /* put original socket back into a clean listen state. */ sk->sk_state = TCP_LISTEN; - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); dprintk("%s: ok success on %02X, client on %02X\n", __func__, llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); frees: @@ -776,7 +780,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } /* Well, if we have backlog, try to process it now yet. */ - if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { @@ -899,7 +903,7 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; - struct sk_buff *skb; + struct sk_buff *skb = NULL; size_t size = 0; int rc = -EINVAL, copied = 0, hdrlen; @@ -908,10 +912,10 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) - goto release; + goto out; } else { if (llc_ui_addr_null(&llc->addr)) - goto release; + goto out; addr = &llc->addr; } /* must bind connection to sap if user hasn't done it. */ @@ -919,7 +923,7 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) /* bind to sap with null dev, exclusive. */ rc = llc_ui_autobind(sock, addr); if (rc) - goto release; + goto out; } hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr); size = hdrlen + len; @@ -928,12 +932,12 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) copied = size - hdrlen; rc = -EINVAL; if (copied < 0) - goto release; + goto out; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) - goto release; + goto out; skb->dev = llc->dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); skb_reserve(skb, hdrlen); @@ -943,29 +947,31 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } if (addr->sllc_test) { llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } if (addr->sllc_xid) { llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } rc = -ENOPROTOOPT; if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua)) goto out; rc = llc_ui_send_data(sk, skb, noblock); + skb = NULL; out: - if (rc) { - kfree_skb(skb); -release: + kfree_skb(skb); + if (rc) dprintk("%s: failed sending from %02X to %02X: %d\n", __func__, llc->laddr.lsap, llc->daddr.lsap, rc); - } release_sock(sk); return rc ? : copied; } diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index 4d78375f9872..647c0554d04c 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -372,6 +372,7 @@ int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { + skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } @@ -389,7 +390,8 @@ static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - rc = llc_conn_send_pdu(sk, skb); + skb_get(skb); + llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; @@ -406,6 +408,7 @@ int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { + skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } @@ -916,7 +919,8 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - rc = llc_conn_send_pdu(sk, skb); + skb_get(skb); + llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 4ff89cb7c86f..7b620acaca9e 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -30,7 +30,7 @@ #endif static int llc_find_offset(int state, int ev_type); -static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *skb); +static void llc_conn_send_pdus(struct sock *sk); static int llc_conn_service(struct sock *sk, struct sk_buff *skb); static int llc_exec_conn_trans_actions(struct sock *sk, struct llc_conn_state_trans *trans, @@ -55,6 +55,8 @@ int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ; * (executing it's actions and changing state), upper layer will be * indicated or confirmed, if needed. Returns 0 for success, 1 for * failure. The socket lock has to be held before calling this function. + * + * This function always consumes a reference to the skb. */ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) { @@ -62,12 +64,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(skb->sk); struct llc_conn_state_ev *ev = llc_conn_ev(skb); - /* - * We have to hold the skb, because llc_conn_service will kfree it in - * the sending path and we need to look at the skb->cb, where we encode - * llc_conn_state_ev. - */ - skb_get(skb); ev->ind_prim = ev->cfm_prim = 0; /* * Send event to state machine @@ -75,21 +71,12 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) rc = llc_conn_service(skb->sk, skb); if (unlikely(rc != 0)) { printk(KERN_ERR "%s: llc_conn_service failed\n", __func__); - goto out_kfree_skb; - } - - if (unlikely(!ev->ind_prim && !ev->cfm_prim)) { - /* indicate or confirm not required */ - if (!skb->next) - goto out_kfree_skb; goto out_skb_put; } - if (unlikely(ev->ind_prim && ev->cfm_prim)) /* Paranoia */ - skb_get(skb); - switch (ev->ind_prim) { case LLC_DATA_PRIM: + skb_get(skb); llc_save_primitive(sk, skb, LLC_DATA_PRIM); if (unlikely(sock_queue_rcv_skb(sk, skb))) { /* @@ -106,6 +93,7 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) * skb->sk pointing to the newly created struct sock in * llc_conn_handler. -acme */ + skb_get(skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_state_change(sk); break; @@ -121,7 +109,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) sk->sk_state_change(sk); } } - kfree_skb(skb); sock_put(sk); break; case LLC_RESET_PRIM: @@ -130,14 +117,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) * RESET is not being notified to upper layers for now */ printk(KERN_INFO "%s: received a reset ind!\n", __func__); - kfree_skb(skb); break; default: - if (ev->ind_prim) { + if (ev->ind_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->ind_prim); - kfree_skb(skb); - } /* No indication */ break; } @@ -179,25 +163,22 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) printk(KERN_INFO "%s: received a reset conf!\n", __func__); break; default: - if (ev->cfm_prim) { + if (ev->cfm_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->cfm_prim); - break; - } - goto out_skb_put; /* No confirmation */ + /* No confirmation */ + break; } -out_kfree_skb: - kfree_skb(skb); out_skb_put: kfree_skb(skb); return rc; } -int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) +void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) { /* queue PDU to send to MAC layer */ skb_queue_tail(&sk->sk_write_queue, skb); - return llc_conn_send_pdus(sk, skb); + llc_conn_send_pdus(sk); } /** @@ -255,7 +236,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk, NULL); + llc_conn_send_pdus(sk); out:; } @@ -296,7 +277,7 @@ void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk, NULL); + llc_conn_send_pdus(sk); out:; } @@ -340,16 +321,12 @@ out: /** * llc_conn_send_pdus - Sends queued PDUs * @sk: active connection - * @hold_skb: the skb held by caller, or NULL if does not care * - * Sends queued pdus to MAC layer for transmission. When @hold_skb is - * NULL, always return 0. Otherwise, return 0 if @hold_skb is sent - * successfully, or 1 for failure. + * Sends queued pdus to MAC layer for transmission. */ -static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb) +static void llc_conn_send_pdus(struct sock *sk) { struct sk_buff *skb; - int ret = 0; while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); @@ -361,20 +338,10 @@ static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb) skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb); if (!skb2) break; - dev_queue_xmit(skb2); - } else { - bool is_target = skb == hold_skb; - int rc; - - if (is_target) - skb_get(skb); - rc = dev_queue_xmit(skb); - if (is_target) - ret = rc; + skb = skb2; } + dev_queue_xmit(skb); } - - return ret; } /** @@ -846,7 +813,7 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) else { dprintk("%s: adding to backlog...\n", __func__); llc_set_backlog_type(skb, LLC_PACKET); - if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) goto drop_unlock; } out: diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index 8db03c2d5440..ad6547736c21 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -38,6 +38,8 @@ * closed and -EBUSY when sending data is not permitted in this state or * LLC has send an I pdu with p bit set to 1 and is waiting for it's * response. + * + * This function always consumes a reference to the skb. */ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) { @@ -46,20 +48,22 @@ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); if (unlikely(llc->state == LLC_CONN_STATE_ADM)) - goto out; + goto out_free; rc = -EBUSY; if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */ llc->p_flag)) { llc->failed_data_req = 1; - goto out; + goto out_free; } ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PRIM; ev->prim = LLC_DATA_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; skb->dev = llc->dev; - rc = llc_conn_state_process(sk, skb); -out: + return llc_conn_state_process(sk, skb); + +out_free: + kfree_skb(skb); return rc; } diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c index a94bd56bcac6..7ae4cc684d3a 100644 --- a/net/llc/llc_s_ac.c +++ b/net/llc/llc_s_ac.c @@ -58,8 +58,10 @@ int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb) ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(skb); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) + if (likely(!rc)) { + skb_get(skb); rc = dev_queue_xmit(skb); + } return rc; } @@ -81,8 +83,10 @@ int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb) ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) + if (likely(!rc)) { + skb_get(skb); rc = dev_queue_xmit(skb); + } return rc; } @@ -135,8 +139,10 @@ int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb) ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_test_cmd(skb); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) + if (likely(!rc)) { + skb_get(skb); rc = dev_queue_xmit(skb); + } return rc; } diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index a7f7b8ff4729..be419062e19a 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -197,29 +197,22 @@ out: * After executing actions of the event, upper layer will be indicated * if needed(on receiving an UI frame). sk can be null for the * datalink_proto case. + * + * This function always consumes a reference to the skb. */ static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); - /* - * We have to hold the skb, because llc_sap_next_state - * will kfree it in the sending path and we need to - * look at the skb->cb, where we encode llc_sap_state_ev. - */ - skb_get(skb); ev->ind_cfm_flag = 0; llc_sap_next_state(sap, skb); - if (ev->ind_cfm_flag == LLC_IND) { - if (skb->sk->sk_state == TCP_LISTEN) - kfree_skb(skb); - else { - llc_save_primitive(skb->sk, skb, ev->prim); - /* queue skb to the user. */ - if (sock_queue_rcv_skb(skb->sk, skb)) - kfree_skb(skb); - } + if (ev->ind_cfm_flag == LLC_IND && skb->sk->sk_state != TCP_LISTEN) { + llc_save_primitive(skb->sk, skb, ev->prim); + + /* queue skb to the user. */ + if (sock_queue_rcv_skb(skb->sk, skb) == 0) + return; } kfree_skb(skb); } diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index b11883d26875..33da6f738c99 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -485,7 +485,14 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) params.ssn = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, ¶ms); - if (ret) { + if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) { + /* + * We didn't send the request yet, so don't need to check + * here if we already got a response, just mark as driver + * ready immediately. + */ + set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state); + } else if (ret) { ht_dbg(sdata, "BA request denied - HW unavailable for %pM tid %d\n", sta->sta.addr, tid); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 70739e746c13..4fb7f1f12109 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3428,7 +3428,7 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, spin_lock_irqsave(&local->ack_status_lock, spin_flags); id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x10000, GFP_ATOMIC); + 1, 0x40, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); if (id < 0) { diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 0a6ff01c68a9..d40744903fa9 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -538,7 +538,6 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct cfg80211_bss *cbss; - int err, changed = 0; sdata_assert_lock(sdata); @@ -560,13 +559,7 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) ifibss->chandef = sdata->csa_chandef; /* generate the beacon */ - err = ieee80211_ibss_csa_beacon(sdata, NULL); - if (err < 0) - return err; - - changed |= err; - - return changed; + return ieee80211_ibss_csa_beacon(sdata, NULL); } void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index aba094b4ccfc..2d05c4cfaf6d 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1292,8 +1292,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) ieee80211_remove_interfaces(local); fail_rate: rtnl_unlock(); - ieee80211_led_exit(local); fail_flows: + ieee80211_led_exit(local); destroy_workqueue(local->workqueue); fail_workqueue: wiphy_unregister(local->hw.wiphy); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 26a2f49208b6..5fa13176036f 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2633,7 +2633,8 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, rcu_read_lock(); ssid = ieee80211_bss_get_ie(cbss, WLAN_EID_SSID); - if (WARN_ON_ONCE(ssid == NULL)) + if (WARN_ONCE(!ssid || ssid[1] > IEEE80211_MAX_SSID_LEN, + "invalid SSID element (len=%d)", ssid ? ssid[1] : -1)) ssid_len = 0; else ssid_len = ssid[1]; @@ -3185,15 +3186,14 @@ static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, - struct ieee80211_mgmt *mgmt, size_t len) + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee802_11_elems *elems) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; struct sta_info *sta; - u8 *pos; u16 capab_info, aid; - struct ieee802_11_elems elems; struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; const struct cfg80211_bss_ies *bss_ies = NULL; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; @@ -3221,19 +3221,15 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ifmgd->broken_ap = true; } - pos = mgmt->u.assoc_resp.variable; - ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems, - mgmt->bssid, assoc_data->bss->bssid); - - if (!elems.supp_rates) { + if (!elems->supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); return false; } ifmgd->aid = aid; ifmgd->tdls_chan_switch_prohibited = - elems.ext_capab && elems.ext_capab_len >= 5 && - (elems.ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); + elems->ext_capab && elems->ext_capab_len >= 5 && + (elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); /* * Some APs are erroneously not including some information in their @@ -3242,11 +3238,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ - if ((assoc_data->wmm && !elems.wmm_param) || + if ((assoc_data->wmm && !elems->wmm_param) || (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && - (!elems.ht_cap_elem || !elems.ht_operation)) || + (!elems->ht_cap_elem || !elems->ht_operation)) || (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && - (!elems.vht_cap_elem || !elems.vht_operation))) { + (!elems->vht_cap_elem || !elems->vht_operation))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems bss_elems; @@ -3264,8 +3260,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, mgmt->bssid, assoc_data->bss->bssid); if (assoc_data->wmm && - !elems.wmm_param && bss_elems.wmm_param) { - elems.wmm_param = bss_elems.wmm_param; + !elems->wmm_param && bss_elems.wmm_param) { + elems->wmm_param = bss_elems.wmm_param; sdata_info(sdata, "AP bug: WMM param missing from AssocResp\n"); } @@ -3274,27 +3270,27 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * Also check if we requested HT/VHT, otherwise the AP doesn't * have to include the IEs in the (re)association response. */ - if (!elems.ht_cap_elem && bss_elems.ht_cap_elem && + if (!elems->ht_cap_elem && bss_elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { - elems.ht_cap_elem = bss_elems.ht_cap_elem; + elems->ht_cap_elem = bss_elems.ht_cap_elem; sdata_info(sdata, "AP bug: HT capability missing from AssocResp\n"); } - if (!elems.ht_operation && bss_elems.ht_operation && + if (!elems->ht_operation && bss_elems.ht_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { - elems.ht_operation = bss_elems.ht_operation; + elems->ht_operation = bss_elems.ht_operation; sdata_info(sdata, "AP bug: HT operation missing from AssocResp\n"); } - if (!elems.vht_cap_elem && bss_elems.vht_cap_elem && + if (!elems->vht_cap_elem && bss_elems.vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { - elems.vht_cap_elem = bss_elems.vht_cap_elem; + elems->vht_cap_elem = bss_elems.vht_cap_elem; sdata_info(sdata, "AP bug: VHT capa missing from AssocResp\n"); } - if (!elems.vht_operation && bss_elems.vht_operation && + if (!elems->vht_operation && bss_elems.vht_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { - elems.vht_operation = bss_elems.vht_operation; + elems->vht_operation = bss_elems.vht_operation; sdata_info(sdata, "AP bug: VHT operation missing from AssocResp\n"); } @@ -3305,7 +3301,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * they should be present here. This is just a safety net. */ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && - (!elems.wmm_param || !elems.ht_cap_elem || !elems.ht_operation)) { + (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) { sdata_info(sdata, "HT AP is missing WMM params or HT capability/operation\n"); ret = false; @@ -3313,7 +3309,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && - (!elems.vht_cap_elem || !elems.vht_operation)) { + (!elems->vht_cap_elem || !elems->vht_operation)) { sdata_info(sdata, "VHT AP is missing VHT capability/operation\n"); ret = false; @@ -3340,7 +3336,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && - (!elems.he_cap || !elems.he_operation)) { + (!elems->he_cap || !elems->he_operation)) { mutex_unlock(&sdata->local->sta_mtx); sdata_info(sdata, "HE AP is missing HE capability/operation\n"); @@ -3349,23 +3345,23 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } /* Set up internal HT/VHT capabilities */ - if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) + if (elems->ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, - elems.ht_cap_elem, sta); + elems->ht_cap_elem, sta); - if (elems.vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) + if (elems->vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - elems.vht_cap_elem, sta); + elems->vht_cap_elem, sta); - if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && - elems.he_cap) { + if (elems->he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + elems->he_cap) { ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, - elems.he_cap, - elems.he_cap_len, + elems->he_cap, + elems->he_cap_len, sta); bss_conf->he_support = sta->sta.he_cap.has_he; - changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); + changed |= ieee80211_recalc_twt_req(sdata, sta, elems); } else { bss_conf->he_support = false; bss_conf->twt_requester = false; @@ -3373,14 +3369,14 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (bss_conf->he_support) { bss_conf->bss_color = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_MASK); bss_conf->htc_trig_based_pkt_ext = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); bss_conf->frame_time_rts_th = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); bss_conf->multi_sta_back_32bit = @@ -3391,12 +3387,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_ACK_EN; - bss_conf->uora_exists = !!elems.uora_element; - if (elems.uora_element) - bss_conf->uora_ocw_range = elems.uora_element[0]; + bss_conf->uora_exists = !!elems->uora_element; + if (elems->uora_element) + bss_conf->uora_ocw_range = elems->uora_element[0]; - ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems.he_operation); - ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems.he_spr); + ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems->he_operation); + ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems->he_spr); /* TODO: OPEN: what happens if BSS color disable is set? */ } @@ -3420,11 +3416,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * NSS calculation (that would be done in rate_control_rate_init()) * and use the # of streams from that element. */ - if (elems.opmode_notif && - !(*elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { + if (elems->opmode_notif && + !(*elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { u8 nss; - nss = *elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; + nss = *elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; sta->sta.rx_nss = nss; @@ -3439,7 +3435,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta->sta.mfp = false; } - sta->sta.wme = elems.wmm_param && local->hw.queues >= IEEE80211_NUM_ACS; + sta->sta.wme = elems->wmm_param && local->hw.queues >= IEEE80211_NUM_ACS; err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) @@ -3467,9 +3463,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) { ieee80211_set_wmm_default(sdata, false, false); - } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len, - elems.mu_edca_param_set)) { + } else if (!ieee80211_sta_wmm_params(local, sdata, elems->wmm_param, + elems->wmm_param_len, + elems->mu_edca_param_set)) { /* still enable QoS since we might have HT/VHT */ ieee80211_set_wmm_default(sdata, false, true); /* set the disable-WMM flag in this case to disable @@ -3483,11 +3479,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } changed |= BSS_CHANGED_QOS; - if (elems.max_idle_period_ie) { + if (elems->max_idle_period_ie) { bss_conf->max_idle_period = - le16_to_cpu(elems.max_idle_period_ie->max_idle_period); + le16_to_cpu(elems->max_idle_period_ie->max_idle_period); bss_conf->protected_keep_alive = - !!(elems.max_idle_period_ie->idle_options & + !!(elems->max_idle_period_ie->idle_options & WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE); changed |= BSS_CHANGED_KEEP_ALIVE; } else { @@ -3597,7 +3593,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { - if (!ieee80211_assoc_success(sdata, bss, mgmt, len)) { + if (!ieee80211_assoc_success(sdata, bss, mgmt, len, &elems)) { /* oops -- internal error -- send timeout for now */ ieee80211_destroy_assoc_data(sdata, false, false); cfg80211_assoc_timeout(sdata->dev, bss); @@ -5233,7 +5229,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); ssidie = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID); - if (!ssidie) { + if (!ssidie || ssidie[1] > sizeof(assoc_data->ssid)) { rcu_read_unlock(); kfree(assoc_data); return -EINVAL; diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index ee86c3333999..86bc469a28bc 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -70,7 +70,7 @@ rix_to_ndx(struct minstrel_sta_info *mi, int rix) } /* return current EMWA throughput */ -int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma) +int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg) { int usecs; @@ -79,13 +79,13 @@ int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma) usecs = 1000000; /* reset thr. below 10% success */ - if (mr->stats.prob_ewma < MINSTREL_FRAC(10, 100)) + if (mr->stats.prob_avg < MINSTREL_FRAC(10, 100)) return 0; - if (prob_ewma > MINSTREL_FRAC(90, 100)) + if (prob_avg > MINSTREL_FRAC(90, 100)) return MINSTREL_TRUNC(100000 * (MINSTREL_FRAC(90, 100) / usecs)); else - return MINSTREL_TRUNC(100000 * (prob_ewma / usecs)); + return MINSTREL_TRUNC(100000 * (prob_avg / usecs)); } /* find & sort topmost throughput rates */ @@ -98,8 +98,8 @@ minstrel_sort_best_tp_rates(struct minstrel_sta_info *mi, int i, u8 *tp_list) for (j = MAX_THR_RATES; j > 0; --j) { tmp_mrs = &mi->r[tp_list[j - 1]].stats; - if (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_ewma) <= - minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_ewma)) + if (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_avg) <= + minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_avg)) break; } @@ -157,20 +157,24 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * Recalculate statistics and counters of a given rate */ void -minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs) +minstrel_calc_rate_stats(struct minstrel_priv *mp, + struct minstrel_rate_stats *mrs) { unsigned int cur_prob; if (unlikely(mrs->attempts > 0)) { mrs->sample_skipped = 0; cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); - if (unlikely(!mrs->att_hist)) { - mrs->prob_ewma = cur_prob; + if (mp->new_avg) { + minstrel_filter_avg_add(&mrs->prob_avg, + &mrs->prob_avg_1, cur_prob); + } else if (unlikely(!mrs->att_hist)) { + mrs->prob_avg = cur_prob; } else { /*update exponential weighted moving avarage */ - mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma, - cur_prob, - EWMA_LEVEL); + mrs->prob_avg = minstrel_ewma(mrs->prob_avg, + cur_prob, + EWMA_LEVEL); } mrs->att_hist += mrs->attempts; mrs->succ_hist += mrs->success; @@ -200,12 +204,12 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) struct minstrel_rate_stats *tmp_mrs = &mi->r[tmp_prob_rate].stats; /* Update statistics of success probability per rate */ - minstrel_calc_rate_stats(mrs); + minstrel_calc_rate_stats(mp, mrs); /* Sample less often below the 10% chance of success. * Sample less often above the 95% chance of success. */ - if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) || - mrs->prob_ewma < MINSTREL_FRAC(10, 100)) { + if (mrs->prob_avg > MINSTREL_FRAC(95, 100) || + mrs->prob_avg < MINSTREL_FRAC(10, 100)) { mr->adjusted_retry_count = mrs->retry_count >> 1; if (mr->adjusted_retry_count > 2) mr->adjusted_retry_count = 2; @@ -225,14 +229,14 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * choose the maximum throughput rate as max_prob_rate * (2) if all success probabilities < 95%, the rate with * highest success probability is chosen as max_prob_rate */ - if (mrs->prob_ewma >= MINSTREL_FRAC(95, 100)) { - tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_ewma); + if (mrs->prob_avg >= MINSTREL_FRAC(95, 100)) { + tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_avg); tmp_prob_tp = minstrel_get_tp_avg(&mi->r[tmp_prob_rate], - tmp_mrs->prob_ewma); + tmp_mrs->prob_avg); if (tmp_cur_tp >= tmp_prob_tp) tmp_prob_rate = i; } else { - if (mrs->prob_ewma >= tmp_mrs->prob_ewma) + if (mrs->prob_avg >= tmp_mrs->prob_avg) tmp_prob_rate = i; } } @@ -290,7 +294,7 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, mi->sample_deferred--; if (time_after(jiffies, mi->last_stats_update + - (mp->update_interval * HZ) / 1000)) + mp->update_interval / (mp->new_avg ? 2 : 1))) minstrel_update_stats(mp, mi); } @@ -422,7 +426,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, * has a probability of >95%, we shouldn't be attempting * to use it, as this only wastes precious airtime */ if (!mrr_capable && - (mi->r[ndx].stats.prob_ewma > MINSTREL_FRAC(95, 100))) + (mi->r[ndx].stats.prob_avg > MINSTREL_FRAC(95, 100))) return; mi->prev_sample = true; @@ -573,7 +577,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta) * computing cur_tp */ tmp_mrs = &mi->r[idx].stats; - tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10; + tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_avg) * 10; tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024; return tmp_cur_tp; diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index 51d8b2c846e7..dbb43bcd3c45 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -19,6 +19,21 @@ #define MAX_THR_RATES 4 /* + * Coefficients for moving average with noise filter (period=16), + * scaled by 10 bits + * + * a1 = exp(-pi * sqrt(2) / period) + * coeff2 = 2 * a1 * cos(sqrt(2) * 2 * pi / period) + * coeff3 = -sqr(a1) + * coeff1 = 1 - coeff2 - coeff3 + */ +#define MINSTREL_AVG_COEFF1 (MINSTREL_FRAC(1, 1) - \ + MINSTREL_AVG_COEFF2 - \ + MINSTREL_AVG_COEFF3) +#define MINSTREL_AVG_COEFF2 0x00001499 +#define MINSTREL_AVG_COEFF3 -0x0000092e + +/* * Perform EWMA (Exponentially Weighted Moving Average) calculation */ static inline int @@ -32,6 +47,37 @@ minstrel_ewma(int old, int new, int weight) return old + incr; } +static inline int minstrel_filter_avg_add(u16 *prev_1, u16 *prev_2, s32 in) +{ + s32 out_1 = *prev_1; + s32 out_2 = *prev_2; + s32 val; + + if (!in) + in += 1; + + if (!out_1) { + val = out_1 = in; + goto out; + } + + val = MINSTREL_AVG_COEFF1 * in; + val += MINSTREL_AVG_COEFF2 * out_1; + val += MINSTREL_AVG_COEFF3 * out_2; + val >>= MINSTREL_SCALE; + + if (val > 1 << MINSTREL_SCALE) + val = 1 << MINSTREL_SCALE; + if (val < 0) + val = 1; + +out: + *prev_2 = out_1; + *prev_1 = val; + + return val; +} + struct minstrel_rate_stats { /* current / last sampling period attempts/success counters */ u16 attempts, last_attempts; @@ -40,8 +86,9 @@ struct minstrel_rate_stats { /* total attempts/success counters */ u32 att_hist, succ_hist; - /* prob_ewma - exponential weighted moving average of prob */ - u16 prob_ewma; + /* prob_avg - moving average of prob */ + u16 prob_avg; + u16 prob_avg_1; /* maximum retry counts */ u8 retry_count; @@ -95,6 +142,7 @@ struct minstrel_sta_info { struct minstrel_priv { struct ieee80211_hw *hw; bool has_mrr; + bool new_avg; u32 sample_switch; unsigned int cw_min; unsigned int cw_max; @@ -126,8 +174,9 @@ extern const struct rate_control_ops mac80211_minstrel; void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); /* Recalculate success probabilities and counters for a given rate using EWMA */ -void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs); -int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma); +void minstrel_calc_rate_stats(struct minstrel_priv *mp, + struct minstrel_rate_stats *mrs); +int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg); /* debugfs */ int minstrel_stats_open(struct inode *inode, struct file *file); diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index c8afd85b51a0..9b8e0daeb7bb 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -90,8 +90,8 @@ minstrel_stats_open(struct inode *inode, struct file *file) p += sprintf(p, "%6u ", mr->perfect_tx_time); tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); - tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u" " %3u %3u %-3u " @@ -147,8 +147,8 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) p += sprintf(p, "%u,",mr->perfect_tx_time); tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); - tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,%u," "%llu,%llu,%d,%d\n", diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 0ef2633349b5..694a31978a04 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -346,12 +346,12 @@ minstrel_ht_avg_ampdu_len(struct minstrel_ht_sta *mi) */ int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, - int prob_ewma) + int prob_avg) { unsigned int nsecs = 0; /* do not account throughput if sucess prob is below 10% */ - if (prob_ewma < MINSTREL_FRAC(10, 100)) + if (prob_avg < MINSTREL_FRAC(10, 100)) return 0; if (group != MINSTREL_CCK_GROUP) @@ -365,11 +365,11 @@ minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, * account for collision related packet error rate fluctuation * (prob is scaled - see MINSTREL_FRAC above) */ - if (prob_ewma > MINSTREL_FRAC(90, 100)) + if (prob_avg > MINSTREL_FRAC(90, 100)) return MINSTREL_TRUNC(100000 * ((MINSTREL_FRAC(90, 100) * 1000) / nsecs)); else - return MINSTREL_TRUNC(100000 * ((prob_ewma * 1000) / nsecs)); + return MINSTREL_TRUNC(100000 * ((prob_avg * 1000) / nsecs)); } /* @@ -389,13 +389,13 @@ minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index, cur_group = index / MCS_GROUP_RATES; cur_idx = index % MCS_GROUP_RATES; - cur_prob = mi->groups[cur_group].rates[cur_idx].prob_ewma; + cur_prob = mi->groups[cur_group].rates[cur_idx].prob_avg; cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, cur_prob); do { tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (cur_tp_avg < tmp_tp_avg || @@ -432,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) tmp_group = mi->max_prob_rate / MCS_GROUP_RATES; tmp_idx = mi->max_prob_rate % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from @@ -444,11 +444,11 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES; max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; - max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma; + max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_avg; - if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) { + if (mrs->prob_avg > MINSTREL_FRAC(75, 100)) { cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, - mrs->prob_ewma); + mrs->prob_avg); if (cur_tp_avg > tmp_tp_avg) mi->max_prob_rate = index; @@ -458,9 +458,9 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) if (cur_tp_avg > max_gpr_tp_avg) mg->max_group_prob_rate = index; } else { - if (mrs->prob_ewma > tmp_prob) + if (mrs->prob_avg > tmp_prob) mi->max_prob_rate = index; - if (mrs->prob_ewma > max_gpr_prob) + if (mrs->prob_avg > max_gpr_prob) mg->max_group_prob_rate = index; } } @@ -482,12 +482,12 @@ minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, tmp_group = tmp_cck_tp_rate[0] / MCS_GROUP_RATES; tmp_idx = tmp_cck_tp_rate[0] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_cck_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); tmp_group = tmp_mcs_tp_rate[0] / MCS_GROUP_RATES; tmp_idx = tmp_mcs_tp_rate[0] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (tmp_cck_tp_rate && tmp_cck_tp > tmp_mcs_tp) { @@ -518,7 +518,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) continue; tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; - tmp_prob = mi->groups[group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[group].rates[tmp_idx].prob_avg; if (tmp_tp < minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob) && (minstrel_mcs_groups[group].streams < tmp_max_streams)) { @@ -623,7 +623,7 @@ minstrel_ht_rate_sample_switch(struct minstrel_priv *mp, * If that fails, look again for a rate that is at least as fast */ mrs = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); - faster_rate = mrs->prob_ewma > MINSTREL_FRAC(75, 100); + faster_rate = mrs->prob_avg > MINSTREL_FRAC(75, 100); minstrel_ht_find_probe_rates(mi, rates, &n_rates, faster_rate); if (!n_rates && faster_rate) minstrel_ht_find_probe_rates(mi, rates, &n_rates, false); @@ -737,8 +737,8 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, mrs = &mg->rates[i]; mrs->retry_updated = false; - minstrel_calc_rate_stats(mrs); - cur_prob = mrs->prob_ewma; + minstrel_calc_rate_stats(mp, mrs); + cur_prob = mrs->prob_avg; if (minstrel_ht_get_tp_avg(mi, group, i, cur_prob) == 0) continue; @@ -773,6 +773,8 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, /* try to sample all available rates during each interval */ mi->sample_count *= 8; + if (mp->new_avg) + mi->sample_count /= 2; if (sample) minstrel_ht_rate_sample_switch(mp, mi); @@ -889,6 +891,7 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, struct ieee80211_tx_rate *ar = info->status.rates; struct minstrel_rate_stats *rate, *rate2, *rate_sample = NULL; struct minstrel_priv *mp = priv; + u32 update_interval = mp->update_interval / 2; bool last, update = false; bool sample_status = false; int i; @@ -943,6 +946,10 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, switch (mi->sample_mode) { case MINSTREL_SAMPLE_IDLE: + if (mp->new_avg && + (mp->hw->max_rates > 1 || + mi->total_packets_cur < SAMPLE_SWITCH_THR)) + update_interval /= 2; break; case MINSTREL_SAMPLE_ACTIVE: @@ -970,23 +977,20 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, */ rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); if (rate->attempts > 30 && - MINSTREL_FRAC(rate->success, rate->attempts) < - MINSTREL_FRAC(20, 100)) { + rate->success < rate->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true); update = true; } rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]); if (rate2->attempts > 30 && - MINSTREL_FRAC(rate2->success, rate2->attempts) < - MINSTREL_FRAC(20, 100)) { + rate2->success < rate2->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false); update = true; } } - if (time_after(jiffies, mi->last_stats_update + - (mp->update_interval / 2 * HZ) / 1000)) { + if (time_after(jiffies, mi->last_stats_update + update_interval)) { update = true; minstrel_ht_update_stats(mp, mi, true); } @@ -1008,7 +1012,7 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, unsigned int overhead = 0, overhead_rtscts = 0; mrs = minstrel_get_ratestats(mi, index); - if (mrs->prob_ewma < MINSTREL_FRAC(1, 10)) { + if (mrs->prob_avg < MINSTREL_FRAC(1, 10)) { mrs->retry_count = 1; mrs->retry_count_rtscts = 1; return; @@ -1065,7 +1069,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (!mrs->retry_updated) minstrel_calc_retransmit(mp, mi, index); - if (mrs->prob_ewma < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { + if (mrs->prob_avg < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { ratetbl->rate[offset].count = 2; ratetbl->rate[offset].count_rts = 2; ratetbl->rate[offset].count_cts = 2; @@ -1099,11 +1103,11 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, } static inline int -minstrel_ht_get_prob_ewma(struct minstrel_ht_sta *mi, int rate) +minstrel_ht_get_prob_avg(struct minstrel_ht_sta *mi, int rate) { int group = rate / MCS_GROUP_RATES; rate %= MCS_GROUP_RATES; - return mi->groups[group].rates[rate].prob_ewma; + return mi->groups[group].rates[rate].prob_avg; } static int @@ -1115,7 +1119,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) unsigned int duration; /* Disable A-MSDU if max_prob_rate is bad */ - if (mi->groups[group].rates[rate].prob_ewma < MINSTREL_FRAC(50, 100)) + if (mi->groups[group].rates[rate].prob_avg < MINSTREL_FRAC(50, 100)) return 1; duration = g->duration[rate]; @@ -1138,7 +1142,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) * data packet size */ if (duration > MCS_DURATION(1, 0, 260) || - (minstrel_ht_get_prob_ewma(mi, mi->max_tp_rate[0]) < + (minstrel_ht_get_prob_avg(mi, mi->max_tp_rate[0]) < MINSTREL_FRAC(75, 100))) return 3200; @@ -1243,7 +1247,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) * rate, to avoid wasting airtime. */ sample_dur = minstrel_get_duration(sample_idx); - if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) || + if (mrs->prob_avg > MINSTREL_FRAC(95, 100) || minstrel_get_duration(mi->max_prob_rate) * 3 < sample_dur) return -1; @@ -1666,7 +1670,8 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) mp->has_mrr = true; mp->hw = hw; - mp->update_interval = 100; + mp->update_interval = HZ / 10; + mp->new_avg = true; #ifdef CONFIG_MAC80211_DEBUGFS mp->fixed_rate_idx = (u32) -1; @@ -1674,6 +1679,8 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) &mp->fixed_rate_idx); debugfs_create_u32("sample_switch", S_IRUGO | S_IWUSR, debugfsdir, &mp->sample_switch); + debugfs_create_bool("new_avg", S_IRUGO | S_IWUSR, debugfsdir, + &mp->new_avg); #endif minstrel_ht_init_cck_rates(mp); @@ -1698,7 +1705,7 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) i = mi->max_tp_rate[0] / MCS_GROUP_RATES; j = mi->max_tp_rate[0] % MCS_GROUP_RATES; - prob = mi->groups[i].rates[j].prob_ewma; + prob = mi->groups[i].rates[j].prob_avg; /* convert tp_avg from pkt per second in kbps */ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10; diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index f938701e7ab7..53ea3c29debf 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -119,6 +119,6 @@ struct minstrel_ht_sta_priv { void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, - int prob_ewma); + int prob_avg); #endif diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 5a6e9f3edc04..bebb71917742 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -98,8 +98,8 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, "%6u ", tx_time); tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); - tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u" " %3u %3u %-3u " @@ -243,8 +243,8 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, "%u,", tx_time); tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); - tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u," "%u,%llu,%llu,", diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 768d14c9a716..0e05ff037672 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3467,9 +3467,18 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) case cpu_to_le16(IEEE80211_STYPE_PROBE_RESP): /* process for all: mesh, mlme, ibss */ break; + case cpu_to_le16(IEEE80211_STYPE_DEAUTH): + if (is_multicast_ether_addr(mgmt->da) && + !is_broadcast_ether_addr(mgmt->da)) + return RX_DROP_MONITOR; + + /* process only for station/IBSS */ + if (sdata->vif.type != NL80211_IFTYPE_STATION && + sdata->vif.type != NL80211_IFTYPE_ADHOC) + return RX_DROP_MONITOR; + break; case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP): - case cpu_to_le16(IEEE80211_STYPE_DEAUTH): case cpu_to_le16(IEEE80211_STYPE_DISASSOC): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index adf94ba1ed77..4d31d9688dc2 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -520,10 +520,33 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local, return 0; } +static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *sdata_iter; + + if (!ieee80211_is_radar_required(local)) + return true; + + if (!regulatory_pre_cac_allowed(local->hw.wiphy)) + return false; + + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata_iter, &local->interfaces, list) { + if (sdata_iter->wdev.cac_started) { + mutex_unlock(&local->iflist_mtx); + return false; + } + } + mutex_unlock(&local->iflist_mtx); + + return true; +} + static bool ieee80211_can_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { - if (ieee80211_is_radar_required(local)) + if (!__ieee80211_can_leave_ch(sdata)) return false; if (!list_empty(&local->roc_list)) @@ -630,7 +653,10 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, lockdep_assert_held(&local->mtx); - if (local->scan_req || ieee80211_is_radar_required(local)) + if (local->scan_req) + return -EBUSY; + + if (!__ieee80211_can_leave_ch(sdata)) return -EBUSY; if (!ieee80211_can_scan(local, sdata)) { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index bd11fef2139f..8d3a2389b055 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2457,7 +2457,8 @@ unsigned long ieee80211_sta_last_active(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); - if (time_after(stats->last_rx, sta->status_stats.last_ack)) + if (!sta->status_stats.last_ack || + time_after(stats->last_rx, sta->status_stats.last_ack)) return stats->last_rx; return sta->status_stats.last_ack; } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1fa422782905..db38be1b75fa 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1617,7 +1617,7 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, static bool ieee80211_tx_frags(struct ieee80211_local *local, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct sta_info *sta, struct sk_buff_head *skbs, bool txpending) { @@ -1679,7 +1679,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); info->control.vif = vif; - control.sta = sta; + control.sta = sta ? &sta->sta : NULL; __skb_unlink(skb, skbs); drv_tx(local, &control, skb); @@ -1698,7 +1698,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local, struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata; struct ieee80211_vif *vif; - struct ieee80211_sta *pubsta; struct sk_buff *skb; bool result = true; __le16 fc; @@ -1713,11 +1712,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local, if (sta && !sta->uploaded) sta = NULL; - if (sta) - pubsta = &sta->sta; - else - pubsta = NULL; - switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { @@ -1744,8 +1738,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, break; } - result = ieee80211_tx_frags(local, vif, pubsta, skbs, - txpending); + result = ieee80211_tx_frags(local, vif, sta, skbs, txpending); ieee80211_tpt_led_trig_tx(local, fc, led_len); @@ -2424,6 +2417,33 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, return 0; } +static int ieee80211_store_ack_skb(struct ieee80211_local *local, + struct sk_buff *skb, + u32 *info_flags) +{ + struct sk_buff *ack_skb = skb_clone_sk(skb); + u16 info_id = 0; + + if (ack_skb) { + unsigned long flags; + int id; + + spin_lock_irqsave(&local->ack_status_lock, flags); + id = idr_alloc(&local->ack_status_frames, ack_skb, + 1, 0x40, GFP_ATOMIC); + spin_unlock_irqrestore(&local->ack_status_lock, flags); + + if (id >= 0) { + info_id = id; + *info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + } else { + kfree_skb(ack_skb); + } + } + + return info_id; +} + /** * ieee80211_build_hdr - build 802.11 header in the given frame * @sdata: virtual interface to build the header for @@ -2717,26 +2737,8 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } if (unlikely(!multicast && skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) { - struct sk_buff *ack_skb = skb_clone_sk(skb); - - if (ack_skb) { - unsigned long flags; - int id; - - spin_lock_irqsave(&local->ack_status_lock, flags); - id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x10000, GFP_ATOMIC); - spin_unlock_irqrestore(&local->ack_status_lock, flags); - - if (id >= 0) { - info_id = id; - info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; - } else { - kfree_skb(ack_skb); - } - } - } + skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) + info_id = ieee80211_store_ack_skb(local, skb, &info_flags); /* * If the skb is shared we need to obtain our own copy. @@ -3529,7 +3531,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct ieee80211_sub_if_data, u.ap); __skb_queue_tail(&tx.skbs, skb); - ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false); + ieee80211_tx_frags(local, &sdata->vif, sta, &tx.skbs, false); return true; } diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 4fc075b612fe..5e9b2eb24349 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -120,7 +120,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o -nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o +nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ + nf_flow_table_offload.o obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5d5bdf450091..78f046ec506f 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -536,6 +536,26 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, } EXPORT_SYMBOL(nf_hook_slow); +void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, + const struct nf_hook_entries *e) +{ + struct sk_buff *skb, *next; + struct list_head sublist; + int ret; + + INIT_LIST_HEAD(&sublist); + + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + ret = nf_hook_slow(skb, state, e, 0); + if (ret == 1) + list_add_tail(&skb->list, &sublist); + } + /* Put passed packets back on main list */ + list_splice(&sublist, head); +} +EXPORT_SYMBOL(nf_hook_slow_list); + /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 063df74b4647..1abd6f0dc227 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -192,7 +192,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, } #ifndef IP_SET_BITMAP_STORED_TIMEOUT -static inline bool +static bool mtype_is_filled(const struct mtype_elem *x) { return true; diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 11ff9d4a7006..abe8f77d7d23 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -55,7 +55,7 @@ struct bitmap_ip_adt_elem { u16 id; }; -static inline u32 +static u32 ip_to_id(const struct bitmap_ip *m, u32 ip) { return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts; @@ -63,33 +63,33 @@ ip_to_id(const struct bitmap_ip *m, u32 ip) /* Common functions */ -static inline int +static int bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize) { return !!test_bit(id, map->members); } -static inline int +static int bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, u32 flags, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, size_t dsize) { @@ -97,7 +97,7 @@ bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, htonl(map->first_ip + id * map->hosts)); } -static inline int +static int bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || @@ -237,6 +237,18 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, return true; } +static u32 +range_to_mask(u32 from, u32 to, u8 *bits) +{ + u32 mask = 0xFFFFFFFE; + + *bits = 32; + while (--(*bits) > 0 && mask && (to & mask) != from) + mask <<= 1; + + return mask; +} + static int bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 1d4e63326e68..b618713297da 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -65,7 +65,7 @@ struct bitmap_ipmac_elem { unsigned char filled; } __aligned(__alignof__(u64)); -static inline u32 +static u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) { return ip - m->first_ip; @@ -79,7 +79,7 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip) /* Common functions */ -static inline int +static int bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, const struct bitmap_ipmac *map, size_t dsize) { @@ -94,7 +94,7 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, return -EAGAIN; } -static inline int +static int bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) { const struct bitmap_ipmac_elem *elem; @@ -106,13 +106,13 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) return elem->filled == MAC_FILLED; } -static inline int +static int bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) { return elem->filled == MAC_FILLED; } -static inline int +static int bitmap_ipmac_add_timeout(unsigned long *timeout, const struct bitmap_ipmac_adt_elem *e, const struct ip_set_ext *ext, struct ip_set *set, @@ -139,7 +139,7 @@ bitmap_ipmac_add_timeout(unsigned long *timeout, return 0; } -static inline int +static int bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map, u32 flags, size_t dsize) { @@ -177,14 +177,14 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, return IPSET_ADD_STORE_PLAIN_TIMEOUT; } -static inline int +static int bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { @@ -197,7 +197,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); } -static inline int +static int bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 704a0dda1609..23d6095cb196 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -46,7 +46,7 @@ struct bitmap_port_adt_elem { u16 id; }; -static inline u16 +static u16 port_to_id(const struct bitmap_port *m, u16 port) { return port - m->first_port; @@ -54,34 +54,34 @@ port_to_id(const struct bitmap_port *m, u16 port) /* Common functions */ -static inline int +static int bitmap_port_do_test(const struct bitmap_port_adt_elem *e, const struct bitmap_port *map, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize) { return !!test_bit(id, map->members); } -static inline int +static int bitmap_port_do_add(const struct bitmap_port_adt_elem *e, struct bitmap_port *map, u32 flags, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_port_do_del(const struct bitmap_port_adt_elem *e, struct bitmap_port *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, size_t dsize) { @@ -89,13 +89,40 @@ bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, htons(map->first_port + id)); } -static inline int +static int bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map) { return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); } +static bool +ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) +{ + bool ret; + u8 proto; + + switch (pf) { + case NFPROTO_IPV4: + ret = ip_set_get_ip4_port(skb, src, port, &proto); + break; + case NFPROTO_IPV6: + ret = ip_set_get_ip6_port(skb, src, port, &proto); + break; + default: + return false; + } + if (!ret) + return ret; + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + return true; + default: + return false; + } +} + static int bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e64d5f9a89dd..169e0a04f814 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -35,7 +35,7 @@ struct ip_set_net { static unsigned int ip_set_net_id __read_mostly; -static inline struct ip_set_net *ip_set_pernet(struct net *net) +static struct ip_set_net *ip_set_pernet(struct net *net) { return net_generic(net, ip_set_net_id); } @@ -67,13 +67,13 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); * serialized by ip_set_type_mutex. */ -static inline void +static void ip_set_type_lock(void) { mutex_lock(&ip_set_type_mutex); } -static inline void +static void ip_set_type_unlock(void) { mutex_unlock(&ip_set_type_mutex); @@ -277,7 +277,7 @@ ip_set_free(void *members) } EXPORT_SYMBOL_GPL(ip_set_free); -static inline bool +static bool flag_nested(const struct nlattr *nla) { return nla->nla_type & NLA_F_NESTED; @@ -296,7 +296,8 @@ ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, + ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) return -IPSET_ERR_PROTOCOL; @@ -314,7 +315,8 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, + ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) return -IPSET_ERR_PROTOCOL; @@ -325,6 +327,83 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); +static u32 +ip_set_timeout_get(const unsigned long *timeout) +{ + u32 t; + + if (*timeout == IPSET_ELEM_PERMANENT) + return 0; + + t = jiffies_to_msecs(*timeout - jiffies) / MSEC_PER_SEC; + /* Zero value in userspace means no timeout */ + return t == 0 ? 1 : t; +} + +static char * +ip_set_comment_uget(struct nlattr *tb) +{ + return nla_data(tb); +} + +/* Called from uadd only, protected by the set spinlock. + * The kadt functions don't use the comment extensions in any way. + */ +void +ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, + const struct ip_set_ext *ext) +{ + struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); + size_t len = ext->comment ? strlen(ext->comment) : 0; + + if (unlikely(c)) { + set->ext_size -= sizeof(*c) + strlen(c->str) + 1; + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); + } + if (!len) + return; + if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) + len = IPSET_MAX_COMMENT_SIZE; + c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); + if (unlikely(!c)) + return; + strlcpy(c->str, ext->comment, len + 1); + set->ext_size += sizeof(*c) + strlen(c->str) + 1; + rcu_assign_pointer(comment->c, c); +} +EXPORT_SYMBOL_GPL(ip_set_init_comment); + +/* Used only when dumping a set, protected by rcu_read_lock() */ +static int +ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) +{ + struct ip_set_comment_rcu *c = rcu_dereference(comment->c); + + if (!c) + return 0; + return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); +} + +/* Called from uadd/udel, flush or the garbage collectors protected + * by the set spinlock. + * Called when the set is destroyed and when there can't be any user + * of the set data anymore. + */ +static void +ip_set_comment_free(struct ip_set *set, void *ptr) +{ + struct ip_set_comment *comment = ptr; + struct ip_set_comment_rcu *c; + + c = rcu_dereference_protected(comment->c, 1); + if (unlikely(!c)) + return; + set->ext_size -= sizeof(*c) + strlen(c->str) + 1; + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); +} + typedef void (*destroyer)(struct ip_set *, void *); /* ipset data extension types, in size order */ @@ -351,12 +430,12 @@ const struct ip_set_ext_type ip_set_extensions[] = { .flag = IPSET_FLAG_WITH_COMMENT, .len = sizeof(struct ip_set_comment), .align = __alignof__(struct ip_set_comment), - .destroy = (destroyer) ip_set_comment_free, + .destroy = ip_set_comment_free, }, }; EXPORT_SYMBOL_GPL(ip_set_extensions); -static inline bool +static bool add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) { return ip_set_extensions[id].flag ? @@ -446,6 +525,46 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_set_get_extensions); +static u64 +ip_set_get_bytes(const struct ip_set_counter *counter) +{ + return (u64)atomic64_read(&(counter)->bytes); +} + +static u64 +ip_set_get_packets(const struct ip_set_counter *counter) +{ + return (u64)atomic64_read(&(counter)->packets); +} + +static bool +ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter) +{ + return nla_put_net64(skb, IPSET_ATTR_BYTES, + cpu_to_be64(ip_set_get_bytes(counter)), + IPSET_ATTR_PAD) || + nla_put_net64(skb, IPSET_ATTR_PACKETS, + cpu_to_be64(ip_set_get_packets(counter)), + IPSET_ATTR_PAD); +} + +static bool +ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo) +{ + /* Send nonzero parameters only */ + return ((skbinfo->skbmark || skbinfo->skbmarkmask) && + nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask), + IPSET_ATTR_PAD)) || + (skbinfo->skbprio && + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio))) || + (skbinfo->skbqueue && + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue))); +} + int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, const void *e, bool active) @@ -471,6 +590,55 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, } EXPORT_SYMBOL_GPL(ip_set_put_extensions); +static bool +ip_set_match_counter(u64 counter, u64 match, u8 op) +{ + switch (op) { + case IPSET_COUNTER_NONE: + return true; + case IPSET_COUNTER_EQ: + return counter == match; + case IPSET_COUNTER_NE: + return counter != match; + case IPSET_COUNTER_LT: + return counter < match; + case IPSET_COUNTER_GT: + return counter > match; + } + return false; +} + +static void +ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) +{ + atomic64_add((long long)bytes, &(counter)->bytes); +} + +static void +ip_set_add_packets(u64 packets, struct ip_set_counter *counter) +{ + atomic64_add((long long)packets, &(counter)->packets); +} + +static void +ip_set_update_counter(struct ip_set_counter *counter, + const struct ip_set_ext *ext, u32 flags) +{ + if (ext->packets != ULLONG_MAX && + !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) { + ip_set_add_bytes(ext->bytes, counter); + ip_set_add_packets(ext->packets, counter); + } +} + +static void +ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + mext->skbinfo = *skbinfo; +} + bool ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags, void *data) @@ -506,7 +674,7 @@ EXPORT_SYMBOL_GPL(ip_set_match_extensions); * The set behind an index may change by swapping only, from userspace. */ -static inline void +static void __ip_set_get(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -514,7 +682,7 @@ __ip_set_get(struct ip_set *set) write_unlock_bh(&ip_set_ref_lock); } -static inline void +static void __ip_set_put(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -526,7 +694,7 @@ __ip_set_put(struct ip_set *set) /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ -static inline void +static void __ip_set_put_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -541,7 +709,7 @@ __ip_set_put_netlink(struct ip_set *set) * so it can't be destroyed (or changed) under our foot. */ -static inline struct ip_set * +static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { struct ip_set *set; @@ -670,7 +838,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname); * */ -static inline void +static void __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) { struct ip_set *set; @@ -934,7 +1102,8 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && - nla_parse_nested_deprecated(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy, NULL)) { + nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], + set->type->create_policy, NULL)) { ret = -IPSET_ERR_PROTOCOL; goto put_out; } @@ -1252,6 +1421,30 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, #define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF) #define DUMP_FLAGS(arg) (((u32)(arg)) >> 16) +int +ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) +{ + u32 cadt_flags = 0; + + if (SET_WITH_TIMEOUT(set)) + if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(set->timeout)))) + return -EMSGSIZE; + if (SET_WITH_COUNTER(set)) + cadt_flags |= IPSET_FLAG_WITH_COUNTERS; + if (SET_WITH_COMMENT(set)) + cadt_flags |= IPSET_FLAG_WITH_COMMENT; + if (SET_WITH_SKBINFO(set)) + cadt_flags |= IPSET_FLAG_WITH_SKBINFO; + if (SET_WITH_FORCEADD(set)) + cadt_flags |= IPSET_FLAG_WITH_FORCEADD; + + if (!cadt_flags) + return 0; + return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags)); +} +EXPORT_SYMBOL_GPL(ip_set_put_flags); + static int ip_set_dump_done(struct netlink_callback *cb) { @@ -1281,6 +1474,14 @@ dump_attrs(struct nlmsghdr *nlh) } } +static const struct nla_policy +ip_set_dump_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_FLAGS] = { .type = NLA_U32 }, +}; + static int dump_init(struct netlink_callback *cb, struct ip_set_net *inst) { @@ -1292,9 +1493,9 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) ip_set_id_t index; int ret; - ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, - nlh->nlmsg_len - min_len, - ip_set_setname_policy, NULL); + ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, + nlh->nlmsg_len - min_len, + ip_set_dump_policy, NULL); if (ret) return ret; @@ -1543,9 +1744,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); cmdattr = (void *)&errmsg->msg + min_len; - ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, - nlh->nlmsg_len - min_len, - ip_set_adt_policy, NULL); + ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr, + nlh->nlmsg_len - min_len, ip_set_adt_policy, + NULL); if (ret) { nlmsg_free(skb2); @@ -1596,7 +1797,9 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, use_lineno = !!attr[IPSET_ATTR_LINENO]; if (attr[IPSET_ATTR_DATA]) { - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, + attr[IPSET_ATTR_DATA], + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); @@ -1606,7 +1809,8 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || - nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) + nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); @@ -1655,7 +1859,8 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (!set) return -ENOENT; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; rcu_read_lock_bh(); @@ -1961,7 +2166,7 @@ static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { [IPSET_CMD_LIST] = { .call = ip_set_dump, .attr_count = IPSET_ATTR_CMD_MAX, - .policy = ip_set_setname_policy, + .policy = ip_set_dump_policy, }, [IPSET_CMD_SAVE] = { .call = ip_set_dump, @@ -2069,8 +2274,9 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } req_version->version = IPSET_PROTOCOL; - ret = copy_to_user(user, req_version, - sizeof(struct ip_set_req_version)); + if (copy_to_user(user, req_version, + sizeof(struct ip_set_req_version))) + ret = -EFAULT; goto done; } case IP_SET_OP_GET_BYNAME: { @@ -2129,7 +2335,8 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } /* end of switch(op) */ copy: - ret = copy_to_user(user, data, copylen); + if (copy_to_user(user, data, copylen)) + ret = -EFAULT; done: vfree(data); diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 2b8f959574b4..36615eb3eae1 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -148,31 +148,3 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src, } EXPORT_SYMBOL_GPL(ip_set_get_ip6_port); #endif - -bool -ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) -{ - bool ret; - u8 proto; - - switch (pf) { - case NFPROTO_IPV4: - ret = ip_set_get_ip4_port(skb, src, port, &proto); - break; - case NFPROTO_IPV6: - ret = ip_set_get_ip6_port(skb, src, port, &proto); - break; - default: - return false; - } - if (!ret) - return ret; - switch (proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - return true; - default: - return false; - } -} -EXPORT_SYMBOL_GPL(ip_set_get_ip_port); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index d098d87bc331..7480ce55b5c8 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -39,7 +39,7 @@ #ifdef IP_SET_HASH_WITH_MULTI #define AHASH_MAX(h) ((h)->ahash_max) -static inline u8 +static u8 tune_ahash_max(u8 curr, u32 multi) { u32 n; @@ -909,7 +909,7 @@ out: return ret; } -static inline int +static int mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, struct ip_set_ext *mext, struct ip_set *set, u32 flags) { diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index f4432d9fcad0..5d6d68eaf6a9 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -44,7 +44,7 @@ struct hash_ip4_elem { /* Common functions */ -static inline bool +static bool hash_ip4_data_equal(const struct hash_ip4_elem *e1, const struct hash_ip4_elem *e2, u32 *multi) @@ -63,7 +63,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) { next->ip = e->ip; @@ -171,7 +171,7 @@ struct hash_ip6_elem { /* Common functions */ -static inline bool +static bool hash_ip6_data_equal(const struct hash_ip6_elem *ip1, const struct hash_ip6_elem *ip2, u32 *multi) @@ -179,7 +179,7 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1, return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } -static inline void +static void hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) { ip6_netmask(ip, prefix); @@ -196,7 +196,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e) { } diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index 24d8f4df4230..eceb7bc4a93a 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -47,7 +47,7 @@ struct hash_ipmac4_elem { /* Common functions */ -static inline bool +static bool hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1, const struct hash_ipmac4_elem *e2, u32 *multi) @@ -67,7 +67,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmac4_data_next(struct hash_ipmac4_elem *next, const struct hash_ipmac4_elem *e) { @@ -154,7 +154,7 @@ struct hash_ipmac6_elem { /* Common functions */ -static inline bool +static bool hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1, const struct hash_ipmac6_elem *e2, u32 *multi) @@ -175,7 +175,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmac6_data_next(struct hash_ipmac6_elem *next, const struct hash_ipmac6_elem *e) { @@ -209,7 +209,7 @@ hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb, (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - if (opt->flags & IPSET_DIM_ONE_SRC) + if (opt->flags & IPSET_DIM_TWO_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 7a1734aad0c5..aba1df617d6e 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -42,7 +42,7 @@ struct hash_ipmark4_elem { /* Common functions */ -static inline bool +static bool hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, const struct hash_ipmark4_elem *ip2, u32 *multi) @@ -64,7 +64,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmark4_data_next(struct hash_ipmark4_elem *next, const struct hash_ipmark4_elem *d) { @@ -165,7 +165,7 @@ struct hash_ipmark6_elem { /* Common functions */ -static inline bool +static bool hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, const struct hash_ipmark6_elem *ip2, u32 *multi) @@ -187,7 +187,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmark6_data_next(struct hash_ipmark6_elem *next, const struct hash_ipmark6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 32e240658334..1ff228717e29 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -47,7 +47,7 @@ struct hash_ipport4_elem { /* Common functions */ -static inline bool +static bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, const struct hash_ipport4_elem *ip2, u32 *multi) @@ -71,7 +71,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { @@ -202,7 +202,7 @@ struct hash_ipport6_elem { /* Common functions */ -static inline bool +static bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, u32 *multi) @@ -226,7 +226,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipport6_data_next(struct hash_ipport6_elem *next, const struct hash_ipport6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 15d419353179..fa88afd812fa 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -46,7 +46,7 @@ struct hash_ipportip4_elem { u8 padding; }; -static inline bool +static bool hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, const struct hash_ipportip4_elem *ip2, u32 *multi) @@ -72,7 +72,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportip4_data_next(struct hash_ipportip4_elem *next, const struct hash_ipportip4_elem *d) { @@ -210,7 +210,7 @@ struct hash_ipportip6_elem { /* Common functions */ -static inline bool +static bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) @@ -236,7 +236,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportip6_data_next(struct hash_ipportip6_elem *next, const struct hash_ipportip6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7a4d7afd4121..eef6ecfcb409 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -59,7 +59,7 @@ struct hash_ipportnet4_elem { /* Common functions */ -static inline bool +static bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, u32 *multi) @@ -71,25 +71,25 @@ hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) { elem->ip2 &= ip_set_netmask(cidr); @@ -116,7 +116,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, const struct hash_ipportnet4_elem *d) { @@ -308,7 +308,7 @@ struct hash_ipportnet6_elem { /* Common functions */ -static inline bool +static bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) @@ -320,25 +320,25 @@ hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip2, cidr); @@ -365,7 +365,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next, const struct hash_ipportnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index d94c585d33c5..0b61593165ef 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -37,7 +37,7 @@ struct hash_mac4_elem { /* Common functions */ -static inline bool +static bool hash_mac4_data_equal(const struct hash_mac4_elem *e1, const struct hash_mac4_elem *e2, u32 *multi) @@ -45,7 +45,7 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1, return ether_addr_equal(e1->ether, e2->ether); } -static inline bool +static bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) @@ -56,7 +56,7 @@ nla_put_failure: return true; } -static inline void +static void hash_mac4_data_next(struct hash_mac4_elem *next, const struct hash_mac4_elem *e) { diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index c259cbc3ef45..136cf0781d3a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -47,7 +47,7 @@ struct hash_net4_elem { /* Common functions */ -static inline bool +static bool hash_net4_data_equal(const struct hash_net4_elem *ip1, const struct hash_net4_elem *ip2, u32 *multi) @@ -56,25 +56,25 @@ hash_net4_data_equal(const struct hash_net4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_net4_do_data_match(const struct hash_net4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -97,7 +97,7 @@ nla_put_failure: return true; } -static inline void +static void hash_net4_data_next(struct hash_net4_elem *next, const struct hash_net4_elem *d) { @@ -212,7 +212,7 @@ struct hash_net6_elem { /* Common functions */ -static inline bool +static bool hash_net6_data_equal(const struct hash_net6_elem *ip1, const struct hash_net6_elem *ip2, u32 *multi) @@ -221,25 +221,25 @@ hash_net6_data_equal(const struct hash_net6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_net6_do_data_match(const struct hash_net6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -262,7 +262,7 @@ nla_put_failure: return true; } -static inline void +static void hash_net6_data_next(struct hash_net6_elem *next, const struct hash_net6_elem *d) { @@ -368,6 +368,7 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 87b29f971226..1a04e0929738 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -62,7 +62,7 @@ struct hash_netiface4_elem { /* Common functions */ -static inline bool +static bool hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, const struct hash_netiface4_elem *ip2, u32 *multi) @@ -74,25 +74,25 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, strcmp(ip1->iface, ip2->iface) == 0; } -static inline int +static int hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -119,7 +119,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netiface4_data_next(struct hash_netiface4_elem *next, const struct hash_netiface4_elem *d) { @@ -285,7 +285,7 @@ struct hash_netiface6_elem { /* Common functions */ -static inline bool +static bool hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, const struct hash_netiface6_elem *ip2, u32 *multi) @@ -297,25 +297,25 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, strcmp(ip1->iface, ip2->iface) == 0; } -static inline int +static int hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -342,7 +342,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netiface6_data_next(struct hash_netiface6_elem *next, const struct hash_netiface6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index a3ae69bfee66..da4ef910b12d 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -52,7 +52,7 @@ struct hash_netnet4_elem { /* Common functions */ -static inline bool +static bool hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, const struct hash_netnet4_elem *ip2, u32 *multi) @@ -61,32 +61,32 @@ hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, ip1->ccmp == ip2->ccmp; } -static inline int +static int hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, struct hash_netnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) { if (inner) { @@ -117,7 +117,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netnet4_data_next(struct hash_netnet4_elem *next, const struct hash_netnet4_elem *d) { @@ -282,7 +282,7 @@ struct hash_netnet6_elem { /* Common functions */ -static inline bool +static bool hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, const struct hash_netnet6_elem *ip2, u32 *multi) @@ -292,32 +292,32 @@ hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, ip1->ccmp == ip2->ccmp; } -static inline int +static int hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, struct hash_netnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) { if (inner) { @@ -348,7 +348,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netnet6_data_next(struct hash_netnet6_elem *next, const struct hash_netnet6_elem *d) { @@ -476,6 +476,7 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 799f2272cc65..34448df80fb9 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -57,7 +57,7 @@ struct hash_netport4_elem { /* Common functions */ -static inline bool +static bool hash_netport4_data_equal(const struct hash_netport4_elem *ip1, const struct hash_netport4_elem *ip2, u32 *multi) @@ -68,25 +68,25 @@ hash_netport4_data_equal(const struct hash_netport4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_netport4_do_data_match(const struct hash_netport4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -112,7 +112,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netport4_data_next(struct hash_netport4_elem *next, const struct hash_netport4_elem *d) { @@ -270,7 +270,7 @@ struct hash_netport6_elem { /* Common functions */ -static inline bool +static bool hash_netport6_data_equal(const struct hash_netport6_elem *ip1, const struct hash_netport6_elem *ip2, u32 *multi) @@ -281,25 +281,25 @@ hash_netport6_data_equal(const struct hash_netport6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_netport6_do_data_match(const struct hash_netport6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -325,7 +325,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netport6_data_next(struct hash_netport6_elem *next, const struct hash_netport6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index a82b70e8b9a6..934c1712cba8 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -56,7 +56,7 @@ struct hash_netportnet4_elem { /* Common functions */ -static inline bool +static bool hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, const struct hash_netportnet4_elem *ip2, u32 *multi) @@ -67,32 +67,32 @@ hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, struct hash_netportnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, u8 cidr, bool inner) { @@ -126,7 +126,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netportnet4_data_next(struct hash_netportnet4_elem *next, const struct hash_netportnet4_elem *d) { @@ -331,7 +331,7 @@ struct hash_netportnet6_elem { /* Common functions */ -static inline bool +static bool hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, const struct hash_netportnet6_elem *ip2, u32 *multi) @@ -343,32 +343,32 @@ hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, struct hash_netportnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, u8 cidr, bool inner) { @@ -402,7 +402,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netportnet6_data_next(struct hash_netportnet6_elem *next, const struct hash_netportnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 67ac50104e6f..cd747c0962fd 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -149,7 +149,7 @@ __list_set_del_rcu(struct rcu_head * rcu) kfree(e); } -static inline void +static void list_set_del(struct ip_set *set, struct set_elem *e) { struct list_set *map = set->data; @@ -160,7 +160,7 @@ list_set_del(struct ip_set *set, struct set_elem *e) call_rcu(&e->rcu, __list_set_del_rcu); } -static inline void +static void list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { struct list_set *map = set->data; diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 4515056ef1c2..f9b16f2b2219 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -193,21 +193,29 @@ struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app * mutex_lock(&__ip_vs_app_mutex); + /* increase the module use count */ + if (!ip_vs_use_count_inc()) { + err = -ENOENT; + goto out_unlock; + } + list_for_each_entry(a, &ipvs->app_list, a_list) { if (!strcmp(app->name, a->name)) { err = -EEXIST; + /* decrease the module use count */ + ip_vs_use_count_dec(); goto out_unlock; } } a = kmemdup(app, sizeof(*app), GFP_KERNEL); if (!a) { err = -ENOMEM; + /* decrease the module use count */ + ip_vs_use_count_dec(); goto out_unlock; } INIT_LIST_HEAD(&a->incs_list); list_add(&a->a_list, &ipvs->app_list); - /* increase the module use count */ - ip_vs_use_count_inc(); out_unlock: mutex_unlock(&__ip_vs_app_mutex); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 8b80ab794a92..512259f579d7 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2402,18 +2402,22 @@ estimator_fail: return -ENOMEM; } -static void __net_exit __ip_vs_cleanup(struct net *net) +static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) { - struct netns_ipvs *ipvs = net_ipvs(net); - - ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ - ip_vs_conn_net_cleanup(ipvs); - ip_vs_app_net_cleanup(ipvs); - ip_vs_protocol_net_cleanup(ipvs); - ip_vs_control_net_cleanup(ipvs); - ip_vs_estimator_net_cleanup(ipvs); - IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); - net->ipvs = NULL; + struct netns_ipvs *ipvs; + struct net *net; + + ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_conn_net_cleanup(ipvs); + ip_vs_app_net_cleanup(ipvs); + ip_vs_protocol_net_cleanup(ipvs); + ip_vs_control_net_cleanup(ipvs); + ip_vs_estimator_net_cleanup(ipvs); + IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); + net->ipvs = NULL; + } } static int __net_init __ip_vs_dev_init(struct net *net) @@ -2429,27 +2433,32 @@ hook_fail: return ret; } -static void __net_exit __ip_vs_dev_cleanup(struct net *net) +static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs; + struct net *net; + EnterFunction(2); - nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - ipvs->enable = 0; /* Disable packet reception */ - smp_wmb(); - ip_vs_sync_net_cleanup(ipvs); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ipvs->enable = 0; /* Disable packet reception */ + smp_wmb(); + ip_vs_sync_net_cleanup(ipvs); + } LeaveFunction(2); } static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, - .exit = __ip_vs_cleanup, + .exit_batch = __ip_vs_cleanup_batch, .id = &ip_vs_net_id, .size = sizeof(struct netns_ipvs), }; static struct pernet_operations ipvs_core_dev_ops = { .init = __ip_vs_dev_init, - .exit = __ip_vs_dev_cleanup, + .exit_batch = __ip_vs_dev_cleanup_batch, }; /* diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 8b48e7ce1c2c..3be7398901e0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -93,7 +93,6 @@ static bool __ip_vs_addr_is_local_v6(struct net *net, static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; - static int old_secure_tcp = 0; int availmem; int nomem; int to_change = -1; @@ -174,35 +173,35 @@ static void update_defense_level(struct netns_ipvs *ipvs) spin_lock(&ipvs->securetcp_lock); switch (ipvs->sysctl_secure_tcp) { case 0: - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; break; case 1: if (nomem) { - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; ipvs->sysctl_secure_tcp = 2; } else { - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; } break; case 2: if (nomem) { - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; } else { - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; ipvs->sysctl_secure_tcp = 1; } break; case 3: - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; break; } - old_secure_tcp = ipvs->sysctl_secure_tcp; + ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(ipvs, ipvs->sysctl_secure_tcp > 1); @@ -1275,7 +1274,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service *svc = NULL; /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; /* Lookup the scheduler by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { @@ -1607,14 +1607,20 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) /* * Delete service by {netns} in the service table. - * Called by __ip_vs_cleanup() + * Called by __ip_vs_batch_cleanup() */ -void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs) +void ip_vs_service_nets_cleanup(struct list_head *net_list) { + struct netns_ipvs *ipvs; + struct net *net; + EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(ipvs, true); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_flush(ipvs, true); + } mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } @@ -2435,9 +2441,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (copy_from_user(arg, user, len) != 0) return -EFAULT; - /* increase the module use count */ - ip_vs_use_count_inc(); - /* Handle daemons since they have another lock */ if (cmd == IP_VS_SO_SET_STARTDAEMON || cmd == IP_VS_SO_SET_STOPDAEMON) { @@ -2450,13 +2453,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = -EINVAL; if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, sizeof(cfg.mcast_ifn)) <= 0) - goto out_dec; + return ret; cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { ret = stop_sync_thread(ipvs, dm->state); } - goto out_dec; + return ret; } mutex_lock(&__ip_vs_mutex); @@ -2551,10 +2554,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) out_unlock: mutex_unlock(&__ip_vs_mutex); - out_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - return ret; } diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c index 78b074cd5464..c03066fdd5ca 100644 --- a/net/netfilter/ipvs/ip_vs_ovf.c +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -5,7 +5,7 @@ * Authors: Raducu Deaconu <rhadoo_io@yahoo.com> * * Scheduler implements "overflow" loadbalancing according to number of active - * connections , will keep all conections to the node with the highest weight + * connections , will keep all connections to the node with the highest weight * and overflow to the next node if the number of connections exceeds the node's * weight. * Note that this scheduler might not be suitable for UDP because it only uses diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 8e104dff7abc..166c669f0763 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -68,7 +68,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) struct ip_vs_pe *tmp; /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOENT; mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 2f9d5cd5daee..d4903723be7e 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -179,7 +179,8 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) } /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOENT; mutex_lock(&ip_vs_sched_mutex); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index a4a78c4b06de..8dc892a9dc91 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1762,6 +1762,10 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", sizeof(struct ip_vs_sync_conn_v0)); + /* increase the module use count */ + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; + /* Do not hold one mutex and then to block on another */ for (;;) { rtnl_lock(); @@ -1892,9 +1896,6 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); - /* increase the module use count */ - ip_vs_use_count_inc(); - return 0; out: @@ -1924,11 +1925,17 @@ out: } kfree(ti); } + + /* decrease the module use count */ + ip_vs_use_count_dec(); return result; out_early: mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); + + /* decrease the module use count */ + ip_vs_use_count_dec(); return result; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 888d3068a492..b1e300f8881b 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -407,12 +407,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, goto err_put; skb_dst_drop(skb); - if (noref) { - if (!local) - skb_dst_set_noref(skb, &rt->dst); - else - skb_dst_set(skb, dst_clone(&rt->dst)); - } else + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else skb_dst_set(skb, &rt->dst); return local; @@ -574,12 +571,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, goto err_put; skb_dst_drop(skb); - if (noref) { - if (!local) - skb_dst_set_noref(skb, &rt->dst); - else - skb_dst_set(skb, dst_clone(&rt->dst)); - } else + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else skb_dst_set(skb, &rt->dst); return local; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0c63120b2db2..0af1898af2b8 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -573,7 +573,6 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); void nf_ct_tmpl_free(struct nf_conn *tmpl) { nf_ct_ext_destroy(tmpl); - nf_ct_ext_free(tmpl); if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) kfree((char *)tmpl - tmpl->proto.tmpl_padto); @@ -1417,7 +1416,6 @@ void nf_conntrack_free(struct nf_conn *ct) WARN_ON(atomic_read(&ct->ct_general.use) != 0); nf_ct_ext_destroy(ct); - nf_ct_ext_free(ct); kmem_cache_free(nf_conntrack_cachep, ct); smp_mb__before_atomic(); atomic_dec(&net->ct.count); @@ -1792,8 +1790,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (nf_ct_is_confirmed(ct)) extra_jiffies += nfct_time_stamp; - if (ct->timeout != extra_jiffies) - ct->timeout = extra_jiffies; + if (READ_ONCE(ct->timeout) != extra_jiffies) + WRITE_ONCE(ct->timeout, extra_jiffies); acct: if (do_acct) nf_ct_acct_update(ct, ctinfo, skb->len); diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 6fba74b5aaf7..7956c9f19899 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -30,6 +30,7 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); #define ECACHE_RETRY_WAIT (HZ/10) +#define ECACHE_STACK_ALLOC (256 / sizeof(void *)) enum retry_state { STATE_CONGESTED, @@ -39,11 +40,11 @@ enum retry_state { static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) { - struct nf_conn *refs[16]; + struct nf_conn *refs[ECACHE_STACK_ALLOC]; + enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int evicted = 0; - enum retry_state ret = STATE_DONE; spin_lock(&pcpu->lock); @@ -54,10 +55,22 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) if (!nf_ct_is_confirmed(ct)) continue; + /* This ecache access is safe because the ct is on the + * pcpu dying list and we hold the spinlock -- the entry + * cannot be free'd until after the lock is released. + * + * This is true even if ct has a refcount of 0: the + * cpu that is about to free the entry must remove it + * from the dying list and needs the lock to do so. + */ e = nf_ct_ecache_find(ct); if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) continue; + /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means + * the worker owns this entry: the ct will remain valid + * until the worker puts its ct reference. + */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; @@ -189,15 +202,15 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct) if (notify == NULL) goto out_unlock; + if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) + goto out_unlock; + e = nf_ct_ecache_find(ct); if (e == NULL) goto out_unlock; events = xchg(&e->cache, 0); - if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) - goto out_unlock; - /* We make a copy of the missed event cache without taking * the lock, thus we may send missed events twice. However, * this does not harm and it happens very rarely. */ diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index d4ed1e197921..c24e5b64b00c 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -34,21 +34,24 @@ void nf_ct_ext_destroy(struct nf_conn *ct) t->destroy(ct); rcu_read_unlock(); } + + kfree(ct->ext); } EXPORT_SYMBOL(nf_ct_ext_destroy); void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { unsigned int newlen, newoff, oldlen, alloc; - struct nf_ct_ext *old, *new; struct nf_ct_ext_type *t; + struct nf_ct_ext *new; /* Conntrack must not be confirmed to avoid races on reallocation. */ WARN_ON(nf_ct_is_confirmed(ct)); - old = ct->ext; - if (old) { + if (ct->ext) { + const struct nf_ct_ext *old = ct->ext; + if (__nf_ct_ext_exist(old, id)) return NULL; oldlen = old->len; @@ -68,22 +71,18 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) rcu_read_unlock(); alloc = max(newlen, NF_CT_EXT_PREALLOC); - kmemleak_not_leak(old); - new = __krealloc(old, alloc, gfp); + new = krealloc(ct->ext, alloc, gfp); if (!new) return NULL; - if (!old) { + if (!ct->ext) memset(new->offset, 0, sizeof(new->offset)); - ct->ext = new; - } else if (new != old) { - kfree_rcu(old, rcu); - rcu_assign_pointer(ct->ext, new); - } new->offset[id] = newoff; new->len = newlen; memset((void *)new + newoff, 0, newlen - newoff); + + ct->ext = new; return (void *)new + newoff; } EXPORT_SYMBOL(nf_ct_ext_add); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e2d13cd18875..d8d33ef52ce0 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -506,9 +506,45 @@ nla_put_failure: return -1; } +/* all these functions access ct->ext. Caller must either hold a reference + * on ct or prevent its deletion by holding either the bucket spinlock or + * pcpu dying list lock. + */ +static int ctnetlink_dump_extinfo(struct sk_buff *skb, + struct nf_conn *ct, u32 type) +{ + if (ctnetlink_dump_acct(skb, ct, type) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_labels(skb, ct) < 0 || + ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || + ctnetlink_dump_ct_synproxy(skb, ct) < 0) + return -1; + + return 0; +} + +static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) +{ + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_secctx(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0 || + ctnetlink_dump_master(skb, ct) < 0) + return -1; + + if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && + (ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0)) + return -1; + + return 0; +} + static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, - struct nf_conn *ct) + struct nf_conn *ct, bool extinfo) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; @@ -552,23 +588,9 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; - if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_acct(skb, ct, type) < 0 || - ctnetlink_dump_timestamp(skb, ct) < 0 || - ctnetlink_dump_helpinfo(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || - ctnetlink_dump_secctx(skb, ct) < 0 || - ctnetlink_dump_labels(skb, ct) < 0 || - ctnetlink_dump_id(skb, ct) < 0 || - ctnetlink_dump_use(skb, ct) < 0 || - ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || - ctnetlink_dump_ct_synproxy(skb, ct) < 0) + if (ctnetlink_dump_info(skb, ct) < 0) goto nla_put_failure; - - if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && - (ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_protoinfo(skb, ct) < 0)) + if (extinfo && ctnetlink_dump_extinfo(skb, ct, type) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -953,13 +975,11 @@ restart: if (!ctnetlink_filter_match(ct, cb->data)) continue; - rcu_read_lock(); res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); + ct, true); if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; @@ -1364,10 +1384,8 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, return -ENOMEM; } - rcu_read_lock(); err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct); - rcu_read_unlock(); + NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true); nf_ct_put(ct); if (err <= 0) goto free; @@ -1429,12 +1447,18 @@ restart: continue; cb->args[1] = 0; } - rcu_read_lock(); + + /* We can't dump extension info for the unconfirmed + * list because unconfirmed conntracks can have + * ct->ext reallocated (and thus freed). + * + * In the dying list case ct->ext can't be free'd + * until after we drop pcpu->lock. + */ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); + ct, dying ? true : false); if (res < 0) { if (!atomic_inc_not_zero(&ct->ct_general.use)) continue; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 097deba7441a..c2e3dff773bc 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -235,11 +235,7 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, } /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) + if (!icmp_is_err(icmph->type)) return NF_ACCEPT; memset(&outer_daddr, 0, sizeof(outer_daddr)); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 132f5228b431..8468d2d02284 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -14,24 +14,15 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> -struct flow_offload_entry { - struct flow_offload flow; - struct nf_conn *ct; - struct rcu_head rcu_head; -}; - static DEFINE_MUTEX(flowtable_lock); static LIST_HEAD(flowtables); static void flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, - struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple; - struct dst_entry *other_dst = route->tuple[!dir].dst; - struct dst_entry *dst = route->tuple[dir].dst; ft->dir = dir; @@ -39,12 +30,10 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, case NFPROTO_IPV4: ft->src_v4 = ctt->src.u3.in; ft->dst_v4 = ctt->dst.u3.in; - ft->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: ft->src_v6 = ctt->src.u3.in6; ft->dst_v6 = ctt->dst.u3.in6; - ft->mtu = ip6_dst_mtu_forward(dst); break; } @@ -52,37 +41,24 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, ft->l4proto = ctt->dst.protonum; ft->src_port = ctt->src.u.tcp.port; ft->dst_port = ctt->dst.u.tcp.port; - - ft->iifidx = other_dst->dev->ifindex; - ft->dst_cache = dst; } -struct flow_offload * -flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) +struct flow_offload *flow_offload_alloc(struct nf_conn *ct) { - struct flow_offload_entry *entry; struct flow_offload *flow; if (unlikely(nf_ct_is_dying(ct) || !atomic_inc_not_zero(&ct->ct_general.use))) return NULL; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); - if (!entry) + flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + if (!flow) goto err_ct_refcnt; - flow = &entry->flow; + flow->ct = ct; - if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst)) - goto err_dst_cache_original; - - if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst)) - goto err_dst_cache_reply; - - entry->ct = ct; - - flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL); - flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY); + flow_offload_fill_dir(flow, ct, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_dir(flow, ct, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) flow->flags |= FLOW_OFFLOAD_SNAT; @@ -91,10 +67,6 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) return flow; -err_dst_cache_reply: - dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); -err_dst_cache_original: - kfree(entry); err_ct_refcnt: nf_ct_put(ct); @@ -102,6 +74,56 @@ err_ct_refcnt: } EXPORT_SYMBOL_GPL(flow_offload_alloc); +static int flow_offload_fill_route(struct flow_offload *flow, + const struct nf_flow_route *route, + enum flow_offload_tuple_dir dir) +{ + struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; + struct dst_entry *other_dst = route->tuple[!dir].dst; + struct dst_entry *dst = route->tuple[dir].dst; + + if (!dst_hold_safe(route->tuple[dir].dst)) + return -1; + + switch (flow_tuple->l3proto) { + case NFPROTO_IPV4: + flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); + break; + case NFPROTO_IPV6: + flow_tuple->mtu = ip6_dst_mtu_forward(dst); + break; + } + + flow_tuple->iifidx = other_dst->dev->ifindex; + flow_tuple->dst_cache = dst; + + return 0; +} + +int flow_offload_route_init(struct flow_offload *flow, + const struct nf_flow_route *route) +{ + int err; + + err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); + if (err < 0) + return err; + + err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); + if (err < 0) + goto err_route_reply; + + flow->type = NF_FLOW_OFFLOAD_ROUTE; + + return 0; + +err_route_reply: + dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); + + return err; +} +EXPORT_SYMBOL_GPL(flow_offload_route_init); + static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) { tcp->state = TCP_CONNTRACK_ESTABLISHED; @@ -150,17 +172,25 @@ static void flow_offload_fixup_ct(struct nf_conn *ct) flow_offload_fixup_ct_timeout(ct); } -void flow_offload_free(struct flow_offload *flow) +static void flow_offload_route_release(struct flow_offload *flow) { - struct flow_offload_entry *e; - dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); - e = container_of(flow, struct flow_offload_entry, flow); +} + +void flow_offload_free(struct flow_offload *flow) +{ + switch (flow->type) { + case NF_FLOW_OFFLOAD_ROUTE: + flow_offload_route_release(flow); + break; + default: + break; + } if (flow->flags & FLOW_OFFLOAD_DYING) - nf_ct_delete(e->ct, 0, 0); - nf_ct_put(e->ct); - kfree_rcu(e, rcu_head); + nf_ct_delete(flow->ct, 0, 0); + nf_ct_put(flow->ct); + kfree_rcu(flow, rcu_head); } EXPORT_SYMBOL_GPL(flow_offload_free); @@ -202,6 +232,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); @@ -218,7 +250,9 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + if (flow_table->flags & NF_FLOWTABLE_HW_OFFLOAD) + nf_flow_offload_add(flow_table, flow); + return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); @@ -231,8 +265,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow) static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { - struct flow_offload_entry *e; - rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, nf_flow_offload_rhash_params); @@ -240,25 +272,21 @@ static void flow_offload_del(struct nf_flowtable *flow_table, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); - e = container_of(flow, struct flow_offload_entry, flow); - clear_bit(IPS_OFFLOAD_BIT, &e->ct->status); + clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); if (nf_flow_has_expired(flow)) - flow_offload_fixup_ct(e->ct); + flow_offload_fixup_ct(flow->ct); else if (flow->flags & FLOW_OFFLOAD_TEARDOWN) - flow_offload_fixup_ct_timeout(e->ct); + flow_offload_fixup_ct_timeout(flow->ct); flow_offload_free(flow); } void flow_offload_teardown(struct flow_offload *flow) { - struct flow_offload_entry *e; - flow->flags |= FLOW_OFFLOAD_TEARDOWN; - e = container_of(flow, struct flow_offload_entry, flow); - flow_offload_fixup_ct_state(e->ct); + flow_offload_fixup_ct_state(flow->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -268,7 +296,6 @@ flow_offload_lookup(struct nf_flowtable *flow_table, { struct flow_offload_tuple_rhash *tuplehash; struct flow_offload *flow; - struct flow_offload_entry *e; int dir; tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, @@ -281,8 +308,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table, if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) return NULL; - e = container_of(flow, struct flow_offload_entry, flow); - if (unlikely(nf_ct_is_dying(e->ct))) + if (unlikely(nf_ct_is_dying(flow->ct))) return NULL; return tuplehash; @@ -326,12 +352,21 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; - struct flow_offload_entry *e; - e = container_of(flow, struct flow_offload_entry, flow); - if (nf_flow_has_expired(flow) || nf_ct_is_dying(e->ct) || - (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) - flow_offload_del(flow_table, flow); + if (flow->flags & FLOW_OFFLOAD_HW) + nf_flow_offload_stats(flow_table, flow); + + if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || + (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) { + if (flow->flags & FLOW_OFFLOAD_HW) { + if (!(flow->flags & FLOW_OFFLOAD_HW_DYING)) + nf_flow_offload_del(flow_table, flow); + else if (flow->flags & FLOW_OFFLOAD_HW_DEAD) + flow_offload_del(flow_table, flow); + } else { + flow_offload_del(flow_table, flow); + } + } } static void nf_flow_offload_work_gc(struct work_struct *work) @@ -464,6 +499,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable) int err; INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); + flow_block_init(&flowtable->flow_block); err = rhashtable_init(&flowtable->rhashtable, &nf_flow_offload_rhash_params); @@ -484,15 +520,13 @@ EXPORT_SYMBOL_GPL(nf_flow_table_init); static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) { struct net_device *dev = data; - struct flow_offload_entry *e; - - e = container_of(flow, struct flow_offload_entry, flow); if (!dev) { flow_offload_teardown(flow); return; } - if (net_eq(nf_ct_net(e->ct), dev_net(dev)) && + + if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && (flow->tuplehash[0].tuple.iifidx == dev->ifindex || flow->tuplehash[1].tuple.iifidx == dev->ifindex)) flow_offload_dead(flow); @@ -501,6 +535,7 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, struct net_device *dev) { + nf_flow_table_offload_flush(flowtable); nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); } @@ -528,5 +563,18 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) } EXPORT_SYMBOL_GPL(nf_flow_table_free); +static int __init nf_flow_table_module_init(void) +{ + return nf_flow_table_offload_init(); +} + +static void __exit nf_flow_table_module_exit(void) +{ + nf_flow_table_offload_exit(); +} + +module_init(nf_flow_table_module_init); +module_exit(nf_flow_table_module_exit); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 593357aedb36..bfb910b874ce 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -24,6 +24,8 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, static struct nf_flowtable_type flowtable_inet = { .family = NFPROTO_INET, .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route, .free = nf_flow_table_free, .hook = nf_flow_offload_inet_hook, .owner = THIS_MODULE, diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c new file mode 100644 index 000000000000..9be61f47303a --- /dev/null +++ b/net/netfilter/nf_flow_table_offload.c @@ -0,0 +1,758 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/netdevice.h> +#include <linux/tc_act/tc_csum.h> +#include <net/flow_offload.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> + +static struct work_struct nf_flow_offload_work; +static DEFINE_SPINLOCK(flow_offload_pending_list_lock); +static LIST_HEAD(flow_offload_pending_list); + +struct flow_offload_work { + struct list_head list; + enum flow_cls_command cmd; + int priority; + struct nf_flowtable *flowtable; + struct flow_offload *flow; +}; + +struct nf_flow_key { + struct flow_dissector_key_control control; + struct flow_dissector_key_basic basic; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + }; + struct flow_dissector_key_tcp tcp; + struct flow_dissector_key_ports tp; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct nf_flow_match { + struct flow_dissector dissector; + struct nf_flow_key key; + struct nf_flow_key mask; +}; + +struct nf_flow_rule { + struct nf_flow_match match; + struct flow_rule *rule; +}; + +#define NF_FLOW_DISSECTOR(__match, __type, __field) \ + (__match)->dissector.offset[__type] = \ + offsetof(struct nf_flow_key, __field) + +static int nf_flow_rule_match(struct nf_flow_match *match, + const struct flow_offload_tuple *tuple) +{ + struct nf_flow_key *mask = &match->mask; + struct nf_flow_key *key = &match->key; + + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); + + switch (tuple->l3proto) { + case AF_INET: + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + key->basic.n_proto = htons(ETH_P_IP); + key->ipv4.src = tuple->src_v4.s_addr; + mask->ipv4.src = 0xffffffff; + key->ipv4.dst = tuple->dst_v4.s_addr; + mask->ipv4.dst = 0xffffffff; + break; + default: + return -EOPNOTSUPP; + } + mask->basic.n_proto = 0xffff; + + switch (tuple->l4proto) { + case IPPROTO_TCP: + key->tcp.flags = 0; + mask->tcp.flags = TCP_FLAG_RST | TCP_FLAG_FIN; + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); + break; + case IPPROTO_UDP: + break; + default: + return -EOPNOTSUPP; + } + + key->basic.ip_proto = tuple->l4proto; + mask->basic.ip_proto = 0xff; + + key->tp.src = tuple->src_port; + mask->tp.src = 0xffff; + key->tp.dst = tuple->dst_port; + mask->tp.dst = 0xffff; + + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL) | + BIT(FLOW_DISSECTOR_KEY_BASIC) | + BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_PORTS); + return 0; +} + +static void flow_offload_mangle(struct flow_action_entry *entry, + enum flow_action_mangle_base htype, + u32 offset, u8 *value, u8 *mask) +{ + entry->id = FLOW_ACTION_MANGLE; + entry->mangle.htype = htype; + entry->mangle.offset = offset; + memcpy(&entry->mangle.mask, mask, sizeof(u32)); + memcpy(&entry->mangle.val, value, sizeof(u32)); +} + +static int flow_offload_eth_src(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct flow_action_entry *entry0, + struct flow_action_entry *entry1) +{ + const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple; + struct net_device *dev; + u32 mask, val; + u16 val16; + + dev = dev_get_by_index(net, tuple->iifidx); + if (!dev) + return -ENOENT; + + mask = ~0xffff0000; + memcpy(&val16, dev->dev_addr, 2); + val = val16 << 16; + flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, + (u8 *)&val, (u8 *)&mask); + + mask = ~0xffffffff; + memcpy(&val, dev->dev_addr + 2, 4); + flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8, + (u8 *)&val, (u8 *)&mask); + dev_put(dev); + + return 0; +} + +static int flow_offload_eth_dst(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct flow_action_entry *entry0, + struct flow_action_entry *entry1) +{ + const struct flow_offload_tuple *tuple = &flow->tuplehash[dir].tuple; + struct neighbour *n; + u32 mask, val; + u16 val16; + + n = dst_neigh_lookup(tuple->dst_cache, &tuple->dst_v4); + if (!n) + return -ENOENT; + + mask = ~0xffffffff; + memcpy(&val, n->ha, 4); + flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 0, + (u8 *)&val, (u8 *)&mask); + + mask = ~0x0000ffff; + memcpy(&val16, n->ha + 4, 2); + val = val16; + flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, + (u8 *)&val, (u8 *)&mask); + neigh_release(n); + + return 0; +} + +static void flow_offload_ipv4_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct flow_action_entry *entry) +{ + u32 mask = ~htonl(0xffffffff); + __be32 addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; + offset = offsetof(struct iphdr, saddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; + offset = offsetof(struct iphdr, daddr); + break; + default: + return; + } + + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, + (u8 *)&addr, (u8 *)&mask); +} + +static void flow_offload_ipv4_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct flow_action_entry *entry) +{ + u32 mask = ~htonl(0xffffffff); + __be32 addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; + offset = offsetof(struct iphdr, daddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; + offset = offsetof(struct iphdr, saddr); + break; + default: + return; + } + + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, + (u8 *)&addr, (u8 *)&mask); +} + +static int flow_offload_l4proto(const struct flow_offload *flow) +{ + u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; + u8 type = 0; + + switch (protonum) { + case IPPROTO_TCP: + type = FLOW_ACT_MANGLE_HDR_TYPE_TCP; + break; + case IPPROTO_UDP: + type = FLOW_ACT_MANGLE_HDR_TYPE_UDP; + break; + default: + break; + } + + return type; +} + +static void flow_offload_port_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct flow_action_entry *entry) +{ + u32 mask = ~htonl(0xffff0000); + __be16 port; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; + offset = 0; /* offsetof(struct tcphdr, source); */ + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; + offset = 0; /* offsetof(struct tcphdr, dest); */ + break; + default: + break; + } + + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, + (u8 *)&port, (u8 *)&mask); +} + +static void flow_offload_port_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct flow_action_entry *entry) +{ + u32 mask = ~htonl(0xffff); + __be16 port; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; + offset = 0; /* offsetof(struct tcphdr, source); */ + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; + offset = 0; /* offsetof(struct tcphdr, dest); */ + break; + default: + break; + } + + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, + (u8 *)&port, (u8 *)&mask); +} + +static void flow_offload_ipv4_checksum(struct net *net, + const struct flow_offload *flow, + struct flow_action_entry *entry) +{ + u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; + + entry->id = FLOW_ACTION_CSUM; + entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR; + + switch (protonum) { + case IPPROTO_TCP: + entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_TCP; + break; + case IPPROTO_UDP: + entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP; + break; + } +} + +static void flow_offload_redirect(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct flow_action_entry *entry) +{ + struct rtable *rt; + + rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + entry->id = FLOW_ACTION_REDIRECT; + entry->dev = rt->dst.dev; + dev_hold(rt->dst.dev); +} + +int nf_flow_rule_route(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + int i; + + if (flow_offload_eth_src(net, flow, dir, + &flow_rule->rule->action.entries[0], + &flow_rule->rule->action.entries[1]) < 0) + return -1; + + if (flow_offload_eth_dst(net, flow, dir, + &flow_rule->rule->action.entries[2], + &flow_rule->rule->action.entries[3]) < 0) + return -1; + + i = 4; + if (flow->flags & FLOW_OFFLOAD_SNAT) { + flow_offload_ipv4_snat(net, flow, dir, + &flow_rule->rule->action.entries[i++]); + flow_offload_port_snat(net, flow, dir, + &flow_rule->rule->action.entries[i++]); + } + if (flow->flags & FLOW_OFFLOAD_DNAT) { + flow_offload_ipv4_dnat(net, flow, dir, + &flow_rule->rule->action.entries[i++]); + flow_offload_port_dnat(net, flow, dir, + &flow_rule->rule->action.entries[i++]); + } + if (flow->flags & FLOW_OFFLOAD_SNAT || + flow->flags & FLOW_OFFLOAD_DNAT) + flow_offload_ipv4_checksum(net, flow, + &flow_rule->rule->action.entries[i++]); + + flow_offload_redirect(flow, dir, &flow_rule->rule->action.entries[i++]); + + return i; +} +EXPORT_SYMBOL_GPL(nf_flow_rule_route); + +static struct nf_flow_rule * +nf_flow_offload_rule_alloc(struct net *net, + const struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir) +{ + const struct nf_flowtable *flowtable = offload->flowtable; + const struct flow_offload *flow = offload->flow; + const struct flow_offload_tuple *tuple; + struct nf_flow_rule *flow_rule; + int err = -ENOMEM, num_actions; + + flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); + if (!flow_rule) + goto err_flow; + + flow_rule->rule = flow_rule_alloc(10); + if (!flow_rule->rule) + goto err_flow_rule; + + flow_rule->rule->match.dissector = &flow_rule->match.dissector; + flow_rule->rule->match.mask = &flow_rule->match.mask; + flow_rule->rule->match.key = &flow_rule->match.key; + + tuple = &flow->tuplehash[dir].tuple; + err = nf_flow_rule_match(&flow_rule->match, tuple); + if (err < 0) + goto err_flow_match; + + num_actions = flowtable->type->action(net, flow, dir, flow_rule); + if (num_actions < 0) + goto err_flow_match; + + flow_rule->rule->action.num_entries = num_actions; + + return flow_rule; + +err_flow_match: + kfree(flow_rule->rule); +err_flow_rule: + kfree(flow_rule); +err_flow: + return NULL; +} + +static void __nf_flow_offload_destroy(struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry; + int i; + + for (i = 0; i < flow_rule->rule->action.num_entries; i++) { + entry = &flow_rule->rule->action.entries[i]; + if (entry->id != FLOW_ACTION_REDIRECT) + continue; + + dev_put(entry->dev); + } + kfree(flow_rule->rule); + kfree(flow_rule); +} + +static void nf_flow_offload_destroy(struct nf_flow_rule *flow_rule[]) +{ + int i; + + for (i = 0; i < FLOW_OFFLOAD_DIR_MAX; i++) + __nf_flow_offload_destroy(flow_rule[i]); +} + +static int nf_flow_offload_alloc(const struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule[]) +{ + struct net *net = read_pnet(&offload->flowtable->net); + + flow_rule[0] = nf_flow_offload_rule_alloc(net, offload, + FLOW_OFFLOAD_DIR_ORIGINAL); + if (!flow_rule[0]) + return -ENOMEM; + + flow_rule[1] = nf_flow_offload_rule_alloc(net, offload, + FLOW_OFFLOAD_DIR_REPLY); + if (!flow_rule[1]) { + __nf_flow_offload_destroy(flow_rule[0]); + return -ENOMEM; + } + + return 0; +} + +static void nf_flow_offload_init(struct flow_cls_offload *cls_flow, + __be16 proto, int priority, + enum flow_cls_command cmd, + const struct flow_offload_tuple *tuple, + struct netlink_ext_ack *extack) +{ + cls_flow->common.protocol = proto; + cls_flow->common.prio = priority; + cls_flow->common.extack = extack; + cls_flow->command = cmd; + cls_flow->cookie = (unsigned long)tuple; +} + +static int flow_offload_tuple_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir) +{ + struct nf_flowtable *flowtable = offload->flowtable; + struct flow_cls_offload cls_flow = {}; + struct flow_block_cb *block_cb; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + int err, i = 0; + + nf_flow_offload_init(&cls_flow, proto, offload->priority, + FLOW_CLS_REPLACE, + &offload->flow->tuplehash[dir].tuple, &extack); + cls_flow.rule = flow_rule->rule; + + list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) { + err = block_cb->cb(TC_SETUP_FT, &cls_flow, + block_cb->cb_priv); + if (err < 0) + continue; + + i++; + } + + return i; +} + +static void flow_offload_tuple_del(struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir) +{ + struct nf_flowtable *flowtable = offload->flowtable; + struct flow_cls_offload cls_flow = {}; + struct flow_block_cb *block_cb; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + + nf_flow_offload_init(&cls_flow, proto, offload->priority, + FLOW_CLS_DESTROY, + &offload->flow->tuplehash[dir].tuple, &extack); + + list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) + block_cb->cb(TC_SETUP_FT, &cls_flow, block_cb->cb_priv); + + offload->flow->flags |= FLOW_OFFLOAD_HW_DEAD; +} + +static int flow_offload_rule_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule[]) +{ + int ok_count = 0; + + ok_count += flow_offload_tuple_add(offload, flow_rule[0], + FLOW_OFFLOAD_DIR_ORIGINAL); + ok_count += flow_offload_tuple_add(offload, flow_rule[1], + FLOW_OFFLOAD_DIR_REPLY); + if (ok_count == 0) + return -ENOENT; + + return 0; +} + +static int flow_offload_work_add(struct flow_offload_work *offload) +{ + struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; + int err; + + err = nf_flow_offload_alloc(offload, flow_rule); + if (err < 0) + return -ENOMEM; + + err = flow_offload_rule_add(offload, flow_rule); + + nf_flow_offload_destroy(flow_rule); + + return err; +} + +static void flow_offload_work_del(struct flow_offload_work *offload) +{ + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); +} + +static void flow_offload_tuple_stats(struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir, + struct flow_stats *stats) +{ + struct nf_flowtable *flowtable = offload->flowtable; + struct flow_cls_offload cls_flow = {}; + struct flow_block_cb *block_cb; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + + nf_flow_offload_init(&cls_flow, proto, offload->priority, + FLOW_CLS_STATS, + &offload->flow->tuplehash[dir].tuple, &extack); + + list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) + block_cb->cb(TC_SETUP_FT, &cls_flow, block_cb->cb_priv); + memcpy(stats, &cls_flow.stats, sizeof(*stats)); +} + +static void flow_offload_work_stats(struct flow_offload_work *offload) +{ + struct flow_stats stats[FLOW_OFFLOAD_DIR_MAX] = {}; + u64 lastused; + + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); + + lastused = max_t(u64, stats[0].lastused, stats[1].lastused); + offload->flow->timeout = max_t(u64, offload->flow->timeout, + lastused + NF_FLOW_TIMEOUT); +} + +static void flow_offload_work_handler(struct work_struct *work) +{ + struct flow_offload_work *offload, *next; + LIST_HEAD(offload_pending_list); + int ret; + + spin_lock_bh(&flow_offload_pending_list_lock); + list_replace_init(&flow_offload_pending_list, &offload_pending_list); + spin_unlock_bh(&flow_offload_pending_list_lock); + + list_for_each_entry_safe(offload, next, &offload_pending_list, list) { + switch (offload->cmd) { + case FLOW_CLS_REPLACE: + ret = flow_offload_work_add(offload); + if (ret < 0) + offload->flow->flags &= ~FLOW_OFFLOAD_HW; + break; + case FLOW_CLS_DESTROY: + flow_offload_work_del(offload); + break; + case FLOW_CLS_STATS: + flow_offload_work_stats(offload); + break; + default: + WARN_ON_ONCE(1); + } + list_del(&offload->list); + kfree(offload); + } +} + +static void flow_offload_queue_work(struct flow_offload_work *offload) +{ + spin_lock_bh(&flow_offload_pending_list_lock); + list_add_tail(&offload->list, &flow_offload_pending_list); + spin_unlock_bh(&flow_offload_pending_list_lock); + + schedule_work(&nf_flow_offload_work); +} + +void nf_flow_offload_add(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + if (!offload) + return; + + offload->cmd = FLOW_CLS_REPLACE; + offload->flow = flow; + offload->priority = flowtable->priority; + offload->flowtable = flowtable; + flow->flags |= FLOW_OFFLOAD_HW; + + flow_offload_queue_work(offload); +} + +void nf_flow_offload_del(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + if (!offload) + return; + + offload->cmd = FLOW_CLS_DESTROY; + offload->flow = flow; + offload->flow->flags |= FLOW_OFFLOAD_HW_DYING; + offload->flowtable = flowtable; + + flow_offload_queue_work(offload); +} + +void nf_flow_offload_stats(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + s64 delta; + + delta = flow->timeout - jiffies; + if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10) || + flow->flags & FLOW_OFFLOAD_HW_DYING) + return; + + offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + if (!offload) + return; + + offload->cmd = FLOW_CLS_STATS; + offload->flow = flow; + offload->flowtable = flowtable; + + flow_offload_queue_work(offload); +} + +void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) +{ + if (flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD) + flush_work(&nf_flow_offload_work); +} + +static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, + struct flow_block_offload *bo, + enum flow_block_command cmd) +{ + struct flow_block_cb *block_cb, *next; + int err = 0; + + switch (cmd) { + case FLOW_BLOCK_BIND: + list_splice(&bo->cb_list, &flowtable->flow_block.cb_list); + break; + case FLOW_BLOCK_UNBIND: + list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + } + break; + default: + WARN_ON_ONCE(1); + err = -EOPNOTSUPP; + } + + return err; +} + +int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo = {}; + int err; + + if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) + return 0; + + bo.net = dev_net(dev); + bo.block = &flowtable->flow_block; + bo.command = cmd; + bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo.extack = &extack; + INIT_LIST_HEAD(&bo.cb_list); + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + if (err < 0) + return err; + + return nf_flow_table_block_setup(flowtable, &bo, cmd); +} +EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); + +int nf_flow_table_offload_init(void) +{ + INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler); + + return 0; +} + +void nf_flow_table_offload_exit(void) +{ + struct flow_offload_work *offload, *next; + LIST_HEAD(offload_pending_list); + + cancel_work_sync(&nf_flow_offload_work); + + list_for_each_entry_safe(offload, next, &offload_pending_list, list) { + list_del(&offload->list); + kfree(offload); + } +} diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d481f9baca2f..2dc636faa322 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -151,11 +151,64 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) } } +static int nft_netdev_register_hooks(struct net *net, + struct list_head *hook_list) +{ + struct nft_hook *hook; + int err, j; + + j = 0; + list_for_each_entry(hook, hook_list, list) { + err = nf_register_net_hook(net, &hook->ops); + if (err < 0) + goto err_register; + + j++; + } + return 0; + +err_register: + list_for_each_entry(hook, hook_list, list) { + if (j-- <= 0) + break; + + nf_unregister_net_hook(net, &hook->ops); + } + return err; +} + +static void nft_netdev_unregister_hooks(struct net *net, + struct list_head *hook_list) +{ + struct nft_hook *hook; + + list_for_each_entry(hook, hook_list, list) + nf_unregister_net_hook(net, &hook->ops); +} + +static int nft_register_basechain_hooks(struct net *net, int family, + struct nft_base_chain *basechain) +{ + if (family == NFPROTO_NETDEV) + return nft_netdev_register_hooks(net, &basechain->hook_list); + + return nf_register_net_hook(net, &basechain->ops); +} + +static void nft_unregister_basechain_hooks(struct net *net, int family, + struct nft_base_chain *basechain) +{ + if (family == NFPROTO_NETDEV) + nft_netdev_unregister_hooks(net, &basechain->hook_list); + else + nf_unregister_net_hook(net, &basechain->ops); +} + static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { - const struct nft_base_chain *basechain; + struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || @@ -168,14 +221,14 @@ static int nf_tables_register_hook(struct net *net, if (basechain->type->ops_register) return basechain->type->ops_register(net, ops); - return nf_register_net_hook(net, ops); + return nft_register_basechain_hooks(net, table->family, basechain); } static void nf_tables_unregister_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { - const struct nft_base_chain *basechain; + struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || @@ -187,7 +240,7 @@ static void nf_tables_unregister_hook(struct net *net, if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); - nf_unregister_net_hook(net, ops); + nft_unregister_basechain_hooks(net, table->family, basechain); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -742,7 +795,8 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) if (cnt && i++ == cnt) break; - nf_unregister_net_hook(net, &nft_base_chain(chain)->ops); + nft_unregister_basechain_hooks(net, table->family, + nft_base_chain(chain)); } } @@ -757,14 +811,16 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table) if (!nft_is_base_chain(chain)) continue; - err = nf_register_net_hook(net, &nft_base_chain(chain)->ops); + err = nft_register_basechain_hooks(net, table->family, + nft_base_chain(chain)); if (err < 0) - goto err; + goto err_register_hooks; i++; } return 0; -err: + +err_register_hooks: if (i) nft_table_disable(net, table, i); return err; @@ -1225,6 +1281,46 @@ nla_put_failure: return -ENOSPC; } +static int nft_dump_basechain_hook(struct sk_buff *skb, int family, + const struct nft_base_chain *basechain) +{ + const struct nf_hook_ops *ops = &basechain->ops; + struct nft_hook *hook, *first = NULL; + struct nlattr *nest, *nest_devs; + int n = 0; + + nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); + if (nest == NULL) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) + goto nla_put_failure; + + if (family == NFPROTO_NETDEV) { + nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS); + list_for_each_entry(hook, &basechain->hook_list, list) { + if (!first) + first = hook; + + if (nla_put_string(skb, NFTA_DEVICE_NAME, + hook->ops.dev->name)) + goto nla_put_failure; + n++; + } + nla_nest_end(skb, nest_devs); + + if (n == 1 && + nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + + return 0; +nla_put_failure: + return -1; +} + static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, @@ -1253,21 +1349,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); - const struct nf_hook_ops *ops = &basechain->ops; struct nft_stats __percpu *stats; - struct nlattr *nest; - nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); - if (nest == NULL) - goto nla_put_failure; - if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) - goto nla_put_failure; - if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) - goto nla_put_failure; - if (basechain->dev_name[0] && - nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name)) + if (nft_dump_basechain_hook(skb, family, basechain)) goto nla_put_failure; - nla_nest_end(skb, nest); if (nla_put_be32(skb, NFTA_CHAIN_POLICY, htonl(basechain->policy))) @@ -1485,6 +1570,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) static void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; + struct nft_hook *hook, *next; if (WARN_ON(chain->use > 0)) return; @@ -1495,6 +1581,13 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); + if (ctx->family == NFPROTO_NETDEV) { + list_for_each_entry_safe(hook, next, + &basechain->hook_list, list) { + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } + } module_put(basechain->type->owner); if (rcu_access_pointer(basechain->stats)) { static_branch_dec(&nft_counters_enabled); @@ -1508,13 +1601,125 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) } } +static struct nft_hook *nft_netdev_hook_alloc(struct net *net, + const struct nlattr *attr) +{ + struct net_device *dev; + char ifname[IFNAMSIZ]; + struct nft_hook *hook; + int err; + + hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL); + if (!hook) { + err = -ENOMEM; + goto err_hook_alloc; + } + + nla_strlcpy(ifname, attr, IFNAMSIZ); + dev = __dev_get_by_name(net, ifname); + if (!dev) { + err = -ENOENT; + goto err_hook_dev; + } + hook->ops.dev = dev; + + return hook; + +err_hook_dev: + kfree(hook); +err_hook_alloc: + return ERR_PTR(err); +} + +static bool nft_hook_list_find(struct list_head *hook_list, + const struct nft_hook *this) +{ + struct nft_hook *hook; + + list_for_each_entry(hook, hook_list, list) { + if (this->ops.dev == hook->ops.dev) + return true; + } + + return false; +} + +static int nf_tables_parse_netdev_hooks(struct net *net, + const struct nlattr *attr, + struct list_head *hook_list) +{ + struct nft_hook *hook, *next; + const struct nlattr *tmp; + int rem, n = 0, err; + + nla_for_each_nested(tmp, attr, rem) { + if (nla_type(tmp) != NFTA_DEVICE_NAME) { + err = -EINVAL; + goto err_hook; + } + + hook = nft_netdev_hook_alloc(net, tmp); + if (IS_ERR(hook)) { + err = PTR_ERR(hook); + goto err_hook; + } + if (nft_hook_list_find(hook_list, hook)) { + err = -EEXIST; + goto err_hook; + } + list_add_tail(&hook->list, hook_list); + n++; + + if (n == NFT_NETDEVICE_MAX) { + err = -EFBIG; + goto err_hook; + } + } + if (!n) + return -EINVAL; + + return 0; + +err_hook: + list_for_each_entry_safe(hook, next, hook_list, list) { + list_del(&hook->list); + kfree(hook); + } + return err; +} + struct nft_chain_hook { u32 num; s32 priority; const struct nft_chain_type *type; - struct net_device *dev; + struct list_head list; }; +static int nft_chain_parse_netdev(struct net *net, + struct nlattr *tb[], + struct list_head *hook_list) +{ + struct nft_hook *hook; + int err; + + if (tb[NFTA_HOOK_DEV]) { + hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + list_add_tail(&hook->list, hook_list); + } else if (tb[NFTA_HOOK_DEVS]) { + err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS], + hook_list); + if (err < 0) + return err; + } else { + return -EINVAL; + } + + return 0; +} + static int nft_chain_parse_hook(struct net *net, const struct nlattr * const nla[], struct nft_chain_hook *hook, u8 family, @@ -1522,7 +1727,6 @@ static int nft_chain_parse_hook(struct net *net, { struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; - struct net_device *dev; int err; lockdep_assert_held(&net->nft.commit_mutex); @@ -1560,23 +1764,14 @@ static int nft_chain_parse_hook(struct net *net, hook->type = type; - hook->dev = NULL; + INIT_LIST_HEAD(&hook->list); if (family == NFPROTO_NETDEV) { - char ifname[IFNAMSIZ]; - - if (!ha[NFTA_HOOK_DEV]) { - module_put(type->owner); - return -EOPNOTSUPP; - } - - nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); - dev = __dev_get_by_name(net, ifname); - if (!dev) { + err = nft_chain_parse_netdev(net, ha, &hook->list); + if (err < 0) { module_put(type->owner); - return -ENOENT; + return err; } - hook->dev = dev; - } else if (ha[NFTA_HOOK_DEV]) { + } else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) { module_put(type->owner); return -EOPNOTSUPP; } @@ -1586,6 +1781,12 @@ static int nft_chain_parse_hook(struct net *net, static void nft_chain_release_hook(struct nft_chain_hook *hook) { + struct nft_hook *h, *next; + + list_for_each_entry_safe(h, next, &hook->list, list) { + list_del(&h->list); + kfree(h); + } module_put(hook->type->owner); } @@ -1610,6 +1811,49 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha return kvmalloc(alloc, GFP_KERNEL); } +static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, + const struct nft_chain_hook *hook, + struct nft_chain *chain) +{ + ops->pf = family; + ops->hooknum = hook->num; + ops->priority = hook->priority; + ops->priv = chain; + ops->hook = hook->type->hooks[ops->hooknum]; +} + +static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, + struct nft_chain_hook *hook, u32 flags) +{ + struct nft_chain *chain; + struct nft_hook *h; + + basechain->type = hook->type; + INIT_LIST_HEAD(&basechain->hook_list); + chain = &basechain->chain; + + if (family == NFPROTO_NETDEV) { + list_splice_init(&hook->list, &basechain->hook_list); + list_for_each_entry(h, &basechain->hook_list, list) + nft_basechain_hook_init(&h->ops, family, hook, chain); + + basechain->ops.hooknum = hook->num; + basechain->ops.priority = hook->priority; + } else { + nft_basechain_hook_init(&basechain->ops, family, hook, chain); + } + + chain->flags |= NFT_BASE_CHAIN | flags; + basechain->policy = NF_ACCEPT; + if (chain->flags & NFT_CHAIN_HW_OFFLOAD && + nft_chain_offload_priority(basechain) < 0) + return -EOPNOTSUPP; + + flow_block_init(&basechain->flow_block); + + return 0; +} + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, u32 flags) { @@ -1628,7 +1872,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; - struct nf_hook_ops *ops; err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) @@ -1639,9 +1882,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, nft_chain_release_hook(&hook); return -ENOMEM; } - - if (hook.dev != NULL) - strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ); + chain = &basechain->chain; if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); @@ -1654,24 +1895,12 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, static_branch_inc(&nft_counters_enabled); } - basechain->type = hook.type; - chain = &basechain->chain; - - ops = &basechain->ops; - ops->pf = family; - ops->hooknum = hook.num; - ops->priority = hook.priority; - ops->priv = chain; - ops->hook = hook.type->hooks[ops->hooknum]; - ops->dev = hook.dev; - - chain->flags |= NFT_BASE_CHAIN | flags; - basechain->policy = NF_ACCEPT; - if (chain->flags & NFT_CHAIN_HW_OFFLOAD && - nft_chain_offload_priority(basechain) < 0) - return -EOPNOTSUPP; - - flow_block_init(&basechain->flow_block); + err = nft_basechain_init(basechain, family, &hook, flags); + if (err < 0) { + nft_chain_release_hook(&hook); + kfree(basechain); + return err; + } } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) @@ -1731,6 +1960,25 @@ err1: return err; } +static bool nft_hook_list_equal(struct list_head *hook_list1, + struct list_head *hook_list2) +{ + struct nft_hook *hook; + int n = 0, m = 0; + + n = 0; + list_for_each_entry(hook, hook_list2, list) { + if (!nft_hook_list_find(hook_list1, hook)) + return false; + + n++; + } + list_for_each_entry(hook, hook_list1, list) + m++; + + return n == m; +} + static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, u32 flags) { @@ -1762,12 +2010,19 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return -EBUSY; } - ops = &basechain->ops; - if (ops->hooknum != hook.num || - ops->priority != hook.priority || - ops->dev != hook.dev) { - nft_chain_release_hook(&hook); - return -EBUSY; + if (ctx->family == NFPROTO_NETDEV) { + if (!nft_hook_list_equal(&basechain->hook_list, + &hook.list)) { + nft_chain_release_hook(&hook); + return -EBUSY; + } + } else { + ops = &basechain->ops; + if (ops->hooknum != hook.num || + ops->priority != hook.priority) { + nft_chain_release_hook(&hook); + return -EBUSY; + } } nft_chain_release_hook(&hook); } @@ -1922,6 +2177,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + flags |= chain->flags & NFT_BASE_CHAIN; return nf_tables_updchain(&ctx, genmask, policy, flags); } @@ -5143,9 +5399,6 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, struct nft_trans *trans; int err; - if (!obj->ops->update) - return -EOPNOTSUPP; - trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ, sizeof(struct nft_trans_obj)); if (!trans) @@ -5582,6 +5835,7 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { .len = NFT_NAME_MAXLEN - 1 }, [NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED }, [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, + [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, @@ -5628,43 +5882,6 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table, return ERR_PTR(-ENOENT); } -static int nf_tables_parse_devices(const struct nft_ctx *ctx, - const struct nlattr *attr, - struct net_device *dev_array[], int *len) -{ - const struct nlattr *tmp; - struct net_device *dev; - char ifname[IFNAMSIZ]; - int rem, n = 0, err; - - nla_for_each_nested(tmp, attr, rem) { - if (nla_type(tmp) != NFTA_DEVICE_NAME) { - err = -EINVAL; - goto err1; - } - - nla_strlcpy(ifname, tmp, IFNAMSIZ); - dev = __dev_get_by_name(ctx->net, ifname); - if (!dev) { - err = -ENOENT; - goto err1; - } - - dev_array[n++] = dev; - if (n == NFT_FLOWTABLE_DEVICE_MAX) { - err = -EFBIG; - goto err1; - } - } - if (!len) - return -EINVAL; - - err = 0; -err1: - *len = n; - return err; -} - static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, @@ -5675,11 +5892,10 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, const struct nlattr *attr, struct nft_flowtable *flowtable) { - struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX]; struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; - struct nf_hook_ops *ops; + struct nft_hook *hook; int hooknum, priority; - int err, n = 0, i; + int err; err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, nft_flowtable_hook_policy, NULL); @@ -5697,27 +5913,21 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); - err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS], - dev_array, &n); + err = nf_tables_parse_netdev_hooks(ctx->net, + tb[NFTA_FLOWTABLE_HOOK_DEVS], + &flowtable->hook_list); if (err < 0) return err; - ops = kcalloc(n, sizeof(struct nf_hook_ops), GFP_KERNEL); - if (!ops) - return -ENOMEM; - - flowtable->hooknum = hooknum; - flowtable->priority = priority; - flowtable->ops = ops; - flowtable->ops_len = n; + flowtable->hooknum = hooknum; + flowtable->data.priority = priority; - for (i = 0; i < n; i++) { - flowtable->ops[i].pf = NFPROTO_NETDEV; - flowtable->ops[i].hooknum = hooknum; - flowtable->ops[i].priority = priority; - flowtable->ops[i].priv = &flowtable->data; - flowtable->ops[i].hook = flowtable->data.type->hook; - flowtable->ops[i].dev = dev_array[i]; + list_for_each_entry(hook, &flowtable->hook_list, list) { + hook->ops.pf = NFPROTO_NETDEV; + hook->ops.hooknum = hooknum; + hook->ops.priority = priority; + hook->ops.priv = &flowtable->data; + hook->ops.hook = flowtable->data.type->hook; } return err; @@ -5757,14 +5967,58 @@ nft_flowtable_type_get(struct net *net, u8 family) static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable) { - int i; + struct nft_hook *hook; - for (i = 0; i < flowtable->ops_len; i++) { - if (!flowtable->ops[i].dev) - continue; + list_for_each_entry(hook, &flowtable->hook_list, list) { + nf_unregister_net_hook(net, &hook->ops); + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); + } +} + +static int nft_register_flowtable_net_hooks(struct net *net, + struct nft_table *table, + struct nft_flowtable *flowtable) +{ + struct nft_hook *hook, *hook2, *next; + struct nft_flowtable *ft; + int err, i = 0; + + list_for_each_entry(hook, &flowtable->hook_list, list) { + list_for_each_entry(ft, &table->flowtables, list) { + list_for_each_entry(hook2, &ft->hook_list, list) { + if (hook->ops.dev == hook2->ops.dev && + hook->ops.pf == hook2->ops.pf) { + err = -EBUSY; + goto err_unregister_net_hooks; + } + } + } + + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_BIND); + err = nf_register_net_hook(net, &hook->ops); + if (err < 0) + goto err_unregister_net_hooks; + + i++; + } + + return 0; + +err_unregister_net_hooks: + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + if (i-- <= 0) + break; - nf_unregister_net_hook(net, &flowtable->ops[i]); + nf_unregister_net_hook(net, &hook->ops); + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); } + + return err; } static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, @@ -5775,12 +6029,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nf_flowtable_type *type; - struct nft_flowtable *flowtable, *ft; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + struct nft_hook *hook, *next; struct nft_table *table; struct nft_ctx ctx; - int err, i, k; + int err; if (!nla[NFTA_FLOWTABLE_TABLE] || !nla[NFTA_FLOWTABLE_NAME] || @@ -5819,6 +6074,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); + INIT_LIST_HEAD(&flowtable->hook_list); flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL); if (!flowtable->name) { @@ -5832,6 +6088,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err2; } + if (nla[NFTA_FLOWTABLE_FLAGS]) { + flowtable->data.flags = + ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); + if (flowtable->data.flags & ~NF_FLOWTABLE_HW_OFFLOAD) + goto err3; + } + + write_pnet(&flowtable->data.net, net); flowtable->data.type = type; err = type->init(&flowtable->data); if (err < 0) @@ -5842,43 +6106,24 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (err < 0) goto err4; - for (i = 0; i < flowtable->ops_len; i++) { - if (!flowtable->ops[i].dev) - continue; - - list_for_each_entry(ft, &table->flowtables, list) { - for (k = 0; k < ft->ops_len; k++) { - if (!ft->ops[k].dev) - continue; - - if (flowtable->ops[i].dev == ft->ops[k].dev && - flowtable->ops[i].pf == ft->ops[k].pf) { - err = -EBUSY; - goto err5; - } - } - } - - err = nf_register_net_hook(net, &flowtable->ops[i]); - if (err < 0) - goto err5; - } + err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); + if (err < 0) + goto err4; err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err6; + goto err5; list_add_tail_rcu(&flowtable->list, &table->flowtables); table->use++; return 0; -err6: - i = flowtable->ops_len; err5: - for (k = i - 1; k >= 0; k--) - nf_unregister_net_hook(net, &flowtable->ops[k]); - - kfree(flowtable->ops); + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + nf_unregister_net_hook(net, &hook->ops); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } err4: flowtable->data.type->free(&flowtable->data); err3: @@ -5945,8 +6190,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, { struct nlattr *nest, *nest_devs; struct nfgenmsg *nfmsg; + struct nft_hook *hook; struct nlmsghdr *nlh; - int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); @@ -5962,25 +6207,23 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle), - NFTA_FLOWTABLE_PAD)) + NFTA_FLOWTABLE_PAD) || + nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags))) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK); if (!nest) goto nla_put_failure; if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || - nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority))) + nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority))) goto nla_put_failure; nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS); if (!nest_devs) goto nla_put_failure; - for (i = 0; i < flowtable->ops_len; i++) { - const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev); - - if (dev && - nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) + list_for_each_entry_rcu(hook, &flowtable->hook_list, list) { + if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -6171,7 +6414,12 @@ err: static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { - kfree(flowtable->ops); + struct nft_hook *hook, *next; + + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + list_del_rcu(&hook->list); + kfree(hook); + } kfree(flowtable->name); flowtable->data.type->free(&flowtable->data); module_put(flowtable->data.type->owner); @@ -6211,14 +6459,15 @@ nla_put_failure: static void nft_flowtable_event(unsigned long event, struct net_device *dev, struct nft_flowtable *flowtable) { - int i; + struct nft_hook *hook; - for (i = 0; i < flowtable->ops_len; i++) { - if (flowtable->ops[i].dev != dev) + list_for_each_entry(hook, &flowtable->hook_list, list) { + if (hook->ops.dev != dev) continue; - nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]); - flowtable->ops[i].dev = NULL; + nf_unregister_net_hook(dev_net(dev), &hook->ops); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); break; } } @@ -6499,7 +6748,8 @@ static void nft_obj_commit_update(struct nft_trans *trans) obj = nft_trans_obj(trans); newobj = nft_trans_obj_newobj(trans); - obj->ops->update(obj, newobj); + if (obj->ops->update) + obj->ops->update(obj, newobj); kfree(newobj); } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index e546f759b7a7..cdea3010c7a0 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -132,13 +132,13 @@ static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, common->extack = extack; } -static int nft_setup_cb_call(struct nft_base_chain *basechain, - enum tc_setup_type type, void *type_data) +static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, + struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; - list_for_each_entry(block_cb, &basechain->flow_block.cb_list, list) { + list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; @@ -155,32 +155,44 @@ int nft_chain_offload_priority(struct nft_base_chain *basechain) return 0; } +static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, + const struct nft_base_chain *basechain, + const struct nft_rule *rule, + const struct nft_flow_rule *flow, + enum flow_cls_command command) +{ + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + + memset(cls_flow, 0, sizeof(*cls_flow)); + + if (flow) + proto = flow->proto; + + nft_flow_offload_common_init(&cls_flow->common, proto, + basechain->ops.priority, &extack); + cls_flow->command = command; + cls_flow->cookie = (unsigned long) rule; + if (flow) + cls_flow->rule = flow->rule; +} + static int nft_flow_offload_rule(struct nft_chain *chain, struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command) { - struct flow_cls_offload cls_flow = {}; + struct flow_cls_offload cls_flow; struct nft_base_chain *basechain; - struct netlink_ext_ack extack; - __be16 proto = ETH_P_ALL; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); + nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, command); - if (flow) - proto = flow->proto; - - nft_flow_offload_common_init(&cls_flow.common, proto, - basechain->ops.priority, &extack); - cls_flow.command = command; - cls_flow.cookie = (unsigned long) rule; - if (flow) - cls_flow.rule = flow->rule; - - return nft_setup_cb_call(basechain, TC_SETUP_CLSFLOWER, &cls_flow); + return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, + &basechain->flow_block.cb_list); } static int nft_flow_offload_bind(struct flow_block_offload *bo, @@ -194,6 +206,16 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; + struct flow_cls_offload cls_flow; + struct nft_chain *chain; + struct nft_rule *rule; + + chain = &basechain->chain; + list_for_each_entry(rule, &chain->rules, list) { + nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, + FLOW_CLS_DESTROY); + nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); + } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); @@ -224,20 +246,30 @@ static int nft_block_setup(struct nft_base_chain *basechain, return err; } +static void nft_flow_block_offload_init(struct flow_block_offload *bo, + struct net *net, + enum flow_block_command cmd, + struct nft_base_chain *basechain, + struct netlink_ext_ack *extack) +{ + memset(bo, 0, sizeof(*bo)); + bo->net = net; + bo->block = &basechain->flow_block; + bo->command = cmd; + bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); +} + static int nft_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; - struct flow_block_offload bo = {}; + struct flow_block_offload bo; int err; - bo.net = dev_net(dev); - bo.block = &chain->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) @@ -253,17 +285,12 @@ static void nft_indr_block_ing_cmd(struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; - struct flow_block_offload bo = {}; + struct flow_block_offload bo; if (!chain) return; - bo.net = dev_net(dev); - bo.block = &chain->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); @@ -274,15 +301,10 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { - struct flow_block_offload bo = {}; struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; - bo.net = dev_net(dev); - bo.block = &chain->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); flow_indr_block_call(dev, &bo, cmd); @@ -294,32 +316,73 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, #define FLOW_SETUP_BLOCK TC_SETUP_BLOCK -static int nft_flow_offload_chain(struct nft_chain *chain, - u8 *ppolicy, +static int nft_chain_offload_cmd(struct nft_base_chain *basechain, + struct net_device *dev, + enum flow_block_command cmd) +{ + int err; + + if (dev->netdev_ops->ndo_setup_tc) + err = nft_block_offload_cmd(basechain, dev, cmd); + else + err = nft_indr_block_offload_cmd(basechain, dev, cmd); + + return err; +} + +static int nft_flow_block_chain(struct nft_base_chain *basechain, + const struct net_device *this_dev, + enum flow_block_command cmd) +{ + struct net_device *dev; + struct nft_hook *hook; + int err, i = 0; + + list_for_each_entry(hook, &basechain->hook_list, list) { + dev = hook->ops.dev; + if (this_dev && this_dev != dev) + continue; + + err = nft_chain_offload_cmd(basechain, dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; + + return err; + } + i++; + } + + return 0; + +err_flow_block: + list_for_each_entry(hook, &basechain->hook_list, list) { + if (i-- <= 0) + break; + + dev = hook->ops.dev; + nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + } + return err; +} + +static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { struct nft_base_chain *basechain; - struct net_device *dev; u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); - dev = basechain->ops.dev; - if (!dev) - return -EOPNOTSUPP; - policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; - if (dev->netdev_ops->ndo_setup_tc) - return nft_block_offload_cmd(basechain, dev, cmd); - else - return nft_indr_block_offload_cmd(basechain, dev, cmd); + return nft_flow_block_chain(basechain, NULL, cmd); } int nft_flow_rule_offload_commit(struct net *net) @@ -334,7 +397,8 @@ int nft_flow_rule_offload_commit(struct net *net) switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + nft_trans_chain_update(trans)) continue; policy = nft_trans_chain_policy(trans); @@ -347,7 +411,7 @@ int nft_flow_rule_offload_commit(struct net *net) policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(trans->ctx.chain, &policy, - FLOW_BLOCK_BIND); + FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) @@ -385,6 +449,7 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) { struct nft_base_chain *basechain; struct net *net = dev_net(dev); + struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; @@ -397,8 +462,16 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; + found = NULL; basechain = nft_base_chain(chain); - if (strncmp(basechain->dev_name, dev->name, IFNAMSIZ)) + list_for_each_entry(hook, &basechain->hook_list, list) { + if (hook->ops.dev != dev) + continue; + + found = hook; + break; + } + if (!found) continue; return chain; @@ -426,18 +499,6 @@ static void nft_indr_block_cb(struct net_device *dev, mutex_unlock(&net->nft.commit_mutex); } -static void nft_offload_chain_clean(struct nft_chain *chain) -{ - struct nft_rule *rule; - - list_for_each_entry(rule, &chain->rules, list) { - nft_flow_offload_rule(chain, rule, - NULL, FLOW_CLS_DESTROY); - } - - nft_flow_offload_chain(chain, NULL, FLOW_BLOCK_UNBIND); -} - static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -448,7 +509,9 @@ static int nft_offload_netdev_event(struct notifier_block *this, mutex_lock(&net->nft.commit_mutex); chain = __nft_offload_get_chain(dev); if (chain) - nft_offload_chain_clean(chain); + nft_flow_block_chain(nft_base_chain(chain), dev, + FLOW_BLOCK_UNBIND); + mutex_unlock(&net->nft.commit_mutex); return NOTIFY_DONE; diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 974300178fa9..02afa752dd2e 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -134,12 +134,13 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx, const struct nft_expr *expr) { const struct nft_bitwise *priv = nft_expr_priv(expr); + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) || - priv->sreg != priv->dreg) + priv->sreg != priv->dreg || priv->len != reg->len) return -EOPNOTSUPP; - memcpy(&ctx->regs[priv->dreg].mask, &priv->mask, sizeof(priv->mask)); + memcpy(®->mask, &priv->mask, sizeof(priv->mask)); return 0; } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index b5d5d071d765..c78d01bc02e9 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -287,28 +287,35 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, struct nft_ctx *ctx) { struct nft_base_chain *basechain = nft_base_chain(ctx->chain); + struct nft_hook *hook, *found = NULL; + int n = 0; - switch (event) { - case NETDEV_UNREGISTER: - if (strcmp(basechain->dev_name, dev->name) != 0) - return; - - /* UNREGISTER events are also happpening on netns exit. - * - * Altough nf_tables core releases all tables/chains, only - * this event handler provides guarantee that - * basechain.ops->dev is still accessible, so we cannot - * skip exiting net namespaces. - */ - __nft_release_basechain(ctx); - break; - case NETDEV_CHANGENAME: - if (dev->ifindex != basechain->ops.dev->ifindex) - return; + if (event != NETDEV_UNREGISTER) + return; - strncpy(basechain->dev_name, dev->name, IFNAMSIZ); - break; + list_for_each_entry(hook, &basechain->hook_list, list) { + if (hook->ops.dev == dev) + found = hook; + + n++; } + if (!found) + return; + + if (n > 1) { + nf_unregister_net_hook(ctx->net, &found->ops); + list_del_rcu(&found->list); + kfree_rcu(found, rcu); + return; + } + + /* UNREGISTER events are also happening on netns exit. + * + * Although nf_tables core releases all tables/chains, only this event + * handler provides guarantee that hook->ops.dev is still accessible, + * so we cannot skip exiting net namespaces. + */ + __nft_release_basechain(ctx); } static int nf_tables_netdev_event(struct notifier_block *this, diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index bd173b1824c6..0744b2bb46da 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -116,7 +116,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, u8 *mask = (u8 *)&flow->match.mask; u8 *key = (u8 *)&flow->match.key; - if (priv->op != NFT_CMP_EQ) + if (priv->op != NFT_CMP_EQ || reg->len != priv->len) return -EOPNOTSUPP; memcpy(key + reg->offset, &priv->data, priv->len); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index f29bbc74c4bf..dd82ff2ee19f 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -115,10 +115,13 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (nft_flow_route(pkt, ct, &route, dir) < 0) goto err_flow_route; - flow = flow_offload_alloc(ct, &route); + flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; + if (flow_offload_route_init(flow, &route) < 0) + goto err_flow_add; + if (tcph) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 22a80eb60222..5cb2d8908d2a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -161,13 +161,21 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ethhdr, h_source): + if (priv->len != ETH_ALEN) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, src, ETH_ALEN, reg); break; case offsetof(struct ethhdr, h_dest): + if (priv->len != ETH_ALEN) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, dst, ETH_ALEN, reg); break; + default: + return -EOPNOTSUPP; } return 0; @@ -181,14 +189,23 @@ static int nft_payload_offload_ip(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct iphdr, saddr): + if (priv->len != sizeof(struct in_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src, sizeof(struct in_addr), reg); break; case offsetof(struct iphdr, daddr): + if (priv->len != sizeof(struct in_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst, sizeof(struct in_addr), reg); break; case offsetof(struct iphdr, protocol): + if (priv->len != sizeof(__u8)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); @@ -208,14 +225,23 @@ static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ipv6hdr, saddr): + if (priv->len != sizeof(struct in6_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src, sizeof(struct in6_addr), reg); break; case offsetof(struct ipv6hdr, daddr): + if (priv->len != sizeof(struct in6_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst, sizeof(struct in6_addr), reg); break; case offsetof(struct ipv6hdr, nexthdr): + if (priv->len != sizeof(__u8)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); @@ -255,10 +281,16 @@ static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct tcphdr, source): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct tcphdr, dest): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, sizeof(__be16), reg); break; @@ -277,10 +309,16 @@ static int nft_payload_offload_udp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct udphdr, source): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct udphdr, dest): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, sizeof(__be16), reg); break; diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index be7798a50546..713fb38541df 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -239,11 +239,7 @@ static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff) return 0; /* Error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) + if (!icmp_is_err(icmph->type)) return 0; *nhoff += iphsz + sizeof(_ih); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index efccd1ac9a66..0522b2b1fd95 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -458,10 +458,63 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, } EXPORT_SYMBOL(genlmsg_put); +static struct genl_dumpit_info *genl_dumpit_info_alloc(void) +{ + return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL); +} + +static void genl_dumpit_info_free(const struct genl_dumpit_info *info) +{ + kfree(info); +} + +static struct nlattr ** +genl_family_rcv_msg_attrs_parse(const struct genl_family *family, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, + enum genl_validate_flags no_strict_flag, + bool parallel) +{ + enum netlink_validation validate = ops->validate & no_strict_flag ? + NL_VALIDATE_LIBERAL : + NL_VALIDATE_STRICT; + struct nlattr **attrbuf; + int err; + + if (!family->maxattr) + return NULL; + + if (parallel) { + attrbuf = kmalloc_array(family->maxattr + 1, + sizeof(struct nlattr *), GFP_KERNEL); + if (!attrbuf) + return ERR_PTR(-ENOMEM); + } else { + attrbuf = family->attrbuf; + } + + err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, + family->policy, validate, extack); + if (err && parallel) { + kfree(attrbuf); + return ERR_PTR(err); + } + return attrbuf; +} + +static void genl_family_rcv_msg_attrs_free(const struct genl_family *family, + struct nlattr **attrbuf, + bool parallel) +{ + if (parallel) + kfree(attrbuf); +} + static int genl_lock_start(struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_ops *ops = genl_dumpit_info(cb)->ops; int rc = 0; if (ops->start) { @@ -474,8 +527,7 @@ static int genl_lock_start(struct netlink_callback *cb) static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_ops *ops = genl_dumpit_info(cb)->ops; int rc; genl_lock(); @@ -486,8 +538,8 @@ static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int genl_lock_done(struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_ops *ops = info->ops; int rc = 0; if (ops->done) { @@ -495,120 +547,111 @@ static int genl_lock_done(struct netlink_callback *cb) rc = ops->done(cb); genl_unlock(); } + genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_dumpit_info_free(info); return rc; } -static int genl_family_rcv_msg(const struct genl_family *family, - struct sk_buff *skb, - struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int genl_parallel_done(struct netlink_callback *cb) { - const struct genl_ops *ops; - struct net *net = sock_net(skb->sk); - struct genl_info info; - struct genlmsghdr *hdr = nlmsg_data(nlh); - struct nlattr **attrbuf; - int hdrlen, err; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_ops *ops = info->ops; + int rc = 0; - /* this family doesn't exist in this netns */ - if (!family->netnsok && !net_eq(net, &init_net)) - return -ENOENT; + if (ops->done) + rc = ops->done(cb); + genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_dumpit_info_free(info); + return rc; +} - hdrlen = GENL_HDRLEN + family->hdrsize; - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) - return -EINVAL; +static int genl_family_rcv_msg_dumpit(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, struct net *net) +{ + struct genl_dumpit_info *info; + struct nlattr **attrs = NULL; + int err; - ops = genl_get_cmd(hdr->cmd, family); - if (ops == NULL) + if (!ops->dumpit) return -EOPNOTSUPP; - if ((ops->flags & GENL_ADMIN_PERM) && - !netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; - - if ((ops->flags & GENL_UNS_ADMIN_PERM) && - !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { - int rc; - - if (ops->dumpit == NULL) - return -EOPNOTSUPP; - - if (!(ops->validate & GENL_DONT_VALIDATE_DUMP)) { - int hdrlen = GENL_HDRLEN + family->hdrsize; - - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) - return -EINVAL; + if (ops->validate & GENL_DONT_VALIDATE_DUMP) + goto no_attrs; - if (family->maxattr) { - unsigned int validate = NL_VALIDATE_STRICT; - - if (ops->validate & - GENL_DONT_VALIDATE_DUMP_STRICT) - validate = NL_VALIDATE_LIBERAL; - rc = __nla_validate(nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), - family->maxattr, - family->policy, - validate, extack); - if (rc) - return rc; - } - } + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; - if (!family->parallel_ops) { - struct netlink_dump_control c = { - .module = family->module, - /* we have const, but the netlink API doesn't */ - .data = (void *)ops, - .start = genl_lock_start, - .dump = genl_lock_dumpit, - .done = genl_lock_done, - }; + attrs = genl_family_rcv_msg_attrs_parse(family, nlh, extack, + ops, hdrlen, + GENL_DONT_VALIDATE_DUMP_STRICT, + true); + if (IS_ERR(attrs)) + return PTR_ERR(attrs); + +no_attrs: + /* Allocate dumpit info. It is going to be freed by done() callback. */ + info = genl_dumpit_info_alloc(); + if (!info) { + genl_family_rcv_msg_attrs_free(family, attrs, true); + return -ENOMEM; + } - genl_unlock(); - rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - genl_lock(); + info->family = family; + info->ops = ops; + info->attrs = attrs; - } else { - struct netlink_dump_control c = { - .module = family->module, - .start = ops->start, - .dump = ops->dumpit, - .done = ops->done, - }; + if (!family->parallel_ops) { + struct netlink_dump_control c = { + .module = family->module, + .data = info, + .start = genl_lock_start, + .dump = genl_lock_dumpit, + .done = genl_lock_done, + }; - rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - } + genl_unlock(); + err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); + genl_lock(); - return rc; + } else { + struct netlink_dump_control c = { + .module = family->module, + .data = info, + .start = ops->start, + .dump = ops->dumpit, + .done = genl_parallel_done, + }; + + err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); } - if (ops->doit == NULL) - return -EOPNOTSUPP; - - if (family->maxattr && family->parallel_ops) { - attrbuf = kmalloc_array(family->maxattr + 1, - sizeof(struct nlattr *), - GFP_KERNEL); - if (attrbuf == NULL) - return -ENOMEM; - } else - attrbuf = family->attrbuf; + return err; +} - if (attrbuf) { - enum netlink_validation validate = NL_VALIDATE_STRICT; +static int genl_family_rcv_msg_doit(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, struct net *net) +{ + struct nlattr **attrbuf; + struct genl_info info; + int err; - if (ops->validate & GENL_DONT_VALIDATE_STRICT) - validate = NL_VALIDATE_LIBERAL; + if (!ops->doit) + return -EOPNOTSUPP; - err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, - family->policy, validate, extack); - if (err < 0) - goto out; - } + attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack, + ops, hdrlen, + GENL_DONT_VALIDATE_STRICT, + family->parallel_ops); + if (IS_ERR(attrbuf)) + return PTR_ERR(attrbuf); info.snd_seq = nlh->nlmsg_seq; info.snd_portid = NETLINK_CB(skb).portid; @@ -632,12 +675,49 @@ static int genl_family_rcv_msg(const struct genl_family *family, family->post_doit(ops, skb, &info); out: - if (family->parallel_ops) - kfree(attrbuf); + genl_family_rcv_msg_attrs_free(family, attrbuf, family->parallel_ops); return err; } +static int genl_family_rcv_msg(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + const struct genl_ops *ops; + struct net *net = sock_net(skb->sk); + struct genlmsghdr *hdr = nlmsg_data(nlh); + int hdrlen; + + /* this family doesn't exist in this netns */ + if (!family->netnsok && !net_eq(net, &init_net)) + return -ENOENT; + + hdrlen = GENL_HDRLEN + family->hdrsize; + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; + + ops = genl_get_cmd(hdr->cmd, family); + if (ops == NULL) + return -EOPNOTSUPP; + + if ((ops->flags & GENL_ADMIN_PERM) && + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + if ((ops->flags & GENL_UNS_ADMIN_PERM) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) + return genl_family_rcv_msg_dumpit(family, skb, nlh, extack, + ops, hdrlen, net); + else + return genl_family_rcv_msg_doit(family, skb, nlh, extack, + ops, hdrlen, net); +} + static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1088,25 +1168,6 @@ problem: subsys_initcall(genl_init); -/** - * genl_family_attrbuf - return family's attrbuf - * @family: the family - * - * Return the family's attrbuf, while validating that it's - * actually valid to access it. - * - * You cannot use this function with a family that has parallel_ops - * and you can only use it within (pre/post) doit/dumpit callbacks. - */ -struct nlattr **genl_family_attrbuf(const struct genl_family *family) -{ - if (!WARN_ON(family->parallel_ops)) - lockdep_assert_held(&genl_mutex); - - return family->attrbuf; -} -EXPORT_SYMBOL(genl_family_attrbuf); - static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, gfp_t flags) { diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index c4f54ad2b98a..58d5373c513c 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -64,28 +64,6 @@ static DEFINE_SPINLOCK(nr_list_lock); static const struct proto_ops nr_proto_ops; /* - * NETROM network devices are virtual network devices encapsulating NETROM - * frames into AX.25 which will be sent through an AX.25 device, so form a - * special "super class" of normal net devices; split their locks off into a - * separate class since they always nest. - */ -static struct lock_class_key nr_netdev_xmit_lock_key; -static struct lock_class_key nr_netdev_addr_lock_key; - -static void nr_set_lockdep_one(struct net_device *dev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key); -} - -static void nr_set_lockdep_key(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key); - netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); -} - -/* * Socket removal during an interrupt is now safe. */ static void nr_remove_socket(struct sock *sk) @@ -1414,7 +1392,6 @@ static int __init nr_proto_init(void) free_netdev(dev); goto fail; } - nr_set_lockdep_key(dev); dev_nr[i] = dev; } diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ccdd790e163a..28604414dec1 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -554,11 +554,11 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == LLCP_CLOSED) diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 17e6ca62f1be..eee0dddb7749 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -102,22 +102,14 @@ nla_put_failure: static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) { - struct nlattr **attrbuf = genl_family_attrbuf(&nfc_genl_family); + const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nfc_dev *dev; - int rc; u32 idx; - rc = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nfc_genl_family.hdrsize, - attrbuf, nfc_genl_family.maxattr, - nfc_genl_policy, NULL); - if (rc < 0) - return ERR_PTR(rc); - - if (!attrbuf[NFC_ATTR_DEVICE_INDEX]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); - idx = nla_get_u32(attrbuf[NFC_ATTR_DEVICE_INDEX]); + idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) @@ -1099,7 +1091,6 @@ static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) local = nfc_llcp_find_local(dev); if (!local) { - nfc_put_device(dev); rc = -ENODEV; goto exit; } @@ -1159,7 +1150,6 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) local = nfc_llcp_find_local(dev); if (!local) { - nfc_put_device(dev); rc = -ENODEV; goto exit; } @@ -1697,7 +1687,8 @@ static const struct genl_ops nfc_genl_ops[] = { }, { .cmd = NFC_CMD_GET_TARGET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = nfc_genl_dump_targets, .done = nfc_genl_dump_targets_done, }, diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 3572e11b6f21..12936c151cc0 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -165,7 +165,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, { int err; - err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype); + err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype, + skb->mac_len); if (err) return err; @@ -178,7 +179,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, { int err; - err = skb_mpls_pop(skb, ethertype); + err = skb_mpls_pop(skb, ethertype, skb->mac_len); if (err) return err; @@ -199,7 +200,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, if (err) return err; - flow_key->mpls.top_lse = lse; + flow_key->mpls.lse[0] = lse; return 0; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 05249eb45082..df9c80bf621d 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -971,6 +971,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, ct = nf_ct_get(skb, &ctinfo); if (ct) { + bool add_helper = false; + /* Packets starting a new connection must be NATted before the * helper, so that the helper knows about the NAT. We enforce * this by delaying both NAT and helper calls for unconfirmed @@ -988,16 +990,17 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, } /* Userspace may decide to perform a ct lookup without a helper - * specified followed by a (recirculate and) commit with one. - * Therefore, for unconfirmed connections which we will commit, - * we need to attach the helper here. + * specified followed by a (recirculate and) commit with one, + * or attach a helper in a later commit. Therefore, for + * connections which we will commit, we may need to attach + * the helper here. */ - if (!nf_ct_is_confirmed(ct) && info->commit && - info->helper && !nfct_help(ct)) { + if (info->commit && info->helper && !nfct_help(ct)) { int err = __nf_ct_try_assign_helper(ct, info->ct, GFP_ATOMIC); if (err) return err; + add_helper = true; /* helper installed, add seqadj if NAT is required */ if (info->nat && !nfct_seqadj(ct)) { @@ -1007,11 +1010,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, } /* Call the helper only if: - * - nf_conntrack_in() was executed above ("!cached") for a - * confirmed connection, or + * - nf_conntrack_in() was executed above ("!cached") or a + * helper was just attached ("add_helper") for a confirmed + * connection, or * - When committing an unconfirmed connection. */ - if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) && + if ((nf_ct_is_confirmed(ct) ? !cached || add_helper : + info->commit) && ovs_ct_helper(skb, info->family) != NF_ACCEPT) { return -EINVAL; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f30e406fbec5..2088619c03f0 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -227,7 +227,8 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ - flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit); + flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), + &n_mask_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -1575,6 +1576,31 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) return 0; } +static int ovs_dp_stats_init(struct datapath *dp) +{ + dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); + if (!dp->stats_percpu) + return -ENOMEM; + + return 0; +} + +static int ovs_dp_vport_init(struct datapath *dp) +{ + int i; + + dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!dp->ports) + return -ENOMEM; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->ports[i]); + + return 0; +} + static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1583,7 +1609,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct vport *vport; struct ovs_net *ovs_net; - int err, i; + int err; err = -EINVAL; if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) @@ -1596,35 +1622,26 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) - goto err_free_reply; + goto err_destroy_reply; ovs_dp_set_net(dp, sock_net(skb->sk)); /* Allocate table. */ err = ovs_flow_tbl_init(&dp->table); if (err) - goto err_free_dp; + goto err_destroy_dp; - dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); - if (!dp->stats_percpu) { - err = -ENOMEM; + err = ovs_dp_stats_init(dp); + if (err) goto err_destroy_table; - } - - dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, - sizeof(struct hlist_head), - GFP_KERNEL); - if (!dp->ports) { - err = -ENOMEM; - goto err_destroy_percpu; - } - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->ports[i]); + err = ovs_dp_vport_init(dp); + if (err) + goto err_destroy_stats; err = ovs_meters_init(dp); if (err) - goto err_destroy_ports_array; + goto err_destroy_ports; /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); @@ -1656,6 +1673,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } + ovs_unlock(); goto err_destroy_meters; } @@ -1672,17 +1690,16 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) return 0; err_destroy_meters: - ovs_unlock(); ovs_meters_exit(dp); -err_destroy_ports_array: +err_destroy_ports: kfree(dp->ports); -err_destroy_percpu: +err_destroy_stats: free_percpu(dp->stats_percpu); err_destroy_table: ovs_flow_tbl_destroy(&dp->table); -err_free_dp: +err_destroy_dp: kfree(dp); -err_free_reply: +err_destroy_reply: kfree_skb(reply); err: return err; @@ -1881,7 +1898,7 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = { /* Called with ovs_mutex or RCU read lock. */ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, struct net *net, u32 portid, u32 seq, - u32 flags, u8 cmd) + u32 flags, u8 cmd, gfp_t gfp) { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; @@ -1902,7 +1919,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, goto nla_put_failure; if (!net_eq(net, dev_net(vport->dev))) { - int id = peernet2id_alloc(net, dev_net(vport->dev)); + int id = peernet2id_alloc(net, dev_net(vport->dev), gfp); if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) goto nla_put_failure; @@ -1943,11 +1960,12 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, struct sk_buff *skb; int retval; - skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); - retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd); + retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd, + GFP_KERNEL); BUG_ON(retval < 0); return skb; @@ -2089,7 +2107,7 @@ restart: err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_NEW); + OVS_VPORT_CMD_NEW, GFP_KERNEL); new_headroom = netdev_get_fwd_headroom(vport->dev); @@ -2150,7 +2168,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_SET); + OVS_VPORT_CMD_SET, GFP_KERNEL); BUG_ON(err < 0); ovs_unlock(); @@ -2190,7 +2208,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_DEL); + OVS_VPORT_CMD_DEL, GFP_KERNEL); BUG_ON(err < 0); /* the vport deletion may trigger dp headroom update */ @@ -2237,7 +2255,7 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) goto exit_unlock_free; err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_GET); + OVS_VPORT_CMD_GET, GFP_ATOMIC); BUG_ON(err < 0); rcu_read_unlock(); @@ -2273,7 +2291,8 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_VPORT_CMD_GET) < 0) + OVS_VPORT_CMD_GET, + GFP_ATOMIC) < 0) goto out; j++; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 38147e6a20f5..9d375e74b607 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -637,27 +637,35 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ipv4, 0, sizeof(key->ipv4)); } } else if (eth_p_mpls(key->eth.type)) { - size_t stack_len = MPLS_HLEN; + u8 label_count = 1; + memset(&key->mpls, 0, sizeof(key->mpls)); skb_set_inner_network_header(skb, skb->mac_len); while (1) { __be32 lse; - error = check_header(skb, skb->mac_len + stack_len); + error = check_header(skb, skb->mac_len + + label_count * MPLS_HLEN); if (unlikely(error)) return 0; memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN); - if (stack_len == MPLS_HLEN) - memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); + if (label_count <= MPLS_LABEL_DEPTH) + memcpy(&key->mpls.lse[label_count - 1], &lse, + MPLS_HLEN); - skb_set_inner_network_header(skb, skb->mac_len + stack_len); + skb_set_inner_network_header(skb, skb->mac_len + + label_count * MPLS_HLEN); if (lse & htonl(MPLS_LS_S_MASK)) break; - stack_len += MPLS_HLEN; + label_count++; } + if (label_count > MPLS_LABEL_DEPTH) + label_count = MPLS_LABEL_DEPTH; + + key->mpls.num_labels_mask = GENMASK(label_count - 1, 0); } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index b830d5ff7af4..fd8ed766bdd1 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -30,6 +30,7 @@ enum sw_flow_mac_proto { MAC_PROTO_ETHERNET, }; #define SW_FLOW_KEY_INVALID 0x80 +#define MPLS_LABEL_DEPTH 3 /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length @@ -85,9 +86,6 @@ struct sw_flow_key { */ union { struct { - __be32 top_lse; /* top label stack entry */ - } mpls; - struct { u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ u8 tos; /* IP ToS. */ u8 ttl; /* IP TTL/hop limit. */ @@ -135,6 +133,11 @@ struct sw_flow_key { } nd; }; } ipv6; + struct { + u32 num_labels_mask; /* labels present bitmap of effective length MPLS_LABEL_DEPTH */ + __be32 lse[MPLS_LABEL_DEPTH]; /* label stack entry */ + } mpls; + struct ovs_key_nsh nsh; /* network service header */ }; struct { @@ -166,7 +169,6 @@ struct sw_flow_key_range { struct sw_flow_mask { int ref_count; struct rcu_head rcu; - struct list_head list; struct sw_flow_key_range range; struct sw_flow_key key; }; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d7559c64795d..65c2e3458ff5 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -424,7 +424,7 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) }, [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, .next = ovs_tunnel_key_lens, }, - [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, + [OVS_KEY_ATTR_MPLS] = { .len = OVS_ATTR_VARIABLE }, [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) }, [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, @@ -1628,10 +1628,25 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { const struct ovs_key_mpls *mpls_key; + u32 hdr_len; + u32 label_count, label_count_mask, i; mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); - SW_FLOW_KEY_PUT(match, mpls.top_lse, - mpls_key->mpls_lse, is_mask); + hdr_len = nla_len(a[OVS_KEY_ATTR_MPLS]); + label_count = hdr_len / sizeof(struct ovs_key_mpls); + + if (label_count == 0 || label_count > MPLS_LABEL_DEPTH || + hdr_len % sizeof(struct ovs_key_mpls)) + return -EINVAL; + + label_count_mask = GENMASK(label_count - 1, 0); + + for (i = 0 ; i < label_count; i++) + SW_FLOW_KEY_PUT(match, mpls.lse[i], + mpls_key[i].mpls_lse, is_mask); + + SW_FLOW_KEY_PUT(match, mpls.num_labels_mask, + label_count_mask, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_MPLS); } @@ -2114,13 +2129,18 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); } else if (eth_p_mpls(swkey->eth.type)) { + u8 i, num_labels; struct ovs_key_mpls *mpls_key; - nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); + num_labels = hweight_long(output->mpls.num_labels_mask); + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, + num_labels * sizeof(*mpls_key)); if (!nla) goto nla_put_failure; + mpls_key = nla_data(nla); - mpls_key->mpls_lse = output->mpls.top_lse; + for (i = 0; i < num_labels; i++) + mpls_key[i].mpls_lse = output->mpls.lse[i]; } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -2406,13 +2426,14 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, bool log); + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log); static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - bool log, bool last) + u32 mpls_label_count, bool log, bool last) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; @@ -2463,7 +2484,7 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, return err; err = __ovs_nla_copy_actions(net, actions, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2478,7 +2499,7 @@ static int validate_and_copy_clone(struct net *net, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - bool log, bool last) + u32 mpls_label_count, bool log, bool last) { int start, err; u32 exec; @@ -2498,7 +2519,7 @@ static int validate_and_copy_clone(struct net *net, return err; err = __ovs_nla_copy_actions(net, attr, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2864,6 +2885,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log, bool last) { const struct nlattr *acts_if_greater, *acts_if_lesser_eq; @@ -2912,7 +2934,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, return nested_acts_start; err = __ovs_nla_copy_actions(net, acts_if_lesser_eq, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2925,7 +2947,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, return nested_acts_start; err = __ovs_nla_copy_actions(net, acts_if_greater, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2952,7 +2974,8 @@ static int copy_action(const struct nlattr *from, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, bool log) + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log) { u8 mac_proto = ovs_key_mac_proto(key); const struct nlattr *a; @@ -3065,25 +3088,36 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, !eth_p_mpls(eth_type))) return -EINVAL; eth_type = mpls->mpls_ethertype; + mpls_label_count++; break; } - case OVS_ACTION_ATTR_POP_MPLS: + case OVS_ACTION_ATTR_POP_MPLS: { + __be16 proto; if (vlan_tci & htons(VLAN_CFI_MASK) || !eth_p_mpls(eth_type)) return -EINVAL; - /* Disallow subsequent L2.5+ set and mpls_pop actions - * as there is no check here to ensure that the new - * eth_type is valid and thus set actions could - * write off the end of the packet or otherwise - * corrupt it. + /* Disallow subsequent L2.5+ set actions and mpls_pop + * actions once the last MPLS label in the packet is + * is popped as there is no check here to ensure that + * the new eth type is valid and thus set actions could + * write off the end of the packet or otherwise corrupt + * it. * * Support for these actions is planned using packet * recirculation. */ - eth_type = htons(0); + proto = nla_get_be16(a); + mpls_label_count--; + + if (!eth_p_mpls(proto) || !mpls_label_count) + eth_type = htons(0); + else + eth_type = proto; + break; + } case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, @@ -3106,6 +3140,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_sample(net, a, key, sfa, eth_type, vlan_tci, + mpls_label_count, log, last); if (err) return err; @@ -3176,6 +3211,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_clone(net, a, key, sfa, eth_type, vlan_tci, + mpls_label_count, log, last); if (err) return err; @@ -3188,8 +3224,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_check_pkt_len(net, a, key, sfa, eth_type, - vlan_tci, log, - last); + vlan_tci, + mpls_label_count, + log, last); if (err) return err; skip_copy = true; @@ -3219,14 +3256,18 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, struct sw_flow_actions **sfa, bool log) { int err; + u32 mpls_label_count = 0; *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); if (IS_ERR(*sfa)) return PTR_ERR(*sfa); + if (eth_p_mpls(key->eth.type)) + mpls_label_count = hweight_long(key->mpls.num_labels_mask); + (*sfa)->orig_len = nla_len(attr); err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type, - key->eth.vlan.tci, log); + key->eth.vlan.tci, mpls_label_count, log); if (err) ovs_nla_free_flow_actions(*sfa); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index cf3582c5ed70..5904e93e5765 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -34,8 +34,13 @@ #include <net/ndisc.h> #define TBL_MIN_BUCKETS 1024 +#define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) +#define MC_HASH_SHIFT 8 +#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT) +#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) + static struct kmem_cache *flow_cache; struct kmem_cache *flow_stats_cache __read_mostly; @@ -164,14 +169,133 @@ static struct table_instance *table_instance_alloc(int new_size) return ti; } +static struct mask_array *tbl_mask_array_alloc(int size) +{ + struct mask_array *new; + + size = max(MASK_ARRAY_SIZE_MIN, size); + new = kzalloc(sizeof(struct mask_array) + + sizeof(struct sw_flow_mask *) * size, GFP_KERNEL); + if (!new) + return NULL; + + new->count = 0; + new->max = size; + + return new; +} + +static int tbl_mask_array_realloc(struct flow_table *tbl, int size) +{ + struct mask_array *old; + struct mask_array *new; + + new = tbl_mask_array_alloc(size); + if (!new) + return -ENOMEM; + + old = ovsl_dereference(tbl->mask_array); + if (old) { + int i; + + for (i = 0; i < old->max; i++) { + if (ovsl_dereference(old->masks[i])) + new->masks[new->count++] = old->masks[i]; + } + } + + rcu_assign_pointer(tbl->mask_array, new); + kfree_rcu(old, rcu); + + return 0; +} + +static int tbl_mask_array_add_mask(struct flow_table *tbl, + struct sw_flow_mask *new) +{ + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int err, ma_count = READ_ONCE(ma->count); + + if (ma_count >= ma->max) { + err = tbl_mask_array_realloc(tbl, ma->max + + MASK_ARRAY_SIZE_MIN); + if (err) + return err; + + ma = ovsl_dereference(tbl->mask_array); + } + + BUG_ON(ovsl_dereference(ma->masks[ma_count])); + + rcu_assign_pointer(ma->masks[ma_count], new); + WRITE_ONCE(ma->count, ma_count +1); + + return 0; +} + +static void tbl_mask_array_del_mask(struct flow_table *tbl, + struct sw_flow_mask *mask) +{ + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int i, ma_count = READ_ONCE(ma->count); + + /* Remove the deleted mask pointers from the array */ + for (i = 0; i < ma_count; i++) { + if (mask == ovsl_dereference(ma->masks[i])) + goto found; + } + + BUG(); + return; + +found: + WRITE_ONCE(ma->count, ma_count -1); + + rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]); + RCU_INIT_POINTER(ma->masks[ma_count -1], NULL); + + kfree_rcu(mask, rcu); + + /* Shrink the mask array if necessary. */ + if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) && + ma_count <= (ma->max / 3)) + tbl_mask_array_realloc(tbl, ma->max / 2); +} + +/* Remove 'mask' from the mask list, if it is not needed any more. */ +static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + if (mask) { + /* ovs-lock is required to protect mask-refcount and + * mask list. + */ + ASSERT_OVSL(); + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) + tbl_mask_array_del_mask(tbl, mask); + } +} + int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; + struct mask_array *ma; - ti = table_instance_alloc(TBL_MIN_BUCKETS); + table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * + MC_HASH_ENTRIES, + __alignof__(struct mask_cache_entry)); + if (!table->mask_cache) + return -ENOMEM; + ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); + if (!ma) + goto free_mask_cache; + + ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ti) - return -ENOMEM; + goto free_mask_array; ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ufid_ti) @@ -179,7 +303,7 @@ int ovs_flow_tbl_init(struct flow_table *table) rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); - INIT_LIST_HEAD(&table->mask_list); + rcu_assign_pointer(table->mask_array, ma); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; @@ -187,6 +311,10 @@ int ovs_flow_tbl_init(struct flow_table *table) free_ti: __table_instance_destroy(ti); +free_mask_array: + kfree(ma); +free_mask_cache: + free_percpu(table->mask_cache); return -ENOMEM; } @@ -197,7 +325,28 @@ static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) __table_instance_destroy(ti); } -static void table_instance_destroy(struct table_instance *ti, +static void table_instance_flow_free(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti, + struct sw_flow *flow, + bool count) +{ + hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); + if (count) + table->count--; + + if (ovs_identifier_is_ufid(&flow->id)) { + hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); + + if (count) + table->ufid_count--; + } + + flow_mask_remove(table, flow->mask); +} + +static void table_instance_destroy(struct flow_table *table, + struct table_instance *ti, struct table_instance *ufid_ti, bool deferred) { @@ -214,13 +363,12 @@ static void table_instance_destroy(struct table_instance *ti, struct sw_flow *flow; struct hlist_head *head = &ti->buckets[i]; struct hlist_node *n; - int ver = ti->node_ver; - int ufid_ver = ufid_ti->node_ver; - hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) { - hlist_del_rcu(&flow->flow_table.node[ver]); - if (ovs_identifier_is_ufid(&flow->id)) - hlist_del_rcu(&flow->ufid_table.node[ufid_ver]); + hlist_for_each_entry_safe(flow, n, head, + flow_table.node[ti->node_ver]) { + + table_instance_flow_free(table, ti, ufid_ti, + flow, false); ovs_flow_free(flow, deferred); } } @@ -243,7 +391,9 @@ void ovs_flow_tbl_destroy(struct flow_table *table) struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); - table_instance_destroy(ti, ufid_ti, false); + free_percpu(table->mask_cache); + kfree_rcu(rcu_dereference_raw(table->mask_array), rcu); + table_instance_destroy(table, ti, ufid_ti, false); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -359,7 +509,7 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table) flow_table->count = 0; flow_table->ufid_count = 0; - table_instance_destroy(old_ti, old_ufid_ti, true); + table_instance_destroy(flow_table, old_ti, old_ufid_ti, true); return 0; err_free_ti: @@ -370,13 +520,10 @@ err_free_ti: static u32 flow_hash(const struct sw_flow_key *key, const struct sw_flow_key_range *range) { - int key_start = range->start; - int key_end = range->end; - const u32 *hash_key = (const u32 *)((const u8 *)key + key_start); - int hash_u32s = (key_end - key_start) >> 2; + const u32 *hash_key = (const u32 *)((const u8 *)key + range->start); /* Make sure number of hash bytes are multiple of u32. */ - BUILD_BUG_ON(sizeof(long) % sizeof(u32)); + int hash_u32s = range_n_bytes(range) >> 2; return jhash2(hash_key, hash_u32s, 0); } @@ -425,7 +572,8 @@ static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, static struct sw_flow *masked_flow_lookup(struct table_instance *ti, const struct sw_flow_key *unmasked, - const struct sw_flow_mask *mask) + const struct sw_flow_mask *mask, + u32 *n_mask_hit) { struct sw_flow *flow; struct hlist_head *head; @@ -435,6 +583,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, ovs_flow_mask_key(&masked_key, unmasked, false, mask); hash = flow_hash(&masked_key, &mask->range); head = find_bucket(ti, hash); + (*n_mask_hit)++; + hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { if (flow->mask == mask && flow->flow_table.hash == hash && flow_cmp_masked_key(flow, &masked_key, &mask->range)) @@ -443,46 +593,147 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, return NULL; } -struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, - const struct sw_flow_key *key, - u32 *n_mask_hit) +/* Flow lookup does full lookup on flow table. It starts with + * mask from index passed in *index. + */ +static struct sw_flow *flow_lookup(struct flow_table *tbl, + struct table_instance *ti, + struct mask_array *ma, + const struct sw_flow_key *key, + u32 *n_mask_hit, + u32 *index) { - struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow *flow; struct sw_flow_mask *mask; + int i; + + if (likely(*index < ma->max)) { + mask = rcu_dereference_ovsl(ma->masks[*index]); + if (mask) { + flow = masked_flow_lookup(ti, key, mask, n_mask_hit); + if (flow) + return flow; + } + } + + for (i = 0; i < ma->max; i++) { + + if (i == *index) + continue; + + mask = rcu_dereference_ovsl(ma->masks[i]); + if (unlikely(!mask)) + break; + + flow = masked_flow_lookup(ti, key, mask, n_mask_hit); + if (flow) { /* Found */ + *index = i; + return flow; + } + } + + return NULL; +} + +/* + * mask_cache maps flow to probable mask. This cache is not tightly + * coupled cache, It means updates to mask list can result in inconsistent + * cache entry in mask cache. + * This is per cpu cache and is divided in MC_HASH_SEGS segments. + * In case of a hash collision the entry is hashed in next segment. + * */ +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, + const struct sw_flow_key *key, + u32 skb_hash, + u32 *n_mask_hit) +{ + struct mask_array *ma = rcu_dereference(tbl->mask_array); + struct table_instance *ti = rcu_dereference(tbl->ti); + struct mask_cache_entry *entries, *ce; struct sw_flow *flow; + u32 hash; + int seg; *n_mask_hit = 0; - list_for_each_entry_rcu(mask, &tbl->mask_list, list) { - (*n_mask_hit)++; - flow = masked_flow_lookup(ti, key, mask); - if (flow) /* Found */ + if (unlikely(!skb_hash)) { + u32 mask_index = 0; + + return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); + } + + /* Pre and post recirulation flows usually have the same skb_hash + * value. To avoid hash collisions, rehash the 'skb_hash' with + * 'recirc_id'. */ + if (key->recirc_id) + skb_hash = jhash_1word(skb_hash, key->recirc_id); + + ce = NULL; + hash = skb_hash; + entries = this_cpu_ptr(tbl->mask_cache); + + /* Find the cache entry 'ce' to operate on. */ + for (seg = 0; seg < MC_HASH_SEGS; seg++) { + int index = hash & (MC_HASH_ENTRIES - 1); + struct mask_cache_entry *e; + + e = &entries[index]; + if (e->skb_hash == skb_hash) { + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, + &e->mask_index); + if (!flow) + e->skb_hash = 0; return flow; + } + + if (!ce || e->skb_hash < ce->skb_hash) + ce = e; /* A better replacement cache candidate. */ + + hash >>= MC_HASH_SHIFT; } - return NULL; + + /* Cache miss, do full lookup. */ + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); + if (flow) + ce->skb_hash = skb_hash; + + return flow; } struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, const struct sw_flow_key *key) { + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; + u32 index = 0; - return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit); + return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, const struct sw_flow_match *match) { - struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); - struct sw_flow_mask *mask; - struct sw_flow *flow; + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int i; /* Always called under ovs-mutex. */ - list_for_each_entry(mask, &tbl->mask_list, list) { - flow = masked_flow_lookup(ti, match->key, mask); + for (i = 0; i < ma->max; i++) { + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 __always_unused n_mask_hit; + struct sw_flow_mask *mask; + struct sw_flow *flow; + + mask = ovsl_dereference(ma->masks[i]); + if (!mask) + continue; + + flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit); if (flow && ovs_identifier_is_key(&flow->id) && - ovs_flow_cmp_unmasked_key(flow, match)) + ovs_flow_cmp_unmasked_key(flow, match)) { return flow; + } } + return NULL; } @@ -528,13 +779,8 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, int ovs_flow_tbl_num_masks(const struct flow_table *table) { - struct sw_flow_mask *mask; - int num = 0; - - list_for_each_entry(mask, &table->mask_list, list) - num++; - - return num; + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); + return READ_ONCE(ma->count); } static struct table_instance *table_instance_expand(struct table_instance *ti, @@ -543,24 +789,6 @@ static struct table_instance *table_instance_expand(struct table_instance *ti, return table_instance_rehash(ti, ti->n_buckets * 2, ufid); } -/* Remove 'mask' from the mask list, if it is not needed any more. */ -static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) -{ - if (mask) { - /* ovs-lock is required to protect mask-refcount and - * mask list. - */ - ASSERT_OVSL(); - BUG_ON(!mask->ref_count); - mask->ref_count--; - - if (!mask->ref_count) { - list_del_rcu(&mask->list); - kfree_rcu(mask, rcu); - } - } -} - /* Must be called with OVS mutex held. */ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) { @@ -568,17 +796,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); BUG_ON(table->count == 0); - hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); - table->count--; - if (ovs_identifier_is_ufid(&flow->id)) { - hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); - table->ufid_count--; - } - - /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be - * accessible as long as the RCU read lock is held. - */ - flow_mask_remove(table, flow->mask); + table_instance_flow_free(table, ti, ufid_ti, flow, true); } static struct sw_flow_mask *mask_alloc(void) @@ -606,13 +824,16 @@ static bool mask_equal(const struct sw_flow_mask *a, static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, const struct sw_flow_mask *mask) { - struct list_head *ml; + struct mask_array *ma; + int i; + + ma = ovsl_dereference(tbl->mask_array); + for (i = 0; i < ma->max; i++) { + struct sw_flow_mask *t; + t = ovsl_dereference(ma->masks[i]); - list_for_each(ml, &tbl->mask_list) { - struct sw_flow_mask *m; - m = container_of(ml, struct sw_flow_mask, list); - if (mask_equal(mask, m)) - return m; + if (t && mask_equal(mask, t)) + return t; } return NULL; @@ -623,6 +844,7 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, const struct sw_flow_mask *new) { struct sw_flow_mask *mask; + mask = flow_mask_find(tbl, new); if (!mask) { /* Allocate a new mask if none exsits. */ @@ -631,7 +853,12 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, return -ENOMEM; mask->key = new->key; mask->range = new->range; - list_add_rcu(&mask->list, &tbl->mask_list); + + /* Add mask to mask-list. */ + if (tbl_mask_array_add_mask(tbl, mask)) { + kfree(mask); + return -ENOMEM; + } } else { BUG_ON(!mask->ref_count); mask->ref_count++; diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index bc52045b63ff..8a5cea6ae111 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -22,6 +22,17 @@ #include "flow.h" +struct mask_cache_entry { + u32 skb_hash; + u32 mask_index; +}; + +struct mask_array { + struct rcu_head rcu; + int count, max; + struct sw_flow_mask __rcu *masks[]; +}; + struct table_instance { struct hlist_head *buckets; unsigned int n_buckets; @@ -34,7 +45,8 @@ struct table_instance { struct flow_table { struct table_instance __rcu *ti; struct table_instance __rcu *ufid_ti; - struct list_head mask_list; + struct mask_cache_entry __percpu *mask_cache; + struct mask_array __rcu *mask_array; unsigned long last_rehash; unsigned int count; unsigned int ufid_count; @@ -60,8 +72,9 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, - const struct sw_flow_key *, - u32 *n_mask_hit); + const struct sw_flow_key *, + u32 skb_hash, + u32 *n_mask_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 21c90d3a7ebf..58a7b8312c28 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -137,7 +137,7 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | IFF_NO_QUEUE; netdev->needs_free_netdev = true; - netdev->priv_destructor = internal_dev_destructor; + netdev->priv_destructor = NULL; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -159,7 +159,6 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) struct internal_dev *internal_dev; struct net_device *dev; int err; - bool free_vport = true; vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { @@ -190,10 +189,9 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) rtnl_lock(); err = register_netdevice(vport->dev); - if (err) { - free_vport = false; + if (err) goto error_unlock; - } + vport->dev->priv_destructor = internal_dev_destructor; dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); @@ -207,8 +205,7 @@ error_unlock: error_free_netdev: free_netdev(dev); error_free_vport: - if (free_vport) - ovs_vport_free(vport); + ovs_vport_free(vport); error: return ERR_PTR(err); } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 3fc38d16c456..5da9392b03d6 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -403,8 +403,9 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) ids = rcu_dereference(vport->upcall_portids); - if (ids->n_ids == 1 && ids->ids[0] == 0) - return 0; + /* If there is only one portid, select it in the fast-path. */ + if (ids->n_ids == 1) + return ids->ids[0]; hash = skb_get_hash(skb); ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 82a50e850245..53c1d41fb1c9 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1295,15 +1295,21 @@ static void packet_sock_destruct(struct sock *sk) static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { - u32 rxhash; + u32 *history = po->rollover->history; + u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) - if (po->rollover->history[i] == rxhash) + if (READ_ONCE(history[i]) == rxhash) count++; - po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash; + victim = prandom_u32() % ROLLOVER_HLEN; + + /* Avoid dirtying the cache line if possible */ + if (READ_ONCE(history[victim]) != rxhash) + WRITE_ONCE(history[victim], rxhash); + return count > (ROLLOVER_HLEN >> 1); } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 96ea9f254ae9..76d499f6af9a 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -338,9 +338,9 @@ static __poll_t pn_socket_poll(struct file *file, struct socket *sock, if (sk->sk_state == TCP_CLOSE) return EPOLLERR; - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; - if (!skb_queue_empty(&pn->ctrlreq_queue)) + if (!skb_queue_empty_lockless(&pn->ctrlreq_queue)) mask |= EPOLLPRI; if (!mask && sk->sk_state == TCP_CLOSE_WAIT) return EPOLLHUP; diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c index e35869e81766..15ce9b642b25 100644 --- a/net/qrtr/tun.c +++ b/net/qrtr/tun.c @@ -111,15 +111,11 @@ static __poll_t qrtr_tun_poll(struct file *filp, poll_table *wait) static int qrtr_tun_release(struct inode *inode, struct file *filp) { struct qrtr_tun *tun = filp->private_data; - struct sk_buff *skb; qrtr_endpoint_unregister(&tun->ep); /* Discard all SKBs */ - while (!skb_queue_empty(&tun->queue)) { - skb = skb_dequeue(&tun->queue); - kfree_skb(skb); - } + skb_queue_purge(&tun->queue); kfree(tun); diff --git a/net/rds/ib.c b/net/rds/ib.c index 9de2ae22d583..3fd5f40189bd 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,6 +30,7 @@ * SOFTWARE. * */ +#include <linux/dmapool.h> #include <linux/kernel.h> #include <linux/in.h> #include <linux/if.h> @@ -107,6 +108,7 @@ static void rds_ib_dev_free(struct work_struct *work) rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); + dma_pool_destroy(rds_ibdev->rid_hdrs_pool); list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { list_del(&i_ipaddr->list); @@ -182,6 +184,12 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->pd = NULL; goto put_dev; } + rds_ibdev->rid_hdrs_pool = dma_pool_create(device->name, + device->dma_device, + sizeof(struct rds_header), + L1_CACHE_BYTES, 0); + if (!rds_ibdev->rid_hdrs_pool) + goto put_dev; rds_ibdev->mr_1m_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); diff --git a/net/rds/ib.h b/net/rds/ib.h index f2b558e8b5ea..6e6f24753998 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -165,8 +165,8 @@ struct rds_ib_connection { /* tx */ struct rds_ib_work_ring i_send_ring; struct rm_data_op *i_data_op; - struct rds_header *i_send_hdrs; - dma_addr_t i_send_hdrs_dma; + struct rds_header **i_send_hdrs; + dma_addr_t *i_send_hdrs_dma; struct rds_ib_send_work *i_sends; atomic_t i_signaled_sends; @@ -175,8 +175,8 @@ struct rds_ib_connection { struct rds_ib_work_ring i_recv_ring; struct rds_ib_incoming *i_ibinc; u32 i_recv_data_rem; - struct rds_header *i_recv_hdrs; - dma_addr_t i_recv_hdrs_dma; + struct rds_header **i_recv_hdrs; + dma_addr_t *i_recv_hdrs_dma; struct rds_ib_recv_work *i_recvs; u64 i_ack_recv; /* last ACK received */ struct rds_ib_refill_cache i_cache_incs; @@ -246,6 +246,7 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; + struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */ bool use_fastreg; unsigned int max_mrs; @@ -381,7 +382,11 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6); void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event); - +struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev, + struct dma_pool *pool, + dma_addr_t **dma_addrs, u32 num_hdrs); +void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs, + dma_addr_t *dma_addrs, u32 num_hdrs); #define rds_ib_conn_error(conn, fmt...) \ __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 233f1368162b..6b345c858dba 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,6 +30,7 @@ * SOFTWARE. * */ +#include <linux/dmapool.h> #include <linux/kernel.h> #include <linux/in.h> #include <linux/slab.h> @@ -439,6 +440,68 @@ static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index) rds_ibdev->vector_load[index]--; } +/* Allocate DMA coherent memory to be used to store struct rds_header for + * sending/receiving packets. The pointers to the DMA memory and the + * associated DMA addresses are stored in two arrays. + * + * @ibdev: the IB device + * @pool: the DMA memory pool + * @dma_addrs: pointer to the array for storing DMA addresses + * @num_hdrs: number of headers to allocate + * + * It returns the pointer to the array storing the DMA memory pointers. On + * error, NULL pointer is returned. + */ +struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev, + struct dma_pool *pool, + dma_addr_t **dma_addrs, u32 num_hdrs) +{ + struct rds_header **hdrs; + dma_addr_t *hdr_daddrs; + u32 i; + + hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL, + ibdev_to_node(ibdev)); + if (!hdrs) + return NULL; + + hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL, + ibdev_to_node(ibdev)); + if (!hdr_daddrs) { + kvfree(hdrs); + return NULL; + } + + for (i = 0; i < num_hdrs; i++) { + hdrs[i] = dma_pool_zalloc(pool, GFP_KERNEL, &hdr_daddrs[i]); + if (!hdrs[i]) { + rds_dma_hdrs_free(pool, hdrs, hdr_daddrs, i); + return NULL; + } + } + + *dma_addrs = hdr_daddrs; + return hdrs; +} + +/* Free the DMA memory used to store struct rds_header. + * + * @pool: the DMA memory pool + * @hdrs: pointer to the array storing DMA memory pointers + * @dma_addrs: pointer to the array storing DMA addresses + * @num_hdars: number of headers to free. + */ +void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs, + dma_addr_t *dma_addrs, u32 num_hdrs) +{ + u32 i; + + for (i = 0; i < num_hdrs; i++) + dma_pool_free(pool, hdrs[i], dma_addrs[i]); + kvfree(hdrs); + kvfree(dma_addrs); +} + /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -451,6 +514,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct ib_cq_init_attr cq_attr = {}; struct rds_ib_device *rds_ibdev; int ret, fr_queue_space; + struct dma_pool *pool; /* * It's normal to see a null device if an incoming connection races @@ -541,31 +605,28 @@ static int rds_ib_setup_qp(struct rds_connection *conn) goto recv_cq_out; } - ic->i_send_hdrs = ib_dma_alloc_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - &ic->i_send_hdrs_dma, GFP_KERNEL); + pool = rds_ibdev->rid_hdrs_pool; + ic->i_send_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); if (!ic->i_send_hdrs) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent send failed\n"); + rdsdebug("DMA send hdrs alloc failed\n"); goto qp_out; } - ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - &ic->i_recv_hdrs_dma, GFP_KERNEL); + ic->i_recv_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); if (!ic->i_recv_hdrs) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent recv failed\n"); + rdsdebug("DMA recv hdrs alloc failed\n"); goto send_hdrs_dma_out; } - ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), - &ic->i_ack_dma, GFP_KERNEL); + ic->i_ack = dma_pool_zalloc(pool, GFP_KERNEL, + &ic->i_ack_dma); if (!ic->i_ack) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent ack failed\n"); + rdsdebug("DMA ack header alloc failed\n"); goto recv_hdrs_dma_out; } @@ -596,17 +657,23 @@ static int rds_ib_setup_qp(struct rds_connection *conn) sends_out: vfree(ic->i_sends); + ack_dma_out: - ib_dma_free_coherent(dev, sizeof(struct rds_header), - ic->i_ack, ic->i_ack_dma); + dma_pool_free(pool, ic->i_ack, ic->i_ack_dma); + ic->i_ack = NULL; + recv_hdrs_dma_out: - ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - ic->i_recv_hdrs, ic->i_recv_hdrs_dma); + rds_dma_hdrs_free(pool, ic->i_recv_hdrs, ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); + ic->i_recv_hdrs = NULL; + ic->i_recv_hdrs_dma = NULL; + send_hdrs_dma_out: - ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * - sizeof(struct rds_header), - ic->i_send_hdrs, ic->i_send_hdrs_dma); + rds_dma_hdrs_free(pool, ic->i_send_hdrs, ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); + ic->i_send_hdrs = NULL; + ic->i_send_hdrs_dma = NULL; + qp_out: rdma_destroy_qp(ic->i_cm_id); recv_cq_out: @@ -984,8 +1051,6 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_cm_id ? ic->i_cm_id->qp : NULL); if (ic->i_cm_id) { - struct ib_device *dev = ic->i_cm_id->device; - rdsdebug("disconnecting cm %p\n", ic->i_cm_id); err = rdma_disconnect(ic->i_cm_id); if (err) { @@ -1035,24 +1100,39 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ib_destroy_cq(ic->i_recv_cq); } - /* then free the resources that ib callbacks use */ - if (ic->i_send_hdrs) - ib_dma_free_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - ic->i_send_hdrs, - ic->i_send_hdrs_dma); - - if (ic->i_recv_hdrs) - ib_dma_free_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - ic->i_recv_hdrs, - ic->i_recv_hdrs_dma); - - if (ic->i_ack) - ib_dma_free_coherent(dev, sizeof(struct rds_header), - ic->i_ack, ic->i_ack_dma); + if (ic->rds_ibdev) { + struct dma_pool *pool; + + pool = ic->rds_ibdev->rid_hdrs_pool; + + /* then free the resources that ib callbacks use */ + if (ic->i_send_hdrs) { + rds_dma_hdrs_free(pool, ic->i_send_hdrs, + ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); + ic->i_send_hdrs = NULL; + ic->i_send_hdrs_dma = NULL; + } + + if (ic->i_recv_hdrs) { + rds_dma_hdrs_free(pool, ic->i_recv_hdrs, + ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); + ic->i_recv_hdrs = NULL; + ic->i_recv_hdrs_dma = NULL; + } + + if (ic->i_ack) { + dma_pool_free(pool, ic->i_ack, ic->i_ack_dma); + ic->i_ack = NULL; + } + } else { + WARN_ON(ic->i_send_hdrs); + WARN_ON(ic->i_send_hdrs_dma); + WARN_ON(ic->i_recv_hdrs); + WARN_ON(ic->i_recv_hdrs_dma); + WARN_ON(ic->i_ack); + } if (ic->i_sends) rds_ib_send_clear_ring(ic); @@ -1071,9 +1151,6 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_pd = NULL; ic->i_send_cq = NULL; ic->i_recv_cq = NULL; - ic->i_send_hdrs = NULL; - ic->i_recv_hdrs = NULL; - ic->i_ack = NULL; } BUG_ON(ic->rds_ibdev); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index a0f99bbf362c..694d411dc72f 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -61,7 +61,7 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) recv->r_wr.num_sge = RDS_IB_RECV_SGE; sge = &recv->r_sge[0]; - sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->addr = ic->i_recv_hdrs_dma[i]; sge->length = sizeof(struct rds_header); sge->lkey = ic->i_pd->local_dma_lkey; @@ -343,7 +343,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, WARN_ON(ret != 1); sge = &recv->r_sge[0]; - sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs]; sge->length = sizeof(struct rds_header); sge = &recv->r_sge[1]; @@ -861,7 +861,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, } data_len -= sizeof(struct rds_header); - ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + ihdr = ic->i_recv_hdrs[recv - ic->i_recvs]; /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { @@ -993,10 +993,11 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, } else { /* We expect errors as the qp is drained during shutdown */ if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, wc->status, - ib_wc_status_msg(wc->status)); + ib_wc_status_msg(wc->status), + wc->vendor_err); } /* rds_ib_process_recv() doesn't always consume the frag, and diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index dfe6237dafe2..d1cc1d7778d8 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -201,7 +201,8 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_wr.ex.imm_data = 0; sge = &send->s_sge[0]; - sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->addr = ic->i_send_hdrs_dma[i]; + sge->length = sizeof(struct rds_header); sge->lkey = ic->i_pd->local_dma_lkey; @@ -300,10 +301,10 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, wc->status, - ib_wc_status_msg(wc->status)); + ib_wc_status_msg(wc->status), wc->vendor_err); } } @@ -631,11 +632,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_queued = jiffies; send->s_op = NULL; - send->s_sge[0].addr = ic->i_send_hdrs_dma - + (pos * sizeof(struct rds_header)); + send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; + send->s_sge[0].length = sizeof(struct rds_header); - memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, + sizeof(struct rds_header)); + /* Set up the data, if present */ if (i < work_alloc @@ -674,7 +677,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, &send->s_wr, send->s_wr.num_sge, send->s_wr.next); if (ic->i_flowctl && adv_credits) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; + struct rds_header *hdr = ic->i_send_hdrs[pos]; /* add credit and redo the header checksum */ hdr->h_credit = adv_credits; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index f0e9ccf472a9..46b8ff24020d 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -65,28 +65,6 @@ static const struct proto_ops rose_proto_ops; ax25_address rose_callsign; /* - * ROSE network devices are virtual network devices encapsulating ROSE - * frames into AX.25 which will be sent through an AX.25 device, so form a - * special "super class" of normal net devices; split their locks off into a - * separate class since they always nest. - */ -static struct lock_class_key rose_netdev_xmit_lock_key; -static struct lock_class_key rose_netdev_addr_lock_key; - -static void rose_set_lockdep_one(struct net_device *dev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key); -} - -static void rose_set_lockdep_key(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key); - netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); -} - -/* * Convert a ROSE address into text. */ char *rose2asc(char *buf, const rose_address *addr) @@ -928,7 +906,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); out_release: release_sock(sk); @@ -1033,7 +1011,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros make_rose->va = 0; make_rose->vr = 0; make_rose->vl = 0; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); rose_insert_socket(make); @@ -1533,7 +1511,6 @@ static int __init rose_proto_init(void) free_netdev(dev); goto fail; } - rose_set_lockdep_key(dev); dev_rose[i] = dev; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1091bf35a199..7c7d10f2e0c1 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -556,6 +556,7 @@ struct rxrpc_call { struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ + const struct rxrpc_security *security; /* applied security module */ struct mutex user_mutex; /* User access mutex */ unsigned long ack_at; /* When deferred ACK needs to happen */ unsigned long ack_lost_at; /* When ACK is figured as lost */ @@ -600,6 +601,7 @@ struct rxrpc_call { int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ + bool rx_pkt_last; /* Current recvmsg packet is last */ /* Rx/Tx circular buffer, depending on phase. * diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 00c095d74145..135bf5cd8dd5 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -84,7 +84,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, smp_store_release(&b->conn_backlog_head, (head + 1) & (size - 1)); - trace_rxrpc_conn(conn, rxrpc_conn_new_service, + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service, atomic_read(&conn->usage), here); } @@ -97,7 +97,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->flags |= (1 << RXRPC_CALL_IS_SERVICE); call->state = RXRPC_CALL_SERVER_PREALLOC; - trace_rxrpc_call(call, rxrpc_call_new_service, + trace_rxrpc_call(call->debug_id, rxrpc_call_new_service, atomic_read(&call->usage), here, (const void *)user_call_ID); @@ -307,6 +307,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, rxrpc_see_call(call); call->conn = conn; + call->security = conn->security; call->peer = rxrpc_get_peer(conn->params.peer); call->cong_cwnd = call->peer->cong_cwnd; return call; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 32d8dc677142..a31c18c09894 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -240,7 +240,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, if (p->intr) __set_bit(RXRPC_CALL_IS_INTR, &call->flags); call->tx_total_len = p->tx_total_len; - trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), + trace_rxrpc_call(call->debug_id, rxrpc_call_new_client, + atomic_read(&call->usage), here, (const void *)p->user_call_ID); /* We need to protect a partially set up call against the user as we @@ -290,8 +291,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, if (ret < 0) goto error; - trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage), - here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_connected, + atomic_read(&call->usage), here, NULL); rxrpc_start_call_timer(call); @@ -313,8 +314,8 @@ error_dup_user_ID: error: __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_CALL_DEAD, ret); - trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), - here, ERR_PTR(ret)); + trace_rxrpc_call(call->debug_id, rxrpc_call_error, + atomic_read(&call->usage), here, ERR_PTR(ret)); rxrpc_release_call(rx, call); mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); @@ -376,7 +377,8 @@ bool rxrpc_queue_call(struct rxrpc_call *call) if (n == 0) return false; if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_queued, n + 1, + here, NULL); else rxrpc_put_call(call, rxrpc_call_put_noqueue); return true; @@ -391,7 +393,8 @@ bool __rxrpc_queue_call(struct rxrpc_call *call) int n = atomic_read(&call->usage); ASSERTCMP(n, >=, 1); if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_queued_ref, n, + here, NULL); else rxrpc_put_call(call, rxrpc_call_put_noqueue); return true; @@ -406,7 +409,8 @@ void rxrpc_see_call(struct rxrpc_call *call) if (call) { int n = atomic_read(&call->usage); - trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_seen, n, + here, NULL); } } @@ -418,7 +422,7 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) const void *here = __builtin_return_address(0); int n = atomic_inc_return(&call->usage); - trace_rxrpc_call(call, op, n, here, NULL); + trace_rxrpc_call(call->debug_id, op, n, here, NULL); } /* @@ -445,7 +449,8 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage)); - trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage), + trace_rxrpc_call(call->debug_id, rxrpc_call_release, + atomic_read(&call->usage), here, (const void *)call->flags); ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); @@ -488,10 +493,10 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); - if (conn) { + if (conn) rxrpc_disconnect_call(call); - conn->security->free_call_crypto(call); - } + if (call->security) + call->security->free_call_crypto(call); rxrpc_cleanup_ring(call); _leave(""); @@ -534,12 +539,13 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { struct rxrpc_net *rxnet = call->rxnet; const void *here = __builtin_return_address(0); + unsigned int debug_id = call->debug_id; int n; ASSERT(call != NULL); n = atomic_dec_return(&call->usage); - trace_rxrpc_call(call, op, n, here, NULL); + trace_rxrpc_call(debug_id, op, n, here, NULL); ASSERTCMP(n, >=, 0); if (n == 0) { _debug("call %d dead", call->debug_id); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 3f1da1b49f69..376370cd9285 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -212,7 +212,8 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) rxrpc_get_local(conn->params.local); key_get(conn->params.key); - trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage), + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client, + atomic_read(&conn->usage), __builtin_return_address(0)); trace_rxrpc_client(conn, -1, rxrpc_client_alloc); _leave(" = %p", conn); @@ -352,6 +353,7 @@ static int rxrpc_get_client_conn(struct rxrpc_sock *rx, if (cp->exclusive) { call->conn = candidate; + call->security = candidate->security; call->security_ix = candidate->security_ix; call->service_id = candidate->service_id; _leave(" = 0 [exclusive %d]", candidate->debug_id); @@ -403,6 +405,7 @@ static int rxrpc_get_client_conn(struct rxrpc_sock *rx, candidate_published: set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); call->conn = candidate; + call->security = candidate->security; call->security_ix = candidate->security_ix; call->service_id = candidate->service_id; spin_unlock(&local->client_conns_lock); @@ -425,6 +428,7 @@ found_extant_conn: spin_lock(&conn->channel_lock); call->conn = conn; + call->security = conn->security; call->security_ix = conn->security_ix; call->service_id = conn->service_id; list_add_tail(&call->chan_wait_link, &conn->waiting_calls); @@ -985,11 +989,12 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) void rxrpc_put_client_conn(struct rxrpc_connection *conn) { const void *here = __builtin_return_address(0); + unsigned int debug_id = conn->debug_id; int n; do { n = atomic_dec_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here); + trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, n, here); if (n > 0) return; ASSERTCMP(n, >=, 0); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index ed05b6922132..38d718e90dc6 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -269,7 +269,7 @@ bool rxrpc_queue_conn(struct rxrpc_connection *conn) if (n == 0) return false; if (rxrpc_queue_work(&conn->processor)) - trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_queued, n + 1, here); else rxrpc_put_connection(conn); return true; @@ -284,7 +284,7 @@ void rxrpc_see_connection(struct rxrpc_connection *conn) if (conn) { int n = atomic_read(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_seen, n, here); } } @@ -296,7 +296,7 @@ void rxrpc_get_connection(struct rxrpc_connection *conn) const void *here = __builtin_return_address(0); int n = atomic_inc_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_got, n, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n, here); } /* @@ -310,7 +310,7 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn) if (conn) { int n = atomic_fetch_add_unless(&conn->usage, 1, 0); if (n > 0) - trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n + 1, here); else conn = NULL; } @@ -333,10 +333,11 @@ static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, void rxrpc_put_service_conn(struct rxrpc_connection *conn) { const void *here = __builtin_return_address(0); + unsigned int debug_id = conn->debug_id; int n; n = atomic_dec_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); + trace_rxrpc_conn(debug_id, rxrpc_conn_put_service, n, here); ASSERTCMP(n, >=, 0); if (n == 1) rxrpc_set_service_reap_timer(conn->params.local->rxnet, @@ -420,7 +421,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) */ if (atomic_cmpxchg(&conn->usage, 1, 0) != 1) continue; - trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, NULL); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_reap_service, 0, NULL); if (rxrpc_conn_is_client(conn)) BUG(); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index b30e13f6d95f..123d6ceab15c 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -134,7 +134,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); - trace_rxrpc_conn(conn, rxrpc_conn_new_service, + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service, atomic_read(&conn->usage), __builtin_return_address(0)); } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index c97ebdc043e4..48f67a9b1037 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -147,10 +147,16 @@ void rxrpc_error_report(struct sock *sk) { struct sock_exterr_skb *serr; struct sockaddr_rxrpc srx; - struct rxrpc_local *local = sk->sk_user_data; + struct rxrpc_local *local; struct rxrpc_peer *peer; struct sk_buff *skb; + rcu_read_lock(); + local = rcu_dereference_sk_user_data(sk); + if (unlikely(!local)) { + rcu_read_unlock(); + return; + } _enter("%p{%d}", sk, local->debug_id); /* Clear the outstanding error value on the socket so that it doesn't @@ -160,6 +166,7 @@ void rxrpc_error_report(struct sock *sk) skb = sock_dequeue_err_skb(sk); if (!skb) { + rcu_read_unlock(); _leave("UDP socket errqueue empty"); return; } @@ -167,11 +174,11 @@ void rxrpc_error_report(struct sock *sk) serr = SKB_EXT_ERR(skb); if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); + rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_freed); return; } - rcu_read_lock(); peer = rxrpc_lookup_peer_icmp_rcu(local, skb, &srx); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 9c3ac96f71cb..452163eadb98 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -209,6 +209,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx, */ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) { + const void *here = __builtin_return_address(0); struct rxrpc_peer *peer; _enter(""); @@ -216,7 +217,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) peer = kzalloc(sizeof(struct rxrpc_peer), gfp); if (peer) { atomic_set(&peer->usage, 1); - peer->local = local; + peer->local = rxrpc_get_local(local); INIT_HLIST_HEAD(&peer->error_targets); peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); @@ -230,6 +231,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) peer->cong_cwnd = 3; else peer->cong_cwnd = 4; + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here); } _leave(" = %p", peer); @@ -307,7 +309,6 @@ void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, unsigned long hash_key; hash_key = rxrpc_peer_hash_key(local, &peer->srx); - peer->local = local; rxrpc_init_peer(rx, peer, hash_key); spin_lock(&rxnet->peer_hash_lock); @@ -382,7 +383,7 @@ struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) int n; n = atomic_inc_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_got, n, here); + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n, here); return peer; } @@ -396,7 +397,7 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) if (peer) { int n = atomic_fetch_add_unless(&peer->usage, 1, 0); if (n > 0) - trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here); + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n + 1, here); else peer = NULL; } @@ -417,6 +418,7 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) list_del_init(&peer->keepalive_link); spin_unlock_bh(&rxnet->peer_hash_lock); + rxrpc_put_local(peer->local); kfree_rcu(peer, rcu); } @@ -426,11 +428,13 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) void rxrpc_put_peer(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); + unsigned int debug_id; int n; if (peer) { + debug_id = peer->debug_id; n = atomic_dec_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_put, n, here); + trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here); if (n == 0) __rxrpc_put_peer(peer); } @@ -443,13 +447,15 @@ void rxrpc_put_peer(struct rxrpc_peer *peer) void rxrpc_put_peer_locked(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); + unsigned int debug_id = peer->debug_id; int n; n = atomic_dec_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_put, n, here); + trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here); if (n == 0) { hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); + rxrpc_put_local(peer->local); kfree_rcu(peer, rcu); } } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 3b0becb12041..8578c39ec839 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -251,8 +251,8 @@ static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, seq += subpacket; } - return call->conn->security->verify_packet(call, skb, offset, len, - seq, cksum); + return call->security->verify_packet(call, skb, offset, len, + seq, cksum); } /* @@ -267,11 +267,13 @@ static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, */ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, u8 *_annotation, - unsigned int *_offset, unsigned int *_len) + unsigned int *_offset, unsigned int *_len, + bool *_last) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len; + bool last = false; int ret; u8 annotation = *_annotation; u8 subpacket = annotation & RXRPC_RX_ANNO_SUBPACKET; @@ -281,6 +283,8 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, len = skb->len - offset; if (subpacket < sp->nr_subpackets - 1) len = RXRPC_JUMBO_DATALEN; + else if (sp->rx_flags & RXRPC_SKB_INCL_LAST) + last = true; if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) { ret = rxrpc_verify_packet(call, skb, annotation, offset, len); @@ -291,7 +295,8 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, *_offset = offset; *_len = len; - call->conn->security->locate_data(call, skb, _offset, _len); + *_last = last; + call->security->locate_data(call, skb, _offset, _len); return 0; } @@ -309,7 +314,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top, seq; size_t remain; - bool last; + bool rx_pkt_last; unsigned int rx_pkt_offset, rx_pkt_len; int ix, copy, ret = -EAGAIN, ret2; @@ -319,6 +324,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; + rx_pkt_last = call->rx_pkt_last; if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) { seq = call->rx_hard_ack; @@ -329,6 +335,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, /* Barriers against rxrpc_input_data(). */ hard_ack = call->rx_hard_ack; seq = hard_ack + 1; + while (top = smp_load_acquire(&call->rx_top), before_eq(seq, top) ) { @@ -356,7 +363,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (rx_pkt_offset == 0) { ret2 = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix], - &rx_pkt_offset, &rx_pkt_len); + &rx_pkt_offset, &rx_pkt_len, + &rx_pkt_last); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq, rx_pkt_offset, rx_pkt_len, ret2); if (ret2 < 0) { @@ -396,13 +404,12 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, } /* The whole packet has been transferred. */ - last = sp->hdr.flags & RXRPC_LAST_PACKET; if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); rx_pkt_offset = 0; rx_pkt_len = 0; - if (last) { + if (rx_pkt_last) { ASSERTCMP(seq, ==, READ_ONCE(call->rx_top)); ret = 1; goto out; @@ -415,6 +422,7 @@ out: if (!(flags & MSG_PEEK)) { call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; + call->rx_pkt_last = rx_pkt_last; } done: trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq, diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 6a1547b270fe..813fd6888142 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -419,7 +419,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, call->tx_winsize) sp->hdr.flags |= RXRPC_MORE_PACKETS; - ret = conn->security->secure_packet( + ret = call->security->secure_packet( call, skb, skb->mark, skb->head); if (ret < 0) goto out; @@ -661,6 +661,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) case RXRPC_CALL_SERVER_PREALLOC: case RXRPC_CALL_SERVER_SECURING: case RXRPC_CALL_SERVER_ACCEPTING: + rxrpc_put_call(call, rxrpc_call_put); ret = -EBUSY; goto error_release_sock; default: diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2558f00f6b3e..7fc1e2c1b656 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -188,6 +188,8 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act) + nla_total_size(0) /* TCA_ACT_STATS nested */ /* TCA_STATS_BASIC */ + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) + /* TCA_STATS_PKT64 */ + + nla_total_size_64bit(sizeof(u64)) /* TCA_STATS_QUEUE */ + nla_total_size_64bit(sizeof(struct gnet_stats_queue)) + nla_total_size(0) /* TCA_OPTIONS nested */ @@ -399,7 +401,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, - int bind, bool cpustats) + int bind, bool cpustats, u32 flags) { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; @@ -427,6 +429,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; p->tcfa_tm.firstuse = 0; + p->tcfa_flags = flags; if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, @@ -451,6 +454,17 @@ err1: } EXPORT_SYMBOL(tcf_idr_create); +int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, + struct nlattr *est, struct tc_action **a, + const struct tc_action_ops *ops, int bind, + u32 flags) +{ + /* Set cpustats according to actions flags. */ + return tcf_idr_create(tn, index, est, a, ops, bind, + !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags); +} +EXPORT_SYMBOL(tcf_idr_create_from_flags); + void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) { struct tcf_idrinfo *idrinfo = tn->idrinfo; @@ -773,6 +787,14 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) } rcu_read_unlock(); + if (a->tcfa_flags) { + struct nla_bitfield32 flags = { a->tcfa_flags, + a->tcfa_flags, }; + + if (nla_put(skb, TCA_ACT_FLAGS, sizeof(flags), &flags)) + goto nla_put_failure; + } + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; @@ -831,13 +853,15 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } +static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS; static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { - [TCA_ACT_KIND] = { .type = NLA_NUL_STRING, - .len = IFNAMSIZ - 1 }, + [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, [TCA_ACT_COOKIE] = { .type = NLA_BINARY, .len = TC_COOKIE_MAX_SIZE }, [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, + [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32, + .validation_data = &tca_act_flags_allowed }, }; struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, @@ -846,6 +870,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { + struct nla_bitfield32 flags = { 0, 0 }; struct tc_action *a; struct tc_action_ops *a_o; struct tc_cookie *cookie = NULL; @@ -865,8 +890,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, NL_SET_ERR_MSG(extack, "TC action kind must be specified"); goto err_out; } - nla_strlcpy(act_name, kind, IFNAMSIZ); - + if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) { + NL_SET_ERR_MSG(extack, "TC action name too long"); + goto err_out; + } if (tb[TCA_ACT_COOKIE]) { cookie = nla_memdup_cookie(tb); if (!cookie) { @@ -875,6 +902,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_out; } } + if (tb[TCA_ACT_FLAGS]) + flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); } else { if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { NL_SET_ERR_MSG(extack, "TC action name too long"); @@ -913,10 +942,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - rtnl_held, tp, extack); + rtnl_held, tp, flags.value, extack); else err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, - tp, extack); + tp, flags.value, extack); if (err < 0) goto err_mod; @@ -974,7 +1003,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, err = PTR_ERR(act); goto err; } - act->order = i; sz += tcf_action_fill_size(act); /* Start from index 0 */ actions[i - 1] = act; @@ -988,6 +1016,29 @@ err: return err; } +void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, + bool drop, bool hw) +{ + if (a->cpu_bstats) { + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + + if (drop) + this_cpu_ptr(a->cpu_qstats)->drops += packets; + + if (hw) + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); + return; + } + + _bstats_update(&a->tcfa_bstats, bytes, packets); + if (drop) + a->tcfa_qstats.drops += packets; + if (hw) + _bstats_update(&a->tcfa_bstats_hw, bytes, packets); +} +EXPORT_SYMBOL(tcf_action_update_stats); + int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { @@ -1352,11 +1403,16 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { size_t attr_size = 0; - int ret = 0; + int loop, ret; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions, - &attr_size, true, extack); + for (loop = 0; loop < 10; loop++) { + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, + actions, &attr_size, true, extack); + if (ret != -EAGAIN) + break; + } + if (ret < 0) return ret; ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); @@ -1406,11 +1462,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, */ if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; -replay: ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr, extack); - if (ret == -EAGAIN) - goto replay; break; case RTM_DELACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 04b7bd4ec751..46f47e58b3be 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -275,7 +275,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int replace, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; @@ -303,7 +304,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, ret = tcf_idr_check_alloc(tn, &index, act, bind); if (!ret) { ret = tcf_idr_create(tn, index, est, act, - &act_bpf_ops, bind, true); + &act_bpf_ops, bind, true, 0); if (ret < 0) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 2b43cacf82af..43a243081e7d 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -94,7 +94,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -121,7 +121,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ret = tcf_idr_check_alloc(tn, &index, a, bind); if (!ret) { ret = tcf_idr_create(tn, index, est, a, - &act_connmark_ops, bind, false); + &act_connmark_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index d3cfad88dc3a..16e67e1c1db1 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -43,7 +43,7 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_new; @@ -68,8 +68,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_csum_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_csum_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -580,7 +580,7 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, params = rcu_dereference_bh(p->params); tcf_lastuse_update(&p->tcf_tm); - bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); + tcf_action_update_bstats(&p->common, skb); action = READ_ONCE(p->tcf_action); if (unlikely(action == TC_ACT_SHOT)) @@ -624,7 +624,7 @@ out: return action; drop: - qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&p->common); action = TC_ACT_SHOT; goto out; } diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index fcc46025e790..68d6af56b243 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -465,11 +465,11 @@ out_push: skb_push_rcsum(skb, nh_ofs); out: - bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); + tcf_action_update_bstats(&c->common, skb); return retval; drop: - qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); + tcf_action_inc_drop_qstats(&c->common); return TC_ACT_SHOT; } @@ -656,7 +656,7 @@ static int tcf_ct_fill_params(struct net *net, static int tcf_ct_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int replace, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ct_net_id); @@ -688,8 +688,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, return err; if (!err) { - err = tcf_idr_create(tn, index, est, a, - &act_ct_ops, bind, true); + err = tcf_idr_create_from_flags(tn, index, est, a, + &act_ct_ops, bind, flags); if (err) { tcf_idr_cleanup(tn, index); return err; @@ -905,11 +905,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, { struct tcf_ct *c = to_ct(a); - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse); } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 0dbcfd1dca7b..b1e601007242 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -153,7 +153,7 @@ static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ctinfo_net_id); @@ -210,7 +210,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create(tn, index, est, a, - &act_ctinfo_ops, bind, false); + &act_ctinfo_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 324f1d1f6d47..416065772719 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -53,7 +53,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; @@ -98,8 +99,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_gact_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_gact_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -161,9 +162,9 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, action = gact_rand[ptype](gact); } #endif - bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb); + tcf_action_update_bstats(&gact->common, skb); if (action == TC_ACT_SHOT) - qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&gact->common); tcf_lastuse_update(&gact->tcf_tm); @@ -177,15 +178,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, - packets); - if (action == TC_ACT_SHOT) - this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; - - if (hw) - _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw), - bytes, packets); - + tcf_action_update_stats(a, bytes, packets, action == TC_ACT_SHOT, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3a31e241c647..d562c88cccbe 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -465,7 +465,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; @@ -522,7 +523,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_ife_ops, - bind, true); + bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); kfree(p); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 214a03d405cf..400a2cfe8452 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -95,7 +95,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int ovr, int bind, - struct tcf_proto *tp) + struct tcf_proto *tp, u32 flags) { struct tc_action_net *tn = net_generic(net, id); struct nlattr *tb[TCA_IPT_MAX + 1]; @@ -144,7 +144,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, ops, bind, - false); + false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -205,19 +205,19 @@ err1: static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, - bind, tp); + bind, tp, flags); } static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool unlocked, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, - bind, tp); + bind, tp, flags); } static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 9ce073a05414..b6e1b5bbb4da 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -93,7 +93,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; @@ -148,8 +148,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; } - ret = tcf_idr_create(tn, index, est, a, - &act_mirred_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_mirred_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -231,7 +231,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, } tcf_lastuse_update(&m->tcf_tm); - bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); + tcf_action_update_bstats(&m->common, skb); m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); @@ -289,8 +289,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; - res->qstats = this_cpu_ptr(m->common.cpu_qstats); - skb_tc_reinsert(skb, res); + if (skb_tc_reinsert(skb, res)) + tcf_action_inc_overlimit_qstats(&m->common); __this_cpu_dec(mirred_rec_level); return TC_ACT_CONSUMED; } @@ -303,7 +303,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, if (err) { out: - qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats)); + tcf_action_inc_overlimit_qstats(&m->common); if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } @@ -318,10 +318,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } @@ -484,7 +481,11 @@ static int __init mirred_init_module(void) return err; pr_info("Mirror/redirect action on\n"); - return tcf_register_action(&act_mirred_ops, &mirred_net_ops); + err = tcf_register_action(&act_mirred_ops, &mirred_net_ops); + if (err) + unregister_netdevice_notifier(&mirred_device_notifier); + + return err; } static void __exit mirred_cleanup_module(void) diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index e168df0e008a..4d8c822b6aca 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -55,7 +55,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; __be32 new_lse; - int ret; + int ret, mac_len; tcf_lastuse_update(&m->tcf_tm); bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); @@ -63,8 +63,12 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, /* Ensure 'data' points at mac_header prior calling mpls manipulating * functions. */ - if (skb_at_tc_ingress(skb)) + if (skb_at_tc_ingress(skb)) { skb_push_rcsum(skb, skb->mac_len); + mac_len = skb->mac_len; + } else { + mac_len = skb_network_header(skb) - skb_mac_header(skb); + } ret = READ_ONCE(m->tcf_action); @@ -72,12 +76,12 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, switch (p->tcfm_action) { case TCA_MPLS_ACT_POP: - if (skb_mpls_pop(skb, p->tcfm_proto)) + if (skb_mpls_pop(skb, p->tcfm_proto, mac_len)) goto drop; break; case TCA_MPLS_ACT_PUSH: new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol)); - if (skb_mpls_push(skb, new_lse, p->tcfm_proto)) + if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len)) goto drop; break; case TCA_MPLS_ACT_MODIFY: @@ -127,7 +131,8 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { static int tcf_mpls_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mpls_net_id); struct nlattr *tb[TCA_MPLS_MAX + 1]; @@ -220,7 +225,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_mpls_ops, bind, true); + &act_mpls_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index ea4c5359e7df..855a6fa16a62 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -36,7 +36,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; @@ -61,7 +61,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create(tn, index, est, a, - &act_nat_ops, bind, false); + &act_nat_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -206,9 +206,7 @@ static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a, icmph = (void *)(skb_network_header(skb) + ihl); - if ((icmph->type != ICMP_DEST_UNREACH) && - (icmph->type != ICMP_TIME_EXCEEDED) && - (icmph->type != ICMP_PARAMETERPROB)) + if (!icmp_is_err(icmph->type)) break; if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) + diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index cdfaa79382a2..d5eff6ac17a9 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -137,7 +137,8 @@ nla_failure: static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; @@ -190,7 +191,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_free; } ret = tcf_idr_create(tn, index, est, a, - &act_pedit_ops, bind, false); + &act_pedit_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); goto out_free; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 89c04c52af3d..d96271590268 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -47,7 +47,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { int ret = 0, tcfp_result = TC_ACT_OK, err, size; @@ -87,7 +87,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, NULL, a, - &act_police_ops, bind, true); + &act_police_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -294,10 +294,7 @@ static void tcf_police_stats_update(struct tc_action *a, struct tcf_police *police = to_police(a); struct tcf_t *tm = &police->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } @@ -345,10 +342,7 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(police->tcf_tm.expires); + tcf_tm_dump(&t, &police->tcf_tm); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; spin_unlock_bh(&police->tcf_lock); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 514456a0b9a8..29b23bfaf10d 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -36,7 +36,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; @@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_sample_ops, bind, true); + &act_sample_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 6120e56117ca..9813ca4006dd 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -35,7 +35,7 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, * Example if this was the 3rd packet and the string was "hello" * then it would look like "hello_3" (without quotes) */ - pr_info("simple: %s_%d\n", + pr_info("simple: %s_%llu\n", (char *)d->tcfd_defdata, d->tcf_bstats.packets); spin_unlock(&d->tcf_lock); return d->tcf_action; @@ -86,7 +86,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; @@ -127,7 +128,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_simp_ops, bind, false); + &act_simp_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 6a8d3337c577..5f7ca7f89ca2 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -86,7 +86,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); @@ -165,7 +165,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_skbedit_ops, bind, true); + &act_skbedit_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 888437f97ba6..39e6d94cfafb 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -79,7 +79,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); @@ -143,7 +143,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_skbmod_ops, bind, true); + &act_skbmod_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2f83a79f76aa..cb34e5d57aaa 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -31,7 +31,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); - bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); + tcf_action_update_bstats(&t->common, skb); action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { @@ -208,7 +208,7 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); @@ -347,8 +347,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } if (!exists) { - ret = tcf_idr_create(tn, index, est, a, - &act_tunnel_key_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_tunnel_key_ops, bind, + act_flags); if (ret) { NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); goto release_tun_meta; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 08aaf719a70f..b6939abc61eb 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -29,7 +29,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, u16 tci; tcf_lastuse_update(&v->tcf_tm); - bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb); + tcf_action_update_bstats(&v->common, skb); /* Ensure 'data' points at mac_header prior calling vlan manipulating * functions. @@ -88,7 +88,7 @@ out: return action; drop: - qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&v->common); return TC_ACT_SHOT; } @@ -102,7 +102,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; @@ -188,8 +189,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, action = parm->v_action; if (!exists) { - ret = tcf_idr_create(tn, index, est, a, - &act_vlan_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_vlan_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -307,10 +308,7 @@ static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u32 packets, struct tcf_vlan *v = to_vlan(a); struct tcf_t *tm = &v->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 64584a1df425..20d60b8fcb70 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/idr.h> #include <linux/rhashtable.h> +#include <linux/jhash.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -47,6 +48,62 @@ static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); +static u32 destroy_obj_hashfn(const struct tcf_proto *tp) +{ + return jhash_3words(tp->chain->index, tp->prio, + (__force __u32)tp->protocol, 0); +} + +static void tcf_proto_signal_destroying(struct tcf_chain *chain, + struct tcf_proto *tp) +{ + struct tcf_block *block = chain->block; + + mutex_lock(&block->proto_destroy_lock); + hash_add_rcu(block->proto_destroy_ht, &tp->destroy_ht_node, + destroy_obj_hashfn(tp)); + mutex_unlock(&block->proto_destroy_lock); +} + +static bool tcf_proto_cmp(const struct tcf_proto *tp1, + const struct tcf_proto *tp2) +{ + return tp1->chain->index == tp2->chain->index && + tp1->prio == tp2->prio && + tp1->protocol == tp2->protocol; +} + +static bool tcf_proto_exists_destroying(struct tcf_chain *chain, + struct tcf_proto *tp) +{ + u32 hash = destroy_obj_hashfn(tp); + struct tcf_proto *iter; + bool found = false; + + rcu_read_lock(); + hash_for_each_possible_rcu(chain->block->proto_destroy_ht, iter, + destroy_ht_node, hash) { + if (tcf_proto_cmp(tp, iter)) { + found = true; + break; + } + } + rcu_read_unlock(); + + return found; +} + +static void +tcf_proto_signal_destroyed(struct tcf_chain *chain, struct tcf_proto *tp) +{ + struct tcf_block *block = chain->block; + + mutex_lock(&block->proto_destroy_lock); + if (hash_hashed(&tp->destroy_ht_node)) + hash_del_rcu(&tp->destroy_ht_node); + mutex_unlock(&block->proto_destroy_lock); +} + /* Find classifier type by string name */ static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind) @@ -162,11 +219,22 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) return TC_H_MAJ(first); } +static bool tcf_proto_check_kind(struct nlattr *kind, char *name) +{ + if (kind) + return nla_strlcpy(name, kind, IFNAMSIZ) >= IFNAMSIZ; + memset(name, 0, IFNAMSIZ); + return false; +} + static bool tcf_proto_is_unlocked(const char *kind) { const struct tcf_proto_ops *ops; bool ret; + if (strlen(kind) == 0) + return false; + ops = tcf_proto_lookup_ops(kind, false, NULL); /* On error return false to take rtnl lock. Proto lookup/create * functions will perform lookup again and properly handle errors. @@ -223,9 +291,11 @@ static void tcf_proto_get(struct tcf_proto *tp) static void tcf_chain_put(struct tcf_chain *chain); static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, - struct netlink_ext_ack *extack) + bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); + if (sig_destroy) + tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); module_put(tp->ops->owner); kfree_rcu(tp, rcu); @@ -235,7 +305,7 @@ static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { if (refcount_dec_and_test(&tp->refcnt)) - tcf_proto_destroy(tp, rtnl_held, extack); + tcf_proto_destroy(tp, rtnl_held, true, extack); } static int walker_check_empty(struct tcf_proto *tp, void *fh, @@ -359,6 +429,7 @@ static bool tcf_chain_detach(struct tcf_chain *chain) static void tcf_block_destroy(struct tcf_block *block) { mutex_destroy(&block->lock); + mutex_destroy(&block->proto_destroy_lock); kfree_rcu(block, rcu); } @@ -534,6 +605,12 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_dereference(chain->filter_chain, chain); + while (tp) { + tp_next = rcu_dereference_protected(tp->next, 1); + tcf_proto_signal_destroying(chain, tp); + tp = tp_next; + } + tp = tcf_chain_dereference(chain->filter_chain, chain); RCU_INIT_POINTER(chain->filter_chain, NULL); tcf_chain0_head_change(chain, NULL); chain->flushing = true; @@ -833,6 +910,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, return ERR_PTR(-ENOMEM); } mutex_init(&block->lock); + mutex_init(&block->proto_destroy_lock); init_rwsem(&block->cb_lock); flow_block_init(&block->flow_block); INIT_LIST_HEAD(&block->chain_list); @@ -1610,6 +1688,12 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, mutex_lock(&chain->filter_chain_lock); + if (tcf_proto_exists_destroying(chain, tp_new)) { + mutex_unlock(&chain->filter_chain_lock); + tcf_proto_destroy(tp_new, rtnl_held, false, NULL); + return ERR_PTR(-EAGAIN); + } + tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false); if (!tp) @@ -1617,10 +1701,10 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, mutex_unlock(&chain->filter_chain_lock); if (tp) { - tcf_proto_destroy(tp_new, rtnl_held, NULL); + tcf_proto_destroy(tp_new, rtnl_held, false, NULL); tp_new = tp; } else if (err) { - tcf_proto_destroy(tp_new, rtnl_held, NULL); + tcf_proto_destroy(tp_new, rtnl_held, false, NULL); tp_new = ERR_PTR(err); } @@ -1658,6 +1742,7 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain, return; } + tcf_proto_signal_destroying(chain, tp); next = tcf_chain_dereference(chain_info.next, chain); if (tp == chain->filter_chain) tcf_chain0_head_change(chain, next); @@ -1843,6 +1928,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; + char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; @@ -1899,13 +1985,19 @@ replay: if (err) return err; + if (tcf_proto_check_kind(tca[TCA_KIND], name)) { + NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); + err = -EINVAL; + goto errout; + } + /* Take rtnl mutex if rtnl_held was set to true on previous iteration, * block is shared (no qdisc found), qdisc is not unlocked, classifier * type is not specified, classifier is not unlocked. */ if (rtnl_held || (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || - !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } @@ -2063,6 +2155,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; + char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; @@ -2102,13 +2195,18 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (err) return err; + if (tcf_proto_check_kind(tca[TCA_KIND], name)) { + NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); + err = -EINVAL; + goto errout; + } /* Take rtnl mutex if flushing whole chain, block is shared (no qdisc * found), qdisc is not unlocked, classifier type is not specified, * classifier is not unlocked. */ if (!prio || (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || - !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } @@ -2164,6 +2262,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, err = -EINVAL; goto errout_locked; } else if (t->tcm_handle == 0) { + tcf_proto_signal_destroying(chain, tp); tcf_chain_tp_remove(chain, &chain_info, tp); mutex_unlock(&chain->filter_chain_lock); @@ -2216,6 +2315,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; + char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; @@ -2252,12 +2352,17 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (err) return err; + if (tcf_proto_check_kind(tca[TCA_KIND], name)) { + NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); + err = -EINVAL; + goto errout; + } /* Take rtnl mutex if block is shared (no qdisc found), qdisc is not * unlocked, classifier type is not specified, classifier is not * unlocked. */ if ((q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || - !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index bf10bdaf5012..8229ed4a67be 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -162,16 +162,20 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, cls_bpf.name = obj->bpf_name; cls_bpf.exts_integrated = obj->exts_integrated; - if (oldprog) + if (oldprog && prog) err = tc_setup_cb_replace(block, tp, TC_SETUP_CLSBPF, &cls_bpf, skip_sw, &oldprog->gen_flags, &oldprog->in_hw_count, &prog->gen_flags, &prog->in_hw_count, true); - else + else if (prog) err = tc_setup_cb_add(block, tp, TC_SETUP_CLSBPF, &cls_bpf, skip_sw, &prog->gen_flags, &prog->in_hw_count, true); + else + err = tc_setup_cb_destroy(block, tp, TC_SETUP_CLSBPF, &cls_bpf, + skip_sw, &oldprog->gen_flags, + &oldprog->in_hw_count, true); if (prog && err) { cls_bpf_offload_cmd(tp, oldprog, prog, extack); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 82bd14e7ac93..d99966a55c84 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -446,7 +446,7 @@ META_COLLECTOR(int_sk_wmem_queued) *err = -1; return; } - dst->value = sk->sk_wmem_queued; + dst->value = READ_ONCE(sk->sk_wmem_queued); } META_COLLECTOR(int_sk_fwd_alloc) @@ -521,7 +521,7 @@ META_COLLECTOR(int_sk_ack_bl) *err = -1; return; } - dst->value = sk->sk_ack_backlog; + dst->value = READ_ONCE(sk->sk_ack_backlog); } META_COLLECTOR(int_sk_max_ack_bl) @@ -532,7 +532,7 @@ META_COLLECTOR(int_sk_max_ack_bl) *err = -1; return; } - dst->value = sk->sk_max_ack_backlog; + dst->value = READ_ONCE(sk->sk_max_ack_backlog); } META_COLLECTOR(int_sk_prio) @@ -554,7 +554,7 @@ META_COLLECTOR(int_sk_rcvlowat) *err = -1; return; } - dst->value = sk->sk_rcvlowat; + dst->value = READ_ONCE(sk->sk_rcvlowat); } META_COLLECTOR(int_sk_rcvtimeo) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 81d58b280612..1047825d9f48 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1390,8 +1390,7 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) } const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { - [TCA_KIND] = { .type = NLA_NUL_STRING, - .len = IFNAMSIZ - 1 }, + [TCA_KIND] = { .type = NLA_STRING }, [TCA_RATE] = { .type = NLA_BINARY, .len = sizeof(struct tc_estimator) }, [TCA_STAB] = { .type = NLA_NESTED }, diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index cebfb65d8556..b1da5589a0c6 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -177,7 +177,7 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, parent = *p; skb = rb_to_skb(parent); - if (ktime_after(txtime, skb->tstamp)) { + if (ktime_compare(txtime, skb->tstamp) >= 0) { p = &parent->rb_right; leftmost = false; } else { diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 98dd87ce1510..b1c7e726ce5d 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -530,8 +530,7 @@ begin: fq_flow_set_throttled(q, f); goto begin; } - if (time_next_packet && - (s64)(now - time_next_packet - q->ce_threshold) > 0) { + if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index c261c0a18868..968519ff36e9 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -14,7 +14,6 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> -#include <linux/jhash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/netlink.h> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 17bd8f539bc7..5ab696efca95 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -382,13 +382,8 @@ void __qdisc_run(struct Qdisc *q) int packets; while (qdisc_restart(q, &packets)) { - /* - * Ordered by possible occurrence: Postpone processing if - * 1. we've exceeded packet quota - * 2. another process needs the CPU; - */ quota -= packets; - if (quota <= 0 || need_resched()) { + if (quota <= 0) { __netif_schedule(q); break; } @@ -657,7 +652,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else { - qdisc->empty = true; + WRITE_ONCE(qdisc->empty, true); } return skb; @@ -799,9 +794,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { }; EXPORT_SYMBOL(pfifo_fast_ops); -static struct lock_class_key qdisc_tx_busylock; -static struct lock_class_key qdisc_running_key; - struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) @@ -854,17 +846,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } spin_lock_init(&sch->busylock); - lockdep_set_class(&sch->busylock, - dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); - lockdep_set_class(&sch->busylock, - dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - seqcount_init(&sch->running); - lockdep_set_class(&sch->running, - dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; sch->flags = ops->static_flags; @@ -875,6 +859,12 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev_hold(dev); refcount_set(&sch->refcnt, 1); + if (sch != &noop_qdisc) { + lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key); + lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key); + lockdep_set_class(&sch->running, &dev->qdisc_running_key); + } + return sch; errout1: kfree(p); @@ -1043,6 +1033,8 @@ static void attach_one_default_qdisc(struct net_device *dev, if (dev->priv_flags & IFF_NO_QUEUE) ops = &noqueue_qdisc_ops; + else if(dev->type == ARPHRD_CAN) + ops = &pfifo_fast_ops; qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); if (!qdisc) { @@ -1217,8 +1209,13 @@ void dev_deactivate_many(struct list_head *head) /* Wait for outstanding qdisc_run calls. */ list_for_each_entry(dev, head, close_list) { - while (some_qdisc_is_busy(dev)) - yield(); + while (some_qdisc_is_busy(dev)) { + /* wait_event() would avoid this sleep-loop but would + * require expensive checks in the fast paths of packet + * processing which isn't worth it. + */ + schedule_timeout_uninterruptible(1); + } /* The new qdisc is assigned at this point so we can safely * unwind stale skb lists and qdisc statistics */ diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 23cd1c873a2c..be35f03b657b 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -5,11 +5,11 @@ * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> */ -#include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> +#include <linux/siphash.h> #include <net/pkt_sched.h> #include <net/sock.h> @@ -126,7 +126,7 @@ struct wdrr_bucket { struct hhf_sched_data { struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; - u32 perturbation; /* hash perturbation */ + siphash_key_t perturbation; /* hash perturbation */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ u32 drop_overlimit; /* number of times max qdisc packet * limit was hit @@ -264,7 +264,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) } /* Get hashed flow-id of the skb. */ - hash = skb_get_hash_perturb(skb, q->perturbation); + hash = skb_get_hash_perturb(skb, &q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; @@ -582,7 +582,7 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt, sch->limit = 1000; q->quantum = psched_mtu(qdisc_dev(sch)); - q->perturbation = prandom_u32(); + get_random_bytes(&q->perturbation, sizeof(q->perturbation)); INIT_LIST_HEAD(&q->new_buckets); INIT_LIST_HEAD(&q->old_buckets); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 0e44039e729c..42e557d48e4e 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -509,6 +509,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) { qdisc_drop(skb, sch, to_free); + skb = NULL; goto finish_segs; } @@ -593,9 +594,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, finish_segs: if (segs) { unsigned int len, last_len; - int nb = 0; + int nb; - len = skb->len; + len = skb ? skb->len : 0; + nb = skb ? 1 : 0; while (segs) { skb2 = segs->next; @@ -612,7 +614,10 @@ finish_segs: } segs = skb2; } - qdisc_tree_reduce_backlog(sch, -nb, prev_len - len); + /* Parent qdiscs accounted for 1 skb of size @prev_len */ + qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); + } else if (!skb) { + return NET_XMIT_DROP; } return NET_XMIT_SUCCESS; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index d448fe3068e5..4074c50ac3d7 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -18,7 +18,7 @@ #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/random.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <net/ip.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -45,7 +45,7 @@ struct sfb_bucket { * (Section 4.4 of SFB reference : moving hash functions) */ struct sfb_bins { - u32 perturbation; /* jhash perturbation */ + siphash_key_t perturbation; /* siphash key */ struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS]; }; @@ -217,7 +217,8 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) { - q->bins[slot].perturbation = prandom_u32(); + get_random_bytes(&q->bins[slot].perturbation, + sizeof(q->bins[slot].perturbation)); } static void sfb_swap_slot(struct sfb_sched_data *q) @@ -314,9 +315,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; - sfbhash = jhash_1word(salt, q->bins[slot].perturbation); + sfbhash = siphash_1u32(salt, &q->bins[slot].perturbation); } else { - sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation); + sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation); } @@ -352,7 +353,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Inelastic flow */ if (q->double_buffering) { sfbhash = skb_get_hash_perturb(skb, - q->bins[slot].perturbation); + &q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 68404a9d2ce4..c787d4d46017 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -14,7 +14,7 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/netlink.h> @@ -117,7 +117,7 @@ struct sfq_sched_data { u8 headdrop; u8 maxdepth; /* limit of packets per flow */ - u32 perturbation; + siphash_key_t perturbation; u8 cur_depth; /* depth of longest slot */ u8 flags; unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ @@ -157,7 +157,7 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index static unsigned int sfq_hash(const struct sfq_sched_data *q, const struct sk_buff *skb) { - return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1); + return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, @@ -607,9 +607,11 @@ static void sfq_perturbation(struct timer_list *t) struct sfq_sched_data *q = from_timer(q, t, perturb_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + siphash_key_t nkey; + get_random_bytes(&nkey, sizeof(nkey)); spin_lock(root_lock); - q->perturbation = prandom_u32(); + q->perturbation = nkey; if (!q->filter_list && q->tail) sfq_rehash(sch); spin_unlock(root_lock); @@ -688,7 +690,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) del_timer(&q->perturb_timer); if (q->perturb_period) { mod_timer(&q->perturb_timer, jiffies + q->perturb_period); - q->perturbation = prandom_u32(); + get_random_bytes(&q->perturbation, sizeof(q->perturbation)); } sch_tree_unlock(sch); kfree(p); @@ -745,7 +747,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt, q->quantum = psched_mtu(qdisc_dev(sch)); q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); q->perturb_period = 0; - q->perturbation = prandom_u32(); + get_random_bytes(&q->perturbation, sizeof(q->perturbation)); if (opt) { int err = sfq_change(sch, opt); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 68b543f85a96..7cd68628c637 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1152,7 +1152,7 @@ EXPORT_SYMBOL_GPL(taprio_offload_free); * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). * This is left as TODO. */ -void taprio_offload_config_changed(struct taprio_sched *q) +static void taprio_offload_config_changed(struct taprio_sched *q) { struct sched_gate_list *oper, *admin; @@ -1224,8 +1224,6 @@ static int taprio_enable_offload(struct net_device *dev, goto done; } - taprio_offload_config_changed(q); - done: taprio_offload_free(offload); @@ -1341,6 +1339,10 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); goto out; } + + /* Everything went ok, return success. */ + err = 0; + out: return err; } @@ -1501,6 +1503,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, call_rcu(&admin->rcu, taprio_free_sched_cb); spin_unlock_irqrestore(&q->current_entry_lock, flags); + + if (FULL_OFFLOAD_IS_ENABLED(taprio_flags)) + taprio_offload_config_changed(q); } new_admin = NULL; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index d2ffc9a0ba3a..8f8d18abd013 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -86,6 +86,8 @@ static struct sctp_association *sctp_association_init( */ asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt; asoc->pf_retrans = sp->pf_retrans; + asoc->ps_retrans = sp->ps_retrans; + asoc->pf_expose = sp->pf_expose; asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial); asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max); @@ -324,7 +326,7 @@ void sctp_association_free(struct sctp_association *asoc) * socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); } /* Mark as dead, so other users can know this structure is @@ -429,6 +431,8 @@ void sctp_assoc_set_primary(struct sctp_association *asoc, changeover = 1 ; asoc->peer.primary_path = transport; + sctp_ulpevent_nofity_peer_addr_change(transport, + SCTP_ADDR_MADE_PRIM, 0); /* Set a default msg_name for events. */ memcpy(&asoc->peer.primary_addr, &transport->ipaddr, @@ -569,6 +573,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, asoc->peer.transport_count--; + sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); sctp_transport_free(peer); } @@ -624,6 +629,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, /* And the partial failure retrans threshold */ peer->pf_retrans = asoc->pf_retrans; + /* And the primary path switchover retrans threshold */ + peer->ps_retrans = asoc->ps_retrans; /* Initialize the peer's SACK delay timeout based on the * association configured value. @@ -707,6 +714,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; + sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); + /* If we do not yet have a primary path, set one. */ if (!asoc->peer.primary_path) { sctp_assoc_set_primary(asoc, peer); @@ -781,9 +790,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, enum sctp_transport_cmd command, sctp_sn_error_t error) { - struct sctp_ulpevent *event; - struct sockaddr_storage addr; - int spc_state = 0; + int spc_state = SCTP_ADDR_AVAILABLE; bool ulp_notify = true; /* Record the transition on the transport. */ @@ -793,19 +800,13 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, * to heartbeat success, report the SCTP_ADDR_CONFIRMED * state to the user, otherwise report SCTP_ADDR_AVAILABLE. */ - if (SCTP_UNCONFIRMED == transport->state && - SCTP_HEARTBEAT_SUCCESS == error) - spc_state = SCTP_ADDR_CONFIRMED; - else - spc_state = SCTP_ADDR_AVAILABLE; - /* Don't inform ULP about transition from PF to - * active state and set cwnd to 1 MTU, see SCTP - * Quick failover draft section 5.1, point 5 - */ - if (transport->state == SCTP_PF) { + if (transport->state == SCTP_PF && + asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) ulp_notify = false; - transport->cwnd = asoc->pathmtu; - } + else if (transport->state == SCTP_UNCONFIRMED && + error == SCTP_HEARTBEAT_SUCCESS) + spc_state = SCTP_ADDR_CONFIRMED; + transport->state = SCTP_ACTIVE; break; @@ -814,19 +815,21 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, * to inactive state. Also, release the cached route since * there may be a better route next time. */ - if (transport->state != SCTP_UNCONFIRMED) + if (transport->state != SCTP_UNCONFIRMED) { transport->state = SCTP_INACTIVE; - else { + spc_state = SCTP_ADDR_UNREACHABLE; + } else { sctp_transport_dst_release(transport); ulp_notify = false; } - - spc_state = SCTP_ADDR_UNREACHABLE; break; case SCTP_TRANSPORT_PF: transport->state = SCTP_PF; - ulp_notify = false; + if (asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) + ulp_notify = false; + else + spc_state = SCTP_ADDR_POTENTIALLY_FAILED; break; default: @@ -836,16 +839,9 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, /* Generate and send a SCTP_PEER_ADDR_CHANGE notification * to the user. */ - if (ulp_notify) { - memset(&addr, 0, sizeof(struct sockaddr_storage)); - memcpy(&addr, &transport->ipaddr, - transport->af_specific->sockaddr_len); - - event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, - 0, spc_state, error, GFP_ATOMIC); - if (event) - asoc->stream.si->enqueue_event(&asoc->ulpq, event); - } + if (ulp_notify) + sctp_ulpevent_nofity_peer_addr_change(transport, + spc_state, error); /* Select new active and retran paths. */ sctp_select_active_and_retran_path(asoc); @@ -1077,7 +1073,7 @@ void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk) /* Decrement the backlog value for a TCP-style socket. */ if (sctp_style(oldsk, TCP)) - oldsk->sk_ack_backlog--; + sk_acceptq_removed(oldsk); /* Release references to the old endpoint and the sock. */ sctp_endpoint_put(assoc->ep); diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index cc0405c79dfc..cc3ce5d80b08 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -75,41 +75,39 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_ulpevent *ev; - int error = 0, notify; - - /* If we failed, we may need to notify. */ - notify = msg->send_failed ? -1 : 0; + int error, sent; /* Release all references. */ list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); - /* Check whether we _really_ need to notify. */ - if (notify < 0) { - asoc = chunk->asoc; - if (msg->send_error) - error = msg->send_error; - else - error = asoc->outqueue.error; - - notify = sctp_ulpevent_type_enabled(asoc->subscribe, - SCTP_SEND_FAILED); + + if (!msg->send_failed) { + sctp_chunk_put(chunk); + continue; } - /* Generate a SEND FAILED event only if enabled. */ - if (notify > 0) { - int sent; - if (chunk->has_tsn) - sent = SCTP_DATA_SENT; - else - sent = SCTP_DATA_UNSENT; + asoc = chunk->asoc; + error = msg->send_error ?: asoc->outqueue.error; + sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT; + if (sctp_ulpevent_type_enabled(asoc->subscribe, + SCTP_SEND_FAILED)) { ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } + if (sctp_ulpevent_type_enabled(asoc->subscribe, + SCTP_SEND_FAILED_EVENT)) { + ev = sctp_ulpevent_make_send_failed_event(asoc, chunk, + sent, error, + GFP_ATOMIC); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } + sctp_chunk_put(chunk); } diff --git a/net/sctp/diag.c b/net/sctp/diag.c index fc9a4c6629ce..8a15146faaeb 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -175,7 +175,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) @@ -425,8 +425,8 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc); r->idiag_wqueue = infox->asoc->sndbuf_used; } else { - r->idiag_rqueue = sk->sk_ack_backlog; - r->idiag_wqueue = sk->sk_max_ack_backlog; + r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); + r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } if (infox->sctpinfo) sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index ea53049d1db6..9d05b2e7bce2 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -164,7 +164,7 @@ void sctp_endpoint_add_asoc(struct sctp_endpoint *ep, /* Increment the backlog value for a TCP-style listening socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) - sk->sk_ack_backlog++; + sk_acceptq_added(sk); } /* Free the endpoint structure. Delay cleanup until diff --git a/net/sctp/input.c b/net/sctp/input.c index 5a070fb5b278..2277981559d0 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -243,7 +243,7 @@ int sctp_rcv(struct sk_buff *skb) bh_lock_sock(sk); } - if (sock_owned_by_user(sk)) { + if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sctp_add_backlog(sk, skb)) { bh_unlock_sock(sk); sctp_chunk_free(chunk); @@ -321,8 +321,8 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) local_bh_disable(); bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { + if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) sctp_chunk_free(chunk); else backloged = 1; @@ -336,7 +336,13 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) if (backloged) return 0; } else { - sctp_inq_push(inqueue, chunk); + if (!sctp_newsk_ready(sk)) { + if (!sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) + return 0; + sctp_chunk_free(chunk); + } else { + sctp_inq_push(inqueue, chunk); + } } done: @@ -358,7 +364,7 @@ static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) struct sctp_ep_common *rcvr = chunk->rcvr; int ret; - ret = sk_add_backlog(sk, skb, sk->sk_rcvbuf); + ret = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); if (!ret) { /* Hold the assoc/ep while hanging on the backlog queue. * This way, we know structures we need will not disappear diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 08d14d86ecfb..fbbf19128c2d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1217,9 +1217,15 @@ static int __net_init sctp_defaults_init(struct net *net) /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; + /* Disable of Primary Path Switchover by default */ + net->sctp.ps_retrans = SCTP_PS_RETRANS_MAX; + /* Enable pf state by default */ net->sctp.pf_enable = 1; + /* Ignore pf exposure feature by default */ + net->sctp.pf_expose = SCTP_PF_EXPOSE_UNSET; + /* Association.Max.Retrans - 10 attempts * Path.Max.Retrans - 5 attempts (per destination address) * Max.Init.Retransmits - 8 attempts diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index e41ed2e0ae7d..48d63956a68c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2155,7 +2155,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_SET_PRIMARY: if (ep->asconf_enable) break; - goto fallthrough; + goto unhandled; case SCTP_PARAM_HOST_NAME_ADDRESS: /* Tell the peer, we won't support this param. */ @@ -2166,11 +2166,11 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_FWD_TSN_SUPPORT: if (ep->prsctp_enable) break; - goto fallthrough; + goto unhandled; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; /* SCTP-AUTH: Secion 6.1 * If the random number is not 32 byte long the association @@ -2187,7 +2187,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; /* SCTP-AUTH: Section 3.2 * The CHUNKS parameter MUST be included once in the INIT or @@ -2203,7 +2203,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; hmacs = (struct sctp_hmac_algo_param *)param.p; n_elt = (ntohs(param.p->length) - @@ -2226,7 +2226,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, retval = SCTP_IERROR_ABORT; } break; -fallthrough: +unhandled: default: pr_debug("%s: unrecognized param:%d for chunk:%d\n", __func__, ntohs(param.p->type), cid); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index e52b2128e43b..acd737d4c0e0 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -567,6 +567,11 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, SCTP_FAILED_THRESHOLD); } + if (transport->error_count > transport->ps_retrans && + asoc->peer.primary_path == transport && + asoc->peer.active_path != transport) + sctp_assoc_set_primary(asoc, asoc->peer.active_path); + /* E2) For the destination address for which the timer * expires, set RTO <- RTO * 2 ("back off the timer"). The * maximum value discussed in rule C7 above (RTO.max) may be diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 939b8d2595bc..83e4ca1fabda 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3943,18 +3943,22 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, */ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, char __user *optval, - unsigned int optlen) + unsigned int optlen, bool v2) { - struct sctp_paddrthlds val; + struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; + int len; - if (optlen < sizeof(struct sctp_paddrthlds)) + len = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); + if (optlen < len) return -EINVAL; - if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, - sizeof(struct sctp_paddrthlds))) + if (copy_from_user(&val, optval, len)) return -EFAULT; + if (v2 && val.spt_pathpfthld > val.spt_pathcpthld) + return -EINVAL; + if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { trans = sctp_addr_id2transport(sk, &val.spt_address, val.spt_assoc_id); @@ -3963,6 +3967,8 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, if (val.spt_pathmaxrxt) trans->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + trans->ps_retrans = val.spt_pathcpthld; trans->pf_retrans = val.spt_pathpfthld; return 0; @@ -3978,17 +3984,23 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, transports) { if (val.spt_pathmaxrxt) trans->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + trans->ps_retrans = val.spt_pathcpthld; trans->pf_retrans = val.spt_pathpfthld; } if (val.spt_pathmaxrxt) asoc->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + asoc->ps_retrans = val.spt_pathcpthld; asoc->pf_retrans = val.spt_pathpfthld; } else { struct sctp_sock *sp = sctp_sk(sk); if (val.spt_pathmaxrxt) sp->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + sp->ps_retrans = val.spt_pathcpthld; sp->pf_retrans = val.spt_pathpfthld; } @@ -4589,6 +4601,40 @@ out: return retval; } +static int sctp_setsockopt_pf_expose(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_value > SCTP_PF_EXPOSE_MAX) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) + goto out; + + if (asoc) + asoc->pf_expose = params.assoc_value; + else + sctp_sk(sk)->pf_expose = params.assoc_value; + retval = 0; + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4744,7 +4790,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_auto_asconf(sk, optval, optlen); break; case SCTP_PEER_ADDR_THLDS: - retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen); + retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + false); + break; + case SCTP_PEER_ADDR_THLDS_V2: + retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + true); break; case SCTP_RECVRCVINFO: retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen); @@ -4798,6 +4849,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_ECN_SUPPORTED: retval = sctp_setsockopt_ecn_supported(sk, optval, optlen); break; + case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: + retval = sctp_setsockopt_pf_expose(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -5041,6 +5095,8 @@ static int sctp_init_sock(struct sock *sk) sp->hbinterval = net->sctp.hb_interval; sp->pathmaxrxt = net->sctp.max_retrans_path; sp->pf_retrans = net->sctp.pf_retrans; + sp->ps_retrans = net->sctp.ps_retrans; + sp->pf_expose = net->sctp.pf_expose; sp->pathmtu = 0; /* allow default discovery */ sp->sackdelay = net->sctp.sack_timeout; sp->sackfreq = 2; @@ -5521,8 +5577,16 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address, pinfo.spinfo_assoc_id); - if (!transport) - return -EINVAL; + if (!transport) { + retval = -EINVAL; + goto out; + } + + if (transport->state == SCTP_PF && + transport->asoc->pf_expose == SCTP_PF_EXPOSE_DISABLE) { + retval = -EACCES; + goto out; + } pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc); pinfo.spinfo_state = transport->state; @@ -7170,18 +7234,19 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt */ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, - char __user *optval, - int len, - int __user *optlen) + char __user *optval, int len, + int __user *optlen, bool v2) { - struct sctp_paddrthlds val; + struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; + int min; - if (len < sizeof(struct sctp_paddrthlds)) + min = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); + if (len < min) return -EINVAL; - len = sizeof(struct sctp_paddrthlds); - if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, len)) + len = min; + if (copy_from_user(&val, optval, len)) return -EFAULT; if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { @@ -7192,6 +7257,7 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, val.spt_pathmaxrxt = trans->pathmaxrxt; val.spt_pathpfthld = trans->pf_retrans; + val.spt_pathcpthld = trans->ps_retrans; goto out; } @@ -7204,11 +7270,13 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, if (asoc) { val.spt_pathpfthld = asoc->pf_retrans; val.spt_pathmaxrxt = asoc->pathmaxrxt; + val.spt_pathcpthld = asoc->ps_retrans; } else { struct sctp_sock *sp = sctp_sk(sk); val.spt_pathpfthld = sp->pf_retrans; val.spt_pathmaxrxt = sp->pathmaxrxt; + val.spt_pathcpthld = sp->ps_retrans; } out: @@ -7900,6 +7968,45 @@ out: return retval; } +static int sctp_getsockopt_pf_expose(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) { + retval = -EINVAL; + goto out; + } + + params.assoc_value = asoc ? asoc->pf_expose + : sctp_sk(sk)->pf_expose; + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -8049,7 +8156,12 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen); break; case SCTP_PEER_ADDR_THLDS: - retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen); + retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, + optlen, false); + break; + case SCTP_PEER_ADDR_THLDS_V2: + retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, + optlen, true); break; case SCTP_GET_ASSOC_STATS: retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen); @@ -8112,6 +8224,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_ECN_SUPPORTED: retval = sctp_getsockopt_ecn_supported(sk, len, optval, optlen); break; + case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: + retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -8376,7 +8491,7 @@ static int sctp_listen_start(struct sock *sk, int backlog) } } - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); return sctp_hash_endpoint(ep); } @@ -8430,7 +8545,7 @@ int sctp_inet_listen(struct socket *sock, int backlog) /* If we are already listening, just update the backlog */ if (sctp_sstate(sk, LISTENING)) - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); else { err = sctp_listen_start(sk, backlog); if (err) @@ -8476,7 +8591,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) mask = 0; /* Is there any exceptional events? */ - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (sk->sk_shutdown & RCV_SHUTDOWN) @@ -8485,7 +8600,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= EPOLLHUP; /* Is it readable? Reconsider this code with TCP-style support. */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* The association is either gone or not ready. */ @@ -8871,7 +8986,7 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, if (sk_can_busy_loop(sk)) { sk_busy_loop(sk, noblock); - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) continue; } @@ -9306,7 +9421,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->inet_rcv_saddr = inet->inet_rcv_saddr; newinet->inet_dport = htons(asoc->peer.port); newinet->pmtudisc = inet->pmtudisc; - newinet->inet_id = asoc->next_tsn ^ jiffies; + newinet->inet_id = prandom_u32(); newinet->uc_ttl = inet->uc_ttl; newinet->mc_loop = 1; @@ -9500,7 +9615,7 @@ struct proto sctp_prot = { .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, - .get_port = sctp_get_port, + .no_autobind = true, .obj_size = sizeof(struct sctp_sock), .useroffset = offsetof(struct sctp_sock, subscribe), .usersize = offsetof(struct sctp_sock, initmsg) - @@ -9542,7 +9657,7 @@ struct proto sctpv6_prot = { .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, - .get_port = sctp_get_port, + .no_autobind = true, .obj_size = sizeof(struct sctp6_sock), .useroffset = offsetof(struct sctp6_sock, sctp.subscribe), .usersize = offsetof(struct sctp6_sock, sctp.initmsg) - diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 238cf1737576..4740aa70e652 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -34,6 +34,8 @@ static int rto_alpha_min = 0; static int rto_beta_min = 0; static int rto_alpha_max = 1000; static int rto_beta_max = 1000; +static int pf_expose_max = SCTP_PF_EXPOSE_MAX; +static int ps_retrans_max = SCTP_PS_RETRANS_MAX; static unsigned long max_autoclose_min = 0; static unsigned long max_autoclose_max = @@ -212,7 +214,16 @@ static struct ctl_table sctp_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_INT_MAX, + .extra2 = &init_net.sctp.ps_retrans, + }, + { + .procname = "ps_retrans", + .data = &init_net.sctp.ps_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.sctp.pf_retrans, + .extra2 = &ps_retrans_max, }, { .procname = "sndbuf_policy", @@ -318,6 +329,15 @@ static struct ctl_table sctp_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "pf_expose", + .data = &init_net.sctp.pf_expose, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &pf_expose_max, + }, { /* sentinel */ } }; diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index e0cc1edf49a0..c82dbdcf13f2 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -238,7 +238,7 @@ fail: * When a destination address on a multi-homed peer encounters a change * an interface details event is sent. */ -struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( +static struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( const struct sctp_association *asoc, const struct sockaddr_storage *aaddr, int flags, int state, int error, gfp_t gfp) @@ -336,6 +336,22 @@ fail: return NULL; } +void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, + int state, int error) +{ + struct sctp_association *asoc = transport->asoc; + struct sockaddr_storage addr; + struct sctp_ulpevent *event; + + memset(&addr, 0, sizeof(struct sockaddr_storage)); + memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len); + + event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, state, + error, GFP_ATOMIC); + if (event) + asoc->stream.si->enqueue_event(&asoc->ulpq, event); +} + /* Create and initialize an SCTP_REMOTE_ERROR notification. * * Note: This assumes that the chunk->skb->data already points to the @@ -511,6 +527,45 @@ fail: return NULL; } +struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event( + const struct sctp_association *asoc, struct sctp_chunk *chunk, + __u16 flags, __u32 error, gfp_t gfp) +{ + struct sctp_send_failed_event *ssf; + struct sctp_ulpevent *event; + struct sk_buff *skb; + int len; + + skb = skb_copy_expand(chunk->skb, sizeof(*ssf), 0, gfp); + if (!skb) + return NULL; + + len = ntohs(chunk->chunk_hdr->length); + len -= sctp_datachk_len(&asoc->stream); + + skb_pull(skb, sctp_datachk_len(&asoc->stream)); + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); + + ssf = skb_push(skb, sizeof(*ssf)); + ssf->ssf_type = SCTP_SEND_FAILED_EVENT; + ssf->ssf_flags = flags; + ssf->ssf_length = sizeof(*ssf) + len; + skb_trim(skb, ssf->ssf_length); + ssf->ssf_error = error; + + ssf->ssfe_info.snd_sid = chunk->sinfo.sinfo_stream; + ssf->ssfe_info.snd_ppid = chunk->sinfo.sinfo_ppid; + ssf->ssfe_info.snd_context = chunk->sinfo.sinfo_context; + ssf->ssfe_info.snd_assoc_id = chunk->sinfo.sinfo_assoc_id; + ssf->ssfe_info.snd_flags = chunk->chunk_hdr->flags; + + sctp_ulpevent_set_owner(event, asoc); + ssf->ssf_assoc_id = sctp_assoc2id(asoc); + + return event; +} + /* Create and initialize a SCTP_SHUTDOWN_EVENT notification. * * Socket Extensions for SCTP - draft-01 diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5b932583e407..b7d9fd285c71 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -123,6 +123,12 @@ struct proto smc_proto6 = { }; EXPORT_SYMBOL_GPL(smc_proto6); +static void smc_restore_fallback_changes(struct smc_sock *smc) +{ + smc->clcsock->file->private_data = smc->sk.sk_socket; + smc->clcsock->file = NULL; +} + static int __smc_release(struct smc_sock *smc) { struct sock *sk = &smc->sk; @@ -141,6 +147,7 @@ static int __smc_release(struct smc_sock *smc) } sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); + smc_restore_fallback_changes(smc); } sk->sk_prot->unhash(sk); @@ -167,6 +174,7 @@ static int smc_release(struct socket *sock) if (!sk) goto out; + sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); /* cleanup for a dangling non-blocking connect */ @@ -189,6 +197,7 @@ static int smc_release(struct socket *sock) sock->sk = NULL; release_sock(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ out: return rc; @@ -700,8 +709,6 @@ static int __smc_connect(struct smc_sock *smc) int smc_type; int rc = 0; - sock_hold(&smc->sk); /* sock put in passive closing */ - if (smc->use_fallback) return smc_connect_fallback(smc, smc->fallback_rsn); @@ -846,6 +853,8 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, rc = kernel_connect(smc->clcsock, addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; + + sock_hold(&smc->sk); /* sock put in passive closing */ if (flags & O_NONBLOCK) { if (schedule_work(&smc->connect_work)) smc->connect_nonblock = 1; @@ -970,12 +979,14 @@ void smc_close_non_accepted(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); + sock_hold(sk); /* sock_put below */ lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; __smc_release(smc); release_sock(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ } @@ -1291,8 +1302,8 @@ static void smc_listen_work(struct work_struct *work) /* check if RDMA is available */ if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */ /* prepare RDMA check */ - memset(&ini, 0, sizeof(ini)); ini.is_smcd = false; + ini.ism_dev = NULL; ini.ib_lcl = &pclc->lcl; rc = smc_find_rdma_device(new_smc, &ini); if (rc) { diff --git a/net/smc/smc.h b/net/smc/smc.h index 878313f8d6c1..be11ba41190f 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -188,6 +188,7 @@ struct smc_connection { * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ + u8 killed : 1; /* abnormal termination */ }; struct smc_sock { /* smc sock container */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index d0b0f4c865b4..7dc07ec2379b 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -63,7 +63,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, wr_rdma_buf, (struct smc_wr_tx_pend_priv **)pend); - if (!conn->alert_token_local) + if (conn->killed) /* abnormal termination */ rc = -EPIPE; return rc; @@ -328,7 +328,7 @@ static void smcd_cdc_rx_tsklet(unsigned long data) struct smcd_cdc_msg cdc; struct smc_sock *smc; - if (!conn) + if (!conn || conn->killed) return; data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index fc06720b53c1..d34e5adce2eb 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -13,6 +13,7 @@ #include <linux/sched/signal.h> #include <net/sock.h> +#include <net/tcp.h> #include "smc.h" #include "smc_tx.h" @@ -65,8 +66,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) rc = sk_wait_event(sk, &timeout, !smc_tx_prepared_sends(&smc->conn) || - (sk->sk_err == ECONNABORTED) || - (sk->sk_err == ECONNRESET), + sk->sk_err == ECONNABORTED || + sk->sk_err == ECONNRESET || + smc->conn.killed, &wait); if (rc) break; @@ -95,11 +97,13 @@ static int smc_close_final(struct smc_connection *conn) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; else conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1; + if (conn->killed) + return -EPIPE; return smc_cdc_get_slot_and_msg_send(conn); } -static int smc_close_abort(struct smc_connection *conn) +int smc_close_abort(struct smc_connection *conn) { conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; @@ -109,19 +113,15 @@ static int smc_close_abort(struct smc_connection *conn) /* terminate smc socket abnormally - active abort * link group is terminated, i.e. RDMA communication no longer possible */ -static void smc_close_active_abort(struct smc_sock *smc) +void smc_close_active_abort(struct smc_sock *smc) { struct sock *sk = &smc->sk; - - struct smc_cdc_conn_state_flags *txflags = - &smc->conn.local_tx_ctrl.conn_state_flags; + bool release_clcsock = false; if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) { sk->sk_err = ECONNABORTED; - if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_err = ECONNABORTED; - smc->clcsock->sk->sk_state_change(smc->clcsock->sk); - } + if (smc->clcsock && smc->clcsock->sk) + tcp_abort(smc->clcsock->sk, ECONNABORTED); } switch (sk->sk_state) { case SMC_ACTIVE: @@ -129,35 +129,29 @@ static void smc_close_active_abort(struct smc_sock *smc) release_sock(sk); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); + sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: - if (!smc_cdc_rxed_any_close(&smc->conn)) - sk->sk_state = SMC_PEERABORTWAIT; - else - sk->sk_state = SMC_CLOSED; release_sock(sk); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* postponed passive closing */ break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: - if (!txflags->peer_conn_closed) { - /* just SHUTDOWN_SEND done */ - sk->sk_state = SMC_PEERABORTWAIT; - } else { - sk->sk_state = SMC_CLOSED; - } + case SMC_PEERFINCLOSEWAIT: + sk->sk_state = SMC_CLOSED; + smc_conn_free(&smc->conn); + release_clcsock = true; sock_put(sk); /* passive closing */ break; case SMC_PROCESSABORT: case SMC_APPFINCLOSEWAIT: sk->sk_state = SMC_CLOSED; break; - case SMC_PEERFINCLOSEWAIT: - sock_put(sk); /* passive closing */ - break; case SMC_INIT: case SMC_PEERABORTWAIT: case SMC_CLOSED: @@ -166,6 +160,12 @@ static void smc_close_active_abort(struct smc_sock *smc) sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); + + if (release_clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } } static inline bool smc_close_sent_any_close(struct smc_connection *conn) @@ -215,8 +215,6 @@ again: if (sk->sk_state == SMC_ACTIVE) { /* send close request */ rc = smc_close_final(conn); - if (rc) - break; sk->sk_state = SMC_PEERCLOSEWAIT1; } else { /* peer event has changed the state */ @@ -229,8 +227,6 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); - if (rc) - break; } sk->sk_state = SMC_CLOSED; break; @@ -246,8 +242,6 @@ again: goto again; /* confirm close from peer */ rc = smc_close_final(conn); - if (rc) - break; if (smc_cdc_rxed_any_close(conn)) { /* peer has closed the socket already */ sk->sk_state = SMC_CLOSED; @@ -263,8 +257,6 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); - if (rc) - break; } /* peer sending PeerConnectionClosed will cause transition */ break; @@ -272,10 +264,12 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - smc_close_abort(conn); + rc = smc_close_abort(conn); sk->sk_state = SMC_CLOSED; break; case SMC_PEERABORTWAIT: + sk->sk_state = SMC_CLOSED; + break; case SMC_CLOSED: /* nothing to do, add tracing in future patch */ break; @@ -344,12 +338,6 @@ static void smc_close_passive_work(struct work_struct *work) lock_sock(sk); old_state = sk->sk_state; - if (!conn->alert_token_local) { - /* abnormal termination */ - smc_close_active_abort(smc); - goto wakeup; - } - rxflags = &conn->local_rx_ctrl.conn_state_flags; if (rxflags->peer_conn_abort) { /* peer has not received all data */ @@ -451,8 +439,6 @@ again: goto again; /* send close wr request */ rc = smc_close_wr(conn); - if (rc) - break; sk->sk_state = SMC_PEERCLOSEWAIT1; break; case SMC_APPCLOSEWAIT1: @@ -466,8 +452,6 @@ again: goto again; /* confirm close from peer */ rc = smc_close_wr(conn); - if (rc) - break; sk->sk_state = SMC_APPCLOSEWAIT2; break; case SMC_APPCLOSEWAIT2: diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index e0e3b5df25d2..634fea2b7c95 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -24,5 +24,7 @@ int smc_close_active(struct smc_sock *smc); int smc_close_shutdown_write(struct smc_sock *smc); void smc_close_init(struct smc_sock *smc); void smc_clcsock_release(struct smc_sock *smc); +int smc_close_abort(struct smc_connection *conn); +void smc_close_active_abort(struct smc_sock *smc); #endif /* SMC_CLOSE_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 4ca50ddf8d16..0d92456729ab 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -42,20 +42,40 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); +/* return head of link group list and its lock for a given link group */ +static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, + spinlock_t **lgr_lock) +{ + if (lgr->is_smcd) { + *lgr_lock = &lgr->smcd->lgr_lock; + return &lgr->smcd->lgr_list; + } + + *lgr_lock = &smc_lgr_list.lock; + return &smc_lgr_list.list; +} + static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ - mod_delayed_work(system_wq, &lgr->free_work, - (!lgr->is_smcd && lgr->role == SMC_CLNT) ? - SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); + if (!lgr->freeing && !lgr->freefast) { + mod_delayed_work(system_wq, &lgr->free_work, + (!lgr->is_smcd && lgr->role == SMC_CLNT) ? + SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); + } } void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) { - mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST); + if (!lgr->freeing && !lgr->freefast) { + lgr->freefast = 1; + mod_delayed_work(system_wq, &lgr->free_work, + SMC_LGR_FREE_DELAY_FAST); + } } /* Register connection's alert token in our lookup structure. @@ -134,6 +154,7 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); + conn->lgr = NULL; } /* Send delete link, either as client to request the initiation @@ -157,48 +178,62 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link_group *lgr = container_of(to_delayed_work(work), struct smc_link_group, free_work); + spinlock_t *lgr_lock; + struct smc_link *lnk; bool conns; - spin_lock_bh(&smc_lgr_list.lock); + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (lgr->freeing) { + spin_unlock_bh(lgr_lock); + return; + } read_lock_bh(&lgr->conns_lock); conns = RB_EMPTY_ROOT(&lgr->conns_all); read_unlock_bh(&lgr->conns_lock); if (!conns) { /* number of lgr connections is no longer zero */ - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(lgr_lock); return; } - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); /* remove from smc_lgr_list */ - spin_unlock_bh(&smc_lgr_list.lock); + list_del_init(&lgr->list); /* remove from smc_lgr_list */ + lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (!lgr->is_smcd && !lgr->terminating) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* try to send del link msg, on error free lgr immediately */ if (lnk->state == SMC_LNK_ACTIVE && !smc_link_send_delete(lnk)) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); + spin_unlock_bh(lgr_lock); return; } } + lgr->freeing = 1; /* this instance does the freeing, no new schedule */ + spin_unlock_bh(lgr_lock); + cancel_delayed_work(&lgr->free_work); - if (!delayed_work_pending(&lgr->free_work)) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(lnk); + if (lgr->is_smcd) + smc_ism_signal_shutdown(lgr); + smc_lgr_free(lgr); +} - if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); - if (lgr->is_smcd) - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); - } +static void smc_lgr_terminate_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + terminate_work); + + smc_lgr_terminate(lgr); } /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; + struct list_head *lgr_list; struct smc_link *lnk; + spinlock_t *lgr_lock; u8 rndvec[3]; int rc = 0; int i; @@ -213,10 +248,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = SMC_CLC_DECL_MEM; - goto out; + goto ism_put_vlan; } lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; + lgr->terminating = 0; + lgr->freefast = 0; + lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); @@ -228,13 +266,18 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); + INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work); lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ + get_device(&ini->ism_dev->dev); lgr->peer_gid = ini->ism_gid; lgr->smcd = ini->ism_dev; + lgr_list = &ini->ism_dev->lgr_list; + lgr_lock = &lgr->smcd->lgr_lock; } else { /* SMC-R specific settings */ + get_device(&ini->ib_dev->ibdev->dev); lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); @@ -245,6 +288,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lnk->link_id = SMC_SINGLE_LINK; lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; + lgr_list = &smc_lgr_list.list; + lgr_lock = &smc_lgr_list.lock; lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; if (!ini->ib_dev->initialized) @@ -274,9 +319,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) goto destroy_qp; } smc->conn.lgr = lgr; - spin_lock_bh(&smc_lgr_list.lock); - list_add(&lgr->list, &smc_lgr_list.list); - spin_unlock_bh(&smc_lgr_list.lock); + spin_lock_bh(lgr_lock); + list_add(&lgr->list, lgr_list); + spin_unlock_bh(lgr_lock); return 0; destroy_qp: @@ -289,6 +334,9 @@ clear_llc_lnk: smc_llc_link_clear(lnk); free_lgr: kfree(lgr); +ism_put_vlan: + if (ini->is_smcd && ini->vlan_id) + smc_ism_put_vlan(ini->ism_dev, ini->vlan_id); out: if (rc < 0) { if (rc == -ENOMEM) @@ -306,7 +354,7 @@ static void smc_buf_unuse(struct smc_connection *conn, conn->sndbuf_desc->used = 0; if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { - if (!lgr->is_smcd) { + if (!lgr->is_smcd && !list_empty(&lgr->list)) { /* unregister rmb with peer */ smc_llc_do_delete_rkey( &lgr->lnk[SMC_SINGLE_LINK], @@ -337,9 +385,10 @@ void smc_conn_free(struct smc_connection *conn) } else { smc_cdc_tx_dismiss_slots(conn); } - smc_lgr_unregister_conn(conn); - smc_buf_unuse(conn, lgr); /* allow buffer reuse */ - conn->lgr = NULL; + if (!list_empty(&lgr->list)) { + smc_lgr_unregister_conn(conn); + smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + } if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); @@ -430,23 +479,50 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) static void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); - if (lgr->is_smcd) + if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - else + put_device(&lgr->smcd->dev); + } else { smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); + } kfree(lgr); } void smc_lgr_forget(struct smc_link_group *lgr) { - spin_lock_bh(&smc_lgr_list.lock); + struct list_head *lgr_list; + spinlock_t *lgr_lock; + + lgr_list = smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); - spin_unlock_bh(&smc_lgr_list.lock); + if (!list_empty(lgr_list)) + list_del_init(lgr_list); + spin_unlock_bh(lgr_lock); +} + +static void smc_sk_wake_ups(struct smc_sock *smc) +{ + smc->sk.sk_write_space(&smc->sk); + smc->sk.sk_data_ready(&smc->sk); + smc->sk.sk_state_change(&smc->sk); } -/* terminate linkgroup abnormally */ +/* kill a connection */ +static void smc_conn_kill(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + smc_close_abort(conn); + conn->killed = 1; + smc_sk_wake_ups(smc); + smc_lgr_unregister_conn(conn); + smc->sk.sk_err = ECONNABORTED; + smc_close_active_abort(smc); +} + +/* terminate link group */ static void __smc_lgr_terminate(struct smc_link_group *lgr) { struct smc_connection *conn; @@ -456,52 +532,65 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) if (lgr->terminating) return; /* lgr already terminating */ lgr->terminating = 1; - if (!list_empty(&lgr->list)) /* forget lgr */ - list_del_init(&lgr->list); if (!lgr->is_smcd) smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); - write_lock_bh(&lgr->conns_lock); + /* kill remaining link group connections */ + read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); while (node) { + read_unlock_bh(&lgr->conns_lock); conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); - sock_hold(&smc->sk); /* sock_put in close work */ - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; - __smc_lgr_unregister_conn(conn); - conn->lgr = NULL; - write_unlock_bh(&lgr->conns_lock); - if (!schedule_work(&conn->close_work)) - sock_put(&smc->sk); - write_lock_bh(&lgr->conns_lock); + sock_hold(&smc->sk); /* sock_put below */ + lock_sock(&smc->sk); + smc_conn_kill(conn); + release_sock(&smc->sk); + sock_put(&smc->sk); /* sock_hold above */ + read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } - write_unlock_bh(&lgr->conns_lock); + read_unlock_bh(&lgr->conns_lock); if (!lgr->is_smcd) wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); - smc_lgr_schedule_free_work(lgr); + smc_lgr_schedule_free_work_fast(lgr); } +/* unlink and terminate link group */ void smc_lgr_terminate(struct smc_link_group *lgr) { - spin_lock_bh(&smc_lgr_list.lock); + spinlock_t *lgr_lock; + + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (lgr->terminating) { + spin_unlock_bh(lgr_lock); + return; /* lgr already terminating */ + } + list_del_init(&lgr->list); + spin_unlock_bh(lgr_lock); __smc_lgr_terminate(lgr); - spin_unlock_bh(&smc_lgr_list.lock); } /* Called when IB port is terminated */ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *l; + LIST_HEAD(lgr_free_list); spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { if (!lgr->is_smcd && lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) - __smc_lgr_terminate(lgr); + list_move(&lgr->list, &lgr_free_list); } spin_unlock_bh(&smc_lgr_list.lock); + + list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr); + } } /* Called when SMC-D device is terminated or peer is lost */ @@ -511,20 +600,19 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) LIST_HEAD(lgr_free_list); /* run common cleanup function and build free list */ - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->is_smcd && lgr->smcd == dev && - (!peer_gid || lgr->peer_gid == peer_gid) && + spin_lock_bh(&dev->lgr_lock); + list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { + if ((!peer_gid || lgr->peer_gid == peer_gid) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { - __smc_lgr_terminate(lgr); list_move(&lgr->list, &lgr_free_list); } } - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(&dev->lgr_lock); /* cancel the regular free workers and actually free lgrs */ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); + __smc_lgr_terminate(lgr); cancel_delayed_work_sync(&lgr->free_work); if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */ smc_ism_signal_shutdown(lgr); @@ -558,7 +646,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) } rtnl_lock(); - nest_lvl = dev_get_nest_level(ndev); + nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; @@ -604,10 +692,14 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; + struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; + spinlock_t *lgr_lock; int rc = 0; + lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list; + lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock; ini->cln_first_contact = SMC_FIRST_CONTACT; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; if (role == SMC_CLNT && ini->srv_first_contact) @@ -615,8 +707,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) goto create; /* determine if an existing link group can be reused */ - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry(lgr, &smc_lgr_list.list, list) { + spin_lock_bh(lgr_lock); + list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) : @@ -636,7 +728,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) } write_unlock_bh(&lgr->conns_lock); } - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(lgr_lock); if (role == SMC_CLNT && !ini->srv_first_contact && ini->cln_first_contact == SMC_FIRST_CONTACT) { @@ -1024,16 +1116,45 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, return 0; } +static void smc_core_going_away(void) +{ + struct smc_ib_device *smcibdev; + struct smcd_dev *smcd; + + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { + int i; + + for (i = 0; i < SMC_MAX_PORTS; i++) + set_bit(i, smcibdev->ports_going_away); + } + spin_unlock(&smc_ib_devices.lock); + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + smcd->going_away = 1; + } + spin_unlock(&smcd_dev_list.lock); +} + /* Called (from smc_exit) when module is removed */ void smc_core_exit(void) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_freeing_list); + struct smcd_dev *smcd; + + smc_core_going_away(); spin_lock_bh(&smc_lgr_list.lock); - if (!list_empty(&smc_lgr_list.list)) - list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); + list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); spin_unlock_bh(&smc_lgr_list.lock); + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(smcd, &smcd_dev_list.list, list) + list_splice_init(&smcd->lgr_list, &lgr_freeing_list); + spin_unlock(&smcd_dev_list.lock); + list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { list_del_init(&lgr->list); if (!lgr->is_smcd) { diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c00ac61dc129..e6fd1ed42064 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -202,8 +202,11 @@ struct smc_link_group { u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ + struct work_struct terminate_work; /* abnormal lgr termination */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ + u8 freefast : 1; /* free worker scheduled fast */ + u8 freeing : 1; /* lgr is being freed */ bool is_smcd; /* SMC-R or SMC-D */ union { @@ -280,6 +283,12 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +static inline void smc_lgr_terminate_sched(struct smc_link_group *lgr) +{ + if (!lgr->terminating) + schedule_work(&lgr->terminate_work); +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index d14ca4af6f94..af05daeb0538 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -242,8 +242,12 @@ static void smc_ib_port_event_work(struct work_struct *work) for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { smc_ib_remember_port_attr(smcibdev, port_idx + 1); clear_bit(port_idx, &smcibdev->port_event_mask); - if (!smc_ib_port_active(smcibdev, port_idx + 1)) + if (!smc_ib_port_active(smcibdev, port_idx + 1)) { + set_bit(port_idx, smcibdev->ports_going_away); smc_port_terminate(smcibdev, port_idx + 1); + } else { + clear_bit(port_idx, smcibdev->ports_going_away); + } } } @@ -259,8 +263,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, switch (ibevent->event) { case IB_EVENT_DEVICE_FATAL: /* terminate all ports on device */ - for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) + for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { set_bit(port_idx, &smcibdev->port_event_mask); + set_bit(port_idx, smcibdev->ports_going_away); + } schedule_work(&smcibdev->port_event_work); break; case IB_EVENT_PORT_ERR: @@ -269,6 +275,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, port_idx = ibevent->element.port_num - 1; if (port_idx < SMC_MAX_PORTS) { set_bit(port_idx, &smcibdev->port_event_mask); + if (ibevent->event == IB_EVENT_PORT_ERR) + set_bit(port_idx, smcibdev->ports_going_away); + else if (ibevent->event == IB_EVENT_PORT_ACTIVE) + clear_bit(port_idx, smcibdev->ports_going_away); schedule_work(&smcibdev->port_event_work); } break; @@ -307,6 +317,7 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) port_idx = ibevent->element.qp->port - 1; if (port_idx < SMC_MAX_PORTS) { set_bit(port_idx, &smcibdev->port_event_mask); + set_bit(port_idx, smcibdev->ports_going_away); schedule_work(&smcibdev->port_event_work); } break; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index da60ab9e8d70..6a0069db6cae 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -47,6 +47,7 @@ struct smc_ib_device { /* ib-device infos for smc */ u8 initialized : 1; /* ib dev CQ, evthdl done */ struct work_struct port_event_work; unsigned long port_event_mask; + DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS); }; struct smc_buf_desc; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index e89e918b88e0..ee7340898cb4 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -286,7 +286,9 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); spin_lock_init(&smcd->lock); + spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); + INIT_LIST_HEAD(&smcd->lgr_list); smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) { @@ -313,6 +315,7 @@ void smcd_unregister_dev(struct smcd_dev *smcd) spin_lock(&smcd_dev_list.lock); list_del(&smcd->list); spin_unlock(&smcd_dev_list.lock); + smcd->going_away = 1; flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); smc_smcd_terminate(smcd, 0, VLAN_VID_MASK); @@ -342,6 +345,8 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) { struct smc_ism_event_work *wrk; + if (smcd->going_away) + return; /* copy event to event work queue, and let it be handled there */ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); if (!wrk) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 4fd60c522802..e1918ffaf125 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -475,7 +475,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } smc_llc_send_message(link, llc, sizeof(*llc)); - smc_lgr_schedule_free_work_fast(lgr); + smc_lgr_terminate_sched(lgr); } } diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index bab2da8cf17a..82dedf052d86 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -376,8 +376,6 @@ static int smc_pnet_fill_entry(struct net *net, return 0; error: - if (pnetelem->ndev) - dev_put(pnetelem->ndev); return rc; } @@ -718,7 +716,7 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev) int i, nest_lvl; rtnl_lock(); - nest_lvl = dev_get_nest_level(ndev); + nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; @@ -781,6 +779,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL)) { ini->ib_dev = ibdev; @@ -820,6 +819,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, continue; if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL)) { ini->ib_dev = ibdev; @@ -846,7 +846,8 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, spin_lock(&smcd_dev_list.lock); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { - if (smc_pnet_match(ismdev->pnetid, ndev_pnetid)) { + if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && + !ismdev->going_away) { ini->ism_dev = ismdev; break; } diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 413a6abf227e..39d7b34d06d2 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -201,6 +201,8 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; + struct smc_cdc_conn_state_flags *cflags = + &conn->local_tx_ctrl.conn_state_flags; struct sock *sk = &smc->sk; int rc; @@ -210,9 +212,10 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, add_wait_queue(sk_sleep(sk), &wait); rc = sk_wait_event(sk, timeo, sk->sk_err || + cflags->peer_conn_abort || sk->sk_shutdown & RCV_SHUTDOWN || - fcrit(conn) || - smc_cdc_rxed_any_close_or_senddone(conn), + conn->killed || + fcrit(conn), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); @@ -262,6 +265,18 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, return -EAGAIN; } +static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_rx_data_available(conn)) + return true; + else if (conn->urg_state == SMC_URG_VALID) + /* we received a single urgent Byte - skip */ + smc_rx_update_cons(smc, 0); + return false; +} + /* smc_rx_recvmsg - receive data from RMBE * @msg: copy data to receive buffer * @pipe: copy data to pipe if set - indicates splice() call @@ -303,16 +318,20 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (read_done >= target || (pipe && read_done)) break; - if (atomic_read(&conn->bytes_to_rcv)) + if (conn->killed) + break; + + if (smc_rx_recvmsg_data_available(smc)) goto copy; - else if (conn->urg_state == SMC_URG_VALID) - /* we received a single urgent Byte - skip */ - smc_rx_update_cons(smc, 0); - if (sk->sk_shutdown & RCV_SHUTDOWN || - smc_cdc_rxed_any_close_or_senddone(conn) || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + if (sk->sk_shutdown & RCV_SHUTDOWN) { + /* smc_cdc_msg_recv_action() could have run after + * above smc_rx_recvmsg_data_available() + */ + if (smc_rx_recvmsg_data_available(smc)) + goto copy; break; + } if (read_done) { if (sk->sk_err || diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 6c8f09c1ce51..824f096ee7de 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -86,6 +86,7 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || + conn->killed || conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { rc = -EPIPE; break; @@ -155,7 +156,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) return -ENOTCONN; if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + conn->killed) return -EPIPE; if (smc_cdc_rxed_any_close(conn)) return send_done ?: -ECONNRESET; @@ -282,10 +283,8 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); - if (rc) { - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + if (rc) smc_lgr_terminate(lgr); - } return rc; } @@ -495,10 +494,11 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) if (smc->sk.sk_err == ECONNABORTED) return sock_error(&smc->sk); + if (conn->killed) + return -EPIPE; rc = 0; - if (conn->alert_token_local) /* connection healthy */ - mod_delayed_work(system_wq, &conn->tx_work, - SMC_TX_WORK_DELAY); + mod_delayed_work(system_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); } return rc; } @@ -547,6 +547,9 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { int rc; + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return -EPIPE; /* connection being aborted */ if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -573,9 +576,7 @@ void smc_tx_work(struct work_struct *work) int rc; lock_sock(&smc->sk); - if (smc->sk.sk_err || - !conn->alert_token_local || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + if (smc->sk.sk_err) goto out; rc = smc_tx_sndbuf_nonempty(conn); @@ -608,8 +609,11 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) ((to_confirm > conn->rmbe_update_limit) && ((sender_free <= (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return; if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && - conn->alert_token_local) { /* connection healthy */ + !conn->killed) { schedule_delayed_work(&conn->tx_work, SMC_TX_WORK_DELAY); return; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 253aa75dc2b6..50743dc56c86 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -101,7 +101,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) clear_bit(i, link->wr_tx_mask); } /* terminate connections of this link group abnormally */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); @@ -191,7 +191,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); return -EPIPE; } if (idx == link->wr_tx_cnt) @@ -247,7 +247,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { smc_wr_tx_put_slot(link, priv); - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); } return rc; } @@ -272,7 +272,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) SMC_WR_REG_MR_WAIT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); return -EPIPE; } if (rc == -ERESTARTSYS) @@ -373,7 +373,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) /* terminate connections of this link group * abnormally */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); break; default: smc_wr_rx_post(link); /* refill WR RX */ diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 339e8c077c2d..195b40c5dae4 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -220,7 +220,7 @@ void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs) goto out; spin_lock_bh(&xprt->bc_pa_lock); - xprt->bc_alloc_max -= max_reqs; + xprt->bc_alloc_max -= min(max_reqs, xprt->bc_alloc_max); list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { dprintk("RPC: req=%p\n", req); list_del(&req->rq_bc_pa_list); @@ -307,8 +307,8 @@ void xprt_free_bc_rqst(struct rpc_rqst *req) */ dprintk("RPC: Last session removed req=%p\n", req); xprt_free_allocation(req); - return; } + xprt_put(xprt); } /* @@ -339,7 +339,7 @@ found: spin_unlock(&xprt->bc_pa_lock); if (new) { if (req != new) - xprt_free_bc_rqst(new); + xprt_free_allocation(new); break; } else if (req) break; @@ -368,6 +368,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); dprintk("RPC: add callback request to list\n"); + xprt_get(xprt); spin_lock(&bc_serv->sv_cb_lock); list_add(&req->rq_bc_list, &bc_serv->sv_cb_list); wake_up(&bc_serv->sv_cb_waitq); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 8a45b3ccc313..41df4c507193 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1943,6 +1943,11 @@ static void xprt_destroy_cb(struct work_struct *work) rpc_destroy_wait_queue(&xprt->backlog); kfree(xprt->servername); /* + * Destroy any existing back channel + */ + xprt_destroy_backchannel(xprt, UINT_MAX); + + /* * Tear down transport state and free the rpc_xprt */ xprt->ops->destroy(xprt); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 50e075fcdd8f..b458bf53ca69 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -163,6 +163,7 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) spin_lock(&xprt->bc_pa_lock); list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); spin_unlock(&xprt->bc_pa_lock); + xprt_put(xprt); } static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt) @@ -259,6 +260,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; + xprt_get(xprt); spin_lock(&bc_serv->sv_cb_lock); list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list); spin_unlock(&bc_serv->sv_cb_lock); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9ac88722fa83..70e52f567b2a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1249,19 +1249,21 @@ static void xs_error_report(struct sock *sk) { struct sock_xprt *transport; struct rpc_xprt *xprt; - int err; read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; transport = container_of(xprt, struct sock_xprt, xprt); - err = -sk->sk_err; - if (err == 0) + transport->xprt_err = -sk->sk_err; + if (transport->xprt_err == 0) goto out; dprintk("RPC: xs_error_report client %p, error=%d...\n", - xprt, -err); - trace_rpc_socket_error(xprt, sk->sk_socket, err); + xprt, -transport->xprt_err); + trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err); + + /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */ + smp_mb__before_atomic(); xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR); out: read_unlock_bh(&sk->sk_callback_lock); @@ -2476,7 +2478,6 @@ static void xs_wake_write(struct sock_xprt *transport) static void xs_wake_error(struct sock_xprt *transport) { int sockerr; - int sockerr_len = sizeof(sockerr); if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) return; @@ -2485,9 +2486,7 @@ static void xs_wake_error(struct sock_xprt *transport) goto out; if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) goto out; - if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR, - (char *)&sockerr, &sockerr_len) != 0) - goto out; + sockerr = xchg(&transport->xprt_err, 0); if (sockerr < 0) xprt_wake_pending_tasks(&transport->xprt, sockerr); out: diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index b83e16ade4d2..716b61a701a8 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -35,6 +35,21 @@ config TIPC_MEDIA_UDP Saying Y here will enable support for running TIPC over IP/UDP bool default y +config TIPC_CRYPTO + bool "TIPC encryption support" + depends on TIPC + select CRYPTO + select CRYPTO_AES + select CRYPTO_GCM + help + Saying Y here will enable support for TIPC encryption. + All TIPC messages will be encrypted/decrypted by using the currently most + advanced algorithm: AEAD AES-GCM (like IPSec or TLS) before leaving/ + entering the TIPC stack. + Key setting from user-space is performed via netlink by a user program + (e.g. the iproute2 'tipc' tool). + bool + default y config TIPC_DIAG tristate "TIPC: socket monitoring interface" diff --git a/net/tipc/Makefile b/net/tipc/Makefile index c86aba0282af..11255e970dd4 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -16,6 +16,7 @@ CFLAGS_trace.o += -I$(src) tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o +tipc-$(CONFIG_TIPC_CRYPTO) += crypto.o obj-$(CONFIG_TIPC_DIAG) += diag.o diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 6ef1abdd525f..f41096a759fa 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -84,7 +84,7 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net) */ int tipc_bcast_get_mtu(struct net *net) { - return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE; + return tipc_link_mss(tipc_bc_sndlink(net)); } void tipc_bcast_disable_rcast(struct net *net) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 0214aa1c4427..d7ec26bd739d 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -44,6 +44,7 @@ #include "netlink.h" #include "udp_media.h" #include "trace.h" +#include "crypto.h" #define MAX_ADDR_STR 60 @@ -315,6 +316,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, b->net_plane = bearer_id + 'A'; b->priority = prio; test_and_set_bit_lock(0, &b->up); + refcount_set(&b->refcnt, 1); res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { @@ -351,6 +353,17 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) return 0; } +bool tipc_bearer_hold(struct tipc_bearer *b) +{ + return (b && refcount_inc_not_zero(&b->refcnt)); +} + +void tipc_bearer_put(struct tipc_bearer *b) +{ + if (b && refcount_dec_and_test(&b->refcnt)) + kfree_rcu(b, rcu); +} + /** * bearer_disable * @@ -369,7 +382,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) if (b->disc) tipc_disc_delete(b->disc); RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL); - kfree_rcu(b, rcu); + tipc_bearer_put(b); tipc_mon_delete(net, bearer_id); } @@ -504,10 +517,15 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, rcu_read_lock(); b = bearer_get(net, bearer_id); - if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) - b->media->send_msg(net, skb, b, dest); - else + if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) { +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dest, NULL); + if (skb) +#endif + b->media->send_msg(net, skb, b, dest); + } else { kfree_skb(skb); + } rcu_read_unlock(); } @@ -515,7 +533,8 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, */ void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, - struct tipc_media_addr *dst) + struct tipc_media_addr *dst, + struct tipc_node *__dnode) { struct tipc_bearer *b; struct sk_buff *skb, *tmp; @@ -529,10 +548,15 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id, __skb_queue_purge(xmitq); skb_queue_walk_safe(xmitq, skb, tmp) { __skb_dequeue(xmitq); - if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) - b->media->send_msg(net, skb, b, dst); - else + if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) { +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dst, __dnode); + if (skb) +#endif + b->media->send_msg(net, skb, b, dst); + } else { kfree_skb(skb); + } } rcu_read_unlock(); } @@ -543,6 +567,7 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq) { struct tipc_net *tn = tipc_net(net); + struct tipc_media_addr *dst; int net_id = tn->net_id; struct tipc_bearer *b; struct sk_buff *skb, *tmp; @@ -557,7 +582,12 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, msg_set_non_seq(hdr, 1); msg_set_mc_netid(hdr, net_id); __skb_dequeue(xmitq); - b->media->send_msg(net, skb, b, &b->bcast_addr); + dst = &b->bcast_addr; +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dst, NULL); + if (skb) +#endif + b->media->send_msg(net, skb, b, dst); } rcu_read_unlock(); } @@ -584,6 +614,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, if (likely(b && test_bit(0, &b->up) && (skb->pkt_type <= PACKET_MULTICAST))) { skb_mark_not_on_list(skb); + TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(dev_net(b->pt.dev), skb, b); rcu_read_unlock(); return NET_RX_SUCCESS; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index ea0f3c49cbed..d0c79cc6c0c2 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -165,6 +165,7 @@ struct tipc_bearer { struct tipc_discoverer *disc; char net_plane; unsigned long up; + refcount_t refcnt; }; struct tipc_bearer_names { @@ -210,6 +211,8 @@ int tipc_media_set_window(const char *name, u32 new_value); int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]); +bool tipc_bearer_hold(struct tipc_bearer *b); +void tipc_bearer_put(struct tipc_bearer *b); void tipc_disable_l2_media(struct tipc_bearer *b); int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, struct tipc_bearer *b, struct tipc_media_addr *dest); @@ -229,7 +232,8 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, struct tipc_media_addr *dest); void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, - struct tipc_media_addr *dst); + struct tipc_media_addr *dst, + struct tipc_node *__dnode); void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq); void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts); diff --git a/net/tipc/core.c b/net/tipc/core.c index 23cb379a93d6..fc01a13d7462 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -44,6 +44,7 @@ #include "socket.h" #include "bcast.h" #include "node.h" +#include "crypto.h" #include <linux/module.h> @@ -68,6 +69,11 @@ static int __net_init tipc_init_net(struct net *net) INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); +#ifdef CONFIG_TIPC_CRYPTO + err = tipc_crypto_start(&tn->crypto_tx, net, NULL); + if (err) + goto out_crypto; +#endif err = tipc_sk_rht_init(net); if (err) goto out_sk_rht; @@ -93,6 +99,11 @@ out_bclink: out_nametbl: tipc_sk_rht_destroy(net); out_sk_rht: + +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&tn->crypto_tx); +out_crypto: +#endif return err; } @@ -103,8 +114,20 @@ static void __net_exit tipc_exit_net(struct net *net) tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&tipc_net(net)->crypto_tx); +#endif +} + +static void __net_exit tipc_pernet_pre_exit(struct net *net) +{ + tipc_node_pre_cleanup_net(net); } +static struct pernet_operations tipc_pernet_pre_exit_ops = { + .pre_exit = tipc_pernet_pre_exit, +}; + static struct pernet_operations tipc_net_ops = { .init = tipc_init_net, .exit = tipc_exit_net, @@ -151,6 +174,10 @@ static int __init tipc_init(void) if (err) goto out_pernet_topsrv; + err = register_pernet_subsys(&tipc_pernet_pre_exit_ops); + if (err) + goto out_register_pernet_subsys; + err = tipc_bearer_setup(); if (err) goto out_bearer; @@ -158,6 +185,8 @@ static int __init tipc_init(void) pr_info("Started in single node mode\n"); return 0; out_bearer: + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); +out_register_pernet_subsys: unregister_pernet_device(&tipc_topsrv_net_ops); out_pernet_topsrv: tipc_socket_stop(); @@ -177,6 +206,7 @@ out_netlink: static void __exit tipc_exit(void) { tipc_bearer_cleanup(); + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); unregister_pernet_device(&tipc_topsrv_net_ops); tipc_socket_stop(); unregister_pernet_device(&tipc_net_ops); diff --git a/net/tipc/core.h b/net/tipc/core.h index 60d829581068..775848a5f27e 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -59,6 +59,7 @@ #include <net/netns/generic.h> #include <linux/rhashtable.h> #include <net/genetlink.h> +#include <net/netns/hash.h> struct tipc_node; struct tipc_bearer; @@ -67,6 +68,9 @@ struct tipc_link; struct tipc_name_table; struct tipc_topsrv; struct tipc_monitor; +#ifdef CONFIG_TIPC_CRYPTO +struct tipc_crypto; +#endif #define TIPC_MOD_VER "2.0.0" @@ -128,6 +132,11 @@ struct tipc_net { /* Tracing of node internal messages */ struct packet_type loopback_pt; + +#ifdef CONFIG_TIPC_CRYPTO + /* TX crypto handler */ + struct tipc_crypto *crypto_tx; +#endif }; static inline struct tipc_net *tipc_net(struct net *net) @@ -185,6 +194,11 @@ static inline int in_range(u16 val, u16 min, u16 max) return !less(val, min) && !more(val, max); } +static inline u32 tipc_net_hash_mixes(struct net *net, int tn_rand) +{ + return net_hash_mix(&init_net) ^ net_hash_mix(net) ^ tn_rand; +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c new file mode 100644 index 000000000000..990a872cec46 --- /dev/null +++ b/net/tipc/crypto.c @@ -0,0 +1,1986 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption + * + * Copyright (c) 2019, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <crypto/aead.h> +#include <crypto/aes.h> +#include "crypto.h" + +#define TIPC_TX_PROBE_LIM msecs_to_jiffies(1000) /* > 1s */ +#define TIPC_TX_LASTING_LIM msecs_to_jiffies(120000) /* 2 mins */ +#define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ +#define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(180000) /* 3 mins */ +#define TIPC_MAX_TFMS_DEF 10 +#define TIPC_MAX_TFMS_LIM 1000 + +/** + * TIPC Key ids + */ +enum { + KEY_UNUSED = 0, + KEY_MIN, + KEY_1 = KEY_MIN, + KEY_2, + KEY_3, + KEY_MAX = KEY_3, +}; + +/** + * TIPC Crypto statistics + */ +enum { + STAT_OK, + STAT_NOK, + STAT_ASYNC, + STAT_ASYNC_OK, + STAT_ASYNC_NOK, + STAT_BADKEYS, /* tx only */ + STAT_BADMSGS = STAT_BADKEYS, /* rx only */ + STAT_NOKEYS, + STAT_SWITCHES, + + MAX_STATS, +}; + +/* TIPC crypto statistics' header */ +static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok", + "async_nok", "badmsgs", "nokeys", + "switches"}; + +/* Max TFMs number per key */ +int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; + +/** + * struct tipc_key - TIPC keys' status indicator + * + * 7 6 5 4 3 2 1 0 + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * key: | (reserved)|passive idx| active idx|pending idx| + * +-----+-----+-----+-----+-----+-----+-----+-----+ + */ +struct tipc_key { +#define KEY_BITS (2) +#define KEY_MASK ((1 << KEY_BITS) - 1) + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 pending:2, + active:2, + passive:2, /* rx only */ + reserved:2; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:2, + passive:2, /* rx only */ + active:2, + pending:2; +#else +#error "Please fix <asm/byteorder.h>" +#endif + } __packed; + u8 keys; + }; +}; + +/** + * struct tipc_tfm - TIPC TFM structure to form a list of TFMs + */ +struct tipc_tfm { + struct crypto_aead *tfm; + struct list_head list; +}; + +/** + * struct tipc_aead - TIPC AEAD key structure + * @tfm_entry: per-cpu pointer to one entry in TFM list + * @crypto: TIPC crypto owns this key + * @cloned: reference to the source key in case cloning + * @users: the number of the key users (TX/RX) + * @salt: the key's SALT value + * @authsize: authentication tag size (max = 16) + * @mode: crypto mode is applied to the key + * @hint[]: a hint for user key + * @rcu: struct rcu_head + * @seqno: the key seqno (cluster scope) + * @refcnt: the key reference counter + */ +struct tipc_aead { +#define TIPC_AEAD_HINT_LEN (5) + struct tipc_tfm * __percpu *tfm_entry; + struct tipc_crypto *crypto; + struct tipc_aead *cloned; + atomic_t users; + u32 salt; + u8 authsize; + u8 mode; + char hint[TIPC_AEAD_HINT_LEN + 1]; + struct rcu_head rcu; + + atomic64_t seqno ____cacheline_aligned; + refcount_t refcnt ____cacheline_aligned; + +} ____cacheline_aligned; + +/** + * struct tipc_crypto_stats - TIPC Crypto statistics + */ +struct tipc_crypto_stats { + unsigned int stat[MAX_STATS]; +}; + +/** + * struct tipc_crypto - TIPC TX/RX crypto structure + * @net: struct net + * @node: TIPC node (RX) + * @aead: array of pointers to AEAD keys for encryption/decryption + * @peer_rx_active: replicated peer RX active key index + * @key: the key states + * @working: the crypto is working or not + * @stats: the crypto statistics + * @sndnxt: the per-peer sndnxt (TX) + * @timer1: general timer 1 (jiffies) + * @timer2: general timer 1 (jiffies) + * @lock: tipc_key lock + */ +struct tipc_crypto { + struct net *net; + struct tipc_node *node; + struct tipc_aead __rcu *aead[KEY_MAX + 1]; /* key[0] is UNUSED */ + atomic_t peer_rx_active; + struct tipc_key key; + u8 working:1; + struct tipc_crypto_stats __percpu *stats; + + atomic64_t sndnxt ____cacheline_aligned; + unsigned long timer1; + unsigned long timer2; + spinlock_t lock; /* crypto lock */ + +} ____cacheline_aligned; + +/* struct tipc_crypto_tx_ctx - TX context for callbacks */ +struct tipc_crypto_tx_ctx { + struct tipc_aead *aead; + struct tipc_bearer *bearer; + struct tipc_media_addr dst; +}; + +/* struct tipc_crypto_rx_ctx - RX context for callbacks */ +struct tipc_crypto_rx_ctx { + struct tipc_aead *aead; + struct tipc_bearer *bearer; +}; + +static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead); +static inline void tipc_aead_put(struct tipc_aead *aead); +static void tipc_aead_free(struct rcu_head *rp); +static int tipc_aead_users(struct tipc_aead __rcu *aead); +static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim); +static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim); +static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val); +static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead); +static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, + u8 mode); +static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src); +static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, + unsigned int crypto_ctx_size, + u8 **iv, struct aead_request **req, + struct scatterlist **sg, int nsg); +static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode); +static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err); +static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, + struct sk_buff *skb, struct tipc_bearer *b); +static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err); +static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr); +static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, + u8 tx_key, struct sk_buff *skb, + struct tipc_crypto *__rx); +static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, + u8 new_passive, + u8 new_active, + u8 new_pending); +static int tipc_crypto_key_attach(struct tipc_crypto *c, + struct tipc_aead *aead, u8 pos); +static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); +static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, + struct tipc_crypto *rx, + struct sk_buff *skb); +static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active, + struct tipc_msg *hdr); +static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); +static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, + struct tipc_bearer *b, + struct sk_buff **skb, int err); +static void tipc_crypto_do_cmd(struct net *net, int cmd); +static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf); +#ifdef TIPC_CRYPTO_DEBUG +static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, + char *buf); +#endif + +#define key_next(cur) ((cur) % KEY_MAX + 1) + +#define tipc_aead_rcu_ptr(rcu_ptr, lock) \ + rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) + +#define tipc_aead_rcu_swap(rcu_ptr, ptr, lock) \ + rcu_swap_protected((rcu_ptr), (ptr), lockdep_is_held(lock)) + +#define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ +do { \ + typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \ + lockdep_is_held(lock)); \ + rcu_assign_pointer((rcu_ptr), (ptr)); \ + tipc_aead_put(__tmp); \ +} while (0) + +#define tipc_crypto_key_detach(rcu_ptr, lock) \ + tipc_aead_rcu_replace((rcu_ptr), NULL, lock) + +/** + * tipc_aead_key_validate - Validate a AEAD user key + */ +int tipc_aead_key_validate(struct tipc_aead_key *ukey) +{ + int keylen; + + /* Check if algorithm exists */ + if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) { + pr_info("Not found cipher: \"%s\"!\n", ukey->alg_name); + return -ENODEV; + } + + /* Currently, we only support the "gcm(aes)" cipher algorithm */ + if (strcmp(ukey->alg_name, "gcm(aes)")) + return -ENOTSUPP; + + /* Check if key size is correct */ + keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; + if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 && + keylen != TIPC_AES_GCM_KEY_SIZE_192 && + keylen != TIPC_AES_GCM_KEY_SIZE_256)) + return -EINVAL; + + return 0; +} + +static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (unlikely(!tmp || !refcount_inc_not_zero(&tmp->refcnt))) + tmp = NULL; + rcu_read_unlock(); + + return tmp; +} + +static inline void tipc_aead_put(struct tipc_aead *aead) +{ + if (aead && refcount_dec_and_test(&aead->refcnt)) + call_rcu(&aead->rcu, tipc_aead_free); +} + +/** + * tipc_aead_free - Release AEAD key incl. all the TFMs in the list + * @rp: rcu head pointer + */ +static void tipc_aead_free(struct rcu_head *rp) +{ + struct tipc_aead *aead = container_of(rp, struct tipc_aead, rcu); + struct tipc_tfm *tfm_entry, *head, *tmp; + + if (aead->cloned) { + tipc_aead_put(aead->cloned); + } else { + head = *this_cpu_ptr(aead->tfm_entry); + list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) { + crypto_free_aead(tfm_entry->tfm); + list_del(&tfm_entry->list); + kfree(tfm_entry); + } + /* Free the head */ + crypto_free_aead(head->tfm); + list_del(&head->list); + kfree(head); + } + free_percpu(aead->tfm_entry); + kfree(aead); +} + +static int tipc_aead_users(struct tipc_aead __rcu *aead) +{ + struct tipc_aead *tmp; + int users = 0; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + users = atomic_read(&tmp->users); + rcu_read_unlock(); + + return users; +} + +static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + atomic_add_unless(&tmp->users, 1, lim); + rcu_read_unlock(); +} + +static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + atomic_add_unless(&rcu_dereference(aead)->users, -1, lim); + rcu_read_unlock(); +} + +static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) +{ + struct tipc_aead *tmp; + int cur; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) { + do { + cur = atomic_read(&tmp->users); + if (cur == val) + break; + } while (atomic_cmpxchg(&tmp->users, cur, val) != cur); + } + rcu_read_unlock(); +} + +/** + * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it + */ +static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) +{ + struct tipc_tfm **tfm_entry = this_cpu_ptr(aead->tfm_entry); + + *tfm_entry = list_next_entry(*tfm_entry, list); + return (*tfm_entry)->tfm; +} + +/** + * tipc_aead_init - Initiate TIPC AEAD + * @aead: returned new TIPC AEAD key handle pointer + * @ukey: pointer to user key data + * @mode: the key mode + * + * Allocate a (list of) new cipher transformation (TFM) with the specific user + * key data if valid. The number of the allocated TFMs can be set via the sysfs + * "net/tipc/max_tfms" first. + * Also, all the other AEAD data are also initialized. + * + * Return: 0 if the initiation is successful, otherwise: < 0 + */ +static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, + u8 mode) +{ + struct tipc_tfm *tfm_entry, *head; + struct crypto_aead *tfm; + struct tipc_aead *tmp; + int keylen, err, cpu; + int tfm_cnt = 0; + + if (unlikely(*aead)) + return -EEXIST; + + /* Allocate a new AEAD */ + tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); + if (unlikely(!tmp)) + return -ENOMEM; + + /* The key consists of two parts: [AES-KEY][SALT] */ + keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; + + /* Allocate per-cpu TFM entry pointer */ + tmp->tfm_entry = alloc_percpu(struct tipc_tfm *); + if (!tmp->tfm_entry) { + kzfree(tmp); + return -ENOMEM; + } + + /* Make a list of TFMs with the user key data */ + do { + tfm = crypto_alloc_aead(ukey->alg_name, 0, 0); + if (IS_ERR(tfm)) { + err = PTR_ERR(tfm); + break; + } + + if (unlikely(!tfm_cnt && + crypto_aead_ivsize(tfm) != TIPC_AES_GCM_IV_SIZE)) { + crypto_free_aead(tfm); + err = -ENOTSUPP; + break; + } + + err = crypto_aead_setauthsize(tfm, TIPC_AES_GCM_TAG_SIZE); + err |= crypto_aead_setkey(tfm, ukey->key, keylen); + if (unlikely(err)) { + crypto_free_aead(tfm); + break; + } + + tfm_entry = kmalloc(sizeof(*tfm_entry), GFP_KERNEL); + if (unlikely(!tfm_entry)) { + crypto_free_aead(tfm); + err = -ENOMEM; + break; + } + INIT_LIST_HEAD(&tfm_entry->list); + tfm_entry->tfm = tfm; + + /* First entry? */ + if (!tfm_cnt) { + head = tfm_entry; + for_each_possible_cpu(cpu) { + *per_cpu_ptr(tmp->tfm_entry, cpu) = head; + } + } else { + list_add_tail(&tfm_entry->list, &head->list); + } + + } while (++tfm_cnt < sysctl_tipc_max_tfms); + + /* Not any TFM is allocated? */ + if (!tfm_cnt) { + free_percpu(tmp->tfm_entry); + kzfree(tmp); + return err; + } + + /* Copy some chars from the user key as a hint */ + memcpy(tmp->hint, ukey->key, TIPC_AEAD_HINT_LEN); + tmp->hint[TIPC_AEAD_HINT_LEN] = '\0'; + + /* Initialize the other data */ + tmp->mode = mode; + tmp->cloned = NULL; + tmp->authsize = TIPC_AES_GCM_TAG_SIZE; + memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); + atomic_set(&tmp->users, 0); + atomic64_set(&tmp->seqno, 0); + refcount_set(&tmp->refcnt, 1); + + *aead = tmp; + return 0; +} + +/** + * tipc_aead_clone - Clone a TIPC AEAD key + * @dst: dest key for the cloning + * @src: source key to clone from + * + * Make a "copy" of the source AEAD key data to the dest, the TFMs list is + * common for the keys. + * A reference to the source is hold in the "cloned" pointer for the later + * freeing purposes. + * + * Note: this must be done in cluster-key mode only! + * Return: 0 in case of success, otherwise < 0 + */ +static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src) +{ + struct tipc_aead *aead; + int cpu; + + if (!src) + return -ENOKEY; + + if (src->mode != CLUSTER_KEY) + return -EINVAL; + + if (unlikely(*dst)) + return -EEXIST; + + aead = kzalloc(sizeof(*aead), GFP_ATOMIC); + if (unlikely(!aead)) + return -ENOMEM; + + aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC); + if (unlikely(!aead->tfm_entry)) { + kzfree(aead); + return -ENOMEM; + } + + for_each_possible_cpu(cpu) { + *per_cpu_ptr(aead->tfm_entry, cpu) = + *per_cpu_ptr(src->tfm_entry, cpu); + } + + memcpy(aead->hint, src->hint, sizeof(src->hint)); + aead->mode = src->mode; + aead->salt = src->salt; + aead->authsize = src->authsize; + atomic_set(&aead->users, 0); + atomic64_set(&aead->seqno, 0); + refcount_set(&aead->refcnt, 1); + + WARN_ON(!refcount_inc_not_zero(&src->refcnt)); + aead->cloned = src; + + *dst = aead; + return 0; +} + +/** + * tipc_aead_mem_alloc - Allocate memory for AEAD request operations + * @tfm: cipher handle to be registered with the request + * @crypto_ctx_size: size of crypto context for callback + * @iv: returned pointer to IV data + * @req: returned pointer to AEAD request data + * @sg: returned pointer to SG lists + * @nsg: number of SG lists to be allocated + * + * Allocate memory to store the crypto context data, AEAD request, IV and SG + * lists, the memory layout is as follows: + * crypto_ctx || iv || aead_req || sg[] + * + * Return: the pointer to the memory areas in case of success, otherwise NULL + */ +static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, + unsigned int crypto_ctx_size, + u8 **iv, struct aead_request **req, + struct scatterlist **sg, int nsg) +{ + unsigned int iv_size, req_size; + unsigned int len; + u8 *mem; + + iv_size = crypto_aead_ivsize(tfm); + req_size = sizeof(**req) + crypto_aead_reqsize(tfm); + + len = crypto_ctx_size; + len += iv_size; + len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); + len = ALIGN(len, crypto_tfm_ctx_alignment()); + len += req_size; + len = ALIGN(len, __alignof__(struct scatterlist)); + len += nsg * sizeof(**sg); + + mem = kmalloc(len, GFP_ATOMIC); + if (!mem) + return NULL; + + *iv = (u8 *)PTR_ALIGN(mem + crypto_ctx_size, + crypto_aead_alignmask(tfm) + 1); + *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, + crypto_tfm_ctx_alignment()); + *sg = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, + __alignof__(struct scatterlist)); + + return (void *)mem; +} + +/** + * tipc_aead_encrypt - Encrypt a message + * @aead: TIPC AEAD key for the message encryption + * @skb: the input/output skb + * @b: TIPC bearer where the message will be delivered after the encryption + * @dst: the destination media address + * @__dnode: TIPC dest node if "known" + * + * Return: + * 0 : if the encryption has completed + * -EINPROGRESS/-EBUSY : if a callback will be performed + * < 0 : the encryption has failed + */ +static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode) +{ + struct crypto_aead *tfm = tipc_aead_tfm_next(aead); + struct tipc_crypto_tx_ctx *tx_ctx; + struct aead_request *req; + struct sk_buff *trailer; + struct scatterlist *sg; + struct tipc_ehdr *ehdr; + int ehsz, len, tailen, nsg, rc; + void *ctx; + u32 salt; + u8 *iv; + + /* Make sure message len at least 4-byte aligned */ + len = ALIGN(skb->len, 4); + tailen = len - skb->len + aead->authsize; + + /* Expand skb tail for authentication tag: + * As for simplicity, we'd have made sure skb having enough tailroom + * for authentication tag @skb allocation. Even when skb is nonlinear + * but there is no frag_list, it should be still fine! + * Otherwise, we must cow it to be a writable buffer with the tailroom. + */ +#ifdef TIPC_CRYPTO_DEBUG + SKB_LINEAR_ASSERT(skb); + if (tailen > skb_tailroom(skb)) { + pr_warn("TX: skb tailroom is not enough: %d, requires: %d\n", + skb_tailroom(skb), tailen); + } +#endif + + if (unlikely(!skb_cloned(skb) && tailen <= skb_tailroom(skb))) { + nsg = 1; + trailer = skb; + } else { + /* TODO: We could avoid skb_cow_data() if skb has no frag_list + * e.g. by skb_fill_page_desc() to add another page to the skb + * with the wanted tailen... However, page skbs look not often, + * so take it easy now! + * Cloned skbs e.g. from link_xmit() seems no choice though :( + */ + nsg = skb_cow_data(skb, tailen, &trailer); + if (unlikely(nsg < 0)) { + pr_err("TX: skb_cow_data() returned %d\n", nsg); + return nsg; + } + } + + pskb_put(skb, trailer, tailen); + + /* Allocate memory for the AEAD operation */ + ctx = tipc_aead_mem_alloc(tfm, sizeof(*tx_ctx), &iv, &req, &sg, nsg); + if (unlikely(!ctx)) + return -ENOMEM; + TIPC_SKB_CB(skb)->crypto_ctx = ctx; + + /* Map skb to the sg lists */ + sg_init_table(sg, nsg); + rc = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(rc < 0)) { + pr_err("TX: skb_to_sgvec() returned %d, nsg %d!\n", rc, nsg); + goto exit; + } + + /* Prepare IV: [SALT (4 octets)][SEQNO (8 octets)] + * In case we're in cluster-key mode, SALT is varied by xor-ing with + * the source address (or w0 of id), otherwise with the dest address + * if dest is known. + */ + ehdr = (struct tipc_ehdr *)skb->data; + salt = aead->salt; + if (aead->mode == CLUSTER_KEY) + salt ^= ehdr->addr; /* __be32 */ + else if (__dnode) + salt ^= tipc_node_get_addr(__dnode); + memcpy(iv, &salt, 4); + memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); + + /* Prepare request */ + ehsz = tipc_ehdr_size(ehdr); + aead_request_set_tfm(req, tfm); + aead_request_set_ad(req, ehsz); + aead_request_set_crypt(req, sg, sg, len - ehsz, iv); + + /* Set callback function & data */ + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + tipc_aead_encrypt_done, skb); + tx_ctx = (struct tipc_crypto_tx_ctx *)ctx; + tx_ctx->aead = aead; + tx_ctx->bearer = b; + memcpy(&tx_ctx->dst, dst, sizeof(*dst)); + + /* Hold bearer */ + if (unlikely(!tipc_bearer_hold(b))) { + rc = -ENODEV; + goto exit; + } + + /* Now, do encrypt */ + rc = crypto_aead_encrypt(req); + if (rc == -EINPROGRESS || rc == -EBUSY) + return rc; + + tipc_bearer_put(b); + +exit: + kfree(ctx); + TIPC_SKB_CB(skb)->crypto_ctx = NULL; + return rc; +} + +static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; + struct tipc_bearer *b = tx_ctx->bearer; + struct tipc_aead *aead = tx_ctx->aead; + struct tipc_crypto *tx = aead->crypto; + struct net *net = tx->net; + + switch (err) { + case 0: + this_cpu_inc(tx->stats->stat[STAT_ASYNC_OK]); + if (likely(test_bit(0, &b->up))) + b->media->send_msg(net, skb, b, &tx_ctx->dst); + else + kfree_skb(skb); + break; + case -EINPROGRESS: + return; + default: + this_cpu_inc(tx->stats->stat[STAT_ASYNC_NOK]); + kfree_skb(skb); + break; + } + + kfree(tx_ctx); + tipc_bearer_put(b); + tipc_aead_put(aead); +} + +/** + * tipc_aead_decrypt - Decrypt an encrypted message + * @net: struct net + * @aead: TIPC AEAD for the message decryption + * @skb: the input/output skb + * @b: TIPC bearer where the message has been received + * + * Return: + * 0 : if the decryption has completed + * -EINPROGRESS/-EBUSY : if a callback will be performed + * < 0 : the decryption has failed + */ +static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, + struct sk_buff *skb, struct tipc_bearer *b) +{ + struct tipc_crypto_rx_ctx *rx_ctx; + struct aead_request *req; + struct crypto_aead *tfm; + struct sk_buff *unused; + struct scatterlist *sg; + struct tipc_ehdr *ehdr; + int ehsz, nsg, rc; + void *ctx; + u32 salt; + u8 *iv; + + if (unlikely(!aead)) + return -ENOKEY; + + /* Cow skb data if needed */ + if (likely(!skb_cloned(skb) && + (!skb_is_nonlinear(skb) || !skb_has_frag_list(skb)))) { + nsg = 1 + skb_shinfo(skb)->nr_frags; + } else { + nsg = skb_cow_data(skb, 0, &unused); + if (unlikely(nsg < 0)) { + pr_err("RX: skb_cow_data() returned %d\n", nsg); + return nsg; + } + } + + /* Allocate memory for the AEAD operation */ + tfm = tipc_aead_tfm_next(aead); + ctx = tipc_aead_mem_alloc(tfm, sizeof(*rx_ctx), &iv, &req, &sg, nsg); + if (unlikely(!ctx)) + return -ENOMEM; + TIPC_SKB_CB(skb)->crypto_ctx = ctx; + + /* Map skb to the sg lists */ + sg_init_table(sg, nsg); + rc = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(rc < 0)) { + pr_err("RX: skb_to_sgvec() returned %d, nsg %d\n", rc, nsg); + goto exit; + } + + /* Reconstruct IV: */ + ehdr = (struct tipc_ehdr *)skb->data; + salt = aead->salt; + if (aead->mode == CLUSTER_KEY) + salt ^= ehdr->addr; /* __be32 */ + else if (ehdr->destined) + salt ^= tipc_own_addr(net); + memcpy(iv, &salt, 4); + memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); + + /* Prepare request */ + ehsz = tipc_ehdr_size(ehdr); + aead_request_set_tfm(req, tfm); + aead_request_set_ad(req, ehsz); + aead_request_set_crypt(req, sg, sg, skb->len - ehsz, iv); + + /* Set callback function & data */ + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + tipc_aead_decrypt_done, skb); + rx_ctx = (struct tipc_crypto_rx_ctx *)ctx; + rx_ctx->aead = aead; + rx_ctx->bearer = b; + + /* Hold bearer */ + if (unlikely(!tipc_bearer_hold(b))) { + rc = -ENODEV; + goto exit; + } + + /* Now, do decrypt */ + rc = crypto_aead_decrypt(req); + if (rc == -EINPROGRESS || rc == -EBUSY) + return rc; + + tipc_bearer_put(b); + +exit: + kfree(ctx); + TIPC_SKB_CB(skb)->crypto_ctx = NULL; + return rc; +} + +static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; + struct tipc_bearer *b = rx_ctx->bearer; + struct tipc_aead *aead = rx_ctx->aead; + struct tipc_crypto_stats __percpu *stats = aead->crypto->stats; + struct net *net = aead->crypto->net; + + switch (err) { + case 0: + this_cpu_inc(stats->stat[STAT_ASYNC_OK]); + break; + case -EINPROGRESS: + return; + default: + this_cpu_inc(stats->stat[STAT_ASYNC_NOK]); + break; + } + + kfree(rx_ctx); + tipc_crypto_rcv_complete(net, aead, b, &skb, err); + if (likely(skb)) { + if (likely(test_bit(0, &b->up))) + tipc_rcv(net, skb, b); + else + kfree_skb(skb); + } + + tipc_bearer_put(b); +} + +static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr) +{ + return (ehdr->user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; +} + +/** + * tipc_ehdr_validate - Validate an encryption message + * @skb: the message buffer + * + * Returns "true" if this is a valid encryption message, otherwise "false" + */ +bool tipc_ehdr_validate(struct sk_buff *skb) +{ + struct tipc_ehdr *ehdr; + int ehsz; + + if (unlikely(!pskb_may_pull(skb, EHDR_MIN_SIZE))) + return false; + + ehdr = (struct tipc_ehdr *)skb->data; + if (unlikely(ehdr->version != TIPC_EVERSION)) + return false; + ehsz = tipc_ehdr_size(ehdr); + if (unlikely(!pskb_may_pull(skb, ehsz))) + return false; + if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) + return false; + if (unlikely(!ehdr->tx_key)) + return false; + + return true; +} + +/** + * tipc_ehdr_build - Build TIPC encryption message header + * @net: struct net + * @aead: TX AEAD key to be used for the message encryption + * @tx_key: key id used for the message encryption + * @skb: input/output message skb + * @__rx: RX crypto handle if dest is "known" + * + * Return: the header size if the building is successful, otherwise < 0 + */ +static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, + u8 tx_key, struct sk_buff *skb, + struct tipc_crypto *__rx) +{ + struct tipc_msg *hdr = buf_msg(skb); + struct tipc_ehdr *ehdr; + u32 user = msg_user(hdr); + u64 seqno; + int ehsz; + + /* Make room for encryption header */ + ehsz = (user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; + WARN_ON(skb_headroom(skb) < ehsz); + ehdr = (struct tipc_ehdr *)skb_push(skb, ehsz); + + /* Obtain a seqno first: + * Use the key seqno (= cluster wise) if dest is unknown or we're in + * cluster key mode, otherwise it's better for a per-peer seqno! + */ + if (!__rx || aead->mode == CLUSTER_KEY) + seqno = atomic64_inc_return(&aead->seqno); + else + seqno = atomic64_inc_return(&__rx->sndnxt); + + /* Revoke the key if seqno is wrapped around */ + if (unlikely(!seqno)) + return tipc_crypto_key_revoke(net, tx_key); + + /* Word 1-2 */ + ehdr->seqno = cpu_to_be64(seqno); + + /* Words 0, 3- */ + ehdr->version = TIPC_EVERSION; + ehdr->user = 0; + ehdr->keepalive = 0; + ehdr->tx_key = tx_key; + ehdr->destined = (__rx) ? 1 : 0; + ehdr->rx_key_active = (__rx) ? __rx->key.active : 0; + ehdr->reserved_1 = 0; + ehdr->reserved_2 = 0; + + switch (user) { + case LINK_CONFIG: + ehdr->user = LINK_CONFIG; + memcpy(ehdr->id, tipc_own_id(net), NODE_ID_LEN); + break; + default: + if (user == LINK_PROTOCOL && msg_type(hdr) == STATE_MSG) { + ehdr->user = LINK_PROTOCOL; + ehdr->keepalive = msg_is_keepalive(hdr); + } + ehdr->addr = hdr->hdr[3]; + break; + } + + return ehsz; +} + +static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, + u8 new_passive, + u8 new_active, + u8 new_pending) +{ +#ifdef TIPC_CRYPTO_DEBUG + struct tipc_key old = c->key; + char buf[32]; +#endif + + c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) | + ((new_active & KEY_MASK) << (KEY_BITS)) | + ((new_pending & KEY_MASK)); + +#ifdef TIPC_CRYPTO_DEBUG + pr_info("%s(%s): key changing %s ::%pS\n", + (c->node) ? "RX" : "TX", + (c->node) ? tipc_node_get_id_str(c->node) : + tipc_own_id_string(c->net), + tipc_key_change_dump(old, c->key, buf), + __builtin_return_address(0)); +#endif +} + +/** + * tipc_crypto_key_init - Initiate a new user / AEAD key + * @c: TIPC crypto to which new key is attached + * @ukey: the user key + * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY) + * + * A new TIPC AEAD key will be allocated and initiated with the specified user + * key, then attached to the TIPC crypto. + * + * Return: new key id in case of success, otherwise: < 0 + */ +int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, + u8 mode) +{ + struct tipc_aead *aead = NULL; + int rc = 0; + + /* Initiate with the new user key */ + rc = tipc_aead_init(&aead, ukey, mode); + + /* Attach it to the crypto */ + if (likely(!rc)) { + rc = tipc_crypto_key_attach(c, aead, 0); + if (rc < 0) + tipc_aead_free(&aead->rcu); + } + + pr_info("%s(%s): key initiating, rc %d!\n", + (c->node) ? "RX" : "TX", + (c->node) ? tipc_node_get_id_str(c->node) : + tipc_own_id_string(c->net), + rc); + + return rc; +} + +/** + * tipc_crypto_key_attach - Attach a new AEAD key to TIPC crypto + * @c: TIPC crypto to which the new AEAD key is attached + * @aead: the new AEAD key pointer + * @pos: desired slot in the crypto key array, = 0 if any! + * + * Return: new key id in case of success, otherwise: -EBUSY + */ +static int tipc_crypto_key_attach(struct tipc_crypto *c, + struct tipc_aead *aead, u8 pos) +{ + u8 new_pending, new_passive, new_key; + struct tipc_key key; + int rc = -EBUSY; + + spin_lock_bh(&c->lock); + key = c->key; + if (key.active && key.passive) + goto exit; + if (key.passive && !tipc_aead_users(c->aead[key.passive])) + goto exit; + if (key.pending) { + if (pos) + goto exit; + if (tipc_aead_users(c->aead[key.pending]) > 0) + goto exit; + /* Replace it */ + new_pending = key.pending; + new_passive = key.passive; + new_key = new_pending; + } else { + if (pos) { + if (key.active && pos != key_next(key.active)) { + new_pending = key.pending; + new_passive = pos; + new_key = new_passive; + goto attach; + } else if (!key.active && !key.passive) { + new_pending = pos; + new_passive = key.passive; + new_key = new_pending; + goto attach; + } + } + new_pending = key_next(key.active ?: key.passive); + new_passive = key.passive; + new_key = new_pending; + } + +attach: + aead->crypto = c; + tipc_crypto_key_set_state(c, new_passive, key.active, new_pending); + tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock); + + c->working = 1; + c->timer1 = jiffies; + c->timer2 = jiffies; + rc = new_key; + +exit: + spin_unlock_bh(&c->lock); + return rc; +} + +void tipc_crypto_key_flush(struct tipc_crypto *c) +{ + int k; + + spin_lock_bh(&c->lock); + c->working = 0; + tipc_crypto_key_set_state(c, 0, 0, 0); + for (k = KEY_MIN; k <= KEY_MAX; k++) + tipc_crypto_key_detach(c->aead[k], &c->lock); + atomic_set(&c->peer_rx_active, 0); + atomic64_set(&c->sndnxt, 0); + spin_unlock_bh(&c->lock); +} + +/** + * tipc_crypto_key_try_align - Align RX keys if possible + * @rx: RX crypto handle + * @new_pending: new pending slot if aligned (= TX key from peer) + * + * Peer has used an unknown key slot, this only happens when peer has left and + * rejoned, or we are newcomer. + * That means, there must be no active key but a pending key at unaligned slot. + * If so, we try to move the pending key to the new slot. + * Note: A potential passive key can exist, it will be shifted correspondingly! + * + * Return: "true" if key is successfully aligned, otherwise "false" + */ +static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) +{ + struct tipc_aead *tmp1, *tmp2 = NULL; + struct tipc_key key; + bool aligned = false; + u8 new_passive = 0; + int x; + + spin_lock(&rx->lock); + key = rx->key; + if (key.pending == new_pending) { + aligned = true; + goto exit; + } + if (key.active) + goto exit; + if (!key.pending) + goto exit; + if (tipc_aead_users(rx->aead[key.pending]) > 0) + goto exit; + + /* Try to "isolate" this pending key first */ + tmp1 = tipc_aead_rcu_ptr(rx->aead[key.pending], &rx->lock); + if (!refcount_dec_if_one(&tmp1->refcnt)) + goto exit; + rcu_assign_pointer(rx->aead[key.pending], NULL); + + /* Move passive key if any */ + if (key.passive) { + tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock); + x = (key.passive - key.pending + new_pending) % KEY_MAX; + new_passive = (x <= 0) ? x + KEY_MAX : x; + } + + /* Re-allocate the key(s) */ + tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); + rcu_assign_pointer(rx->aead[new_pending], tmp1); + if (new_passive) + rcu_assign_pointer(rx->aead[new_passive], tmp2); + refcount_set(&tmp1->refcnt, 1); + aligned = true; + pr_info("RX(%s): key is aligned!\n", tipc_node_get_id_str(rx->node)); + +exit: + spin_unlock(&rx->lock); + return aligned; +} + +/** + * tipc_crypto_key_pick_tx - Pick one TX key for message decryption + * @tx: TX crypto handle + * @rx: RX crypto handle (can be NULL) + * @skb: the message skb which will be decrypted later + * + * This function looks up the existing TX keys and pick one which is suitable + * for the message decryption, that must be a cluster key and not used before + * on the same message (i.e. recursive). + * + * Return: the TX AEAD key handle in case of success, otherwise NULL + */ +static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, + struct tipc_crypto *rx, + struct sk_buff *skb) +{ + struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb); + struct tipc_aead *aead = NULL; + struct tipc_key key = tx->key; + u8 k, i = 0; + + /* Initialize data if not yet */ + if (!skb_cb->tx_clone_deferred) { + skb_cb->tx_clone_deferred = 1; + memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); + } + + skb_cb->tx_clone_ctx.rx = rx; + if (++skb_cb->tx_clone_ctx.recurs > 2) + return NULL; + + /* Pick one TX key */ + spin_lock(&tx->lock); + do { + k = (i == 0) ? key.pending : + ((i == 1) ? key.active : key.passive); + if (!k) + continue; + aead = tipc_aead_rcu_ptr(tx->aead[k], &tx->lock); + if (!aead) + continue; + if (aead->mode != CLUSTER_KEY || + aead == skb_cb->tx_clone_ctx.last) { + aead = NULL; + continue; + } + /* Ok, found one cluster key */ + skb_cb->tx_clone_ctx.last = aead; + WARN_ON(skb->next); + skb->next = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!skb->next)) + pr_warn("Failed to clone skb for next round if any\n"); + WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); + break; + } while (++i < 3); + spin_unlock(&tx->lock); + + return aead; +} + +/** + * tipc_crypto_key_synch: Synch own key data according to peer key status + * @rx: RX crypto handle + * @new_rx_active: latest RX active key from peer + * @hdr: TIPCv2 message + * + * This function updates the peer node related data as the peer RX active key + * has changed, so the number of TX keys' users on this node are increased and + * decreased correspondingly. + * + * The "per-peer" sndnxt is also reset when the peer key has switched. + */ +static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active, + struct tipc_msg *hdr) +{ + struct net *net = rx->net; + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + u8 cur_rx_active; + + /* TX might be even not ready yet */ + if (unlikely(!tx->key.active && !tx->key.pending)) + return; + + cur_rx_active = atomic_read(&rx->peer_rx_active); + if (likely(cur_rx_active == new_rx_active)) + return; + + /* Make sure this message destined for this node */ + if (unlikely(msg_short(hdr) || + msg_destnode(hdr) != tipc_own_addr(net))) + return; + + /* Peer RX active key has changed, try to update owns' & TX users */ + if (atomic_cmpxchg(&rx->peer_rx_active, + cur_rx_active, + new_rx_active) == cur_rx_active) { + if (new_rx_active) + tipc_aead_users_inc(tx->aead[new_rx_active], INT_MAX); + if (cur_rx_active) + tipc_aead_users_dec(tx->aead[cur_rx_active], 0); + + atomic64_set(&rx->sndnxt, 0); + /* Mark the point TX key users changed */ + tx->timer1 = jiffies; + +#ifdef TIPC_CRYPTO_DEBUG + pr_info("TX(%s): key users changed %d-- %d++, peer RX(%s)\n", + tipc_own_id_string(net), cur_rx_active, + new_rx_active, tipc_node_get_id_str(rx->node)); +#endif + } +} + +static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) +{ + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_key key; + + spin_lock(&tx->lock); + key = tx->key; + WARN_ON(!key.active || tx_key != key.active); + + /* Free the active key */ + tipc_crypto_key_set_state(tx, key.passive, 0, key.pending); + tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); + spin_unlock(&tx->lock); + + pr_warn("TX(%s): key is revoked!\n", tipc_own_id_string(net)); + return -EKEYREVOKED; +} + +int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, + struct tipc_node *node) +{ + struct tipc_crypto *c; + + if (*crypto) + return -EEXIST; + + /* Allocate crypto */ + c = kzalloc(sizeof(*c), GFP_ATOMIC); + if (!c) + return -ENOMEM; + + /* Allocate statistic structure */ + c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); + if (!c->stats) { + kzfree(c); + return -ENOMEM; + } + + c->working = 0; + c->net = net; + c->node = node; + tipc_crypto_key_set_state(c, 0, 0, 0); + atomic_set(&c->peer_rx_active, 0); + atomic64_set(&c->sndnxt, 0); + c->timer1 = jiffies; + c->timer2 = jiffies; + spin_lock_init(&c->lock); + *crypto = c; + + return 0; +} + +void tipc_crypto_stop(struct tipc_crypto **crypto) +{ + struct tipc_crypto *c, *tx, *rx; + bool is_rx; + u8 k; + + if (!*crypto) + return; + + rcu_read_lock(); + /* RX stopping? => decrease TX key users if any */ + is_rx = !!((*crypto)->node); + if (is_rx) { + rx = *crypto; + tx = tipc_net(rx->net)->crypto_tx; + k = atomic_read(&rx->peer_rx_active); + if (k) { + tipc_aead_users_dec(tx->aead[k], 0); + /* Mark the point TX key users changed */ + tx->timer1 = jiffies; + } + } + + /* Release AEAD keys */ + c = *crypto; + for (k = KEY_MIN; k <= KEY_MAX; k++) + tipc_aead_put(rcu_dereference(c->aead[k])); + rcu_read_unlock(); + + pr_warn("%s(%s) has been purged, node left!\n", + (is_rx) ? "RX" : "TX", + (is_rx) ? tipc_node_get_id_str((*crypto)->node) : + tipc_own_id_string((*crypto)->net)); + + /* Free this crypto statistics */ + free_percpu(c->stats); + + *crypto = NULL; + kzfree(c); +} + +void tipc_crypto_timeout(struct tipc_crypto *rx) +{ + struct tipc_net *tn = tipc_net(rx->net); + struct tipc_crypto *tx = tn->crypto_tx; + struct tipc_key key; + u8 new_pending, new_passive; + int cmd; + + /* TX key activating: + * The pending key (users > 0) -> active + * The active key if any (users == 0) -> free + */ + spin_lock(&tx->lock); + key = tx->key; + if (key.active && tipc_aead_users(tx->aead[key.active]) > 0) + goto s1; + if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0) + goto s1; + if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_LIM)) + goto s1; + + tipc_crypto_key_set_state(tx, key.passive, key.pending, 0); + if (key.active) + tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); + this_cpu_inc(tx->stats->stat[STAT_SWITCHES]); + pr_info("TX(%s): key %d is activated!\n", tipc_own_id_string(tx->net), + key.pending); + +s1: + spin_unlock(&tx->lock); + + /* RX key activating: + * The pending key (users > 0) -> active + * The active key if any -> passive, freed later + */ + spin_lock(&rx->lock); + key = rx->key; + if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0) + goto s2; + + new_pending = (key.passive && + !tipc_aead_users(rx->aead[key.passive])) ? + key.passive : 0; + new_passive = (key.active) ?: ((new_pending) ? 0 : key.passive); + tipc_crypto_key_set_state(rx, new_passive, key.pending, new_pending); + this_cpu_inc(rx->stats->stat[STAT_SWITCHES]); + pr_info("RX(%s): key %d is activated!\n", + tipc_node_get_id_str(rx->node), key.pending); + goto s5; + +s2: + /* RX key "faulty" switching: + * The faulty pending key (users < -30) -> passive + * The passive key (users = 0) -> pending + * Note: This only happens after RX deactivated - s3! + */ + key = rx->key; + if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -30) + goto s3; + if (!key.passive || tipc_aead_users(rx->aead[key.passive]) != 0) + goto s3; + + new_pending = key.passive; + new_passive = key.pending; + tipc_crypto_key_set_state(rx, new_passive, key.active, new_pending); + goto s5; + +s3: + /* RX key deactivating: + * The passive key if any -> pending + * The active key -> passive (users = 0) / pending + * The pending key if any -> passive (users = 0) + */ + key = rx->key; + if (!key.active) + goto s4; + if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM)) + goto s4; + + new_pending = (key.passive) ?: key.active; + new_passive = (key.passive) ? key.active : key.pending; + tipc_aead_users_set(rx->aead[new_pending], 0); + if (new_passive) + tipc_aead_users_set(rx->aead[new_passive], 0); + tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); + pr_info("RX(%s): key %d is deactivated!\n", + tipc_node_get_id_str(rx->node), key.active); + goto s5; + +s4: + /* RX key passive -> freed: */ + key = rx->key; + if (!key.passive || !tipc_aead_users(rx->aead[key.passive])) + goto s5; + if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM)) + goto s5; + + tipc_crypto_key_set_state(rx, 0, key.active, key.pending); + tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock); + pr_info("RX(%s): key %d is freed!\n", tipc_node_get_id_str(rx->node), + key.passive); + +s5: + spin_unlock(&rx->lock); + + /* Limit max_tfms & do debug commands if needed */ + if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM)) + return; + + cmd = sysctl_tipc_max_tfms; + sysctl_tipc_max_tfms = TIPC_MAX_TFMS_DEF; + tipc_crypto_do_cmd(rx->net, cmd); +} + +/** + * tipc_crypto_xmit - Build & encrypt TIPC message for xmit + * @net: struct net + * @skb: input/output message skb pointer + * @b: bearer used for xmit later + * @dst: destination media address + * @__dnode: destination node for reference if any + * + * First, build an encryption message header on the top of the message, then + * encrypt the original TIPC message by using the active or pending TX key. + * If the encryption is successful, the encrypted skb is returned directly or + * via the callback. + * Otherwise, the skb is freed! + * + * Return: + * 0 : the encryption has succeeded (or no encryption) + * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made + * -ENOKEK : the encryption has failed due to no key + * -EKEYREVOKED : the encryption has failed due to key revoked + * -ENOMEM : the encryption has failed due to no memory + * < 0 : the encryption has failed due to other reasons + */ +int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, + struct tipc_bearer *b, struct tipc_media_addr *dst, + struct tipc_node *__dnode) +{ + struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode); + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_crypto_stats __percpu *stats = tx->stats; + struct tipc_key key = tx->key; + struct tipc_aead *aead = NULL; + struct sk_buff *probe; + int rc = -ENOKEY; + u8 tx_key; + + /* No encryption? */ + if (!tx->working) + return 0; + + /* Try with the pending key if available and: + * 1) This is the only choice (i.e. no active key) or; + * 2) Peer has switched to this key (unicast only) or; + * 3) It is time to do a pending key probe; + */ + if (unlikely(key.pending)) { + tx_key = key.pending; + if (!key.active) + goto encrypt; + if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key) + goto encrypt; + if (TIPC_SKB_CB(*skb)->probe) + goto encrypt; + if (!__rx && + time_after(jiffies, tx->timer2 + TIPC_TX_PROBE_LIM)) { + tx->timer2 = jiffies; + probe = skb_clone(*skb, GFP_ATOMIC); + if (probe) { + TIPC_SKB_CB(probe)->probe = 1; + tipc_crypto_xmit(net, &probe, b, dst, __dnode); + if (probe) + b->media->send_msg(net, probe, b, dst); + } + } + } + /* Else, use the active key if any */ + if (likely(key.active)) { + tx_key = key.active; + goto encrypt; + } + goto exit; + +encrypt: + aead = tipc_aead_get(tx->aead[tx_key]); + if (unlikely(!aead)) + goto exit; + rc = tipc_ehdr_build(net, aead, tx_key, *skb, __rx); + if (likely(rc > 0)) + rc = tipc_aead_encrypt(aead, *skb, b, dst, __dnode); + +exit: + switch (rc) { + case 0: + this_cpu_inc(stats->stat[STAT_OK]); + break; + case -EINPROGRESS: + case -EBUSY: + this_cpu_inc(stats->stat[STAT_ASYNC]); + *skb = NULL; + return rc; + default: + this_cpu_inc(stats->stat[STAT_NOK]); + if (rc == -ENOKEY) + this_cpu_inc(stats->stat[STAT_NOKEYS]); + else if (rc == -EKEYREVOKED) + this_cpu_inc(stats->stat[STAT_BADKEYS]); + kfree_skb(*skb); + *skb = NULL; + break; + } + + tipc_aead_put(aead); + return rc; +} + +/** + * tipc_crypto_rcv - Decrypt an encrypted TIPC message from peer + * @net: struct net + * @rx: RX crypto handle + * @skb: input/output message skb pointer + * @b: bearer where the message has been received + * + * If the decryption is successful, the decrypted skb is returned directly or + * as the callback, the encryption header and auth tag will be trimed out + * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). + * Otherwise, the skb will be freed! + * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX + * cluster key(s) can be taken for decryption (- recursive). + * + * Return: + * 0 : the decryption has successfully completed + * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made + * -ENOKEY : the decryption has failed due to no key + * -EBADMSG : the decryption has failed due to bad message + * -ENOMEM : the decryption has failed due to no memory + * < 0 : the decryption has failed due to other reasons + */ +int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, + struct sk_buff **skb, struct tipc_bearer *b) +{ + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_crypto_stats __percpu *stats; + struct tipc_aead *aead = NULL; + struct tipc_key key; + int rc = -ENOKEY; + u8 tx_key = 0; + + /* New peer? + * Let's try with TX key (i.e. cluster mode) & verify the skb first! + */ + if (unlikely(!rx)) + goto pick_tx; + + /* Pick RX key according to TX key, three cases are possible: + * 1) The current active key (likely) or; + * 2) The pending (new or deactivated) key (if any) or; + * 3) The passive or old active key (i.e. users > 0); + */ + tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; + key = rx->key; + if (likely(tx_key == key.active)) + goto decrypt; + if (tx_key == key.pending) + goto decrypt; + if (tx_key == key.passive) { + rx->timer2 = jiffies; + if (tipc_aead_users(rx->aead[key.passive]) > 0) + goto decrypt; + } + + /* Unknown key, let's try to align RX key(s) */ + if (tipc_crypto_key_try_align(rx, tx_key)) + goto decrypt; + +pick_tx: + /* No key suitable? Try to pick one from TX... */ + aead = tipc_crypto_key_pick_tx(tx, rx, *skb); + if (aead) + goto decrypt; + goto exit; + +decrypt: + rcu_read_lock(); + if (!aead) + aead = tipc_aead_get(rx->aead[tx_key]); + rc = tipc_aead_decrypt(net, aead, *skb, b); + rcu_read_unlock(); + +exit: + stats = ((rx) ?: tx)->stats; + switch (rc) { + case 0: + this_cpu_inc(stats->stat[STAT_OK]); + break; + case -EINPROGRESS: + case -EBUSY: + this_cpu_inc(stats->stat[STAT_ASYNC]); + *skb = NULL; + return rc; + default: + this_cpu_inc(stats->stat[STAT_NOK]); + if (rc == -ENOKEY) { + kfree_skb(*skb); + *skb = NULL; + if (rx) + tipc_node_put(rx->node); + this_cpu_inc(stats->stat[STAT_NOKEYS]); + return rc; + } else if (rc == -EBADMSG) { + this_cpu_inc(stats->stat[STAT_BADMSGS]); + } + break; + } + + tipc_crypto_rcv_complete(net, aead, b, skb, rc); + return rc; +} + +static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, + struct tipc_bearer *b, + struct sk_buff **skb, int err) +{ + struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(*skb); + struct tipc_crypto *rx = aead->crypto; + struct tipc_aead *tmp = NULL; + struct tipc_ehdr *ehdr; + struct tipc_node *n; + u8 rx_key_active; + bool destined; + + /* Is this completed by TX? */ + if (unlikely(!rx->node)) { + rx = skb_cb->tx_clone_ctx.rx; +#ifdef TIPC_CRYPTO_DEBUG + pr_info("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n", + (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead, + (*skb)->next, skb_cb->flags); + pr_info("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n", + skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last, + aead->crypto->aead[1], aead->crypto->aead[2], + aead->crypto->aead[3]); +#endif + if (unlikely(err)) { + if (err == -EBADMSG && (*skb)->next) + tipc_rcv(net, (*skb)->next, b); + goto free_skb; + } + + if (likely((*skb)->next)) { + kfree_skb((*skb)->next); + (*skb)->next = NULL; + } + ehdr = (struct tipc_ehdr *)(*skb)->data; + if (!rx) { + WARN_ON(ehdr->user != LINK_CONFIG); + n = tipc_node_create(net, 0, ehdr->id, 0xffffu, 0, + true); + rx = tipc_node_crypto_rx(n); + if (unlikely(!rx)) + goto free_skb; + } + + /* Skip cloning this time as we had a RX pending key */ + if (rx->key.pending) + goto rcv; + if (tipc_aead_clone(&tmp, aead) < 0) + goto rcv; + if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key) < 0) { + tipc_aead_free(&tmp->rcu); + goto rcv; + } + tipc_aead_put(aead); + aead = tipc_aead_get(tmp); + } + + if (unlikely(err)) { + tipc_aead_users_dec(aead, INT_MIN); + goto free_skb; + } + + /* Set the RX key's user */ + tipc_aead_users_set(aead, 1); + +rcv: + /* Mark this point, RX works */ + rx->timer1 = jiffies; + + /* Remove ehdr & auth. tag prior to tipc_rcv() */ + ehdr = (struct tipc_ehdr *)(*skb)->data; + destined = ehdr->destined; + rx_key_active = ehdr->rx_key_active; + skb_pull(*skb, tipc_ehdr_size(ehdr)); + pskb_trim(*skb, (*skb)->len - aead->authsize); + + /* Validate TIPCv2 message */ + if (unlikely(!tipc_msg_validate(skb))) { + pr_err_ratelimited("Packet dropped after decryption!\n"); + goto free_skb; + } + + /* Update peer RX active key & TX users */ + if (destined) + tipc_crypto_key_synch(rx, rx_key_active, buf_msg(*skb)); + + /* Mark skb decrypted */ + skb_cb->decrypted = 1; + + /* Clear clone cxt if any */ + if (likely(!skb_cb->tx_clone_deferred)) + goto exit; + skb_cb->tx_clone_deferred = 0; + memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); + goto exit; + +free_skb: + kfree_skb(*skb); + *skb = NULL; + +exit: + tipc_aead_put(aead); + if (rx) + tipc_node_put(rx->node); +} + +static void tipc_crypto_do_cmd(struct net *net, int cmd) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_crypto *tx = tn->crypto_tx, *rx; + struct list_head *p; + unsigned int stat; + int i, j, cpu; + char buf[200]; + + /* Currently only one command is supported */ + switch (cmd) { + case 0xfff1: + goto print_stats; + default: + return; + } + +print_stats: + /* Print a header */ + pr_info("\n=============== TIPC Crypto Statistics ===============\n\n"); + + /* Print key status */ + pr_info("Key status:\n"); + pr_info("TX(%7.7s)\n%s", tipc_own_id_string(net), + tipc_crypto_key_dump(tx, buf)); + + rcu_read_lock(); + for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { + rx = tipc_node_crypto_rx_by_list(p); + pr_info("RX(%7.7s)\n%s", tipc_node_get_id_str(rx->node), + tipc_crypto_key_dump(rx, buf)); + } + rcu_read_unlock(); + + /* Print crypto statistics */ + for (i = 0, j = 0; i < MAX_STATS; i++) + j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]); + pr_info("\nCounter %s", buf); + + memset(buf, '-', 115); + buf[115] = '\0'; + pr_info("%s\n", buf); + + j = scnprintf(buf, 200, "TX(%7.7s) ", tipc_own_id_string(net)); + for_each_possible_cpu(cpu) { + for (i = 0; i < MAX_STATS; i++) { + stat = per_cpu_ptr(tx->stats, cpu)->stat[i]; + j += scnprintf(buf + j, 200 - j, "|%11d ", stat); + } + pr_info("%s", buf); + j = scnprintf(buf, 200, "%12s", " "); + } + + rcu_read_lock(); + for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { + rx = tipc_node_crypto_rx_by_list(p); + j = scnprintf(buf, 200, "RX(%7.7s) ", + tipc_node_get_id_str(rx->node)); + for_each_possible_cpu(cpu) { + for (i = 0; i < MAX_STATS; i++) { + stat = per_cpu_ptr(rx->stats, cpu)->stat[i]; + j += scnprintf(buf + j, 200 - j, "|%11d ", + stat); + } + pr_info("%s", buf); + j = scnprintf(buf, 200, "%12s", " "); + } + } + rcu_read_unlock(); + + pr_info("\n======================== Done ========================\n"); +} + +static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) +{ + struct tipc_key key = c->key; + struct tipc_aead *aead; + int k, i = 0; + char *s; + + for (k = KEY_MIN; k <= KEY_MAX; k++) { + if (k == key.passive) + s = "PAS"; + else if (k == key.active) + s = "ACT"; + else if (k == key.pending) + s = "PEN"; + else + s = "-"; + i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s); + + rcu_read_lock(); + aead = rcu_dereference(c->aead[k]); + if (aead) + i += scnprintf(buf + i, 200 - i, + "{\"%s...\", \"%s\"}/%d:%d", + aead->hint, + (aead->mode == CLUSTER_KEY) ? "c" : "p", + atomic_read(&aead->users), + refcount_read(&aead->refcnt)); + rcu_read_unlock(); + i += scnprintf(buf + i, 200 - i, "\n"); + } + + if (c->node) + i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n", + atomic_read(&c->peer_rx_active)); + + return buf; +} + +#ifdef TIPC_CRYPTO_DEBUG +static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, + char *buf) +{ + struct tipc_key *key = &old; + int k, i = 0; + char *s; + + /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */ +again: + i += scnprintf(buf + i, 32 - i, "["); + for (k = KEY_MIN; k <= KEY_MAX; k++) { + if (k == key->passive) + s = "pas"; + else if (k == key->active) + s = "act"; + else if (k == key->pending) + s = "pen"; + else + s = "-"; + i += scnprintf(buf + i, 32 - i, + (k != KEY_MAX) ? "%s " : "%s", s); + } + if (key != &new) { + i += scnprintf(buf + i, 32 - i, "] -> "); + key = &new; + goto again; + } + i += scnprintf(buf + i, 32 - i, "]"); + return buf; +} +#endif diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h new file mode 100644 index 000000000000..c3de769f49e8 --- /dev/null +++ b/net/tipc/crypto.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * net/tipc/crypto.h: Include file for TIPC crypto + * + * Copyright (c) 2019, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifdef CONFIG_TIPC_CRYPTO +#ifndef _TIPC_CRYPTO_H +#define _TIPC_CRYPTO_H + +#include "core.h" +#include "node.h" +#include "msg.h" +#include "bearer.h" + +#define TIPC_EVERSION 7 + +/* AEAD aes(gcm) */ +#define TIPC_AES_GCM_KEY_SIZE_128 16 +#define TIPC_AES_GCM_KEY_SIZE_192 24 +#define TIPC_AES_GCM_KEY_SIZE_256 32 + +#define TIPC_AES_GCM_SALT_SIZE 4 +#define TIPC_AES_GCM_IV_SIZE 12 +#define TIPC_AES_GCM_TAG_SIZE 16 + +/** + * TIPC crypto modes: + * - CLUSTER_KEY: + * One single key is used for both TX & RX in all nodes in the cluster. + * - PER_NODE_KEY: + * Each nodes in the cluster has one TX key, for RX a node needs to know + * its peers' TX key for the decryption of messages from those nodes. + */ +enum { + CLUSTER_KEY = 1, + PER_NODE_KEY = (1 << 1), +}; + +extern int sysctl_tipc_max_tfms __read_mostly; + +/** + * TIPC encryption message format: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 + * 1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w0:|Ver=7| User |D|TX |RX |K| Rsvd | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w1:| Seqno | + * w2:| (8 octets) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w3:\ Prevnode \ + * / (4 or 16 octets) / + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / Encrypted complete TIPC V2 header and user data / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | | + * | AuthTag | + * | (16 octets) | + * | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Word0: + * Ver : = 7 i.e. TIPC encryption message version + * User : = 7 (for LINK_PROTOCOL); = 13 (for LINK_CONFIG) or = 0 + * D : The destined bit i.e. the message's destination node is + * "known" or not at the message encryption + * TX : TX key used for the message encryption + * RX : Currently RX active key corresponding to the destination + * node's TX key (when the "D" bit is set) + * K : Keep-alive bit (for RPS, LINK_PROTOCOL/STATE_MSG only) + * Rsvd : Reserved bit, field + * Word1-2: + * Seqno : The 64-bit sequence number of the encrypted message, also + * part of the nonce used for the message encryption/decryption + * Word3-: + * Prevnode: The source node address, or ID in case LINK_CONFIG only + * AuthTag : The authentication tag for the message integrity checking + * generated by the message encryption + */ +struct tipc_ehdr { + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 destined:1, + user:4, + version:3; + __u8 reserved_1:3, + keepalive:1, + rx_key_active:2, + tx_key:2; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 version:3, + user:4, + destined:1; + __u8 tx_key:2, + rx_key_active:2, + keepalive:1, + reserved_1:3; +#else +#error "Please fix <asm/byteorder.h>" +#endif + __be16 reserved_2; + } __packed; + __be32 w0; + }; + __be64 seqno; + union { + __be32 addr; + __u8 id[NODE_ID_LEN]; /* For a LINK_CONFIG message only! */ + }; +#define EHDR_SIZE (offsetof(struct tipc_ehdr, addr) + sizeof(__be32)) +#define EHDR_CFG_SIZE (sizeof(struct tipc_ehdr)) +#define EHDR_MIN_SIZE (EHDR_SIZE) +#define EHDR_MAX_SIZE (EHDR_CFG_SIZE) +#define EMSG_OVERHEAD (EHDR_SIZE + TIPC_AES_GCM_TAG_SIZE) +} __packed; + +int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, + struct tipc_node *node); +void tipc_crypto_stop(struct tipc_crypto **crypto); +void tipc_crypto_timeout(struct tipc_crypto *rx); +int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, + struct tipc_bearer *b, struct tipc_media_addr *dst, + struct tipc_node *__dnode); +int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, + struct sk_buff **skb, struct tipc_bearer *b); +int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, + u8 mode); +void tipc_crypto_key_flush(struct tipc_crypto *c); +int tipc_aead_key_validate(struct tipc_aead_key *ukey); +bool tipc_ehdr_validate(struct sk_buff *skb); + +#endif /* _TIPC_CRYPTO_H */ +#endif diff --git a/net/tipc/discover.c b/net/tipc/discover.c index c138d68e8a69..b043e8c6397a 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -94,6 +94,7 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); + msg_set_peer_net_hash(hdr, tipc_net_hash_mixes(net, tn->random)); msg_set_node_id(hdr, tipc_own_id(net)); } @@ -242,7 +243,8 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, if (!tipc_in_scope(legacy, b->domain, src)) return; tipc_node_check_dest(net, src, peer_id, b, caps, signature, - &maddr, &respond, &dupl_addr); + msg_peer_net_hash(hdr), &maddr, &respond, + &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); if (!respond) diff --git a/net/tipc/link.c b/net/tipc/link.c index 999eab592de8..fb72031228c9 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -44,6 +44,7 @@ #include "netlink.h" #include "monitor.h" #include "trace.h" +#include "crypto.h" #include <linux/pkt_sched.h> @@ -397,6 +398,15 @@ int tipc_link_mtu(struct tipc_link *l) return l->mtu; } +int tipc_link_mss(struct tipc_link *l) +{ +#ifdef CONFIG_TIPC_CRYPTO + return l->mtu - INT_H_SIZE - EMSG_OVERHEAD; +#else + return l->mtu - INT_H_SIZE; +#endif +} + u16 tipc_link_rcv_nxt(struct tipc_link *l) { return l->rcv_nxt; @@ -940,16 +950,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb_peek(list)); - unsigned int maxwin = l->window; - int imp = msg_importance(hdr); - unsigned int mtu = l->mtu; + struct sk_buff_head *backlogq = &l->backlogq; + struct sk_buff_head *transmq = &l->transmq; + struct sk_buff *skb, *_skb; + u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; u16 seqno = l->snd_nxt; - u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; - struct sk_buff_head *transmq = &l->transmq; - struct sk_buff_head *backlogq = &l->backlogq; - struct sk_buff *skb, *_skb, **tskb; int pkt_cnt = skb_queue_len(list); + int imp = msg_importance(hdr); + unsigned int mss = tipc_link_mss(l); + unsigned int maxwin = l->window; + unsigned int mtu = l->mtu; + bool new_bundle; int rc = 0; if (unlikely(msg_size(hdr) > mtu)) { @@ -975,20 +987,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, } /* Prepare each packet for sending, and add to relevant queue: */ - while (skb_queue_len(list)) { - skb = skb_peek(list); - hdr = buf_msg(skb); - msg_set_seqno(hdr, seqno); - msg_set_ack(hdr, ack); - msg_set_bcast_ack(hdr, bc_ack); - + while ((skb = __skb_dequeue(list))) { if (likely(skb_queue_len(transmq) < maxwin)) { + hdr = buf_msg(skb); + msg_set_seqno(hdr, seqno); + msg_set_ack(hdr, ack); + msg_set_bcast_ack(hdr, bc_ack); _skb = skb_clone(skb, GFP_ATOMIC); if (!_skb) { + kfree_skb(skb); __skb_queue_purge(list); return -ENOBUFS; } - __skb_dequeue(list); __skb_queue_tail(transmq, skb); /* next retransmit attempt */ if (link_is_bc_sndlink(l)) @@ -1000,22 +1010,25 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, seqno++; continue; } - tskb = &l->backlog[imp].target_bskb; - if (tipc_msg_bundle(*tskb, hdr, mtu)) { - kfree_skb(__skb_dequeue(list)); - l->stats.sent_bundled++; - continue; - } - if (tipc_msg_make_bundle(tskb, hdr, mtu, l->addr)) { - kfree_skb(__skb_dequeue(list)); - __skb_queue_tail(backlogq, *tskb); - l->backlog[imp].len++; - l->stats.sent_bundled++; - l->stats.sent_bundles++; + if (tipc_msg_try_bundle(l->backlog[imp].target_bskb, &skb, + mss, l->addr, &new_bundle)) { + if (skb) { + /* Keep a ref. to the skb for next try */ + l->backlog[imp].target_bskb = skb; + l->backlog[imp].len++; + __skb_queue_tail(backlogq, skb); + } else { + if (new_bundle) { + l->stats.sent_bundles++; + l->stats.sent_bundled++; + } + l->stats.sent_bundled++; + } continue; } l->backlog[imp].target_bskb = NULL; - l->backlog[imp].len += skb_queue_len(list); + l->backlog[imp].len += (1 + skb_queue_len(list)); + __skb_queue_tail(backlogq, skb); skb_queue_splice_tail_init(list, backlogq); } l->snd_nxt = seqno; @@ -1084,7 +1097,7 @@ static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r, return false; if (!time_after(jiffies, TIPC_SKB_CB(skb)->retr_stamp + - msecs_to_jiffies(r->tolerance))) + msecs_to_jiffies(r->tolerance * 10))) return false; hdr = buf_msg(skb); @@ -1151,7 +1164,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; - _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC); + _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) return 0; hdr = buf_msg(_skb); @@ -1427,8 +1440,7 @@ next_gap_ack: if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME; - _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, - GFP_ATOMIC); + _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) continue; hdr = buf_msg(_skb); @@ -1728,21 +1740,6 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, return; __skb_queue_head_init(&tnlq); - __skb_queue_head_init(&tmpxq); - __skb_queue_head_init(&frags); - - /* At least one packet required for safe algorithm => add dummy */ - skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, - BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net), - 0, 0, TIPC_ERR_NO_PORT); - if (!skb) { - pr_warn("%sunable to create tunnel packet\n", link_co_err); - return; - } - __skb_queue_tail(&tnlq, skb); - tipc_link_xmit(l, &tnlq, &tmpxq); - __skb_queue_purge(&tmpxq); - /* Link Synching: * From now on, send only one single ("dummy") SYNCH message * to peer. The SYNCH message does not contain any data, just @@ -1768,6 +1765,20 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, return; } + __skb_queue_head_init(&tmpxq); + __skb_queue_head_init(&frags); + /* At least one packet required for safe algorithm => add dummy */ + skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, + BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net), + 0, 0, TIPC_ERR_NO_PORT); + if (!skb) { + pr_warn("%sunable to create tunnel packet\n", link_co_err); + return; + } + __skb_queue_tail(&tnlq, skb); + tipc_link_xmit(l, &tnlq, &tmpxq); + __skb_queue_purge(&tmpxq); + /* Initialize reusable tunnel packet header */ tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL, mtyp, INT_H_SIZE, l->addr); @@ -1873,7 +1884,7 @@ void tipc_link_failover_prepare(struct tipc_link *l, struct tipc_link *tnl, tipc_link_create_dummy_tnl_msg(tnl, xmitq); - /* This failover link enpoint was never established before, + /* This failover link endpoint was never established before, * so it has not received anything from peer. * Otherwise, it must be a normal failover situation or the * node has entered SELF_DOWN_PEER_LEAVING and both peer nodes diff --git a/net/tipc/link.h b/net/tipc/link.h index adcad65e761c..c09e9d49d0a3 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -141,6 +141,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, int tipc_link_bc_peers(struct tipc_link *l); void tipc_link_set_mtu(struct tipc_link *l, int mtu); int tipc_link_mtu(struct tipc_link *l); +int tipc_link_mss(struct tipc_link *l); void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, struct sk_buff_head *xmitq); void tipc_link_build_bc_sync_msg(struct tipc_link *l, diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 6a6eae88442f..58708b4c7719 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -665,6 +665,21 @@ void tipc_mon_delete(struct net *net, int bearer_id) kfree(mon); } +void tipc_mon_reinit_self(struct net *net) +{ + struct tipc_monitor *mon; + int bearer_id; + + for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + mon = tipc_monitor(net, bearer_id); + if (!mon) + continue; + write_lock_bh(&mon->lock); + mon->self->addr = tipc_own_addr(net); + write_unlock_bh(&mon->lock); + } +} + int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size) { struct tipc_net *tn = tipc_net(net); diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h index 2a21b93e0d04..ed63d2e650b0 100644 --- a/net/tipc/monitor.h +++ b/net/tipc/monitor.h @@ -77,6 +77,7 @@ int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id); int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id, u32 *prev_node); +void tipc_mon_reinit_self(struct net *net); extern const int tipc_max_domain_size; #endif diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 922d262e153f..0d515d20b056 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -39,10 +39,16 @@ #include "msg.h" #include "addr.h" #include "name_table.h" +#include "crypto.h" #define MAX_FORWARD_SIZE 1024 +#ifdef CONFIG_TIPC_CRYPTO +#define BUF_HEADROOM ALIGN(((LL_MAX_HEADER + 48) + EHDR_MAX_SIZE), 16) +#define BUF_TAILROOM (TIPC_AES_GCM_TAG_SIZE) +#else #define BUF_HEADROOM (LL_MAX_HEADER + 48) #define BUF_TAILROOM 16 +#endif static unsigned int align(unsigned int i) { @@ -61,7 +67,11 @@ static unsigned int align(unsigned int i) struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { struct sk_buff *skb; +#ifdef CONFIG_TIPC_CRYPTO + unsigned int buf_size = (BUF_HEADROOM + size + BUF_TAILROOM + 3) & ~3u; +#else unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; +#endif skb = alloc_skb_fclone(buf_size, gfp); if (skb) { @@ -173,7 +183,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) } if (fragid == LAST_FRAGMENT) { - TIPC_SKB_CB(head)->validated = false; + TIPC_SKB_CB(head)->validated = 0; if (unlikely(!tipc_msg_validate(&head))) goto err; *buf = head; @@ -190,6 +200,59 @@ err: return 0; } +/** + * tipc_msg_append(): Append data to tail of an existing buffer queue + * @hdr: header to be used + * @m: the data to be appended + * @mss: max allowable size of buffer + * @dlen: size of data to be appended + * @txq: queue to appand to + * Returns the number og 1k blocks appended or errno value + */ +int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, + int mss, struct sk_buff_head *txq) +{ + struct sk_buff *skb, *prev; + int accounted, total, curr; + int mlen, cpy, rem = dlen; + struct tipc_msg *hdr; + + skb = skb_peek_tail(txq); + accounted = skb ? msg_blocks(buf_msg(skb)) : 0; + total = accounted; + + while (rem) { + if (!skb || skb->len >= mss) { + prev = skb; + skb = tipc_buf_acquire(mss, GFP_KERNEL); + if (unlikely(!skb)) + return -ENOMEM; + skb_orphan(skb); + skb_trim(skb, MIN_H_SIZE); + hdr = buf_msg(skb); + skb_copy_to_linear_data(skb, _hdr, MIN_H_SIZE); + msg_set_hdr_sz(hdr, MIN_H_SIZE); + msg_set_size(hdr, MIN_H_SIZE); + __skb_queue_tail(txq, skb); + total += 1; + if (prev) + msg_set_ack_required(buf_msg(prev), 0); + msg_set_ack_required(hdr, 1); + } + hdr = buf_msg(skb); + curr = msg_blocks(hdr); + mlen = msg_size(hdr); + cpy = min_t(int, rem, mss - mlen); + if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter)) + return -EFAULT; + msg_set_size(hdr, mlen + cpy); + skb_put(skb, cpy); + rem -= cpy; + total += msg_blocks(hdr) - curr; + } + return total - accounted; +} + /* tipc_msg_validate - validate basic format of received message * * This routine ensures a TIPC message has an acceptable header, and at least @@ -218,6 +281,7 @@ bool tipc_msg_validate(struct sk_buff **_skb) if (unlikely(TIPC_SKB_CB(skb)->validated)) return true; + if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE))) return false; @@ -239,7 +303,7 @@ bool tipc_msg_validate(struct sk_buff **_skb) if (unlikely(skb->len < msz)) return false; - TIPC_SKB_CB(skb)->validated = true; + TIPC_SKB_CB(skb)->validated = 1; return true; } @@ -419,48 +483,98 @@ error: } /** - * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one - * @skb: the buffer to append to ("bundle") - * @msg: message to be appended - * @mtu: max allowable size for the bundle buffer - * Consumes buffer if successful - * Returns true if bundling could be performed, otherwise false + * tipc_msg_bundle - Append contents of a buffer to tail of an existing one + * @bskb: the bundle buffer to append to + * @msg: message to be appended + * @max: max allowable size for the bundle buffer + * + * Returns "true" if bundling has been performed, otherwise "false" */ -bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu) +static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg, + u32 max) { - struct tipc_msg *bmsg; - unsigned int bsz; - unsigned int msz = msg_size(msg); - u32 start, pad; - u32 max = mtu - INT_H_SIZE; + struct tipc_msg *bmsg = buf_msg(bskb); + u32 msz, bsz, offset, pad; - if (likely(msg_user(msg) == MSG_FRAGMENTER)) - return false; - if (!skb) - return false; - bmsg = buf_msg(skb); + msz = msg_size(msg); bsz = msg_size(bmsg); - start = align(bsz); - pad = start - bsz; + offset = align(bsz); + pad = offset - bsz; - if (unlikely(msg_user(msg) == TUNNEL_PROTOCOL)) + if (unlikely(skb_tailroom(bskb) < (pad + msz))) return false; - if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) + if (unlikely(max < (offset + msz))) return false; - if (unlikely(msg_user(bmsg) != MSG_BUNDLER)) + + skb_put(bskb, pad + msz); + skb_copy_to_linear_data_offset(bskb, offset, msg, msz); + msg_set_size(bmsg, offset + msz); + msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); + return true; +} + +/** + * tipc_msg_try_bundle - Try to bundle a new message to the last one + * @tskb: the last/target message to which the new one will be appended + * @skb: the new message skb pointer + * @mss: max message size (header inclusive) + * @dnode: destination node for the message + * @new_bundle: if this call made a new bundle or not + * + * Return: "true" if the new message skb is potential for bundling this time or + * later, in the case a bundling has been done this time, the skb is consumed + * (the skb pointer = NULL). + * Otherwise, "false" if the skb cannot be bundled at all. + */ +bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, + u32 dnode, bool *new_bundle) +{ + struct tipc_msg *msg, *inner, *outer; + u32 tsz; + + /* First, check if the new buffer is suitable for bundling */ + msg = buf_msg(*skb); + if (msg_user(msg) == MSG_FRAGMENTER) return false; - if (unlikely(skb_tailroom(skb) < (pad + msz))) + if (msg_user(msg) == TUNNEL_PROTOCOL) return false; - if (unlikely(max < (start + msz))) + if (msg_user(msg) == BCAST_PROTOCOL) return false; - if ((msg_importance(msg) < TIPC_SYSTEM_IMPORTANCE) && - (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE)) + if (mss <= INT_H_SIZE + msg_size(msg)) return false; - skb_put(skb, pad + msz); - skb_copy_to_linear_data_offset(skb, start, msg, msz); - msg_set_size(bmsg, start + msz); - msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); + /* Ok, but the last/target buffer can be empty? */ + if (unlikely(!tskb)) + return true; + + /* Is it a bundle already? Try to bundle the new message to it */ + if (msg_user(buf_msg(tskb)) == MSG_BUNDLER) { + *new_bundle = false; + goto bundle; + } + + /* Make a new bundle of the two messages if possible */ + tsz = msg_size(buf_msg(tskb)); + if (unlikely(mss < align(INT_H_SIZE + tsz) + msg_size(msg))) + return true; + if (unlikely(pskb_expand_head(tskb, INT_H_SIZE, mss - tsz - INT_H_SIZE, + GFP_ATOMIC))) + return true; + inner = buf_msg(tskb); + skb_push(tskb, INT_H_SIZE); + outer = buf_msg(tskb); + tipc_msg_init(msg_prevnode(inner), outer, MSG_BUNDLER, 0, INT_H_SIZE, + dnode); + msg_set_importance(outer, msg_importance(inner)); + msg_set_size(outer, INT_H_SIZE + tsz); + msg_set_msgcnt(outer, 1); + *new_bundle = true; + +bundle: + if (likely(tipc_msg_bundle(tskb, msg, mss))) { + consume_skb(*skb); + *skb = NULL; + } return true; } @@ -510,49 +624,6 @@ none: } /** - * tipc_msg_make_bundle(): Create bundle buf and append message to its tail - * @list: the buffer chain, where head is the buffer to replace/append - * @skb: buffer to be created, appended to and returned in case of success - * @msg: message to be appended - * @mtu: max allowable size for the bundle buffer, inclusive header - * @dnode: destination node for message. (Not always present in header) - * Returns true if success, otherwise false - */ -bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, - u32 mtu, u32 dnode) -{ - struct sk_buff *_skb; - struct tipc_msg *bmsg; - u32 msz = msg_size(msg); - u32 max = mtu - INT_H_SIZE; - - if (msg_user(msg) == MSG_FRAGMENTER) - return false; - if (msg_user(msg) == TUNNEL_PROTOCOL) - return false; - if (msg_user(msg) == BCAST_PROTOCOL) - return false; - if (msz > (max / 2)) - return false; - - _skb = tipc_buf_acquire(max, GFP_ATOMIC); - if (!_skb) - return false; - - skb_trim(_skb, INT_H_SIZE); - bmsg = buf_msg(_skb); - tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, - INT_H_SIZE, dnode); - msg_set_importance(bmsg, msg_importance(msg)); - msg_set_seqno(bmsg, msg_seqno(msg)); - msg_set_ack(bmsg, msg_ack(msg)); - msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); - tipc_msg_bundle(_skb, msg, mtu); - *skb = _skb; - return true; -} - -/** * tipc_msg_reverse(): swap source and destination addresses and add error code * @own_node: originating node id for reversed message * @skb: buffer containing message to be reversed; will be consumed diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 0daa6f04ca81..6d466ebdb64f 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -102,16 +102,42 @@ struct plist; #define TIPC_MEDIA_INFO_OFFSET 5 struct tipc_skb_cb { - struct sk_buff *tail; - unsigned long nxt_retr; - unsigned long retr_stamp; - u32 bytes_read; - u32 orig_member; - u16 chain_imp; - u16 ackers; - u16 retr_cnt; - bool validated; -}; + union { + struct { + struct sk_buff *tail; + unsigned long nxt_retr; + unsigned long retr_stamp; + u32 bytes_read; + u32 orig_member; + u16 chain_imp; + u16 ackers; + u16 retr_cnt; + } __packed; +#ifdef CONFIG_TIPC_CRYPTO + struct { + struct tipc_crypto *rx; + struct tipc_aead *last; + u8 recurs; + } tx_clone_ctx __packed; +#endif + } __packed; + union { + struct { + u8 validated:1; +#ifdef CONFIG_TIPC_CRYPTO + u8 encrypted:1; + u8 decrypted:1; + u8 probe:1; + u8 tx_clone_deferred:1; +#endif + }; + u8 flags; + }; + u8 reserved; +#ifdef CONFIG_TIPC_CRYPTO + void *crypto_ctx; +#endif +} __packed; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) @@ -290,6 +316,16 @@ static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) msg_set_bits(m, 0, 18, 1, d); } +static inline int msg_ack_required(struct tipc_msg *m) +{ + return msg_bits(m, 0, 18, 1); +} + +static inline void msg_set_ack_required(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 18, 1, d); +} + static inline bool msg_is_rcast(struct tipc_msg *m) { return msg_bits(m, 0, 18, 0x1); @@ -1026,6 +1062,20 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } +/* Word 13 + */ +static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 13, n); +} + +static inline u32 msg_peer_net_hash(struct tipc_msg *m) +{ + return msg_word(m, 13); +} + +/* Word 14 + */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); @@ -1057,14 +1107,15 @@ struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); -bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu); -bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, - u32 mtu, u32 dnode); +bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, + u32 dnode, bool *new_bundle); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); +int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen, + int mss, struct sk_buff_head *txq); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 836e629e8f4a..5feaf3b67380 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -146,7 +146,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; - u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) / + u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; diff --git a/net/tipc/net.c b/net/tipc/net.c index 85707c185360..2de3cec9929d 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -42,6 +42,7 @@ #include "node.h" #include "bcast.h" #include "netlink.h" +#include "monitor.h" /* * The TIPC locking policy is designed to ensure a very fine locking @@ -136,6 +137,7 @@ static void tipc_net_finalize(struct net *net, u32 addr) tipc_set_node_addr(net, addr); tipc_named_reinit(net); tipc_sk_reinit(net); + tipc_mon_reinit_self(net); tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, TIPC_CLUSTER_SCOPE, 0, addr); } diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index d6165ad384c0..e53231bd23b4 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -102,7 +102,11 @@ const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, - [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } + [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG }, + [TIPC_NLA_NODE_ID] = { .type = NLA_BINARY, + .len = TIPC_NODEID_LEN}, + [TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY, + .len = TIPC_AEAD_KEY_SIZE_MAX}, }; /* Properties valid for media, bearer and link */ @@ -176,7 +180,8 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_PUBL_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_nl_publ_dump, }, { @@ -239,7 +244,8 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_MON_PEER_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_nl_node_dump_monitor_peer, }, { @@ -250,10 +256,23 @@ static const struct genl_ops tipc_genl_v2_ops[] = { #ifdef CONFIG_TIPC_MEDIA_UDP { .cmd = TIPC_NL_UDP_GET_REMOTEIP, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_udp_nl_dump_remoteip, }, #endif +#ifdef CONFIG_TIPC_CRYPTO + { + .cmd = TIPC_NL_KEY_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_node_set_key, + }, + { + .cmd = TIPC_NL_KEY_FLUSH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_node_flush_key, + }, +#endif }; struct genl_family tipc_genl_family __ro_after_init = { @@ -268,18 +287,6 @@ struct genl_family tipc_genl_family __ro_after_init = { .n_ops = ARRAY_SIZE(tipc_genl_v2_ops), }; -int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) -{ - u32 maxattr = tipc_genl_family.maxattr; - - *attr = genl_family_attrbuf(&tipc_genl_family); - if (!*attr) - return -EOPNOTSUPP; - - return nlmsg_parse_deprecated(nlh, GENL_HDRLEN, *attr, maxattr, - tipc_nl_policy, NULL); -} - int __init tipc_netlink_start(void) { int res; diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h index 4ba0ad422110..7cf777723e3e 100644 --- a/net/tipc/netlink.h +++ b/net/tipc/netlink.h @@ -38,7 +38,6 @@ #include <net/netlink.h> extern struct genl_family tipc_genl_family; -int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf); struct tipc_nl_msg { struct sk_buff *skb; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index e135d4e11231..17a529739f8d 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -181,15 +181,18 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, struct tipc_nl_compat_msg *msg, struct sk_buff *arg) { + struct genl_dumpit_info info; int len = 0; int err; struct sk_buff *buf; struct nlmsghdr *nlmsg; struct netlink_callback cb; + struct nlattr **attrbuf; memset(&cb, 0, sizeof(cb)); cb.nlh = (struct nlmsghdr *)arg->data; cb.skb = arg; + cb.data = &info; buf = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!buf) @@ -201,19 +204,35 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, return -ENOMEM; } + attrbuf = kmalloc_array(tipc_genl_family.maxattr + 1, + sizeof(struct nlattr *), GFP_KERNEL); + if (!attrbuf) { + err = -ENOMEM; + goto err_out; + } + + info.attrs = attrbuf; + err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf, + tipc_genl_family.maxattr, + tipc_genl_family.policy, NULL); + if (err) + goto err_out; + do { int rem; len = (*cmd->dumpit)(buf, &cb); nlmsg_for_each_msg(nlmsg, nlmsg_hdr(buf), len, rem) { - struct nlattr **attrs; - - err = tipc_nlmsg_parse(nlmsg, &attrs); + err = nlmsg_parse_deprecated(nlmsg, GENL_HDRLEN, + attrbuf, + tipc_genl_family.maxattr, + tipc_genl_family.policy, + NULL); if (err) goto err_out; - err = (*cmd->format)(msg, attrs); + err = (*cmd->format)(msg, attrbuf); if (err) goto err_out; @@ -231,6 +250,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, err = 0; err_out: + kfree(attrbuf); tipc_dump_done(&cb); kfree_skb(buf); diff --git a/net/tipc/node.c b/net/tipc/node.c index c8f6177dd5a2..aaf595613e6e 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -44,6 +44,7 @@ #include "discover.h" #include "netlink.h" #include "trace.h" +#include "crypto.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 @@ -89,6 +90,7 @@ struct tipc_bclink_entry { * @links: array containing references to all links to node * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node + * @preliminary: a preliminary node or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) @@ -99,6 +101,7 @@ struct tipc_bclink_entry { * @publ_list: list of publications * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node + * @crypto_rx: RX crypto handler */ struct tipc_node { u32 addr; @@ -112,6 +115,7 @@ struct tipc_node { int action_flags; struct list_head list; int state; + bool preliminary; bool failover_sent; u16 sync_point; int link_cnt; @@ -120,12 +124,18 @@ struct tipc_node { u32 signature; u32 link_id; u8 peer_id[16]; + char peer_id_string[NODE_ID_STR_LEN]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; + struct net *peer_net; + u32 peer_hash_mix; +#ifdef CONFIG_TIPC_CRYPTO + struct tipc_crypto *crypto_rx; +#endif }; /* Node FSM states and events: @@ -163,7 +173,6 @@ static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); -static void tipc_node_put(struct tipc_node *node); static bool node_is_up(struct tipc_node *n); static void tipc_node_delete_from_list(struct tipc_node *node); @@ -184,7 +193,7 @@ static struct tipc_link *node_active_link(struct tipc_node *n, int sel) return n->links[bearer_id].link; } -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; @@ -194,6 +203,14 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) if (unlikely(!n)) return mtu; + /* Allow MAX_MSG_SIZE when building connection oriented message + * if they are in the same core network + */ + if (n->peer_net && connected) { + tipc_node_put(n); + return mtu; + } + bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; @@ -235,15 +252,51 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr) return caps; } +u32 tipc_node_get_addr(struct tipc_node *node) +{ + return (node) ? node->addr : 0; +} + +char *tipc_node_get_id_str(struct tipc_node *node) +{ + return node->peer_id_string; +} + +#ifdef CONFIG_TIPC_CRYPTO +/** + * tipc_node_crypto_rx - Retrieve crypto RX handle from node + * Note: node ref counter must be held first! + */ +struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) +{ + return (__n) ? __n->crypto_rx : NULL; +} + +struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) +{ + return container_of(pos, struct tipc_node, list)->crypto_rx; +} +#endif + +void tipc_node_free(struct rcu_head *rp) +{ + struct tipc_node *n = container_of(rp, struct tipc_node, rcu); + +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&n->crypto_rx); +#endif + kfree(n); +} + static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); kfree(n->bc_entry.link); - kfree_rcu(n, rcu); + call_rcu(&n->rcu, tipc_node_free); } -static void tipc_node_put(struct tipc_node *node) +void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } @@ -264,7 +317,7 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { - if (node->addr != addr) + if (node->addr != addr || node->preliminary) continue; if (!kref_get_unless_zero(&node->kref)) node = NULL; @@ -360,18 +413,71 @@ static void tipc_node_write_unlock(struct tipc_node *n) } } -static struct tipc_node *tipc_node_create(struct net *net, u32 addr, - u8 *peer_id, u16 capabilities) +static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) +{ + int net_id = tipc_netid(n->net); + struct tipc_net *tn_peer; + struct net *tmp; + u32 hash_chk; + + if (n->peer_net) + return; + + for_each_net_rcu(tmp) { + tn_peer = tipc_net(tmp); + if (!tn_peer) + continue; + /* Integrity checking whether node exists in namespace or not */ + if (tn_peer->net_id != net_id) + continue; + if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) + continue; + hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); + if (hash_mixes ^ hash_chk) + continue; + n->peer_net = tmp; + n->peer_hash_mix = hash_mixes; + break; + } +} + +struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, + u16 capabilities, u32 hash_mixes, + bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; struct tipc_link *l; + unsigned long intv; int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); - n = tipc_node_find(net, addr); + n = tipc_node_find(net, addr) ?: + tipc_node_find_by_id(net, peer_id); if (n) { + if (!n->preliminary) + goto update; + if (preliminary) + goto exit; + /* A preliminary node becomes "real" now, refresh its data */ + tipc_node_write_lock(n); + n->preliminary = false; + n->addr = addr; + hlist_del_rcu(&n->hash); + hlist_add_head_rcu(&n->hash, + &tn->node_htable[tipc_hashfn(addr)]); + list_del_rcu(&n->list); + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + if (n->addr < temp_node->addr) + break; + } + list_add_tail_rcu(&n->list, &temp_node->list); + tipc_node_write_unlock_fast(n); + +update: + if (n->peer_hash_mix ^ hash_mixes) + tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ @@ -389,6 +495,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -396,9 +503,23 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, pr_warn("Node creation failed, no memory\n"); goto exit; } + tipc_nodeid2string(n->peer_id_string, peer_id); +#ifdef CONFIG_TIPC_CRYPTO + if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) { + pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string); + kfree(n); + n = NULL; + goto exit; + } +#endif n->addr = addr; + n->preliminary = preliminary; memcpy(&n->peer_id, peer_id, 16); n->net = net; + n->peer_net = NULL; + n->peer_hash_mix = 0; + /* Assign kernel local namespace if exists */ + tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); @@ -417,22 +538,14 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; - if (!tipc_link_bc_create(net, tipc_own_addr(net), - addr, U16_MAX, - tipc_link_window(tipc_bc_sndlink(net)), - n->capabilities, - &n->bc_entry.inputq1, - &n->bc_entry.namedq, - tipc_bc_sndlink(net), - &n->bc_entry.link)) { - pr_warn("Broadcast rcv link creation failed, no memory\n"); - kfree(n); - n = NULL; - goto exit; - } + n->bc_entry.link = NULL; tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); - n->keepalive_intv = U32_MAX; + /* Start a slow timer anyway, crypto needs it */ + n->keepalive_intv = 10000; + intv = jiffies + msecs_to_jiffies(n->keepalive_intv); + if (!mod_timer(&n->timer, intv)) + tipc_node_get(n); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) @@ -617,6 +730,11 @@ static bool tipc_node_cleanup(struct tipc_node *peer) } tipc_node_write_unlock(peer); + if (!deleted) { + spin_unlock_bh(&tn->node_list_lock); + return deleted; + } + /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { @@ -645,6 +763,10 @@ static void tipc_node_timeout(struct timer_list *t) return; } +#ifdef CONFIG_TIPC_CRYPTO + /* Take any crypto key related actions first */ + tipc_crypto_timeout(n->crypto_rx); +#endif __skb_queue_head_init(&xmitq); /* Initial node interval to value larger (10 seconds), then it will be @@ -665,7 +787,7 @@ static void tipc_node_timeout(struct timer_list *t) remains--; } tipc_node_read_unlock(n); - tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } @@ -697,7 +819,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ - n->links[bearer_id].mtu = tipc_link_mtu(nl) - INT_H_SIZE; + n->links[bearer_id].mtu = tipc_link_mss(nl); tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); @@ -751,7 +873,7 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); maddr = &n->links[bearer_id].maddr; - tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr); + tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n); tipc_node_write_unlock(n); } @@ -906,7 +1028,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); + tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n); tipc_sk_rcv(n->net, &le->inputq); } @@ -950,6 +1072,8 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; + bool preliminary; + u32 sugg_addr; /* Suggest new address if some other peer is using this one */ n = tipc_node_find(net, addr); @@ -965,9 +1089,11 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) /* Suggest previously used address if peer is known */ n = tipc_node_find_by_id(net, id); if (n) { - addr = n->addr; + sugg_addr = n->addr; + preliminary = n->preliminary; tipc_node_put(n); - return addr; + if (!preliminary) + return sugg_addr; } /* Even this node may be in conflict */ @@ -979,12 +1105,12 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { struct tipc_node *n; - struct tipc_link *l; + struct tipc_link *l, *snd_l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; @@ -998,11 +1124,27 @@ void tipc_node_check_dest(struct net *net, u32 addr, *dupl_addr = false; *respond = false; - n = tipc_node_create(net, addr, peer_id, capabilities); + n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes, + false); if (!n) return; tipc_node_write_lock(n); + if (unlikely(!n->bc_entry.link)) { + snd_l = tipc_bc_sndlink(net); + if (!tipc_link_bc_create(net, tipc_own_addr(net), + addr, U16_MAX, + tipc_link_window(snd_l), + n->capabilities, + &n->bc_entry.inputq1, + &n->bc_entry.namedq, snd_l, + &n->bc_entry.link)) { + pr_warn("Broadcast rcv link creation failed, no mem\n"); + tipc_node_write_unlock_fast(n); + tipc_node_put(n); + return; + } + } le = &n->links[b->identity]; @@ -1017,6 +1159,9 @@ void tipc_node_check_dest(struct net *net, u32 addr, if (sign_match && addr_match && link_up) { /* All is fine. Do nothing. */ reset = false; + /* Peer node is not a container/local namespace */ + if (!n->peer_hash_mix) + n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; @@ -1342,7 +1487,8 @@ static void node_lost_contact(struct tipc_node *n, /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; - + n->peer_net = NULL; + n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, @@ -1424,6 +1570,56 @@ msg_full: return -EMSGSIZE; } +static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) +{ + struct tipc_msg *hdr = buf_msg(skb_peek(list)); + struct sk_buff_head inputq; + + switch (msg_user(hdr)) { + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + if (msg_connected(hdr) || msg_named(hdr)) { + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + } + if (msg_mcast(hdr)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + return; + } + return; + case MSG_FRAGMENTER: + if (tipc_msg_assemble(list)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + } + return; + case GROUP_PROTOCOL: + case CONN_MANAGER: + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + case LINK_PROTOCOL: + case NAME_DISTRIBUTOR: + case TUNNEL_PROTOCOL: + case BCAST_PROTOCOL: + return; + default: + return; + }; +} + /** * tipc_node_xmit() is the general link level function for message sending * @net: the applicable net namespace @@ -1439,6 +1635,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; + bool node_up = false; int bearer_id; int rc; @@ -1456,6 +1653,17 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, } tipc_node_read_lock(n); + node_up = node_is_up(n); + if (node_up && n->peer_net && check_net(n->peer_net)) { + /* xmit inner linux container */ + tipc_lxc_xmit(n->peer_net, list); + if (likely(skb_queue_empty(list))) { + tipc_node_read_unlock(n); + tipc_node_put(n); + return 0; + } + } + bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); @@ -1474,7 +1682,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); else - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); @@ -1622,7 +1830,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id } if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); @@ -1800,20 +2008,38 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; - struct tipc_node *n; + struct tipc_link_entry *le; struct tipc_msg *hdr; + struct tipc_node *n; int bearer_id = b->identity; - struct tipc_link_entry *le; u32 self = tipc_own_addr(net); int usr, rc = 0; u16 bc_ack; +#ifdef CONFIG_TIPC_CRYPTO + struct tipc_ehdr *ehdr; - __skb_queue_head_init(&xmitq); + /* Check if message must be decrypted first */ + if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb)) + goto rcv; + + ehdr = (struct tipc_ehdr *)skb->data; + if (likely(ehdr->user != LINK_CONFIG)) { + n = tipc_node_find(net, ntohl(ehdr->addr)); + if (unlikely(!n)) + goto discard; + } else { + n = tipc_node_find_by_id(net, ehdr->id); + } + tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); + if (!skb) + return; +rcv: +#endif /* Ensure message is well-formed before touching the header */ - TIPC_SKB_CB(skb)->validated = false; if (unlikely(!tipc_msg_validate(&skb))) goto discard; + __skb_queue_head_init(&xmitq); hdr = buf_msg(skb); usr = msg_user(hdr); bc_ack = msg_bcast_ack(hdr); @@ -1884,7 +2110,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) tipc_sk_rcv(net, &le->inputq); if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); discard: @@ -1915,7 +2141,7 @@ void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, tipc_link_set_mtu(e->link, b->mtu); } tipc_node_write_unlock(n); - tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } rcu_read_unlock(); @@ -1926,7 +2152,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; - struct tipc_node *peer; + struct tipc_node *peer, *temp_node; u32 addr; int err; @@ -1967,6 +2193,11 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) tipc_node_write_unlock(peer); tipc_node_delete(peer); + /* Calculate cluster capabilities */ + tn->capabilities = TIPC_NODE_CAPABILITIES; + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + tn->capabilities &= temp_node->capabilities; + } err = 0; err_out: tipc_node_put(peer); @@ -2011,6 +2242,8 @@ int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) } list_for_each_entry_rcu(node, &tn->node_list, list) { + if (node->preliminary) + continue; if (last_addr) { if (node->addr == last_addr) last_addr = 0; @@ -2150,7 +2383,8 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) out: tipc_node_read_unlock(node); - tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr, + NULL); return res; } @@ -2484,13 +2718,9 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, int err; if (!prev_node) { - struct nlattr **attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_MON]) return -EINVAL; @@ -2530,11 +2760,141 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, return skb->len; } -u32 tipc_node_get_addr(struct tipc_node *node) +#ifdef CONFIG_TIPC_CRYPTO +static int tipc_nl_retrieve_key(struct nlattr **attrs, + struct tipc_aead_key **key) { - return (node) ? node->addr : 0; + struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; + + if (!attr) + return -ENODATA; + + *key = (struct tipc_aead_key *)nla_data(attr); + if (nla_len(attr) < tipc_aead_key_size(*key)) + return -EINVAL; + + return 0; +} + +static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) +{ + struct nlattr *attr = attrs[TIPC_NLA_NODE_ID]; + + if (!attr) + return -ENODATA; + + if (nla_len(attr) < TIPC_NODEID_LEN) + return -EINVAL; + + *node_id = (u8 *)nla_data(attr); + return 0; } +int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n = NULL; + struct tipc_aead_key *ukey; + struct tipc_crypto *c; + u8 *id, *own_id; + int rc = 0; + + if (!info->attrs[TIPC_NLA_NODE]) + return -EINVAL; + + rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX, + info->attrs[TIPC_NLA_NODE], + tipc_nl_node_policy, info->extack); + if (rc) + goto exit; + + own_id = tipc_own_id(net); + if (!own_id) { + rc = -EPERM; + goto exit; + } + + rc = tipc_nl_retrieve_key(attrs, &ukey); + if (rc) + goto exit; + + rc = tipc_aead_key_validate(ukey); + if (rc) + goto exit; + + rc = tipc_nl_retrieve_nodeid(attrs, &id); + switch (rc) { + case -ENODATA: + /* Cluster key mode */ + rc = tipc_crypto_key_init(tn->crypto_tx, ukey, CLUSTER_KEY); + break; + case 0: + /* Per-node key mode */ + if (!memcmp(id, own_id, NODE_ID_LEN)) { + c = tn->crypto_tx; + } else { + n = tipc_node_find_by_id(net, id) ?: + tipc_node_create(net, 0, id, 0xffffu, 0, true); + if (unlikely(!n)) { + rc = -ENOMEM; + break; + } + c = n->crypto_rx; + } + + rc = tipc_crypto_key_init(c, ukey, PER_NODE_KEY); + if (n) + tipc_node_put(n); + break; + default: + break; + } + +exit: + return (rc < 0) ? rc : 0; +} + +int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_node_set_key(skb, info); + rtnl_unlock(); + + return err; +} + +int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n; + + tipc_crypto_key_flush(tn->crypto_tx); + rcu_read_lock(); + list_for_each_entry_rcu(n, &tn->node_list, list) + tipc_crypto_key_flush(n->crypto_rx); + rcu_read_unlock(); + + pr_info("All keys are flushed!\n"); + return 0; +} + +int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_node_flush_key(skb, info); + rtnl_unlock(); + + return err; +} +#endif + /** * tipc_node_dump - dump TIPC node data * @n: tipc node to be dumped @@ -2591,3 +2951,33 @@ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) return i; } + +void tipc_node_pre_cleanup_net(struct net *exit_net) +{ + struct tipc_node *n; + struct tipc_net *tn; + struct net *tmp; + + rcu_read_lock(); + for_each_net_rcu(tmp) { + if (tmp == exit_net) + continue; + tn = tipc_net(tmp); + if (!tn) + continue; + spin_lock_bh(&tn->node_list_lock); + list_for_each_entry_rcu(n, &tn->node_list, list) { + if (!n->peer_net) + continue; + if (n->peer_net != exit_net) + continue; + tipc_node_write_lock(n); + n->peer_net = NULL; + n->peer_hash_mix = 0; + tipc_node_write_unlock_fast(n); + break; + } + spin_unlock_bh(&tn->node_list_lock); + } + rcu_read_unlock(); +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 291d0ecd4101..a6803b449a2c 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -54,7 +54,8 @@ enum { TIPC_LINK_PROTO_SEQNO = (1 << 6), TIPC_MCAST_RBCTL = (1 << 7), TIPC_GAP_ACK_BLOCK = (1 << 8), - TIPC_TUNNEL_ENHANCED = (1 << 9) + TIPC_TUNNEL_ENHANCED = (1 << 9), + TIPC_NAGLE = (1 << 10) }; #define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \ @@ -66,16 +67,27 @@ enum { TIPC_LINK_PROTO_SEQNO | \ TIPC_MCAST_RBCTL | \ TIPC_GAP_ACK_BLOCK | \ - TIPC_TUNNEL_ENHANCED) + TIPC_TUNNEL_ENHANCED | \ + TIPC_NAGLE) + #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); bool tipc_node_get_id(struct net *net, u32 addr, u8 *id); u32 tipc_node_get_addr(struct tipc_node *node); +char *tipc_node_get_id_str(struct tipc_node *node); +void tipc_node_put(struct tipc_node *node); +struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, + u16 capabilities, u32 hash_mixes, + bool preliminary); +#ifdef CONFIG_TIPC_CRYPTO +struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n); +struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos); +#endif u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, struct tipc_bearer *bearer, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr); void tipc_node_delete_links(struct net *net, int bearer_id); @@ -92,7 +104,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr); void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected); bool tipc_node_is_up(struct net *net, u32 addr); u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); @@ -107,4 +119,9 @@ int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb); +#ifdef CONFIG_TIPC_CRYPTO +int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info); +#endif +void tipc_node_pre_cleanup_net(struct net *exit_net); #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3b9f8cc328f5..5d7859aac78e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -75,6 +75,7 @@ struct sockaddr_pair { * @conn_instance: TIPC instance used when connection was established * @published: non-zero if port has one or more associated names * @max_pkt: maximum packet size "hint" used when building messages sent by port + * @maxnagle: maximum size of msg which can be subject to nagle * @portid: unique port identity in TIPC socket hash table * @phdr: preformatted message header used when sending messages * #cong_links: list of congested links @@ -97,6 +98,7 @@ struct tipc_sock { u32 conn_instance; int published; u32 max_pkt; + u32 maxnagle; u32 portid; struct tipc_msg phdr; struct list_head cong_links; @@ -116,6 +118,10 @@ struct tipc_sock { struct tipc_mc_method mc_method; struct rcu_head rcu; struct tipc_group *group; + u32 oneway; + u16 snd_backlog; + bool expect_ack; + bool nodelay; bool group_is_open; }; @@ -137,6 +143,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); +static void tipc_sk_push_backlog(struct tipc_sock *tsk); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -227,6 +234,26 @@ static u16 tsk_inc(struct tipc_sock *tsk, int msglen) return 1; } +/* tsk_set_nagle - enable/disable nagle property by manipulating maxnagle + */ +static void tsk_set_nagle(struct tipc_sock *tsk) +{ + struct sock *sk = &tsk->sk; + + tsk->maxnagle = 0; + if (sk->sk_type != SOCK_STREAM) + return; + if (tsk->nodelay) + return; + if (!(tsk->peer_caps & TIPC_NAGLE)) + return; + /* Limit node local buffer size to avoid receive queue overflow */ + if (tsk->max_pkt == MAX_MSG_SIZE) + tsk->maxnagle = 1500; + else + tsk->maxnagle = tsk->max_pkt; +} + /** * tsk_advance_rx_queue - discard first buffer in socket receive queue * @@ -446,6 +473,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); tsk->max_pkt = MAX_PKT_DEFAULT; + tsk->maxnagle = 0; INIT_LIST_HEAD(&tsk->publications); INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; @@ -512,8 +540,12 @@ static void __tipc_shutdown(struct socket *sock, int error) tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))); - /* Remove any pending SYN message */ - __skb_queue_purge(&sk->sk_write_queue); + /* Push out unsent messages or remove if pending SYN */ + skb = skb_peek(&sk->sk_write_queue); + if (skb && !msg_is_syn(buf_msg(skb))) + tipc_sk_push_backlog(tsk); + else + __skb_queue_purge(&sk->sk_write_queue); /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). @@ -740,7 +772,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, /* fall through */ case TIPC_LISTEN: case TIPC_CONNECTING: - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) revents |= EPOLLIN | EPOLLRDNORM; break; case TIPC_OPEN: @@ -748,7 +780,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, revents |= EPOLLOUT; if (!tipc_sk_type_connectionless(sk)) break; - if (skb_queue_empty(&sk->sk_receive_queue)) + if (skb_queue_empty_lockless(&sk->sk_receive_queue)) break; revents |= EPOLLIN | EPOLLRDNORM; break; @@ -854,7 +886,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, /* Build message as chain of buffers */ __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1208,6 +1240,27 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, tipc_sk_rcv(net, inputq); } +/* tipc_sk_push_backlog(): send accumulated buffers in socket write queue + * when socket is in Nagle mode + */ +static void tipc_sk_push_backlog(struct tipc_sock *tsk) +{ + struct sk_buff_head *txq = &tsk->sk.sk_write_queue; + struct net *net = sock_net(&tsk->sk); + u32 dnode = tsk_peer_node(tsk); + int rc; + + if (skb_queue_empty(txq) || tsk->cong_link_cnt) + return; + + tsk->snt_unacked += tsk->snd_backlog; + tsk->snd_backlog = 0; + tsk->expect_ack = true; + rc = tipc_node_xmit(net, txq, dnode, tsk->portid); + if (rc == -ELINKCONG) + tsk->cong_link_cnt = 1; +} + /** * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket @@ -1221,7 +1274,7 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, u32 onode = tsk_own_node(tsk); struct sock *sk = &tsk->sk; int mtyp = msg_type(hdr); - bool conn_cong; + bool was_cong; /* Ignore if connection cannot be validated: */ if (!tsk_peer_msg(tsk, hdr)) { @@ -1254,11 +1307,13 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, __skb_queue_tail(xmitq, skb); return; } else if (mtyp == CONN_ACK) { - conn_cong = tsk_conn_cong(tsk); + was_cong = tsk_conn_cong(tsk); + tsk->expect_ack = false; + tipc_sk_push_backlog(tsk); tsk->snt_unacked -= msg_conn_ack(hdr); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) tsk->snd_win = msg_adv_win(hdr); - if (conn_cong) + if (was_cong && !tsk_conn_cong(tsk)) sk->sk_write_space(sk); } else if (mtyp != CONN_PROBE_REPLY) { pr_warn("Received unknown CONN_PROTO msg\n"); @@ -1388,7 +1443,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return rc; __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1437,15 +1492,15 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + struct sk_buff_head *txq = &sk->sk_write_queue; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - struct sk_buff_head pkts; u32 dnode = tsk_peer_node(tsk); + int maxnagle = tsk->maxnagle; + int maxpkt = tsk->max_pkt; int send, sent = 0; - int rc = 0; - - __skb_queue_head_init(&pkts); + int blocks, rc = 0; if (unlikely(dlen > INT_MAX)) return -EMSGSIZE; @@ -1467,21 +1522,35 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) tipc_sk_connected(sk))); if (unlikely(rc)) break; - send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); - rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts); - if (unlikely(rc != send)) - break; - - trace_tipc_sk_sendstream(sk, skb_peek(&pkts), + blocks = tsk->snd_backlog; + if (tsk->oneway++ >= 4 && send <= maxnagle) { + rc = tipc_msg_append(hdr, m, send, maxnagle, txq); + if (unlikely(rc < 0)) + break; + blocks += rc; + if (blocks <= 64 && tsk->expect_ack) { + tsk->snd_backlog = blocks; + sent += send; + break; + } + tsk->expect_ack = true; + } else { + rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq); + if (unlikely(rc != send)) + break; + blocks += tsk_inc(tsk, send + MIN_H_SIZE); + } + trace_tipc_sk_sendstream(sk, skb_peek(txq), TIPC_DUMP_SK_SNDQ, " "); - rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + rc = tipc_node_xmit(net, txq, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { tsk->cong_link_cnt = 1; rc = 0; } if (likely(!rc)) { - tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE); + tsk->snt_unacked += blocks; + tsk->snd_backlog = 0; sent += send; } } while (sent < dlen && !rc); @@ -1526,8 +1595,9 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); tipc_set_sk_state(sk, TIPC_ESTABLISHED); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); - tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); + tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid, true); tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); + tsk_set_nagle(tsk); __skb_queue_purge(&sk->sk_write_queue); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) return; @@ -1848,6 +1918,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, bool peek = flags & MSG_PEEK; int offset, required, copy, copied = 0; int hlen, dlen, err, rc; + bool ack = false; long timeout; /* Catch invalid receive attempts */ @@ -1892,6 +1963,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Copy data if msg ok, otherwise return error/partial data */ if (likely(!err)) { + ack = msg_ack_required(hdr); offset = skb_cb->bytes_read; copy = min_t(int, dlen - offset, buflen - copied); rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); @@ -1919,7 +1991,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); - if (unlikely(tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)) + if (ack || tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) tipc_sk_send_ack(tsk); /* Exit if all requested data or FIN/error received */ @@ -1990,6 +2062,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, smp_wmb(); tsk->cong_link_cnt--; wakeup = true; + tipc_sk_push_backlog(tsk); break; case GROUP_PROTOCOL: tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); @@ -2029,6 +2102,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) if (unlikely(msg_mcast(hdr))) return false; + tsk->oneway = 0; switch (sk->sk_state) { case TIPC_CONNECTING: @@ -2074,6 +2148,8 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) return true; return false; case TIPC_ESTABLISHED: + if (!skb_queue_empty(&sk->sk_write_queue)) + tipc_sk_push_backlog(tsk); /* Accept only connection-based messages sent by peer */ if (likely(con_msg && !err && pport == oport && pnode == onode)) return true; @@ -2119,13 +2195,13 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) struct tipc_msg *hdr = buf_msg(skb); if (unlikely(msg_in_group(hdr))) - return sk->sk_rcvbuf; + return READ_ONCE(sk->sk_rcvbuf); if (unlikely(!msg_connected(hdr))) - return sk->sk_rcvbuf << msg_importance(hdr); + return READ_ONCE(sk->sk_rcvbuf) << msg_importance(hdr); if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) - return sk->sk_rcvbuf; + return READ_ONCE(sk->sk_rcvbuf); return FLOWCTL_MSG_LIM; } @@ -2959,6 +3035,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_SRC_DROPPABLE: case TIPC_DEST_DROPPABLE: case TIPC_CONN_TIMEOUT: + case TIPC_NODELAY: if (ol < sizeof(value)) return -EINVAL; if (get_user(value, (u32 __user *)ov)) @@ -3007,6 +3084,10 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_GROUP_LEAVE: res = tipc_sk_leave(tsk); break; + case TIPC_NODELAY: + tsk->nodelay = !!value; + tsk_set_nagle(tsk); + break; default: res = -EINVAL; } @@ -3588,13 +3669,9 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) struct tipc_sock *tsk; if (!tsk_portid) { - struct nlattr **attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1]; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_SOCK]) return -EINVAL; @@ -3790,7 +3867,7 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) i += scnprintf(buf + i, sz - i, " %d", sk->sk_sndbuf); i += scnprintf(buf + i, sz - i, " | %d", sk_rmem_alloc_get(sk)); i += scnprintf(buf + i, sz - i, " %d", sk->sk_rcvbuf); - i += scnprintf(buf + i, sz - i, " | %d\n", sk->sk_backlog.len); + i += scnprintf(buf + i, sz - i, " | %d\n", READ_ONCE(sk->sk_backlog.len)); if (dqueues & TIPC_DUMP_SK_SNDQ) { i += scnprintf(buf + i, sz - i, "sk_write_queue: "); diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 6159d327db76..58ab3d6dcdce 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -35,6 +35,7 @@ #include "core.h" #include "trace.h" +#include "crypto.h" #include <linux/sysctl.h> @@ -64,6 +65,16 @@ static struct ctl_table tipc_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_TIPC_CRYPTO + { + .procname = "max_tfms", + .data = &sysctl_tipc_max_tfms, + .maxlen = sizeof(sysctl_tipc_max_tfms), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif {} }; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 287df68721df..86aaa4d3e781 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -372,6 +372,7 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) goto out; if (b && test_bit(0, &b->up)) { + TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(sock_net(sk), skb, b); return 0; } @@ -448,15 +449,11 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) int i; if (!bid && !skip_cnt) { + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct net *net = sock_net(skb->sk); struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1]; - struct nlattr **attrs; char *bname; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_BEARER]) return -EINVAL; diff --git a/net/tls/Kconfig b/net/tls/Kconfig index e4328b3b72eb..61ec78521a60 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -26,3 +26,13 @@ config TLS_DEVICE Enable kernel support for HW offload of the TLS protocol. If unsure, say N. + +config TLS_TOE + bool "Transport Layer Security TCP stack bypass" + depends on TLS + default n + help + Enable kernel support for legacy HW offload of the TLS protocol, + which is incompatible with the Linux networking stack semantics. + + If unsure, say N. diff --git a/net/tls/Makefile b/net/tls/Makefile index ef0dc74ce8f9..f1ffbfe8968d 100644 --- a/net/tls/Makefile +++ b/net/tls/Makefile @@ -3,8 +3,11 @@ # Makefile for the TLS subsystem. # +CFLAGS_trace.o := -I$(src) + obj-$(CONFIG_TLS) += tls.o -tls-y := tls_main.o tls_sw.o +tls-y := tls_main.o tls_sw.o tls_proc.o trace.o +tls-$(CONFIG_TLS_TOE) += tls_toe.o tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index f959487c5cd1..0683788bbef0 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -38,6 +38,8 @@ #include <net/tcp.h> #include <net/tls.h> +#include "trace.h" + /* device_offload_lock is used to synchronize tls_dev_add * against NETDEV_DOWN notifications. */ @@ -202,6 +204,15 @@ void tls_device_free_resources_tx(struct sock *sk) tls_free_partial_record(sk, tls_ctx); } +void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + + trace_tls_device_tx_resync_req(sk, got_seq, exp_seq); + WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags)); +} +EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request); + static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, u32 seq) { @@ -216,6 +227,7 @@ static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, rcd_sn = tls_ctx->tx.rec_seq; + trace_tls_device_tx_resync_send(sk, seq, rcd_sn); down_read(&device_offload_lock); netdev = tls_ctx->netdev; if (netdev) @@ -419,7 +431,7 @@ static int tls_push_data(struct sock *sk, ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) return -ENOTSUPP; - if (sk->sk_err) + if (unlikely(sk->sk_err)) return -sk->sk_err; flags |= MSG_SENDPAGE_DECRYPTED; @@ -440,9 +452,8 @@ static int tls_push_data(struct sock *sk, max_open_record_len = TLS_MAX_PAYLOAD_SIZE + prot->prepend_size; do { - rc = tls_do_allocation(sk, ctx, pfrag, - prot->prepend_size); - if (rc) { + rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); + if (unlikely(rc)) { rc = sk_stream_wait_memory(sk, &timeo); if (!rc) continue; @@ -523,8 +534,10 @@ last_record: int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { unsigned char record_type = TLS_RECORD_TYPE_DATA; + struct tls_context *tls_ctx = tls_get_ctx(sk); int rc; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (unlikely(msg->msg_controllen)) { @@ -538,12 +551,14 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) out: release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return rc; } int tls_device_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + struct tls_context *tls_ctx = tls_get_ctx(sk); struct iov_iter msg_iter; char *kaddr = kmap(page); struct kvec iov; @@ -552,6 +567,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, if (flags & MSG_SENDPAGE_NOTLAST) flags |= MSG_MORE; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (flags & MSG_OOB) { @@ -568,6 +584,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, out: release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return rc; } @@ -623,9 +640,11 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) void tls_device_write_space(struct sock *sk, struct tls_context *ctx) { - if (!sk->sk_write_pending && tls_is_partially_sent_record(ctx)) { + if (tls_is_partially_sent_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; + WARN_ON_ONCE(sk->sk_write_pending); + sk->sk_allocation = GFP_ATOMIC; tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL | @@ -637,15 +656,19 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) static void tls_device_resync_rx(struct tls_context *tls_ctx, struct sock *sk, u32 seq, u8 *rcd_sn) { + struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); struct net_device *netdev; if (WARN_ON(test_and_set_bit(TLS_RX_SYNC_RUNNING, &tls_ctx->flags))) return; + + trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type); netdev = READ_ONCE(tls_ctx->netdev); if (netdev) netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_RX); clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC); } void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) @@ -653,8 +676,8 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + u32 sock_data, is_req_pending; struct tls_prot_info *prot; - u32 is_req_pending; s64 resync_req; u32 req_seq; @@ -683,8 +706,12 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) /* head of next rec is already in, note that the sock_inq will * include the currently parsed message when called from parser */ - if (tcp_inq(sk) > rcd_len) + sock_data = tcp_inq(sk); + if (sock_data > rcd_len) { + trace_tls_device_rx_resync_nh_delay(sk, sock_data, + rcd_len); return; + } rx_ctx->resync_nh_do_now = 0; seq += rcd_len; @@ -728,6 +755,7 @@ static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, /* head of next rec is already in, parser will sync for us */ if (tcp_inq(sk) > rxm->full_len) { + trace_tls_device_rx_resync_nh_schedule(sk); ctx->resync_nh_do_now = 1; } else { struct tls_prot_info *prot = &tls_ctx->prot_info; @@ -826,9 +854,9 @@ free_buf: return err; } -int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) +int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, + struct sk_buff *skb, struct strp_msg *rxm) { - struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); int is_decrypted = skb->decrypted; int is_encrypted = !is_decrypted; @@ -840,6 +868,10 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) is_encrypted &= !skb_iter->decrypted; } + trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len, + tls_ctx->rx.rec_seq, rxm->full_len, + is_encrypted, is_decrypted); + ctx->sw.decrypted |= is_decrypted; /* Return immediately if the record is either entirely plaintext or @@ -1013,6 +1045,8 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, &ctx->crypto_send.info, tcp_sk(sk)->write_seq); + trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX, + tcp_sk(sk)->write_seq, rec_seq, rc); if (rc) goto release_lock; @@ -1049,6 +1083,7 @@ free_marker_record: int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { + struct tls12_crypto_info_aes_gcm_128 *info; struct tls_offload_context_rx *context; struct net_device *netdev; int rc = 0; @@ -1096,6 +1131,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); + info = (void *)&ctx->crypto_recv.info; + trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX, + tcp_sk(sk)->copied_seq, info->rec_seq, rc); if (rc) goto free_sw_resources; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index ac88877dcade..5bb93dd5762b 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -41,7 +41,9 @@ #include <linux/inetdevice.h> #include <linux/inet_diag.h> +#include <net/snmp.h> #include <net/tls.h> +#include <net/tls_toe.h> MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); @@ -58,14 +60,12 @@ static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static struct proto *saved_tcpv4_prot; static DEFINE_MUTEX(tcpv4_prot_mutex); -static LIST_HEAD(device_list); -static DEFINE_SPINLOCK(device_spinlock); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], struct proto *base); -static void update_sk_prot(struct sock *sk, struct tls_context *ctx) +void update_sk_prot(struct sock *sk, struct tls_context *ctx) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; @@ -267,6 +267,7 @@ void tls_ctx_free(struct sock *sk, struct tls_context *ctx) memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send)); memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv)); + mutex_destroy(&ctx->tx_lock); if (sk) kfree_rcu(ctx, rcu); @@ -286,14 +287,19 @@ static void tls_sk_proto_cleanup(struct sock *sk, kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); tls_sw_release_resources_tx(sk); + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); } else if (ctx->tx_conf == TLS_HW) { tls_device_free_resources_tx(sk); + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); } - if (ctx->rx_conf == TLS_SW) + if (ctx->rx_conf == TLS_SW) { tls_sw_release_resources_rx(sk); - else if (ctx->rx_conf == TLS_HW) + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); + } else if (ctx->rx_conf == TLS_HW) { tls_device_offload_cleanup_rx(sk); + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); + } } static void tls_sk_proto_close(struct sock *sk, long timeout) @@ -534,19 +540,29 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, if (tx) { rc = tls_set_device_offload(sk, ctx); conf = TLS_HW; - if (rc) { + if (!rc) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXDEVICE); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); + } else { rc = tls_set_sw_offload(sk, ctx, 1); if (rc) goto err_crypto_info; + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); conf = TLS_SW; } } else { rc = tls_set_device_offload_rx(sk, ctx); conf = TLS_HW; - if (rc) { + if (!rc) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICE); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); + } else { rc = tls_set_sw_offload(sk, ctx, 0); if (rc) goto err_crypto_info; + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); conf = TLS_SW; } tls_sw_strparser_arm(sk, ctx); @@ -603,7 +619,7 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } -static struct tls_context *create_ctx(struct sock *sk) +struct tls_context *tls_ctx_create(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; @@ -612,6 +628,7 @@ static struct tls_context *create_ctx(struct sock *sk) if (!ctx) return NULL; + mutex_init(&ctx->tx_lock); rcu_assign_pointer(icsk->icsk_ulp_data, ctx); ctx->sk_proto = sk->sk_prot; return ctx; @@ -643,90 +660,6 @@ static void tls_build_proto(struct sock *sk) } } -static void tls_hw_sk_destruct(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - ctx->sk_destruct(sk); - /* Free ctx */ - rcu_assign_pointer(icsk->icsk_ulp_data, NULL); - tls_ctx_free(sk, ctx); -} - -static int tls_hw_prot(struct sock *sk) -{ - struct tls_context *ctx; - struct tls_device *dev; - int rc = 0; - - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->feature && dev->feature(dev)) { - ctx = create_ctx(sk); - if (!ctx) - goto out; - - spin_unlock_bh(&device_spinlock); - tls_build_proto(sk); - ctx->sk_destruct = sk->sk_destruct; - sk->sk_destruct = tls_hw_sk_destruct; - ctx->rx_conf = TLS_HW_RECORD; - ctx->tx_conf = TLS_HW_RECORD; - update_sk_prot(sk, ctx); - spin_lock_bh(&device_spinlock); - rc = 1; - break; - } - } -out: - spin_unlock_bh(&device_spinlock); - return rc; -} - -static void tls_hw_unhash(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct tls_device *dev; - - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->unhash) { - kref_get(&dev->kref); - spin_unlock_bh(&device_spinlock); - dev->unhash(dev, sk); - kref_put(&dev->kref, dev->release); - spin_lock_bh(&device_spinlock); - } - } - spin_unlock_bh(&device_spinlock); - ctx->sk_proto->unhash(sk); -} - -static int tls_hw_hash(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct tls_device *dev; - int err; - - err = ctx->sk_proto->hash(sk); - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->hash) { - kref_get(&dev->kref); - spin_unlock_bh(&device_spinlock); - err |= dev->hash(dev, sk); - kref_put(&dev->kref, dev->release); - spin_lock_bh(&device_spinlock); - } - } - spin_unlock_bh(&device_spinlock); - - if (err) - tls_hw_unhash(sk); - return err; -} - static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], struct proto *base) { @@ -764,10 +697,11 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW]; #endif - +#ifdef CONFIG_TLS_TOE prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; - prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash; - prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_toe_hash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_toe_unhash; +#endif } static int tls_init(struct sock *sk) @@ -775,8 +709,12 @@ static int tls_init(struct sock *sk) struct tls_context *ctx; int rc = 0; - if (tls_hw_prot(sk)) + tls_build_proto(sk); + +#ifdef CONFIG_TLS_TOE + if (tls_toe_bypass(sk)) return 0; +#endif /* The TLS ulp is currently supported only for TCP sockets * in ESTABLISHED state. @@ -787,11 +725,9 @@ static int tls_init(struct sock *sk) if (sk->sk_state != TCP_ESTABLISHED) return -ENOTSUPP; - tls_build_proto(sk); - /* allocate tls context */ write_lock_bh(&sk->sk_callback_lock); - ctx = create_ctx(sk); + ctx = tls_ctx_create(sk); if (!ctx) { rc = -ENOMEM; goto out; @@ -877,21 +813,34 @@ static size_t tls_get_info_size(const struct sock *sk) return size; } -void tls_register_device(struct tls_device *device) +static int __net_init tls_init_net(struct net *net) { - spin_lock_bh(&device_spinlock); - list_add_tail(&device->dev_list, &device_list); - spin_unlock_bh(&device_spinlock); + int err; + + net->mib.tls_statistics = alloc_percpu(struct linux_tls_mib); + if (!net->mib.tls_statistics) + return -ENOMEM; + + err = tls_proc_init(net); + if (err) + goto err_free_stats; + + return 0; +err_free_stats: + free_percpu(net->mib.tls_statistics); + return err; } -EXPORT_SYMBOL(tls_register_device); -void tls_unregister_device(struct tls_device *device) +static void __net_exit tls_exit_net(struct net *net) { - spin_lock_bh(&device_spinlock); - list_del(&device->dev_list); - spin_unlock_bh(&device_spinlock); + tls_proc_fini(net); + free_percpu(net->mib.tls_statistics); } -EXPORT_SYMBOL(tls_unregister_device); + +static struct pernet_operations tls_proc_ops = { + .init = tls_init_net, + .exit = tls_exit_net, +}; static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", @@ -904,6 +853,12 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { static int __init tls_register(void) { + int err; + + err = register_pernet_subsys(&tls_proc_ops); + if (err) + return err; + tls_sw_proto_ops = inet_stream_ops; tls_sw_proto_ops.splice_read = tls_sw_splice_read; @@ -917,6 +872,7 @@ static void __exit tls_unregister(void) { tcp_unregister_ulp(&tcp_tls_ulp_ops); tls_device_cleanup(); + unregister_pernet_subsys(&tls_proc_ops); } module_init(tls_register); diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c new file mode 100644 index 000000000000..83d9c80a684e --- /dev/null +++ b/net/tls/tls_proc.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/snmp.h> +#include <net/tls.h> + +static const struct snmp_mib tls_mib_list[] = { + SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW), + SNMP_MIB_ITEM("TlsCurrRxSw", LINUX_MIB_TLSCURRRXSW), + SNMP_MIB_ITEM("TlsCurrTxDevice", LINUX_MIB_TLSCURRTXDEVICE), + SNMP_MIB_ITEM("TlsCurrRxDevice", LINUX_MIB_TLSCURRRXDEVICE), + SNMP_MIB_ITEM("TlsTxSw", LINUX_MIB_TLSTXSW), + SNMP_MIB_ITEM("TlsRxSw", LINUX_MIB_TLSRXSW), + SNMP_MIB_ITEM("TlsTxDevice", LINUX_MIB_TLSTXDEVICE), + SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE), + SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR), + SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), + SNMP_MIB_SENTINEL +}; + +static int tls_statistics_seq_show(struct seq_file *seq, void *v) +{ + unsigned long buf[LINUX_MIB_TLSMAX] = {}; + struct net *net = seq->private; + int i; + + snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics); + for (i = 0; tls_mib_list[i].name; i++) + seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]); + + return 0; +} + +int __net_init tls_proc_init(struct net *net) +{ + if (!proc_create_net_single("tls_stat", 0444, net->proc_net, + tls_statistics_seq_show, NULL)) + return -ENOMEM; + return 0; +} + +void __net_exit tls_proc_fini(struct net *net) +{ + remove_proc_entry("tls_stat", net->proc_net); +} diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index c2b5e0d2ba1a..141da093ff04 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -168,6 +168,9 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err) /* Propagate if there was an err */ if (err) { + if (err == -EBADMSG) + TLS_INC_STATS(sock_net(skb->sk), + LINUX_MIB_TLSDECRYPTERROR); ctx->async_wait.err = err; tls_err_abort(skb->sk, err); } else { @@ -253,6 +256,8 @@ static int tls_do_decryption(struct sock *sk, return ret; ret = crypto_wait_req(ret, &ctx->async_wait); + } else if (ret == -EBADMSG) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR); } if (async) @@ -897,15 +902,9 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -ENOTSUPP; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); - /* Wait till there is any pending write on socket */ - if (unlikely(sk->sk_write_pending)) { - ret = wait_on_pending_writer(sk, &timeo); - if (unlikely(ret)) - goto send_end; - } - if (unlikely(msg->msg_controllen)) { ret = tls_proccess_cmsg(sk, msg, &record_type); if (ret) { @@ -1091,6 +1090,7 @@ send_end: ret = sk_stream_error(sk, msg->msg_flags, ret); release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return copied ? copied : ret; } @@ -1114,13 +1114,6 @@ static int tls_sw_do_sendpage(struct sock *sk, struct page *page, eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST)); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - /* Wait till there is any pending write on socket */ - if (unlikely(sk->sk_write_pending)) { - ret = wait_on_pending_writer(sk, &timeo); - if (unlikely(ret)) - goto sendpage_end; - } - /* Call the sk_stream functions to manage the sndbuf mem. */ while (size > 0) { size_t copy, required_size; @@ -1219,15 +1212,18 @@ sendpage_end: int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + struct tls_context *tls_ctx = tls_get_ctx(sk); int ret; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) return -ENOTSUPP; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); ret = tls_sw_do_sendpage(sk, page, offset, size, flags); release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return ret; } @@ -1490,7 +1486,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, if (!ctx->decrypted) { if (tls_ctx->rx_conf == TLS_HW) { - err = tls_device_decrypted(sk, skb); + err = tls_device_decrypted(sk, tls_ctx, skb, rxm); if (err < 0) return err; } @@ -1518,7 +1514,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, rxm->offset += prot->prepend_size; rxm->full_len -= prot->overhead_size; tls_advance_record_sn(sk, prot, &tls_ctx->rx); - ctx->decrypted = true; + ctx->decrypted = 1; ctx->saved_data_ready(sk); } else { *zc = false; @@ -1928,7 +1924,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, tls_err_abort(sk, EBADMSG); goto splice_read_end; } - ctx->decrypted = true; + ctx->decrypted = 1; } rxm = strp_msg(skb); @@ -2029,7 +2025,7 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - ctx->decrypted = false; + ctx->decrypted = 0; ctx->recv_pkt = skb; strp_pause(strp); @@ -2170,9 +2166,11 @@ static void tx_work_handler(struct work_struct *work) if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) return; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); tls_tx_records(sk, -1); release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); } void tls_sw_write_space(struct sock *sk, struct tls_context *ctx) @@ -2180,12 +2178,9 @@ void tls_sw_write_space(struct sock *sk, struct tls_context *ctx) struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx); /* Schedule the transmission if tx list is ready */ - if (is_tx_ready(tx_ctx) && !sk->sk_write_pending) { - /* Schedule the transmission */ - if (!test_and_set_bit(BIT_TX_SCHEDULED, - &tx_ctx->tx_bitmask)) - schedule_delayed_work(&tx_ctx->tx_work.work, 0); - } + if (is_tx_ready(tx_ctx) && + !test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask)) + schedule_delayed_work(&tx_ctx->tx_work.work, 0); } void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx) @@ -2386,10 +2381,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); if (crypto_info->version == TLS_1_3_VERSION) - sw_ctx_rx->async_capable = false; + sw_ctx_rx->async_capable = 0; else sw_ctx_rx->async_capable = - tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; + !!(tfm->__crt_alg->cra_flags & + CRYPTO_ALG_ASYNC); /* Set up strparser */ memset(&cb, 0, sizeof(cb)); diff --git a/net/tls/tls_toe.c b/net/tls/tls_toe.c new file mode 100644 index 000000000000..7e1330f19165 --- /dev/null +++ b/net/tls/tls_toe.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. + * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/spinlock.h> +#include <net/inet_connection_sock.h> +#include <net/tls.h> +#include <net/tls_toe.h> + +static LIST_HEAD(device_list); +static DEFINE_SPINLOCK(device_spinlock); + +static void tls_toe_sk_destruct(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tls_context *ctx = tls_get_ctx(sk); + + ctx->sk_destruct(sk); + /* Free ctx */ + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); + tls_ctx_free(sk, ctx); +} + +int tls_toe_bypass(struct sock *sk) +{ + struct tls_toe_device *dev; + struct tls_context *ctx; + int rc = 0; + + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->feature && dev->feature(dev)) { + ctx = tls_ctx_create(sk); + if (!ctx) + goto out; + + ctx->sk_destruct = sk->sk_destruct; + sk->sk_destruct = tls_toe_sk_destruct; + ctx->rx_conf = TLS_HW_RECORD; + ctx->tx_conf = TLS_HW_RECORD; + update_sk_prot(sk, ctx); + rc = 1; + break; + } + } +out: + spin_unlock_bh(&device_spinlock); + return rc; +} + +void tls_toe_unhash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_toe_device *dev; + + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->unhash) { + kref_get(&dev->kref); + spin_unlock_bh(&device_spinlock); + dev->unhash(dev, sk); + kref_put(&dev->kref, dev->release); + spin_lock_bh(&device_spinlock); + } + } + spin_unlock_bh(&device_spinlock); + ctx->sk_proto->unhash(sk); +} + +int tls_toe_hash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_toe_device *dev; + int err; + + err = ctx->sk_proto->hash(sk); + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->hash) { + kref_get(&dev->kref); + spin_unlock_bh(&device_spinlock); + err |= dev->hash(dev, sk); + kref_put(&dev->kref, dev->release); + spin_lock_bh(&device_spinlock); + } + } + spin_unlock_bh(&device_spinlock); + + if (err) + tls_toe_unhash(sk); + return err; +} + +void tls_toe_register_device(struct tls_toe_device *device) +{ + spin_lock_bh(&device_spinlock); + list_add_tail(&device->dev_list, &device_list); + spin_unlock_bh(&device_spinlock); +} +EXPORT_SYMBOL(tls_toe_register_device); + +void tls_toe_unregister_device(struct tls_toe_device *device) +{ + spin_lock_bh(&device_spinlock); + list_del(&device->dev_list); + spin_unlock_bh(&device_spinlock); +} +EXPORT_SYMBOL(tls_toe_unregister_device); diff --git a/net/tls/trace.c b/net/tls/trace.c new file mode 100644 index 000000000000..e374913cf9c9 --- /dev/null +++ b/net/tls/trace.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#include <linux/module.h> + +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "trace.h" + +#endif diff --git a/net/tls/trace.h b/net/tls/trace.h new file mode 100644 index 000000000000..9ba5f600ea43 --- /dev/null +++ b/net/tls/trace.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM tls + +#if !defined(_TLS_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) +#define _TLS_TRACE_H_ + +#include <asm/unaligned.h> +#include <linux/tracepoint.h> + +struct sock; + +TRACE_EVENT(tls_device_offload_set, + + TP_PROTO(struct sock *sk, int dir, u32 tcp_seq, u8 *rec_no, int ret), + + TP_ARGS(sk, dir, tcp_seq, rec_no, ret), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( int, dir ) + __field( u32, tcp_seq ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->dir = dir; + __entry->tcp_seq = tcp_seq; + __entry->ret = ret; + ), + + TP_printk( + "sk=%p direction=%d tcp_seq=%u rec_no=%llu ret=%d", + __entry->sk, __entry->dir, __entry->tcp_seq, __entry->rec_no, + __entry->ret + ) +); + +TRACE_EVENT(tls_device_decrypted, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no, u32 rec_len, + bool encrypted, bool decrypted), + + TP_ARGS(sk, tcp_seq, rec_no, rec_len, encrypted, decrypted), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + __field( u32, rec_len ) + __field( bool, encrypted ) + __field( bool, decrypted ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + __entry->rec_len = rec_len; + __entry->encrypted = encrypted; + __entry->decrypted = decrypted; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu len=%u encrypted=%d decrypted=%d", + __entry->sk, __entry->tcp_seq, + __entry->rec_no, __entry->rec_len, + __entry->encrypted, __entry->decrypted + ) +); + +TRACE_EVENT(tls_device_rx_resync_send, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no, int sync_type), + + TP_ARGS(sk, tcp_seq, rec_no, sync_type), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + __field( int, sync_type ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + __entry->sync_type = sync_type; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu sync_type=%d", + __entry->sk, __entry->tcp_seq, __entry->rec_no, + __entry->sync_type + ) +); + +TRACE_EVENT(tls_device_rx_resync_nh_schedule, + + TP_PROTO(struct sock *sk), + + TP_ARGS(sk), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + ), + + TP_fast_assign( + __entry->sk = sk; + ), + + TP_printk( + "sk=%p", __entry->sk + ) +); + +TRACE_EVENT(tls_device_rx_resync_nh_delay, + + TP_PROTO(struct sock *sk, u32 sock_data, u32 rec_len), + + TP_ARGS(sk, sock_data, rec_len), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u32, sock_data ) + __field( u32, rec_len ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->sock_data = sock_data; + __entry->rec_len = rec_len; + ), + + TP_printk( + "sk=%p sock_data=%u rec_len=%u", + __entry->sk, __entry->sock_data, __entry->rec_len + ) +); + +TRACE_EVENT(tls_device_tx_resync_req, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u32 exp_tcp_seq), + + TP_ARGS(sk, tcp_seq, exp_tcp_seq), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u32, tcp_seq ) + __field( u32, exp_tcp_seq ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->tcp_seq = tcp_seq; + __entry->exp_tcp_seq = exp_tcp_seq; + ), + + TP_printk( + "sk=%p tcp_seq=%u exp_tcp_seq=%u", + __entry->sk, __entry->tcp_seq, __entry->exp_tcp_seq + ) +); + +TRACE_EVENT(tls_device_tx_resync_send, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no), + + TP_ARGS(sk, tcp_seq, rec_no), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu", + __entry->sk, __entry->tcp_seq, __entry->rec_no + ) +); + +#endif /* _TLS_TRACE_H_ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +#include <trace/define_trace.h> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 67e87db5877f..193cba2d777b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -284,11 +284,9 @@ static struct sock *__unix_find_socket_byname(struct net *net, if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) - goto found; + return s; } - s = NULL; -found: - return s; + return NULL; } static inline struct sock *unix_find_socket_byname(struct net *net, @@ -2599,7 +2597,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ @@ -2628,7 +2626,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, mask = 0; /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); @@ -2638,7 +2636,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 2ab43b2bba31..1f4fde4711b6 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -439,7 +439,7 @@ static void vsock_pending_work(struct work_struct *work) if (vsock_is_pending(sk)) { vsock_remove_pending(listener, sk); - listener->sk_ack_backlog--; + sk_acceptq_removed(listener); } else if (!vsk->rejected) { /* We are not on the pending list and accept() did not reject * us, so we must have been accepted by our user process. We @@ -641,7 +641,6 @@ EXPORT_SYMBOL_GPL(__vsock_create); static void __vsock_release(struct sock *sk, int level) { if (sk) { - struct sk_buff *skb; struct sock *pending; struct vsock_sock *vsk; @@ -662,8 +661,7 @@ static void __vsock_release(struct sock *sk, int level) sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; - while ((skb = skb_dequeue(&sk->sk_receive_queue))) - kfree_skb(skb); + skb_queue_purge(&sk->sk_receive_queue); /* Clean up any sockets that never were accepted. */ while ((pending = vsock_dequeue_accept(sk)) != NULL) { @@ -870,7 +868,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, * the queue and write as long as the socket isn't shutdown for * sending. */ - if (!skb_queue_empty(&sk->sk_receive_queue) || + if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) { mask |= EPOLLIN | EPOLLRDNORM; } @@ -1301,7 +1299,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, err = -listener->sk_err; if (connected) { - listener->sk_ack_backlog--; + sk_acceptq_removed(listener); lock_sock_nested(connected, SINGLE_DEPTH_NESTING); vconnected = vsock_sk(connected); diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index c443db7af8d4..7fa09c5e4625 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -13,15 +13,16 @@ #include <linux/hyperv.h> #include <net/sock.h> #include <net/af_vsock.h> +#include <asm/hyperv-tlfs.h> /* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some - * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer - * hosts don't have this limitation; but, keep the defaults the same for compat. + * stricter requirements on the hv_sock ring buffer size of six 4K pages. + * hyperv-tlfs defines HV_HYP_PAGE_SIZE as 4K. Newer hosts don't have this + * limitation; but, keep the defaults the same for compat. */ -#define PAGE_SIZE_4K 4096 -#define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6) -#define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6) -#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64) +#define RINGBUFFER_HVS_RCV_SIZE (HV_HYP_PAGE_SIZE * 6) +#define RINGBUFFER_HVS_SND_SIZE (HV_HYP_PAGE_SIZE * 6) +#define RINGBUFFER_HVS_MAX_SIZE (HV_HYP_PAGE_SIZE * 64) /* The MTU is 16KB per the host side's design */ #define HVS_MTU_SIZE (1024 * 16) @@ -54,7 +55,8 @@ struct hvs_recv_buf { * ringbuffer APIs that allow us to directly copy data from userspace buffer * to VMBus ringbuffer. */ -#define HVS_SEND_BUF_SIZE (PAGE_SIZE_4K - sizeof(struct vmpipe_proto_header)) +#define HVS_SEND_BUF_SIZE \ + (HV_HYP_PAGE_SIZE - sizeof(struct vmpipe_proto_header)) struct hvs_send_buf { /* The header before the payload data */ @@ -393,10 +395,10 @@ static void hvs_open_connection(struct vmbus_channel *chan) } else { sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE); sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE); - sndbuf = ALIGN(sndbuf, PAGE_SIZE); + sndbuf = ALIGN(sndbuf, HV_HYP_PAGE_SIZE); rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE); rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE); - rcvbuf = ALIGN(rcvbuf, PAGE_SIZE); + rcvbuf = ALIGN(rcvbuf, HV_HYP_PAGE_SIZE); } ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb, @@ -426,7 +428,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (conn_from_host) { new->sk_state = TCP_ESTABLISHED; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); hvs_addr_init(&vnew->local_addr, if_type); hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr); @@ -670,7 +672,7 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, ssize_t ret = 0; ssize_t bytes_written = 0; - BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K); + BUILD_BUG_ON(sizeof(*send_buf) != HV_HYP_PAGE_SIZE); send_buf = kmalloc(sizeof(*send_buf), GFP_KERNEL); if (!send_buf) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index a666ef8fc54e..828edd88488c 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -204,10 +204,14 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, return virtio_transport_get_ops()->send_pkt(pkt); } -static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, +static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) { + if (vvs->rx_bytes + pkt->len > vvs->buf_alloc) + return false; + vvs->rx_bytes += pkt->len; + return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, @@ -264,6 +268,55 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk, } static ssize_t +virtio_transport_stream_do_peek(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt; + size_t bytes, total = 0, off; + int err = -EFAULT; + + spin_lock_bh(&vvs->rx_lock); + + list_for_each_entry(pkt, &vvs->rx_queue, list) { + off = pkt->off; + + if (total == len) + break; + + while (total < len && off < pkt->len) { + bytes = len - total; + if (bytes > pkt->len - off) + bytes = pkt->len - off; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + + err = memcpy_to_msg(msg, pkt->buf + off, bytes); + if (err) + goto out; + + spin_lock_bh(&vvs->rx_lock); + + total += bytes; + off += bytes; + } + } + + spin_unlock_bh(&vvs->rx_lock); + + return total; + +out: + if (total) + err = total; + return err; +} + +static ssize_t virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len) @@ -335,9 +388,9 @@ virtio_transport_stream_dequeue(struct vsock_sock *vsk, size_t len, int flags) { if (flags & MSG_PEEK) - return -EOPNOTSUPP; - - return virtio_transport_stream_do_dequeue(vsk, msg, len); + return virtio_transport_stream_do_peek(vsk, msg, len); + else + return virtio_transport_stream_do_dequeue(vsk, msg, len); } EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); @@ -458,6 +511,9 @@ void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) vvs->buf_size_max = val; vvs->buf_size = val; vvs->buf_alloc = val; + + virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, + NULL); } EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size); @@ -876,14 +932,18 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, struct virtio_vsock_pkt *pkt) { struct virtio_vsock_sock *vvs = vsk->trans; - bool free_pkt = false; + bool can_enqueue, free_pkt = false; pkt->len = le32_to_cpu(pkt->hdr.len); pkt->off = 0; spin_lock_bh(&vvs->rx_lock); - virtio_transport_inc_rx_pkt(vvs, pkt); + can_enqueue = virtio_transport_inc_rx_pkt(vvs, pkt); + if (!can_enqueue) { + free_pkt = true; + goto out; + } /* Try to copy small packets into the buffer of last packet queued, * to avoid wasting memory queueing the entire buffer with a small @@ -936,9 +996,11 @@ virtio_transport_recv_connected(struct sock *sk, if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) vsk->peer_shutdown |= SEND_SHUTDOWN; if (vsk->peer_shutdown == SHUTDOWN_MASK && - vsock_stream_has_data(vsk) <= 0) { - sock_set_flag(sk, SOCK_DONE); - sk->sk_state = TCP_CLOSING; + vsock_stream_has_data(vsk) <= 0 && + !sock_flag(sk, SOCK_DONE)) { + (void)virtio_transport_reset(vsk, NULL); + + virtio_transport_do_close(vsk, true); } if (le32_to_cpu(pkt->hdr.flags)) sk->sk_state_change(sk); @@ -1006,7 +1068,7 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) return -ENOMEM; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); lock_sock_nested(child, SINGLE_DEPTH_NESTING); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 8c9c4ed90fa7..6ba98a1efe2e 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1098,7 +1098,7 @@ static int vmci_transport_recv_listen(struct sock *sk, } vsock_add_pending(sk, pending); - sk->sk_ack_backlog++; + sk_acceptq_added(sk); pending->sk_state = TCP_SYN_SENT; vmci_trans(vpending)->produce_size = diff --git a/net/wireless/chan.c b/net/wireless/chan.c index e851cafd8e2f..fcac5c6366e1 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -204,6 +204,11 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) return false; } + /* channel 14 is only for IEEE 802.11b */ + if (chandef->center_freq1 == 2484 && + chandef->width != NL80211_CHAN_WIDTH_20_NOHT) + return false; + if (cfg80211_chandef_is_edmg(chandef) && !cfg80211_edmg_chandef_valid(chandef)) return false; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 141cdb171665..da5262b2298b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -393,7 +393,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ }, [NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY, .len = IEEE80211_MAX_MESH_ID_LEN }, - [NL80211_ATTR_MPATH_NEXT_HOP] = { .type = NLA_U32 }, + [NL80211_ATTR_MPATH_NEXT_HOP] = NLA_POLICY_ETH_ADDR_COMPAT, [NL80211_ATTR_REG_ALPHA2] = { .type = NLA_STRING, .len = 2 }, [NL80211_ATTR_REG_RULES] = { .type = NLA_NESTED }, @@ -624,6 +624,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = SAE_PASSWORD_MAX_LEN }, [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG }, [NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy), + [NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2), }; /* policy for the key attributes */ @@ -3940,6 +3941,10 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) key.type != NL80211_KEYTYPE_GROUP) return -EINVAL; + if (key.type == NL80211_KEYTYPE_GROUP && + info->attrs[NL80211_ATTR_VLAN_ID]) + key.p.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (!rdev->ops->add_key) return -EOPNOTSUPP; @@ -5711,6 +5716,9 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_STA_AID]) params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]); + if (info->attrs[NL80211_ATTR_VLAN_ID]) + params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); @@ -5856,6 +5864,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); + if (info->attrs[NL80211_ATTR_VLAN_ID]) + params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) { params.support_p2p_ps = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]); @@ -8265,10 +8276,8 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, /* leave request id zero for legacy request * or if driver does not support multi-scheduled scan */ - if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) { - while (!sched_scan_req->reqid) - sched_scan_req->reqid = cfg80211_assign_cookie(rdev); - } + if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) + sched_scan_req->reqid = cfg80211_assign_cookie(rdev); err = rdev_sched_scan_start(rdev, dev, sched_scan_req); if (err) @@ -13682,7 +13691,7 @@ static int nl80211_get_ftm_responder_stats(struct sk_buff *skb, hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, NL80211_CMD_GET_FTM_RESPONDER_STATS); if (!hdr) - return -ENOBUFS; + goto nla_put_failure; if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 420c4207ab59..446c76d44e65 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3883,6 +3883,7 @@ bool regulatory_pre_cac_allowed(struct wiphy *wiphy) return pre_cac_allowed; } +EXPORT_SYMBOL(regulatory_pre_cac_allowed); void regulatory_propagate_dfs_state(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 504133d76de4..f9e83031a40a 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -114,7 +114,7 @@ void regulatory_hint_country_ie(struct wiphy *wiphy, u8 country_ie_len); /** - * regulatory_hint_disconnect - informs all devices have been disconneted + * regulatory_hint_disconnect - informs all devices have been disconnected * * Regulotory rules can be enhanced further upon scanning and upon * connection to an AP. These rules become stale if we disconnect @@ -156,14 +156,6 @@ bool regulatory_indoor_allowed(void); #define REG_PRE_CAC_EXPIRY_GRACE_MS 2000 /** - * regulatory_pre_cac_allowed - if pre-CAC allowed in the current dfs domain - * @wiphy: wiphy for which pre-CAC capability is checked. - - * Pre-CAC is allowed only in ETSI domain. - */ -bool regulatory_pre_cac_allowed(struct wiphy *wiphy); - -/** * regulatory_propagate_dfs_state - Propagate DFS channel state to other wiphys * @wiphy - wiphy on which radar is detected and the event will be propagated * to other available wiphys having the same DFS domain diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ff1016607f0b..aef240fdf8df 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1703,8 +1703,7 @@ cfg80211_parse_mbssid_frame_data(struct wiphy *wiphy, static void cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, struct cfg80211_bss *nontrans_bss, - struct ieee80211_mgmt *mgmt, size_t len, - gfp_t gfp) + struct ieee80211_mgmt *mgmt, size_t len) { u8 *ie, *new_ie, *pos; const u8 *nontrans_ssid, *trans_ssid, *mbssid; @@ -1715,6 +1714,8 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, const struct cfg80211_bss_ies *old; u8 cpy_len; + lockdep_assert_held(&wiphy_to_rdev(wiphy)->bss_lock); + ie = mgmt->u.probe_resp.variable; new_ie_len = ielen; @@ -1731,23 +1732,22 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, if (!mbssid || mbssid < trans_ssid) return; new_ie_len -= mbssid[1]; - rcu_read_lock(); + nontrans_ssid = ieee80211_bss_get_ie(nontrans_bss, WLAN_EID_SSID); - if (!nontrans_ssid) { - rcu_read_unlock(); + if (!nontrans_ssid) return; - } + new_ie_len += nontrans_ssid[1]; - rcu_read_unlock(); /* generate new ie for nontrans BSS * 1. replace SSID with nontrans BSS' SSID * 2. skip MBSSID IE */ - new_ie = kzalloc(new_ie_len, gfp); + new_ie = kzalloc(new_ie_len, GFP_ATOMIC); if (!new_ie) return; - new_ies = kzalloc(sizeof(*new_ies) + new_ie_len, gfp); + + new_ies = kzalloc(sizeof(*new_ies) + new_ie_len, GFP_ATOMIC); if (!new_ies) goto out_free; @@ -1901,6 +1901,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, cfg80211_parse_mbssid_frame_data(wiphy, data, mgmt, len, &non_tx_data, gfp); + spin_lock_bh(&wiphy_to_rdev(wiphy)->bss_lock); + /* check if the res has other nontransmitting bss which is not * in MBSSID IE */ @@ -1915,8 +1917,9 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, ies2 = rcu_access_pointer(tmp_bss->ies); if (ies2->tsf < ies1->tsf) cfg80211_update_notlisted_nontrans(wiphy, tmp_bss, - mgmt, len, gfp); + mgmt, len); } + spin_unlock_bh(&wiphy_to_rdev(wiphy)->bss_lock); return res; } diff --git a/net/wireless/util.c b/net/wireless/util.c index 419eb12c1e93..5b4ed5bbc542 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1559,7 +1559,8 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, } if (freq == 2484) { - if (chandef->width > NL80211_CHAN_WIDTH_40) + /* channel 14 is only for IEEE 802.11b */ + if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT) return false; *op_class = 82; /* channel 14 */ diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index c67d7a82ab13..73fd0eae08ca 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -202,6 +202,7 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; + int ret = 0; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) @@ -219,7 +220,10 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev, if (ie) { data->flags = 1; data->length = ie[1]; - memcpy(ssid, ie + 2, data->length); + if (data->length > IW_ESSID_MAX_SIZE) + ret = -EINVAL; + else + memcpy(ssid, ie + 2, data->length); } rcu_read_unlock(); } else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) { @@ -229,7 +233,7 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev, } wdev_unlock(wdev); - return 0; + return ret; } int cfg80211_mgd_wext_siwap(struct net_device *dev, diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 6aee9f5e8e71..c34f7d077604 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -891,7 +891,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: @@ -1062,7 +1062,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); x25_insert_socket(make); diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 5c111bc3c8ea..00e782335cb0 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -55,7 +55,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb) if (!sock_owned_by_user(sk)) { queued = x25_process_rx_frame(sk, skb); } else { - queued = !sk_add_backlog(sk, skb, sk->sk_rcvbuf); + queued = !sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); } bh_unlock_sock(sk); sock_put(sk); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 16d5f353163a..3049af269fbf 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -27,6 +27,9 @@ void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) { unsigned long flags; + if (!xs->tx) + return; + spin_lock_irqsave(&umem->xsk_list_lock, flags); list_add_rcu(&xs->list, &umem->xsk_list); spin_unlock_irqrestore(&umem->xsk_list_lock, flags); @@ -36,6 +39,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) { unsigned long flags; + if (!xs->tx) + return; + spin_lock_irqsave(&umem->xsk_list_lock, flags); list_del_rcu(&xs->list); spin_unlock_irqrestore(&umem->xsk_list_lock, flags); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index fa8fbb8fa3c8..6040bc2b0088 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -196,7 +196,7 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len; @@ -212,7 +212,7 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); } -void xsk_flush(struct xdp_sock *xs) +static void xsk_flush(struct xdp_sock *xs) { xskq_produce_flush_desc(xs->rx); xs->sk.sk_data_ready(&xs->sk); @@ -264,6 +264,35 @@ out_unlock: return err; } +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + if (!xs->flush_node.prev) + list_add(&xs->flush_node, flush_list); + + return 0; +} + +void __xsk_map_flush(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct xdp_sock *xs, *tmp; + + list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { + xsk_flush(xs); + __list_del_clearprev(&xs->flush_node); + } +} + void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { xskq_produce_flush_addr_n(umem->cq, nb_entries); @@ -305,9 +334,8 @@ out: } EXPORT_SYMBOL(xsk_umem_consume_tx); -static int xsk_zc_xmit(struct sock *sk) +static int xsk_zc_xmit(struct xdp_sock *xs) { - struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev = xs->dev; return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, @@ -327,11 +355,10 @@ static void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } -static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, - size_t total_len) +static int xsk_generic_xmit(struct sock *sk) { - u32 max_batch = TX_BATCH_SIZE; struct xdp_sock *xs = xdp_sk(sk); + u32 max_batch = TX_BATCH_SIZE; bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; @@ -394,6 +421,18 @@ out: return err; } +static int __xsk_sendmsg(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + if (unlikely(!xs->tx)) + return -ENOBUFS; + + return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); +} + static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { bool need_wait = !(m->msg_flags & MSG_DONTWAIT); @@ -402,21 +441,18 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) if (unlikely(!xsk_is_bound(xs))) return -ENXIO; - if (unlikely(!(xs->dev->flags & IFF_UP))) - return -ENETDOWN; - if (unlikely(!xs->tx)) - return -ENOBUFS; - if (need_wait) + if (unlikely(need_wait)) return -EOPNOTSUPP; - return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); + return __xsk_sendmsg(sk); } static unsigned int xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { unsigned int mask = datagram_poll(file, sock, wait); - struct xdp_sock *xs = xdp_sk(sock->sk); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; struct xdp_umem *umem; @@ -426,9 +462,14 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock, dev = xs->dev; umem = xs->umem; - if (umem->need_wakeup) - dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, - umem->need_wakeup); + if (umem->need_wakeup) { + if (dev->netdev_ops->ndo_xsk_wakeup) + dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, + umem->need_wakeup); + else + /* Poll needs to drive Tx also in copy mode */ + __xsk_sendmsg(sk); + } if (xs->rx && !xskq_empty_desc(xs->rx)) mask |= POLLIN | POLLRDNORM; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 0f5131bc3342..7ac1542feaf8 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -732,30 +732,7 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .get_link_net = xfrmi_get_link_net, }; -static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn) -{ - struct xfrm_if *xi; - LIST_HEAD(list); - - xi = rtnl_dereference(xfrmn->xfrmi[0]); - if (!xi) - return; - - unregister_netdevice_queue(xi->dev, &list); - unregister_netdevice_many(&list); -} - -static void __net_exit xfrmi_exit_net(struct net *net) -{ - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - - rtnl_lock(); - xfrmi_destroy_interfaces(xfrmn); - rtnl_unlock(); -} - static struct pernet_operations xfrmi_net_ops = { - .exit = xfrmi_exit_net, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; |