diff options
Diffstat (limited to 'net')
234 files changed, 8667 insertions, 2614 deletions
diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 6c11cdf4dd4c..fbd0c5e7b299 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -109,7 +109,7 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) dev_kfree_skb(skb); goto as_indicate_complete; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); skb_queue_tail(&sk->sk_receive_queue, skb); pr_debug("waking sk_sleep(sk) 0x%p\n", sk_sleep(sk)); sk->sk_state_change(sk); diff --git a/net/atm/svc.c b/net/atm/svc.c index 908cbb8654f5..ba144d035e3d 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -381,7 +381,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags, msg->pvc.sap_addr.vpi, msg->pvc.sap_addr.vci); dev_kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); if (error) { sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL, &old_vcc->qos, error); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index bb222b882b67..324306d6fde0 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1384,7 +1384,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; out: diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index dcdbaeeb2358..cd6afe895db9 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -356,7 +356,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, make->sk_state = TCP_ESTABLISHED; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); bh_unlock_sock(sk); } else { if (!mine) diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 64054edc2e3c..4ff6cf1ecae7 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1085,7 +1085,6 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) hard_iface->bat_v.aggr_len = 0; skb_queue_head_init(&hard_iface->bat_v.aggr_list); - spin_lock_init(&hard_iface->bat_v.aggr_list_lock); INIT_DELAYED_WORK(&hard_iface->bat_v.aggr_wq, batadv_v_ogm_aggr_work); } diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 8033f24f506c..714ce56cfcc8 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -152,7 +152,7 @@ static unsigned int batadv_v_ogm_len(struct sk_buff *skb) * @skb: the OGM to check * @hard_iface: the interface to use to send the OGM * - * Caller needs to hold the hard_iface->bat_v.aggr_list_lock. + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. * * Return: True, if the given OGMv2 packet still fits, false otherwise. */ @@ -163,7 +163,7 @@ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, BATADV_MAX_AGGREGATION_BYTES); unsigned int ogm_len = batadv_v_ogm_len(skb); - lockdep_assert_held(&hard_iface->bat_v.aggr_list_lock); + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); return hard_iface->bat_v.aggr_len + ogm_len <= max; } @@ -174,17 +174,13 @@ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, * * Empties the OGMv2 aggregation queue and frees all the skbs it contained. * - * Caller needs to hold the hard_iface->bat_v.aggr_list_lock. + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) { - struct sk_buff *skb; - - lockdep_assert_held(&hard_iface->bat_v.aggr_list_lock); - - while ((skb = skb_dequeue(&hard_iface->bat_v.aggr_list))) - kfree_skb(skb); + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); + __skb_queue_purge(&hard_iface->bat_v.aggr_list); hard_iface->bat_v.aggr_len = 0; } @@ -197,7 +193,7 @@ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) * * The aggregation queue is empty after this call. * - * Caller needs to hold the hard_iface->bat_v.aggr_list_lock. + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) { @@ -206,7 +202,7 @@ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) unsigned int ogm_len; struct sk_buff *skb; - lockdep_assert_held(&hard_iface->bat_v.aggr_list_lock); + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); if (!aggr_len) return; @@ -220,7 +216,7 @@ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) skb_reserve(skb_aggr, ETH_HLEN + NET_IP_ALIGN); skb_reset_network_header(skb_aggr); - while ((skb = skb_dequeue(&hard_iface->bat_v.aggr_list))) { + while ((skb = __skb_dequeue(&hard_iface->bat_v.aggr_list))) { hard_iface->bat_v.aggr_len -= batadv_v_ogm_len(skb); ogm_len = batadv_v_ogm_len(skb); @@ -247,13 +243,13 @@ static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, return; } - spin_lock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); if (!batadv_v_ogm_queue_left(skb, hard_iface)) batadv_v_ogm_aggr_send(hard_iface); hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb); - skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); - spin_unlock_bh(&hard_iface->bat_v.aggr_list_lock); + __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** @@ -392,9 +388,9 @@ void batadv_v_ogm_aggr_work(struct work_struct *work) batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work); hard_iface = container_of(batv, struct batadv_hard_iface, bat_v); - spin_lock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_send(hard_iface); - spin_unlock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_start_queue_timer(hard_iface); } @@ -425,9 +421,9 @@ void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { cancel_delayed_work_sync(&hard_iface->bat_v.aggr_wq); - spin_lock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_list_free(hard_iface); - spin_unlock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 6967f2e4c3f4..c7b340ddd0e7 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2019.4" +#define BATADV_SOURCE_VERSION "2019.5" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 1d5bdf3a4b65..f9ec8e7507b6 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1421,7 +1421,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, if (*orig) return BATADV_FORW_SINGLE; - /* fall through */ + fallthrough; case 0: return BATADV_FORW_NONE; default: diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 5ee8e9a100f9..832e156c519e 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -22,7 +22,6 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> -#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> @@ -230,7 +229,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, break; } - /* fall through */ + fallthrough; case ETH_P_BATMAN: goto dropped; } @@ -455,7 +454,7 @@ void batadv_interface_rx(struct net_device *soft_iface, if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; - /* fall through */ + fallthrough; case ETH_P_BATMAN: goto dropped; } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 4d7f1baee7b7..47718a82eaf2 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -130,9 +130,6 @@ struct batadv_hard_iface_bat_v { /** @aggr_len: size of the OGM aggregate (excluding ethernet header) */ unsigned int aggr_len; - /** @aggr_list_lock: protects aggr_list */ - spinlock_t aggr_list_lock; - /** * @throughput_override: throughput override to disable link * auto-detection diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 5f508c50649d..3fd124927d4d 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -173,7 +173,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) else release_sock(sk); - parent->sk_ack_backlog++; + sk_acceptq_added(parent); } EXPORT_SYMBOL(bt_accept_enqueue); @@ -185,7 +185,7 @@ void bt_accept_unlink(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); list_del_init(&bt_sk(sk)->accept_q); - bt_sk(sk)->parent->sk_ack_backlog--; + sk_acceptq_removed(bt_sk(sk)->parent); bt_sk(sk)->parent = NULL; sock_put(sk); } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ad5b0ac1f9ce..7ff92dd4c53c 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -934,6 +934,14 @@ static void hci_req_directed_advertising(struct hci_request *req, return; memset(&cp, 0, sizeof(cp)); + + /* Some controllers might reject command if intervals are not + * within range for undirected advertising. + * BCM20702A0 is known to be affected by this. + */ + cp.min_interval = cpu_to_le16(0x0020); + cp.max_interval = cpu_to_le16(0x0020); + cp.type = LE_ADV_DIRECT_IND; cp.own_address_type = own_addr_type; cp.direct_addr_type = conn->dst_type; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 04bc79359a17..0cc9ce917222 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -842,8 +842,8 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { struct hci_cp_le_write_def_data_len cp; - cp.tx_len = hdev->le_max_tx_len; - cp.tx_time = hdev->le_max_tx_time; + cp.tx_len = cpu_to_le16(hdev->le_max_tx_len); + cp.tx_time = cpu_to_le16(hdev->le_max_tx_time); hci_req_add(req, HCI_OP_LE_WRITE_DEF_DATA_LEN, sizeof(cp), &cp); } @@ -4440,7 +4440,14 @@ static void hci_rx_work(struct work_struct *work) hci_send_to_sock(hdev, skb); } - if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { + /* If the device has been opened in HCI_USER_CHANNEL, + * the userspace has exclusive access to device. + * When device is HCI_INIT, we still need to process + * the data packets to the driver in order + * to complete its setup(). + */ + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + !test_bit(HCI_INIT, &hdev->flags)) { kfree_skb(skb); continue; } diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 26e8cfad22b8..6b42be4b5861 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -502,15 +502,12 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], const bdaddr_t *bdaddr) { struct l2cap_chan *chan = hdev->smp_data; - struct smp_dev *smp; u8 hash[3]; int err; if (!chan || !chan->data) return false; - smp = chan->data; - BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk); err = smp_ah(irk, &bdaddr->b[3], hash); @@ -523,14 +520,11 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa) { struct l2cap_chan *chan = hdev->smp_data; - struct smp_dev *smp; int err; if (!chan || !chan->data) return -EOPNOTSUPP; - smp = chan->data; - get_random_bytes(&rpa->b[3], 3); rpa->b[5] &= 0x3f; /* Clear two most significant bits */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 1153bbcdff72..0be4497cb832 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -218,10 +218,18 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + FIELD_SIZEOF(struct __sk_buff, cb), + offsetof(struct __sk_buff, tstamp))) + return -EINVAL; + + /* tstamp is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, tstamp) + + FIELD_SIZEOF(struct __sk_buff, tstamp), sizeof(struct __sk_buff))) return -EINVAL; skb->priority = __skb->priority; + skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); return 0; @@ -235,6 +243,7 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) return; __skb->priority = skb->priority; + __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b1d3248c0252..4877a0db16c6 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -75,8 +75,9 @@ static inline unsigned long hold_time(const struct net_bridge *br) static inline int has_expired(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { - return !fdb->is_static && !fdb->added_by_external_learn && - time_before_eq(fdb->updated + hold_time(br), jiffies); + return !test_bit(BR_FDB_STATIC, &fdb->flags) && + !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags) && + time_before_eq(fdb->updated + hold_time(br), jiffies); } static void fdb_rcu_free(struct rcu_head *head) @@ -197,7 +198,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f, { trace_fdb_delete(br, f); - if (f->is_static) + if (test_bit(BR_FDB_STATIC, &f->flags)) fdb_del_hw_addr(br, f->key.addr.addr); hlist_del_init_rcu(&f->fdb_node); @@ -224,7 +225,7 @@ static void fdb_delete_local(struct net_bridge *br, if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && (!vid || br_vlan_find(vg, vid))) { f->dst = op; - f->added_by_user = 0; + clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } } @@ -235,7 +236,7 @@ static void fdb_delete_local(struct net_bridge *br, if (p && ether_addr_equal(br->dev->dev_addr, addr) && (!vid || (v && br_vlan_should_use(v)))) { f->dst = NULL; - f->added_by_user = 0; + clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } @@ -250,7 +251,8 @@ void br_fdb_find_delete_local(struct net_bridge *br, spin_lock_bh(&br->hash_lock); f = br_fdb_find(br, addr, vid); - if (f && f->is_local && !f->added_by_user && f->dst == p) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !test_bit(BR_FDB_ADDED_BY_USER, &f->flags) && f->dst == p) fdb_delete_local(br, p, f); spin_unlock_bh(&br->hash_lock); } @@ -265,7 +267,8 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) spin_lock_bh(&br->hash_lock); vg = nbp_vlan_group(p); hlist_for_each_entry(f, &br->fdb_list, fdb_node) { - if (f->dst == p && f->is_local && !f->added_by_user) { + if (f->dst == p && test_bit(BR_FDB_LOCAL, &f->flags) && + !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) { /* delete old one */ fdb_delete_local(br, p, f); @@ -306,7 +309,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) /* If old entry was unassociated with any port, then delete it. */ f = br_fdb_find(br, br->dev->dev_addr, 0); - if (f && f->is_local && !f->dst && !f->added_by_user) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, 0); @@ -321,7 +325,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) if (!br_vlan_should_use(v)) continue; f = br_fdb_find(br, br->dev->dev_addr, v->vid); - if (f && f->is_local && !f->dst && !f->added_by_user) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, v->vid); } @@ -346,7 +351,8 @@ void br_fdb_cleanup(struct work_struct *work) hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { unsigned long this_timer; - if (f->is_static || f->added_by_external_learn) + if (test_bit(BR_FDB_STATIC, &f->flags) || + test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) continue; this_timer = f->updated + delay; if (time_after(this_timer, now)) { @@ -373,7 +379,7 @@ void br_fdb_flush(struct net_bridge *br) spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) fdb_delete(br, f, true); } spin_unlock_bh(&br->hash_lock); @@ -397,10 +403,11 @@ void br_fdb_delete_by_port(struct net_bridge *br, continue; if (!do_all) - if (f->is_static || (vid && f->key.vlan_id != vid)) + if (test_bit(BR_FDB_STATIC, &f->flags) || + (vid && f->key.vlan_id != vid)) continue; - if (f->is_local) + if (test_bit(BR_FDB_LOCAL, &f->flags)) fdb_delete_local(br, p, f); else fdb_delete(br, f, true); @@ -469,8 +476,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, fe->port_no = f->dst->port_no; fe->port_hi = f->dst->port_no >> 8; - fe->is_local = f->is_local; - if (!f->is_static) + fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags); + if (!test_bit(BR_FDB_STATIC, &f->flags)) fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); ++fe; ++num; @@ -484,8 +491,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, __u16 vid, - unsigned char is_local, - unsigned char is_static) + unsigned long flags) { struct net_bridge_fdb_entry *fdb; @@ -494,12 +500,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, memcpy(fdb->key.addr.addr, addr, ETH_ALEN); fdb->dst = source; fdb->key.vlan_id = vid; - fdb->is_local = is_local; - fdb->is_static = is_static; - fdb->added_by_user = 0; - fdb->added_by_external_learn = 0; - fdb->offloaded = 0; - fdb->is_sticky = 0; + fdb->flags = flags; fdb->updated = fdb->used = jiffies; if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode, @@ -526,14 +527,15 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, /* it is okay to have multiple ports with same * address, just use the first one. */ - if (fdb->is_local) + if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return 0; br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n", source ? source->dev->name : br->dev->name, addr, vid); fdb_delete(br, fdb, true); } - fdb = fdb_create(br, source, addr, vid, 1, 1); + fdb = fdb_create(br, source, addr, vid, + BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC)); if (!fdb) return -ENOMEM; @@ -555,7 +557,7 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, } void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user) + const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; bool fdb_modified = false; @@ -564,15 +566,10 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, if (hold_time(br) == 0) return; - /* ignore packets unless we are using this port */ - if (!(source->state == BR_STATE_LEARNING || - source->state == BR_STATE_FORWARDING)) - return; - fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); if (likely(fdb)) { /* attempt to update an entry for a local interface */ - if (unlikely(fdb->is_local)) { + if (unlikely(test_bit(BR_FDB_LOCAL, &fdb->flags))) { if (net_ratelimit()) br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n", source->dev->name, addr, vid); @@ -580,30 +577,30 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, unsigned long now = jiffies; /* fastpath: update of existing entry */ - if (unlikely(source != fdb->dst && !fdb->is_sticky)) { + if (unlikely(source != fdb->dst && + !test_bit(BR_FDB_STICKY, &fdb->flags))) { fdb->dst = source; fdb_modified = true; /* Take over HW learned entry */ - if (unlikely(fdb->added_by_external_learn)) - fdb->added_by_external_learn = 0; + if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN, + &fdb->flags))) + clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, + &fdb->flags); } if (now != fdb->updated) fdb->updated = now; - if (unlikely(added_by_user)) - fdb->added_by_user = 1; + if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (unlikely(fdb_modified)) { - trace_br_fdb_update(br, source, addr, vid, added_by_user); + trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } } } else { spin_lock(&br->hash_lock); - fdb = fdb_create(br, source, addr, vid, 0, 0); + fdb = fdb_create(br, source, addr, vid, flags); if (fdb) { - if (unlikely(added_by_user)) - fdb->added_by_user = 1; - trace_br_fdb_update(br, source, addr, vid, - added_by_user); + trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } /* else we lose race and someone else inserts @@ -616,9 +613,9 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, static int fdb_to_nud(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { - if (fdb->is_local) + if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return NUD_PERMANENT; - else if (fdb->is_static) + else if (test_bit(BR_FDB_STATIC, &fdb->flags)) return NUD_NOARP; else if (has_expired(br, fdb)) return NUD_STALE; @@ -648,11 +645,11 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex; ndm->ndm_state = fdb_to_nud(br, fdb); - if (fdb->offloaded) + if (test_bit(BR_FDB_OFFLOADED, &fdb->flags)) ndm->ndm_flags |= NTF_OFFLOADED; - if (fdb->added_by_external_learn) + if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) ndm->ndm_flags |= NTF_EXT_LEARNED; - if (fdb->is_sticky) + if (test_bit(BR_FDB_STICKY, &fdb->flags)) ndm->ndm_flags |= NTF_STICKY; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) @@ -799,7 +796,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, const u8 *addr, u16 state, u16 flags, u16 vid, u8 ndm_flags) { - u8 is_sticky = !!(ndm_flags & NTF_STICKY); + bool is_sticky = !!(ndm_flags & NTF_STICKY); struct net_bridge_fdb_entry *fdb; bool modified = false; @@ -823,7 +820,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (!(flags & NLM_F_CREATE)) return -ENOENT; - fdb = fdb_create(br, source, addr, vid, 0, 0); + fdb = fdb_create(br, source, addr, vid, 0); if (!fdb) return -ENOMEM; @@ -840,34 +837,28 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (fdb_to_nud(br, fdb) != state) { if (state & NUD_PERMANENT) { - fdb->is_local = 1; - if (!fdb->is_static) { - fdb->is_static = 1; + set_bit(BR_FDB_LOCAL, &fdb->flags); + if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); - } } else if (state & NUD_NOARP) { - fdb->is_local = 0; - if (!fdb->is_static) { - fdb->is_static = 1; + clear_bit(BR_FDB_LOCAL, &fdb->flags); + if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); - } } else { - fdb->is_local = 0; - if (fdb->is_static) { - fdb->is_static = 0; + clear_bit(BR_FDB_LOCAL, &fdb->flags); + if (test_and_clear_bit(BR_FDB_STATIC, &fdb->flags)) fdb_del_hw_addr(br, addr); - } } modified = true; } - if (is_sticky != fdb->is_sticky) { - fdb->is_sticky = is_sticky; + if (is_sticky != test_bit(BR_FDB_STICKY, &fdb->flags)) { + change_bit(BR_FDB_STICKY, &fdb->flags); modified = true; } - fdb->added_by_user = 1; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); fdb->used = jiffies; if (modified) { @@ -890,9 +881,12 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, br->dev->name); return -EINVAL; } + if (!nbp_state_should_learn(p)) + return 0; + local_bh_disable(); rcu_read_lock(); - br_fdb_update(br, p, addr, vid, true); + br_fdb_update(br, p, addr, vid, BIT(BR_FDB_ADDED_BY_USER)); rcu_read_unlock(); local_bh_enable(); } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { @@ -1064,7 +1058,7 @@ int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p) rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; err = dev_uc_add(p->dev, f->key.addr.addr); if (err) @@ -1078,7 +1072,7 @@ done: rollback: hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!tmp->is_static) + if (!test_bit(BR_FDB_STATIC, &tmp->flags)) continue; if (tmp == f) break; @@ -1097,7 +1091,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; dev_uc_del(p->dev, f->key.addr.addr); @@ -1119,14 +1113,15 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, fdb = br_fdb_find(br, addr, vid); if (!fdb) { - fdb = fdb_create(br, p, addr, vid, 0, 0); + unsigned long flags = BIT(BR_FDB_ADDED_BY_EXT_LEARN); + + if (swdev_notify) + flags |= BIT(BR_FDB_ADDED_BY_USER); + fdb = fdb_create(br, p, addr, vid, flags); if (!fdb) { err = -ENOMEM; goto err_unlock; } - if (swdev_notify) - fdb->added_by_user = 1; - fdb->added_by_external_learn = 1; fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } else { fdb->updated = jiffies; @@ -1136,17 +1131,17 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, modified = true; } - if (fdb->added_by_external_learn) { + if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { /* Refresh entry */ fdb->used = jiffies; - } else if (!fdb->added_by_user) { + } else if (!test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags)) { /* Take over SW learned entry */ - fdb->added_by_external_learn = 1; + set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); modified = true; } if (swdev_notify) - fdb->added_by_user = 1; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (modified) fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); @@ -1168,7 +1163,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); - if (fdb && fdb->added_by_external_learn) + if (fdb && test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) fdb_delete(br, fdb, swdev_notify); else err = -ENOENT; @@ -1186,8 +1181,8 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); - if (fdb) - fdb->offloaded = offloaded; + if (fdb && offloaded != test_bit(BR_FDB_OFFLOADED, &fdb->flags)) + change_bit(BR_FDB_OFFLOADED, &fdb->flags); spin_unlock_bh(&br->hash_lock); } @@ -1206,7 +1201,7 @@ void br_fdb_clear_offload(const struct net_device *dev, u16 vid) spin_lock_bh(&p->br->hash_lock); hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) { if (f->dst == p && f->key.vlan_id == vid) - f->offloaded = 0; + clear_bit(BR_FDB_OFFLOADED, &f->flags); } spin_unlock_bh(&p->br->hash_lock); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 09b1dd8cd853..8944ceb47fe9 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -88,7 +88,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; if (p->flags & BR_LEARNING) - br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); + br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); local_rcv = !!(br->dev->flags & IFF_PROMISC); if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { @@ -151,7 +151,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (dst) { unsigned long now = jiffies; - if (dst->is_local) + if (test_bit(BR_FDB_LOCAL, &dst->flags)) return br_pass_frame_up(skb); if (now != dst->used) @@ -182,9 +182,10 @@ static void __br_handle_local_finish(struct sk_buff *skb) /* check if vlan is allowed, to avoid spoofing */ if ((p->flags & BR_LEARNING) && + nbp_state_should_learn(p) && !br_opt_get(p->br, BROPT_NO_LL_LEARN) && br_should_learn(p, skb, &vid)) - br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); + br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0); } /* note: already called with rcu_read_lock */ diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ce2ab14ee605..36b0367ca1e0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -172,6 +172,16 @@ struct net_bridge_vlan_group { u16 pvid; }; +/* bridge fdb flags */ +enum { + BR_FDB_LOCAL, + BR_FDB_STATIC, + BR_FDB_STICKY, + BR_FDB_ADDED_BY_USER, + BR_FDB_ADDED_BY_EXT_LEARN, + BR_FDB_OFFLOADED, +}; + struct net_bridge_fdb_key { mac_addr addr; u16 vlan_id; @@ -183,12 +193,7 @@ struct net_bridge_fdb_entry { struct net_bridge_fdb_key key; struct hlist_node fdb_node; - unsigned char is_local:1, - is_static:1, - is_sticky:1, - added_by_user:1, - added_by_external_learn:1, - offloaded:1; + unsigned long flags; /* write-heavy members should not affect lookups */ unsigned long updated ____cacheline_aligned_in_smp; @@ -495,6 +500,11 @@ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v) return true; } +static inline bool nbp_state_should_learn(const struct net_bridge_port *p) +{ + return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING; +} + static inline int br_opt_get(const struct net_bridge *br, enum net_bridge_opts opt) { @@ -566,7 +576,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user); + const unsigned char *addr, u16 vid, unsigned long flags); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 921310d3cbae..015209bf44aa 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -129,15 +129,19 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr, fdb->key.vlan_id, fdb->dst->dev, - fdb->added_by_user, - fdb->offloaded); + test_bit(BR_FDB_ADDED_BY_USER, + &fdb->flags), + test_bit(BR_FDB_OFFLOADED, + &fdb->flags)); break; case RTM_NEWNEIGH: br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr, fdb->key.vlan_id, fdb->dst->dev, - fdb->added_by_user, - fdb->offloaded); + test_bit(BR_FDB_ADDED_BY_USER, + &fdb->flags), + test_bit(BR_FDB_OFFLOADED, + &fdb->flags)); break; } } diff --git a/net/caif/Kconfig b/net/caif/Kconfig index eb83051c8330..b7532a79ca7a 100644 --- a/net/caif/Kconfig +++ b/net/caif/Kconfig @@ -13,11 +13,11 @@ menuconfig CAIF with its modems. It is accessed from user space as sockets (PF_CAIF). Say Y (or M) here if you build for a phone product (e.g. Android or - MeeGo ) that uses CAIF as transport, if unsure say N. + MeeGo) that uses CAIF as transport. If unsure say N. If you select to build it as module then CAIF_NETDEV also needs to be - built as modules. You will also need to say yes to any CAIF physical - devices that your platform requires. + built as a module. You will also need to say Y (or M) to any CAIF + physical devices that your platform requires. See Documentation/networking/caif for a further explanation on how to use and configure CAIF. @@ -37,7 +37,7 @@ config CAIF_NETDEV default CAIF ---help--- Say Y if you will be using a CAIF based GPRS network device. - This can be either built-in or a loadable module, + This can be either built-in or a loadable module. If you select to build it as a built-in then the main CAIF device must also be a built-in. If unsure say Y. @@ -48,7 +48,7 @@ config CAIF_USB default n ---help--- Say Y if you are using CAIF over USB CDC NCM. - This can be either built-in or a loadable module, + This can be either built-in or a loadable module. If you select to build it as a built-in then the main CAIF device must also be a built-in. If unsure say N. diff --git a/net/core/dev.c b/net/core/dev.c index 99ac84ff398f..1c799d486623 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -229,6 +229,122 @@ static inline void rps_unlock(struct softnet_data *sd) #endif } +static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, + const char *name) +{ + struct netdev_name_node *name_node; + + name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); + if (!name_node) + return NULL; + INIT_HLIST_NODE(&name_node->hlist); + name_node->dev = dev; + name_node->name = name; + return name_node; +} + +static struct netdev_name_node * +netdev_name_node_head_alloc(struct net_device *dev) +{ + struct netdev_name_node *name_node; + + name_node = netdev_name_node_alloc(dev, dev->name); + if (!name_node) + return NULL; + INIT_LIST_HEAD(&name_node->list); + return name_node; +} + +static void netdev_name_node_free(struct netdev_name_node *name_node) +{ + kfree(name_node); +} + +static void netdev_name_node_add(struct net *net, + struct netdev_name_node *name_node) +{ + hlist_add_head_rcu(&name_node->hlist, + dev_name_hash(net, name_node->name)); +} + +static void netdev_name_node_del(struct netdev_name_node *name_node) +{ + hlist_del_rcu(&name_node->hlist); +} + +static struct netdev_name_node *netdev_name_node_lookup(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry_rcu(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +int netdev_name_node_alt_create(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (name_node) + return -EEXIST; + name_node = netdev_name_node_alloc(dev, name); + if (!name_node) + return -ENOMEM; + netdev_name_node_add(net, name_node); + /* The node that holds dev->name acts as a head of per-device list. */ + list_add_tail(&name_node->list, &dev->name_node->list); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_create); + +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + list_del(&name_node->list); + netdev_name_node_del(name_node); + kfree(name_node->name); + netdev_name_node_free(name_node); +} + +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (!name_node) + return -ENOENT; + __netdev_name_node_alt_destroy(name_node); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_destroy); + +static void netdev_name_node_alt_flush(struct net_device *dev) +{ + struct netdev_name_node *name_node, *tmp; + + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) + __netdev_name_node_alt_destroy(name_node); +} + /* Device list insertion */ static void list_netdevice(struct net_device *dev) { @@ -238,7 +354,7 @@ static void list_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); @@ -256,7 +372,7 @@ static void unlist_netdevice(struct net_device *dev) /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); list_del_rcu(&dev->dev_list); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); @@ -652,14 +768,10 @@ EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *node_name; - hlist_for_each_entry(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; - - return NULL; + node_name = netdev_name_node_lookup(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); @@ -677,14 +789,10 @@ EXPORT_SYMBOL(__dev_get_by_name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); - - hlist_for_each_entry_rcu(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; + struct netdev_name_node *node_name; - return NULL; + node_name = netdev_name_node_lookup_rcu(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); @@ -1060,8 +1168,8 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -int dev_get_valid_name(struct net *net, struct net_device *dev, - const char *name) +static int dev_get_valid_name(struct net *net, struct net_device *dev, + const char *name) { BUG_ON(!net); @@ -1077,7 +1185,6 @@ int dev_get_valid_name(struct net *net, struct net_device *dev, return 0; } -EXPORT_SYMBOL(dev_get_valid_name); /** * dev_change_name - change name of a device @@ -1151,13 +1258,13 @@ rollback: netdev_adjacent_rename_links(dev, oldname); write_lock_bh(&dev_base_lock); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); write_unlock_bh(&dev_base_lock); synchronize_rcu(); write_lock_bh(&dev_base_lock); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); write_unlock_bh(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); @@ -1536,6 +1643,62 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, return nb->notifier_call(nb, val, &info); } +static int call_netdevice_register_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + int err; + + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + return err; + + if (!(dev->flags & IFF_UP)) + return 0; + + call_netdevice_notifier(nb, NETDEV_UP, dev); + return 0; +} + +static void call_netdevice_unregister_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); + } + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); +} + +static int call_netdevice_register_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + err = call_netdevice_register_notifiers(nb, dev); + if (err) + goto rollback; + } + return 0; + +rollback: + for_each_netdev_continue_reverse(net, dev) + call_netdevice_unregister_notifiers(nb, dev); + return err; +} + +static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + + for_each_netdev(net, dev) + call_netdevice_unregister_notifiers(nb, dev); +} + static int dev_boot_phase = 1; /** @@ -1554,8 +1717,6 @@ static int dev_boot_phase = 1; int register_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; - struct net_device *last; struct net *net; int err; @@ -1568,17 +1729,9 @@ int register_netdevice_notifier(struct notifier_block *nb) if (dev_boot_phase) goto unlock; for_each_net(net) { - for_each_netdev(net, dev) { - err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); - err = notifier_to_errno(err); - if (err) - goto rollback; - - if (!(dev->flags & IFF_UP)) - continue; - - call_netdevice_notifier(nb, NETDEV_UP, dev); - } + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto rollback; } unlock: @@ -1587,22 +1740,9 @@ unlock: return err; rollback: - last = dev; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev == last) - goto outroll; + for_each_net_continue_reverse(net) + call_netdevice_unregister_net_notifiers(nb, net); - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } - } - -outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } @@ -1653,6 +1793,80 @@ unlock: EXPORT_SYMBOL(unregister_netdevice_notifier); /** + * register_netdevice_notifier_net - register a per-netns network notifier block + * @net: network namespace + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = raw_notifier_chain_register(&net->netdev_chain, nb); + if (err) + goto unlock; + if (dev_boot_phase) + goto unlock; + + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto chain_unregister; + +unlock: + rtnl_unlock(); + return err; + +chain_unregister: + raw_notifier_chain_unregister(&netdev_chain, nb); + goto unlock; +} +EXPORT_SYMBOL(register_netdevice_notifier_net); + +/** + * unregister_netdevice_notifier_net - unregister a per-netns + * network notifier block + * @net: network namespace + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = raw_notifier_chain_unregister(&net->netdev_chain, nb); + if (err) + goto unlock; + + call_netdevice_unregister_net_notifiers(nb, net); + +unlock: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_net); + +/** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function * @info: notifier information data @@ -1664,7 +1878,18 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { + struct net *net = dev_net(info->dev); + int ret; + ASSERT_RTNL(); + + /* Run per-netns notifier block chain first, then run the global one. + * Hopefully, one day, the global one is going to be removed after + * all notifier block registrators get converted to be per-netns. + */ + ret = raw_notifier_call_chain(&net->netdev_chain, val, info); + if (ret & NOTIFY_STOP_MASK) + return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } @@ -2690,7 +2915,7 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); - if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + if (!netif_xmit_stopped(txq)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); @@ -2858,12 +3083,9 @@ int skb_checksum_help(struct sk_buff *skb) offset += skb->csum_offset; BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__sum16))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); + if (ret) + goto out; *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: @@ -2898,12 +3120,11 @@ int skb_crc32c_csum_help(struct sk_buff *skb) ret = -EINVAL; goto out; } - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__le32))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + + ret = skb_ensure_writable(skb, offset + sizeof(__le32)); + if (ret) + goto out; + crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); @@ -3386,7 +3607,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty && + if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) && qdisc_run_begin(q)) { if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { @@ -5582,6 +5803,26 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); +/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ +static void gro_normal_list(struct napi_struct *napi) +{ + if (!napi->rx_count) + return; + netif_receive_skb_list_internal(&napi->rx_list); + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; +} + +/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, + * pass the whole batch up to the stack. + */ +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +{ + list_add_tail(&skb->list, &napi->rx_list); + if (++napi->rx_count >= gro_normal_batch) + gro_normal_list(napi); +} + static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); @@ -5589,12 +5830,13 @@ static void napi_skb_free_stolen_head(struct sk_buff *skb) kmem_cache_free(skbuff_head_cache, skb); } -static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static gro_result_t napi_skb_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - if (netif_receive_skb_internal(skb)) - ret = GRO_DROP; + gro_normal_one(napi, skb); break; case GRO_DROP: @@ -5626,7 +5868,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_gro_reset_offset(skb); - ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); return ret; @@ -5672,26 +5914,6 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static void gro_normal_list(struct napi_struct *napi) -{ - if (!napi->rx_count) - return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; -} - -/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, - * pass the whole batch up to the stack. - */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) -{ - list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) - gro_normal_list(napi); -} - static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) @@ -8532,6 +8754,9 @@ static void rollback_registered_many(struct list_head *head) dev_uc_flush(dev); dev_mc_flush(dev); + netdev_name_node_alt_flush(dev); + netdev_name_node_free(dev->name_node); + if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -9011,6 +9236,11 @@ int register_netdevice(struct net_device *dev) if (ret < 0) goto out; + ret = -ENOMEM; + dev->name_node = netdev_name_node_head_alloc(dev); + if (!dev->name_node) + goto out; + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -9132,6 +9362,8 @@ out: return ret; err_uninit: + if (dev->name_node) + netdev_name_node_free(dev->name_node); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (dev->priv_destructor) @@ -9946,6 +10178,8 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); + return 0; err_idx: diff --git a/net/core/devlink.c b/net/core/devlink.c index f80151eeaf51..2e027c9436e0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -95,16 +95,25 @@ static LIST_HEAD(devlink_list); */ static DEFINE_MUTEX(devlink_mutex); -static struct net *devlink_net(const struct devlink *devlink) +struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); } +EXPORT_SYMBOL_GPL(devlink_net); -static void devlink_net_set(struct devlink *devlink, struct net *net) +static void __devlink_net_set(struct devlink *devlink, struct net *net) { write_pnet(&devlink->_net, net); } +void devlink_net_set(struct devlink *devlink, struct net *net) +{ + if (WARN_ON(devlink->registered)) + return; + __devlink_net_set(devlink, net); +} +EXPORT_SYMBOL_GPL(devlink_net_set); + static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { @@ -434,8 +443,16 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, { struct devlink *devlink; - devlink = devlink_get_from_info(info); - if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + /* When devlink changes netns, it would not be found + * by devlink_get_from_info(). So try if it is stored first. + */ + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { + devlink = info->user_ptr[0]; + } else { + devlink = devlink_get_from_info(info); + WARN_ON(IS_ERR(devlink)); + } + if (!IS_ERR(devlink) && ~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); } @@ -1035,7 +1052,7 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1058,6 +1075,9 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -1233,7 +1253,7 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1256,6 +1276,9 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -1460,7 +1483,7 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1485,6 +1508,9 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -2674,6 +2700,72 @@ devlink_resources_validate(struct devlink *devlink, return err; } +static struct net *devlink_netns_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID]; + struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD]; + struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID]; + struct net *net; + + if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { + NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified"); + return ERR_PTR(-EINVAL); + } + + if (netns_pid_attr) { + net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr)); + } else if (netns_fd_attr) { + net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr)); + } else if (netns_id_attr) { + net = get_net_ns_by_id(sock_net(skb->sk), + nla_get_u32(netns_id_attr)); + if (!net) + net = ERR_PTR(-EINVAL); + } else { + WARN_ON(1); + net = ERR_PTR(-EINVAL); + } + if (IS_ERR(net)) { + NL_SET_ERR_MSG(info->extack, "Unknown network namespace"); + return ERR_PTR(-EINVAL); + } + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return ERR_PTR(-EPERM); + } + return net; +} + +static void devlink_param_notify(struct devlink *devlink, + unsigned int port_index, + struct devlink_param_item *param_item, + enum devlink_command cmd); + +static void devlink_reload_netns_change(struct devlink *devlink, + struct net *dest_net) +{ + struct devlink_param_item *param_item; + + /* Userspace needs to be notified about devlink objects + * removed from original and entering new network namespace. + * The rest of the devlink objects are re-created during + * reload process so the notifications are generated separatelly. + */ + + list_for_each_entry(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_DEL); + devlink_notify(devlink, DEVLINK_CMD_DEL); + + __devlink_net_set(devlink, dest_net); + + devlink_notify(devlink, DEVLINK_CMD_NEW); + list_for_each_entry(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_NEW); +} + static bool devlink_reload_supported(struct devlink *devlink) { return devlink->ops->reload_down && devlink->ops->reload_up; @@ -2694,9 +2786,30 @@ bool devlink_is_reload_failed(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_is_reload_failed); +static int devlink_reload(struct devlink *devlink, struct net *dest_net, + struct netlink_ext_ack *extack) +{ + int err; + + if (!devlink->reload_enabled) + return -EOPNOTSUPP; + + err = devlink->ops->reload_down(devlink, !!dest_net, extack); + if (err) + return err; + + if (dest_net && !net_eq(dest_net, devlink_net(devlink))) + devlink_reload_netns_change(devlink, dest_net); + + err = devlink->ops->reload_up(devlink, extack); + devlink_reload_failed_set(devlink, !!err); + return err; +} + static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct net *dest_net = NULL; int err; if (!devlink_reload_supported(devlink)) @@ -2707,11 +2820,20 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); return err; } - err = devlink->ops->reload_down(devlink, info->extack); - if (err) - return err; - err = devlink->ops->reload_up(devlink, info->extack); - devlink_reload_failed_set(devlink, !!err); + + if (info->attrs[DEVLINK_ATTR_NETNS_PID] || + info->attrs[DEVLINK_ATTR_NETNS_FD] || + info->attrs[DEVLINK_ATTR_NETNS_ID]) { + dest_net = devlink_netns_get(skb, info); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + } + + err = devlink_reload(devlink, dest_net, info->extack); + + if (dest_net) + put_net(dest_net); + return err; } @@ -3155,7 +3277,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -3183,6 +3305,9 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -3411,7 +3536,7 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -3444,6 +3569,9 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -3818,29 +3946,19 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); u64 ret_offset, start_offset, end_offset = 0; + struct nlattr **attrs = info->attrs; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; - struct nlattr **attrs; bool dump = true; void *hdr; int err; start_offset = *((u64 *)&cb->args[0]); - attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return -ENOMEM; - - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, - devlink_nl_family.policy, cb->extack); - if (err) - goto out_free; - mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) { @@ -3917,7 +4035,6 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, genlmsg_end(skb, hdr); mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); - kfree(attrs); return skb->len; @@ -3927,8 +4044,6 @@ out_unlock: mutex_unlock(&devlink->lock); out_dev: mutex_unlock(&devlink_mutex); -out_free: - kfree(attrs); return err; } @@ -4066,7 +4181,7 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -4094,6 +4209,9 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, } mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -4732,14 +4850,17 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); static int devlink_health_reporter_recover(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, struct netlink_ext_ack *extack) { int err; + if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) + return 0; + if (!reporter->ops->recover) return -EOPNOTSUPP; - err = reporter->ops->recover(reporter, priv_ctx); + err = reporter->ops->recover(reporter, priv_ctx, extack); if (err) return err; @@ -4760,7 +4881,8 @@ devlink_health_dump_clear(struct devlink_health_reporter *reporter) } static int devlink_health_do_dump(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, + struct netlink_ext_ack *extack) { int err; @@ -4781,7 +4903,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter, goto dump_err; err = reporter->ops->dump(reporter, reporter->dump_fmsg, - priv_ctx); + priv_ctx, extack); if (err) goto dump_err; @@ -4828,11 +4950,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, mutex_lock(&reporter->dump_lock); /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx); + devlink_health_do_dump(reporter, priv_ctx, NULL); mutex_unlock(&reporter->dump_lock); if (reporter->auto_recover) - return devlink_health_reporter_recover(reporter, priv_ctx); + return devlink_health_reporter_recover(reporter, + priv_ctx, NULL); return 0; } @@ -4867,21 +4990,10 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, static struct devlink_health_reporter * devlink_health_reporter_get_from_cb(struct netlink_callback *cb) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_health_reporter *reporter; + struct nlattr **attrs = info->attrs; struct devlink *devlink; - struct nlattr **attrs; - int err; - - attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return NULL; - - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, - devlink_nl_family.policy, cb->extack); - if (err) - goto free; mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); @@ -4890,12 +5002,9 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); mutex_unlock(&devlink_mutex); - kfree(attrs); return reporter; unlock: mutex_unlock(&devlink_mutex); -free: - kfree(attrs); return NULL; } @@ -5084,7 +5193,7 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - err = devlink_health_reporter_recover(reporter, NULL); + err = devlink_health_reporter_recover(reporter, NULL, info->extack); devlink_health_reporter_put(reporter); return err; @@ -5117,7 +5226,7 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, if (err) goto out; - err = reporter->ops->diagnose(reporter, fmsg); + err = reporter->ops->diagnose(reporter, fmsg, info->extack); if (err) goto out; @@ -5152,7 +5261,7 @@ devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, } mutex_lock(&reporter->dump_lock); if (!start) { - err = devlink_health_do_dump(reporter, NULL); + err = devlink_health_do_dump(reporter, NULL, cb->extack); if (err) goto unlock; cb->args[1] = reporter->dump_ts; @@ -5793,6 +5902,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -6023,7 +6135,8 @@ static const struct genl_ops devlink_nl_ops[] = { }, { .cmd = DEVLINK_CMD_REGION_READ, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_region_read_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, @@ -6071,7 +6184,8 @@ static const struct genl_ops devlink_nl_ops[] = { }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | @@ -6155,7 +6269,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) if (!devlink) return NULL; devlink->ops = ops; - devlink_net_set(devlink, &init_net); + __devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); @@ -6181,6 +6295,7 @@ int devlink_register(struct devlink *devlink, struct device *dev) { mutex_lock(&devlink_mutex); devlink->dev = dev; + devlink->registered = true; list_add_tail(&devlink->list, &devlink_list); devlink_notify(devlink, DEVLINK_CMD_NEW); mutex_unlock(&devlink_mutex); @@ -6196,6 +6311,8 @@ EXPORT_SYMBOL_GPL(devlink_register); void devlink_unregister(struct devlink *devlink) { mutex_lock(&devlink_mutex); + WARN_ON(devlink_reload_supported(devlink) && + devlink->reload_enabled); devlink_notify(devlink, DEVLINK_CMD_DEL); list_del(&devlink->list); mutex_unlock(&devlink_mutex); @@ -6203,6 +6320,41 @@ void devlink_unregister(struct devlink *devlink) EXPORT_SYMBOL_GPL(devlink_unregister); /** + * devlink_reload_enable - Enable reload of devlink instance + * + * @devlink: devlink + * + * Should be called at end of device initialization + * process when reload operation is supported. + */ +void devlink_reload_enable(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + devlink->reload_enabled = true; + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_reload_enable); + +/** + * devlink_reload_disable - Disable reload of devlink instance + * + * @devlink: devlink + * + * Should be called at the beginning of device cleanup + * process when reload operation is supported. + */ +void devlink_reload_disable(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + /* Mutex is taken which ensures that no reload operation is in + * progress while setting up forbidded flag. + */ + devlink->reload_enabled = false; + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_reload_disable); + +/** * devlink_free - Free devlink instance resources * * @devlink: devlink @@ -7490,6 +7642,21 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP), DEVLINK_TRAP(TTL_ERROR, EXCEPTION), DEVLINK_TRAP(TAIL_DROP, DROP), + DEVLINK_TRAP(NON_IP_PACKET, DROP), + DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP), + DEVLINK_TRAP(DIP_LB, DROP), + DEVLINK_TRAP(SIP_MC, DROP), + DEVLINK_TRAP(SIP_LB, DROP), + DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP), + DEVLINK_TRAP(IPV4_SIP_BC, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), + DEVLINK_TRAP(MTU_ERROR, EXCEPTION), + DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION), + DEVLINK_TRAP(RPF, EXCEPTION), + DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), + DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), + DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8060,9 +8227,43 @@ int devlink_compat_switch_id_get(struct net_device *dev, return 0; } +static void __net_exit devlink_pernet_pre_exit(struct net *net) +{ + struct devlink *devlink; + int err; + + /* In case network namespace is getting destroyed, reload + * all devlink instances from this namespace into init_net. + */ + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (net_eq(devlink_net(devlink), net)) { + if (WARN_ON(!devlink_reload_supported(devlink))) + continue; + err = devlink_reload(devlink, &init_net, NULL); + if (err && err != -EOPNOTSUPP) + pr_warn("Failed to reload devlink instance into init_net\n"); + } + } + mutex_unlock(&devlink_mutex); +} + +static struct pernet_operations devlink_pernet_ops __net_initdata = { + .pre_exit = devlink_pernet_pre_exit, +}; + static int __init devlink_init(void) { - return genl_register_family(&devlink_nl_family); + int err; + + err = genl_register_family(&devlink_nl_family); + if (err) + goto out; + err = register_pernet_subsys(&devlink_pernet_ops); + +out: + WARN_ON(err); + return err; } subsys_initcall(devlink_init); diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 470a606d5e8d..fc96259807b6 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -12,17 +12,15 @@ static unsigned int fib_notifier_net_id; struct fib_notifier_net { struct list_head fib_notifier_ops; + struct atomic_notifier_head fib_chain; }; -static ATOMIC_NOTIFIER_HEAD(fib_chain); - -int call_fib_notifier(struct notifier_block *nb, struct net *net, +int call_fib_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { int err; - info->net = net; err = nb->notifier_call(nb, event_type, info); return notifier_to_errno(err); } @@ -31,106 +29,100 @@ EXPORT_SYMBOL(call_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); int err; - info->net = net; - err = atomic_notifier_call_chain(&fib_chain, event_type, info); + err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info); return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifiers); -static unsigned int fib_seq_sum(void) +static unsigned int fib_seq_sum(struct net *net) { - struct fib_notifier_net *fn_net; + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; unsigned int fib_seq = 0; - struct net *net; rtnl_lock(); - down_read(&net_rwsem); - for_each_net(net) { - fn_net = net_generic(net, fib_notifier_net_id); - rcu_read_lock(); - list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { - if (!try_module_get(ops->owner)) - continue; - fib_seq += ops->fib_seq_read(net); - module_put(ops->owner); - } - rcu_read_unlock(); + rcu_read_lock(); + list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { + if (!try_module_get(ops->owner)) + continue; + fib_seq += ops->fib_seq_read(net); + module_put(ops->owner); } - up_read(&net_rwsem); + rcu_read_unlock(); rtnl_unlock(); return fib_seq; } -static int fib_net_dump(struct net *net, struct notifier_block *nb) +static int fib_net_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; + int err = 0; + rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { - int err; - if (!try_module_get(ops->owner)) continue; - err = ops->fib_dump(net, nb); + err = ops->fib_dump(net, nb, extack); module_put(ops->owner); if (err) - return err; + goto unlock; } - return 0; +unlock: + rcu_read_unlock(); + + return err; } -static bool fib_dump_is_consistent(struct notifier_block *nb, +static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb), unsigned int fib_seq) { - atomic_notifier_chain_register(&fib_chain, nb); - if (fib_seq == fib_seq_sum()) + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + atomic_notifier_chain_register(&fn_net->fib_chain, nb); + if (fib_seq == fib_seq_sum(net)) return true; - atomic_notifier_chain_unregister(&fib_chain, nb); + atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); if (cb) cb(nb); return false; } #define FIB_DUMP_MAX_RETRIES 5 -int register_fib_notifier(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)) +int register_fib_notifier(struct net *net, struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + struct netlink_ext_ack *extack) { int retries = 0; int err; do { - unsigned int fib_seq = fib_seq_sum(); - struct net *net; - - rcu_read_lock(); - for_each_net_rcu(net) { - err = fib_net_dump(net, nb); - if (err) - goto err_fib_net_dump; - } - rcu_read_unlock(); - - if (fib_dump_is_consistent(nb, cb, fib_seq)) + unsigned int fib_seq = fib_seq_sum(net); + + err = fib_net_dump(net, nb, extack); + if (err) + return err; + + if (fib_dump_is_consistent(net, nb, cb, fib_seq)) return 0; } while (++retries < FIB_DUMP_MAX_RETRIES); return -EBUSY; - -err_fib_net_dump: - rcu_read_unlock(); - return err; } EXPORT_SYMBOL(register_fib_notifier); -int unregister_fib_notifier(struct notifier_block *nb) +int unregister_fib_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&fib_chain, nb); + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); } EXPORT_SYMBOL(unregister_fib_notifier); @@ -181,6 +173,7 @@ static int __net_init fib_notifier_net_init(struct net *net) struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); INIT_LIST_HEAD(&fn_net->fib_notifier_ops); + ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain); return 0; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index dd220ce7ca7a..3e7e15278c46 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -321,16 +321,18 @@ out: } EXPORT_SYMBOL_GPL(fib_rules_lookup); -static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, - struct fib_rule *rule, int family) + struct fib_rule *rule, int family, + struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, + .info.extack = extack, .rule = rule, }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, @@ -350,20 +352,25 @@ static int call_fib_rule_notifiers(struct net *net, } /* Called with rcu_read_lock() */ -int fib_rules_dump(struct net *net, struct notifier_block *nb, int family) +int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, + struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; + int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; - list_for_each_entry_rcu(rule, &ops->rules_list, list) - call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule, - family); + list_for_each_entry_rcu(rule, &ops->rules_list, list) { + err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, + rule, family, extack); + if (err) + break; + } rules_ops_put(ops); - return 0; + return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); diff --git a/net/core/filter.c b/net/core/filter.c index 3fed5755494b..fc303abec8fa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2245,7 +2245,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!msg->sg.copy[i] && bytes_sg_total <= len) + if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2450,7 +2450,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; - msg->sg.copy[new] = false; + __clear_bit(new, &msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); @@ -3798,7 +3798,7 @@ BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(skb_size > skb->len)) + if (unlikely(!skb || skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, @@ -3816,6 +3816,19 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static u32 bpf_skb_output_btf_ids[5]; +const struct bpf_func_proto bpf_skb_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_skb_output_btf_ids, +}; + static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 68eda10d0680..ca871657a4c4 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -114,19 +114,50 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, { struct bpf_prog *attached; struct net *net; + int ret = 0; net = current->nsproxy->net_ns; mutex_lock(&flow_dissector_mutex); + + if (net == &init_net) { + /* BPF flow dissector in the root namespace overrides + * any per-net-namespace one. When attaching to root, + * make sure we don't have any BPF program attached + * to the non-root namespaces. + */ + struct net *ns; + + for_each_net(ns) { + if (ns == &init_net) + continue; + if (rcu_access_pointer(ns->flow_dissector_prog)) { + ret = -EEXIST; + goto out; + } + } + } else { + /* Make sure root flow dissector is not attached + * when attaching to the non-root namespace. + */ + if (rcu_access_pointer(init_net.flow_dissector_prog)) { + ret = -EEXIST; + goto out; + } + } + attached = rcu_dereference_protected(net->flow_dissector_prog, lockdep_is_held(&flow_dissector_mutex)); - if (attached) { - /* Only one BPF program can be attached at a time */ - mutex_unlock(&flow_dissector_mutex); - return -EEXIST; + if (attached == prog) { + /* The same program cannot be attached twice */ + ret = -EINVAL; + goto out; } rcu_assign_pointer(net->flow_dissector_prog, prog); + if (attached) + bpf_prog_put(attached); +out: mutex_unlock(&flow_dissector_mutex); - return 0; + return ret; } int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) @@ -147,27 +178,6 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) mutex_unlock(&flow_dissector_mutex); return 0; } -/** - * skb_flow_get_be16 - extract be16 entity - * @skb: sk_buff to extract from - * @poff: offset to extract at - * @data: raw buffer pointer to the packet - * @hlen: packet header length - * - * The function will try to retrieve a be32 entity at - * offset poff - */ -static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, - void *data, int hlen) -{ - __be16 *u, _u; - - u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u); - if (u) - return *u; - - return 0; -} /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -203,6 +213,72 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +static bool icmp_has_id(u8 type) +{ + switch (type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + return true; + } + + return false; +} + +/** + * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields + * @skb: sk_buff to extract from + * @key_icmp: struct flow_dissector_key_icmp to fill + * @data: raw buffer pointer to the packet + * @toff: offset to extract at + * @hlen: packet header length + */ +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + void *data, int thoff, int hlen) +{ + struct icmphdr *ih, _ih; + + ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); + if (!ih) + return; + + key_icmp->type = ih->type; + key_icmp->code = ih->code; + + /* As we use 0 to signal that the Id field is not present, + * avoid confusion with packets without such field + */ + if (icmp_has_id(ih->type)) + key_icmp->id = ih->un.echo.id ? : 1; + else + key_icmp->id = 0; +} +EXPORT_SYMBOL(skb_flow_get_icmp_tci); + +/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet + * using skb_flow_get_icmp_tci(). + */ +static void __skb_flow_dissect_icmp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, + void *data, int thoff, int hlen) +{ + struct flow_dissector_key_icmp *key_icmp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) + return; + + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + + skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); +} + void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) @@ -853,7 +929,6 @@ bool __skb_flow_dissect(const struct net *net, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; - struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; struct bpf_prog *attached = NULL; @@ -910,7 +985,10 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { rcu_read_lock(); - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(init_net.flow_dissector_prog); + + if (!attached) + attached = rcu_dereference(net->flow_dissector_prog); if (attached) { struct bpf_flow_keys flow_keys; @@ -1295,6 +1373,12 @@ ip_proto_again: data, nhoff, hlen); break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + __skb_flow_dissect_icmp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + default: break; } @@ -1308,14 +1392,6 @@ ip_proto_again: data, hlen); } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP)) { - key_icmp = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP, - target_container); - key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen); - } - /* Process result of IP proto processing */ switch (fdret) { case FLOW_DISSECT_RET_PROTO_AGAIN: @@ -1365,8 +1441,8 @@ static const void *flow_keys_hash_start(const struct flow_keys *flow) static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); - BUILD_BUG_ON(offsetof(typeof(*flow), addrs) != - sizeof(*flow) - sizeof(flow->addrs)); + + BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: @@ -1412,6 +1488,9 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow) } EXPORT_SYMBOL(flow_get_u32_dst); +/* Sort the source and destination IP (and the ports if the IP are the same), + * to have consistent hash within the two directions + */ static inline void __flow_hash_consistentify(struct flow_keys *keys) { int addr_diff, i; diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index bfe7bdd4c340..80dbf2f4016e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -48,7 +48,7 @@ struct net_rate_estimator { u8 intvl_log; /* period : (250ms << intvl_log) */ seqcount_t seq; - u32 last_packets; + u64 last_packets; u64 last_bytes; u64 avpps; @@ -83,7 +83,7 @@ static void est_timer(struct timer_list *t) brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); brate -= (est->avbps >> est->ewma_log); - rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); + rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); rate -= (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 36888f5e09eb..1d653fbfcf52 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -123,8 +123,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, for_each_possible_cpu(i) { struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); unsigned int start; - u64 bytes; - u32 packets; + u64 bytes, packets; do { start = u64_stats_fetch_begin_irq(&bcpu->syncp); @@ -176,12 +175,17 @@ ___gnet_stats_copy_basic(const seqcount_t *running, if (d->tail) { struct gnet_stats_basic sb; + int res; memset(&sb, 0, sizeof(sb)); sb.bytes = bstats.bytes; sb.packets = bstats.packets; - return gnet_stats_copy(d, type, &sb, sizeof(sb), - TCA_STATS_PAD); + res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); + if (res < 0 || sb.packets == bstats.packets) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, + sizeof(bstats.packets), TCA_STATS_PAD); } return 0; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5480edff0c86..652da6369037 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1197,7 +1197,7 @@ static void neigh_update_hhs(struct neighbour *neigh) if (update) { hh = &neigh->hh; - if (hh->hh_len) { + if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); @@ -1476,7 +1476,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) struct net_device *dev = neigh->dev; unsigned int seq; - if (dev->header_ops->cache && !neigh->hh.hh_len) + if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { @@ -2052,8 +2052,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, goto nla_put_failure; { unsigned long now = jiffies; - unsigned int flush_delta = now - tbl->last_flush; - unsigned int rand_delta = now - tbl->last_rand; + long flush_delta = now - tbl->last_flush; + long rand_delta = now - tbl->last_rand; struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 36347933ec3a..6bbd06f7dc7d 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -20,8 +20,8 @@ static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff struct hlist_head *h; unsigned int count = 0, offset = get_offset(*pos); - h = &net->dev_name_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, h, name_hlist) { + h = &net->dev_index_head[get_bucket(*pos)]; + hlist_for_each_entry_rcu(dev, h, index_hlist) { if (++count == offset) return dev; } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 48b1e429857c..294bfcf0ce0e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3404,7 +3404,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) HARD_TX_LOCK(odev, txq, smp_processor_id()); if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { - ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c81cd80114d9..000eddb1207d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -980,6 +980,19 @@ static size_t rtnl_xdp_size(void) return xdp_size; } +static size_t rtnl_prop_list_size(const struct net_device *dev) +{ + struct netdev_name_node *name_node; + size_t size; + + if (list_empty(&dev->name_node->list)) + return 0; + size = nla_total_size(0); + list_for_each_entry(name_node, &dev->name_node->list, list) + size += nla_total_size(ALTIFNAMSIZ); + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1027,6 +1040,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + + rtnl_prop_list_size(dev) + 0; } @@ -1584,6 +1598,42 @@ static int rtnl_fill_link_af(struct sk_buff *skb, return 0; } +static int rtnl_fill_alt_ifnames(struct sk_buff *skb, + const struct net_device *dev) +{ + struct netdev_name_node *name_node; + int count = 0; + + list_for_each_entry(name_node, &dev->name_node->list, list) { + if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) + return -EMSGSIZE; + count++; + } + return count; +} + +static int rtnl_fill_prop_list(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *prop_list; + int ret; + + prop_list = nla_nest_start(skb, IFLA_PROP_LIST); + if (!prop_list) + return -EMSGSIZE; + + ret = rtnl_fill_alt_ifnames(skb, dev); + if (ret <= 0) + goto nest_cancel; + + nla_nest_end(skb, prop_list); + return 0; + +nest_cancel: + nla_nest_cancel(skb, prop_list); + return ret; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1697,6 +1747,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure_rcu; rcu_read_unlock(); + if (rtnl_fill_prop_list(skb, dev)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -1750,6 +1803,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, + [IFLA_PROP_LIST] = { .type = NLA_NESTED }, + [IFLA_ALT_IFNAME] = { .type = NLA_STRING, + .len = ALTIFNAMSIZ - 1 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2723,6 +2779,26 @@ errout: return err; } +static struct net_device *rtnl_dev_get(struct net *net, + struct nlattr *ifname_attr, + struct nlattr *altifname_attr, + char *ifname) +{ + char buffer[ALTIFNAMSIZ]; + + if (!ifname) { + ifname = buffer; + if (ifname_attr) + nla_strlcpy(ifname, ifname_attr, IFNAMSIZ); + else if (altifname_attr) + nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ); + else + return NULL; + } + + return __dev_get_by_name(net, ifname); +} + static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2751,8 +2827,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); else goto errout; @@ -2825,7 +2901,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *tgt_net = net; struct net_device *dev = NULL; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; int netnsid = -1; @@ -2839,9 +2914,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); @@ -2853,8 +2925,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(tgt_net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else @@ -3025,12 +3098,10 @@ replay: ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else { - if (ifname[0]) - dev = __dev_get_by_name(net, ifname); - else - dev = NULL; - } + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); + else + dev = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); @@ -3292,6 +3363,7 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, switch (i) { case IFLA_IFNAME: + case IFLA_ALT_IFNAME: case IFLA_EXT_MASK: case IFLA_TARGET_NETNSID: break; @@ -3310,7 +3382,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; @@ -3333,9 +3404,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); @@ -3343,8 +3411,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(tgt_net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(tgt_net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); else goto out; @@ -3374,6 +3443,100 @@ out: return err; } +static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, + bool *changed, struct netlink_ext_ack *extack) +{ + char *alt_ifname; + int err; + + err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + alt_ifname = nla_data(attr); + if (cmd == RTM_NEWLINKPROP) { + alt_ifname = kstrdup(alt_ifname, GFP_KERNEL); + if (!alt_ifname) + return -ENOMEM; + err = netdev_name_node_alt_create(dev, alt_ifname); + if (err) { + kfree(alt_ifname); + return err; + } + } else if (cmd == RTM_DELLINKPROP) { + err = netdev_name_node_alt_destroy(dev, alt_ifname); + if (err) + return err; + } else { + WARN_ON(1); + return 0; + } + + *changed = true; + return 0; +} + +static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + struct ifinfomsg *ifm; + bool changed = false; + struct nlattr *attr; + int err, rem; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err) + return err; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (!tb[IFLA_PROP_LIST]) + return 0; + + nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { + switch (nla_type(attr)) { + case IFLA_ALT_IFNAME: + err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); + if (err) + return err; + break; + } + } + + if (changed) + netdev_state_change(dev); + return 0; +} + +static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); +} + +static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); +} + static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -5332,6 +5495,9 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); + rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); diff --git a/net/core/sock.c b/net/core/sock.c index ac78a570e43a..71787f7c4f8c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -333,7 +333,6 @@ EXPORT_SYMBOL(__sk_backlog_rcv); static int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; - int size; if (timeo == MAX_SCHEDULE_TIMEOUT) { tv.tv_sec = 0; @@ -354,13 +353,11 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval) old_tv.tv_sec = tv.tv_sec; old_tv.tv_usec = tv.tv_usec; *(struct __kernel_old_timeval *)optval = old_tv; - size = sizeof(old_tv); - } else { - *(struct __kernel_sock_timeval *)optval = tv; - size = sizeof(tv); + return sizeof(old_tv); } - return size; + *(struct __kernel_sock_timeval *)optval = tv; + return sizeof(tv); } static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval) @@ -687,7 +684,8 @@ out: return ret; } -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, + int valbool) { if (valbool) sock_set_flag(sk, bit); @@ -3015,7 +3013,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp, return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt);; + sock_write_timestamp(sk, kt); ts = ktime_to_timespec64(kt); } @@ -3042,7 +3040,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp, } EXPORT_SYMBOL(sock_gettstamp); -void sock_enable_timestamp(struct sock *sk, int flag) +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; diff --git a/net/core/xdp.c b/net/core/xdp.c index d7bf62ffbb5e..20781ad5f9c3 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -386,7 +386,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame - * while still leveraging this protection. The @napi_direct boolian + * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 5bad08dc4316..a52e8ba1ced0 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -944,7 +944,7 @@ int inet_dccp_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) goto out; - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 3349ea81f901..e19a92a62e14 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1091,7 +1091,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags, } cb = DN_SKB_CB(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern); if (newsk == NULL) { release_sock(sk); diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index e4161e0c86aa..c68503a18025 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -328,7 +328,7 @@ static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb) return; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_state_change(sk); } diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 43120a3fb06f..17281fec710c 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -246,7 +246,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_PM_SLEEP static bool dsa_is_port_initialized(struct dsa_switch *ds, int p) { - return dsa_is_user_port(ds, p) && ds->ports[p].slave; + const struct dsa_port *dp = dsa_to_port(ds, p); + + return dp->type == DSA_PORT_TYPE_USER && dp->slave; } int dsa_switch_suspend(struct dsa_switch *ds) @@ -258,7 +260,7 @@ int dsa_switch_suspend(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_suspend(ds->ports[i].slave); + ret = dsa_slave_suspend(dsa_to_port(ds, i)->slave); if (ret) return ret; } @@ -285,7 +287,7 @@ int dsa_switch_resume(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_resume(ds->ports[i].slave); + ret = dsa_slave_resume(dsa_to_port(ds, i)->slave); if (ret) return ret; } @@ -329,6 +331,91 @@ int call_dsa_notifiers(unsigned long val, struct net_device *dev, } EXPORT_SYMBOL_GPL(call_dsa_notifiers); +int dsa_devlink_param_get(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct dsa_devlink_priv *dl_priv; + struct dsa_switch *ds; + + dl_priv = devlink_priv(dl); + ds = dl_priv->ds; + + if (!ds->ops->devlink_param_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_param_get(ds, id, ctx); +} +EXPORT_SYMBOL_GPL(dsa_devlink_param_get); + +int dsa_devlink_param_set(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct dsa_devlink_priv *dl_priv; + struct dsa_switch *ds; + + dl_priv = devlink_priv(dl); + ds = dl_priv->ds; + + if (!ds->ops->devlink_param_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_param_set(ds, id, ctx); +} +EXPORT_SYMBOL_GPL(dsa_devlink_param_set); + +int dsa_devlink_params_register(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count) +{ + return devlink_params_register(ds->devlink, params, params_count); +} +EXPORT_SYMBOL_GPL(dsa_devlink_params_register); + +void dsa_devlink_params_unregister(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count) +{ + devlink_params_unregister(ds->devlink, params, params_count); +} +EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister); + +int dsa_devlink_resource_register(struct dsa_switch *ds, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params) +{ + return devlink_resource_register(ds->devlink, resource_name, + resource_size, resource_id, + parent_resource_id, + size_params); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_register); + +void dsa_devlink_resources_unregister(struct dsa_switch *ds) +{ + devlink_resources_unregister(ds->devlink, NULL); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister); + +void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv) +{ + return devlink_resource_occ_get_register(ds->devlink, resource_id, + occ_get, occ_get_priv); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register); + +void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, + u64 resource_id) +{ + devlink_resource_occ_get_unregister(ds->devlink, resource_id); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister); + static int __init dsa_init_module(void) { int rc; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 716d265ba8ca..9ef2caa13f27 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -45,6 +45,10 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index) dst->index = index; + INIT_LIST_HEAD(&dst->rtable); + + INIT_LIST_HEAD(&dst->ports); + INIT_LIST_HEAD(&dst->list); list_add_tail(&dst->list, &dsa_tree_list); @@ -111,24 +115,38 @@ static bool dsa_port_is_user(struct dsa_port *dp) static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst, struct device_node *dn) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; + list_for_each_entry(dp, &dst->ports, list) + if (dp->dn == dn) + return dp; - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; + return NULL; +} - if (dp->dn == dn) - return dp; - } - } +struct dsa_link *dsa_link_touch(struct dsa_port *dp, struct dsa_port *link_dp) +{ + struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst; + struct dsa_link *dl; - return NULL; + dst = ds->dst; + + list_for_each_entry(dl, &dst->rtable, list) + if (dl->dp == dp && dl->link_dp == link_dp) + return dl; + + dl = kzalloc(sizeof(*dl), GFP_KERNEL); + if (!dl) + return NULL; + + dl->dp = dp; + dl->link_dp = link_dp; + + INIT_LIST_HEAD(&dl->list); + list_add_tail(&dl->list, &dst->rtable); + + return dl; } static bool dsa_port_setup_routing_table(struct dsa_port *dp) @@ -138,6 +156,7 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp) struct device_node *dn = dp->dn; struct of_phandle_iterator it; struct dsa_port *link_dp; + struct dsa_link *dl; int err; of_for_each_phandle(&it, err, dn, "link", NULL, 0) { @@ -147,24 +166,22 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp) return false; } - ds->rtable[link_dp->ds->index] = dp->index; + dl = dsa_link_touch(dp, link_dp); + if (!dl) { + of_node_put(it.node); + return false; + } } return true; } -static bool dsa_switch_setup_routing_table(struct dsa_switch *ds) +static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) { bool complete = true; struct dsa_port *dp; - int i; - - for (i = 0; i < DSA_MAX_SWITCHES; i++) - ds->rtable[i] = DSA_RTABLE_NONE; - - for (i = 0; i < ds->num_ports; i++) { - dp = &ds->ports[i]; + list_for_each_entry(dp, &dst->ports, list) { if (dsa_port_is_dsa(dp)) { complete = dsa_port_setup_routing_table(dp); if (!complete) @@ -175,81 +192,42 @@ static bool dsa_switch_setup_routing_table(struct dsa_switch *ds) return complete; } -static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) -{ - struct dsa_switch *ds; - bool complete = true; - int device; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - complete = dsa_switch_setup_routing_table(ds); - if (!complete) - break; - } - - return complete; -} - static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - if (dsa_port_is_cpu(dp)) - return dp; - } - } + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_cpu(dp)) + return dp; return NULL; } static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; - struct dsa_port *dp; - int device, port; + struct dsa_port *cpu_dp, *dp; - /* DSA currently only supports a single CPU port */ - dst->cpu_dp = dsa_tree_find_first_cpu(dst); - if (!dst->cpu_dp) { - pr_warn("Tree has no master device\n"); + cpu_dp = dsa_tree_find_first_cpu(dst); + if (!cpu_dp) { + pr_err("DSA: tree %d has no CPU port\n", dst->index); return -EINVAL; } /* Assign the default CPU port to all ports of the fabric */ - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) - dp->cpu_dp = dst->cpu_dp; - } - } + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = cpu_dp; return 0; } static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst) { - /* DSA currently only supports a single CPU port */ - dst->cpu_dp = NULL; + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = NULL; } static int dsa_port_setup(struct dsa_port *dp) @@ -265,6 +243,9 @@ static int dsa_port_setup(struct dsa_port *dp) bool dsa_port_enabled = false; int err = 0; + if (dp->setup) + return 0; + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: dsa_port_disable(dp); @@ -333,14 +314,21 @@ static int dsa_port_setup(struct dsa_port *dp) dsa_port_link_unregister_of(dp); if (err && devlink_port_registered) devlink_port_unregister(dlp); + if (err) + return err; - return err; + dp->setup = true; + + return 0; } static void dsa_port_teardown(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; + if (!dp->setup) + return; + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: break; @@ -363,11 +351,17 @@ static void dsa_port_teardown(struct dsa_port *dp) } break; } + + dp->setup = false; } static int dsa_switch_setup(struct dsa_switch *ds) { - int err = 0; + struct dsa_devlink_priv *dl_priv; + int err; + + if (ds->setup) + return 0; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus * driver and before ops->setup() has run, since the switch drivers and @@ -379,9 +373,11 @@ static int dsa_switch_setup(struct dsa_switch *ds) /* Add the switch to devlink before calling setup, so that setup can * add dpipe tables */ - ds->devlink = devlink_alloc(&dsa_devlink_ops, 0); + ds->devlink = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv)); if (!ds->devlink) return -ENOMEM; + dl_priv = devlink_priv(ds->devlink); + dl_priv->ds = ds; err = devlink_register(ds->devlink, ds->dev); if (err) @@ -395,6 +391,8 @@ static int dsa_switch_setup(struct dsa_switch *ds) if (err < 0) goto unregister_notifier; + devlink_params_publish(ds->devlink); + if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) { @@ -409,6 +407,8 @@ static int dsa_switch_setup(struct dsa_switch *ds) goto unregister_notifier; } + ds->setup = true; + return 0; unregister_notifier: @@ -424,6 +424,9 @@ free_devlink: static void dsa_switch_teardown(struct dsa_switch *ds) { + if (!ds->setup) + return; + if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); @@ -438,95 +441,72 @@ static void dsa_switch_teardown(struct dsa_switch *ds) ds->devlink = NULL; } + ds->setup = false; } static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port, i; - int err = 0; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; + int err; - err = dsa_switch_setup(ds); + list_for_each_entry(dp, &dst->ports, list) { + err = dsa_switch_setup(dp->ds); if (err) - goto switch_teardown; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; + goto teardown; + } - err = dsa_port_setup(dp); - if (err) - goto ports_teardown; - } + list_for_each_entry(dp, &dst->ports, list) { + err = dsa_port_setup(dp); + if (err) + goto teardown; } return 0; -ports_teardown: - for (i = 0; i < port; i++) - dsa_port_teardown(&ds->ports[i]); +teardown: + list_for_each_entry(dp, &dst->ports, list) + dsa_port_teardown(dp); - dsa_switch_teardown(ds); - -switch_teardown: - for (i = 0; i < device; i++) { - ds = dst->ds[i]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - dsa_port_teardown(dp); - } - - dsa_switch_teardown(ds); - } + list_for_each_entry(dp, &dst->ports, list) + dsa_switch_teardown(dp->ds); return err; } static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; + list_for_each_entry(dp, &dst->ports, list) + dsa_port_teardown(dp); - dsa_port_teardown(dp); - } - - dsa_switch_teardown(ds); - } + list_for_each_entry(dp, &dst->ports, list) + dsa_switch_teardown(dp->ds); } static int dsa_tree_setup_master(struct dsa_switch_tree *dst) { - struct dsa_port *cpu_dp = dst->cpu_dp; - struct net_device *master = cpu_dp->master; + struct dsa_port *dp; + int err; - /* DSA currently supports a single pair of CPU port and master device */ - return dsa_master_setup(master, cpu_dp); + list_for_each_entry(dp, &dst->ports, list) { + if (dsa_port_is_cpu(dp)) { + err = dsa_master_setup(dp->master, dp); + if (err) + return err; + } + } + + return 0; } static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) { - struct dsa_port *cpu_dp = dst->cpu_dp; - struct net_device *master = cpu_dp->master; + struct dsa_port *dp; - return dsa_master_teardown(master); + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_cpu(dp)) + dsa_master_teardown(dp->master); } static int dsa_tree_setup(struct dsa_switch_tree *dst) @@ -572,6 +552,8 @@ teardown_default_cpu: static void dsa_tree_teardown(struct dsa_switch_tree *dst) { + struct dsa_link *dl, *next; + if (!dst->setup) return; @@ -581,39 +563,36 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) dsa_tree_teardown_default_cpu(dst); + list_for_each_entry_safe(dl, next, &dst->rtable, list) { + list_del(&dl->list); + kfree(dl); + } + pr_info("DSA: tree %d torn down\n", dst->index); dst->setup = false; } -static void dsa_tree_remove_switch(struct dsa_switch_tree *dst, - unsigned int index) +static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) { - dsa_tree_teardown(dst); + struct dsa_switch_tree *dst = ds->dst; + struct dsa_port *dp; - dst->ds[index] = NULL; - dsa_tree_put(dst); -} + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds == ds && dp->index == index) + return dp; -static int dsa_tree_add_switch(struct dsa_switch_tree *dst, - struct dsa_switch *ds) -{ - unsigned int index = ds->index; - int err; + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + if (!dp) + return NULL; - if (dst->ds[index]) - return -EBUSY; + dp->ds = ds; + dp->index = index; - dsa_tree_get(dst); - dst->ds[index] = ds; + INIT_LIST_HEAD(&dp->list); + list_add_tail(&dp->list, &dst->ports); - err = dsa_tree_setup(dst); - if (err) { - dst->ds[index] = NULL; - dsa_tree_put(dst); - } - - return err; + return dp; } static int dsa_port_parse_user(struct dsa_port *dp, const char *name) @@ -708,7 +687,7 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, goto out_put_node; } - dp = &ds->ports[reg]; + dp = dsa_to_port(ds, reg); err = dsa_port_parse_of(dp, port); if (err) @@ -732,8 +711,6 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, return sz; ds->index = m[1]; - if (ds->index >= DSA_MAX_SWITCHES) - return -EINVAL; ds->dst = dsa_tree_touch(m[0]); if (!ds->dst) @@ -742,6 +719,20 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, return 0; } +static int dsa_switch_touch_ports(struct dsa_switch *ds) +{ + struct dsa_port *dp; + int port; + + for (port = 0; port < ds->num_ports; port++) { + dp = dsa_port_touch(ds, port); + if (!dp) + return -ENOMEM; + } + + return 0; +} + static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) { int err; @@ -750,6 +741,10 @@ static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) if (err) return err; + err = dsa_switch_touch_ports(ds); + if (err) + return err; + return dsa_switch_parse_ports_of(ds, dn); } @@ -787,7 +782,7 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds, for (i = 0; i < DSA_MAX_PORTS; i++) { name = cd->port_names[i]; dev = cd->netdev[i]; - dp = &ds->ports[i]; + dp = dsa_to_port(ds, i); if (!name) continue; @@ -807,6 +802,8 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds, static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) { + int err; + ds->cd = cd; /* We don't support interconnected switches nor multiple trees via @@ -817,22 +814,29 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) if (!ds->dst) return -ENOMEM; - return dsa_switch_parse_ports(ds, cd); -} - -static int dsa_switch_add(struct dsa_switch *ds) -{ - struct dsa_switch_tree *dst = ds->dst; + err = dsa_switch_touch_ports(ds); + if (err) + return err; - return dsa_tree_add_switch(dst, ds); + return dsa_switch_parse_ports(ds, cd); } static int dsa_switch_probe(struct dsa_switch *ds) { - struct dsa_chip_data *pdata = ds->dev->platform_data; - struct device_node *np = ds->dev->of_node; + struct dsa_switch_tree *dst; + struct dsa_chip_data *pdata; + struct device_node *np; int err; + if (!ds->dev) + return -ENODEV; + + pdata = ds->dev->platform_data; + np = ds->dev->of_node; + + if (!ds->num_ports) + return -EINVAL; + if (np) err = dsa_switch_parse_of(ds, np); else if (pdata) @@ -843,29 +847,14 @@ static int dsa_switch_probe(struct dsa_switch *ds) if (err) return err; - return dsa_switch_add(ds); -} - -struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) -{ - struct dsa_switch *ds; - int i; - - ds = devm_kzalloc(dev, struct_size(ds, ports, n), GFP_KERNEL); - if (!ds) - return NULL; - - ds->dev = dev; - ds->num_ports = n; - - for (i = 0; i < ds->num_ports; ++i) { - ds->ports[i].index = i; - ds->ports[i].ds = ds; - } + dst = ds->dst; + dsa_tree_get(dst); + err = dsa_tree_setup(dst); + if (err) + dsa_tree_put(dst); - return ds; + return err; } -EXPORT_SYMBOL_GPL(dsa_switch_alloc); int dsa_register_switch(struct dsa_switch *ds) { @@ -883,9 +872,16 @@ EXPORT_SYMBOL_GPL(dsa_register_switch); static void dsa_switch_remove(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; - unsigned int index = ds->index; + struct dsa_port *dp, *next; - dsa_tree_remove_switch(dst, index); + dsa_tree_teardown(dst); + + list_for_each_entry_safe(dp, next, &dst->ports, list) { + list_del(&dp->list); + kfree(dp); + } + + dsa_tree_put(dst); } void dsa_unregister_switch(struct dsa_switch *ds) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 12f8c7ee4dd8..53e7577896b6 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -104,25 +104,14 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; - struct dsa_switch *ds; - struct dsa_port *slave_port; + struct dsa_port *dp; - if (device < 0 || device >= DSA_MAX_SWITCHES) - return NULL; + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds->index == device && dp->index == port && + dp->type == DSA_PORT_TYPE_USER) + return dp->slave; - ds = dst->ds[device]; - if (!ds) - return NULL; - - if (port < 0 || port >= ds->num_ports) - return NULL; - - slave_port = &ds->ports[port]; - - if (unlikely(slave_port->type != DSA_PORT_TYPE_USER)) - return NULL; - - return slave_port->slave; + return NULL; } /* port.c */ diff --git a/net/dsa/port.c b/net/dsa/port.c index 9b54e5a76297..6e93c36bf0c0 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -561,7 +561,7 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp) struct dsa_switch *ds = dp->ds; struct phy_device *phydev; int port = dp->index; - int mode; + phy_interface_t mode; int err; err = of_phy_register_fixed_link(dn); @@ -574,8 +574,8 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp) phydev = of_phy_find_device(dn); - mode = of_get_phy_mode(dn); - if (mode < 0) + err = of_get_phy_mode(dn, &mode); + if (err) mode = PHY_INTERFACE_MODE_NA; phydev->interface = mode; @@ -593,10 +593,11 @@ static int dsa_port_phylink_register(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; struct device_node *port_dn = dp->dn; - int mode, err; + phy_interface_t mode; + int err; - mode = of_get_phy_mode(port_dn); - if (mode < 0) + err = of_get_phy_mode(port_dn, &mode); + if (err) mode = PHY_INTERFACE_MODE_NA; dp->pl_config.dev = ds->dev; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 028e65f4b5ba..78ffc87dc25e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -789,6 +789,22 @@ static int dsa_slave_set_link_ksettings(struct net_device *dev, return phylink_ethtool_ksettings_set(dp->pl, cmd); } +static void dsa_slave_get_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + phylink_ethtool_get_pauseparam(dp->pl, pause); +} + +static int dsa_slave_set_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return phylink_ethtool_set_pauseparam(dp->pl, pause); +} + #ifdef CONFIG_NET_POLL_CONTROLLER static int dsa_slave_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) @@ -1192,6 +1208,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_eee = dsa_slave_get_eee, .get_link_ksettings = dsa_slave_get_link_ksettings, .set_link_ksettings = dsa_slave_set_link_ksettings, + .get_pauseparam = dsa_slave_get_pauseparam, + .set_pauseparam = dsa_slave_set_pauseparam, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, .get_ts_info = dsa_slave_get_ts_info, @@ -1295,11 +1313,12 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct device_node *port_dn = dp->dn; struct dsa_switch *ds = dp->ds; + phy_interface_t mode; u32 phy_flags = 0; - int mode, ret; + int ret; - mode = of_get_phy_mode(port_dn); - if (mode < 0) + ret = of_get_phy_mode(port_dn, &mode); + if (ret) mode = PHY_INTERFACE_MODE_NA; dp->pl_config.dev = &slave_dev->dev; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 6a9607518823..df4abe897ed6 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -20,7 +20,7 @@ static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, int i; for (i = 0; i < ds->num_ports; ++i) { - struct dsa_port *dp = &ds->ports[i]; + struct dsa_port *dp = dsa_to_port(ds, i); if (dp->ageing_time && dp->ageing_time < ageing_time) ageing_time = dp->ageing_time; @@ -98,7 +98,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, if (unset_vlan_filtering) { struct switchdev_trans trans = {0}; - err = dsa_port_vlan_filtering(&ds->ports[info->port], + err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), false, &trans); if (err && err != EOPNOTSUPP) return err; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 9c1cc2482b68..bc5cb91bf052 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -31,15 +31,14 @@ * Must be transmitted as zero and ignored on receive. * * SWITCH_ID - VID[8:6]: - * Index of switch within DSA tree. Must be between 0 and - * DSA_MAX_SWITCHES - 1. + * Index of switch within DSA tree. Must be between 0 and 7. * * RSV - VID[5:4]: * To be used for further expansion of PORT or for other purposes. * Must be transmitted as zero and ignored on receive. * * PORT - VID[3:0]: - * Index of switch port. Must be between 0 and DSA_MAX_PORTS - 1. + * Index of switch port. Must be between 0 and 15. */ #define DSA_8021Q_DIR_SHIFT 10 @@ -103,7 +102,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) if (!dsa_is_user_port(ds, port)) return 0; - slave = ds->ports[port].slave; + slave = dsa_to_port(ds, port)->slave; err = br_vlan_get_pvid(slave, &pvid); if (err < 0) @@ -118,7 +117,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) return err; } - return dsa_port_vid_add(&ds->ports[port], pvid, vinfo.flags); + return dsa_port_vid_add(dsa_to_port(ds, port), pvid, vinfo.flags); } /* If @enabled is true, installs @vid with @flags into the switch port's HW @@ -130,7 +129,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid, u16 flags, bool enabled) { - struct dsa_port *dp = &ds->ports[port]; + struct dsa_port *dp = dsa_to_port(ds, port); struct bridge_vlan_info vinfo; int err; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 17374afee28f..9040fe55e0f5 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -244,7 +244,12 @@ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); memcpy(eth->h_dest, neigh->ha, ETH_ALEN); - hh->hh_len = ETH_HLEN; + + /* Pairs with READ_ONCE() in neigh_resolve_output(), + * neigh_hh_output() and neigh_update_hhs(). + */ + smp_store_release(&hh->hh_len, ETH_HLEN); + return 0; } EXPORT_SYMBOL(eth_header_cache); diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index ffcfcef76291..7c5a1aa5adb4 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -236,21 +236,14 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, struct cfg802154_registered_device **rdev, struct wpan_dev **wpan_dev) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); int err; rtnl_lock(); if (!cb->args[0]) { - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nl802154_fam.hdrsize, - genl_family_attrbuf(&nl802154_fam), - nl802154_fam.maxattr, - nl802154_policy, NULL); - if (err) - goto out_unlock; - *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), - genl_family_attrbuf(&nl802154_fam)); + info->attrs); if (IS_ERR(*wpan_dev)) { err = PTR_ERR(*wpan_dev); goto out_unlock; @@ -557,17 +550,8 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct netlink_callback *cb, struct nl802154_dump_wpan_phy_state *state) { - struct nlattr **tb = genl_family_attrbuf(&nl802154_fam); - int ret = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nl802154_fam.hdrsize, - tb, nl802154_fam.maxattr, - nl802154_policy, NULL); - - /* TODO check if we can handle error here, - * we have no backward compatibility - */ - if (ret) - return 0; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct nlattr **tb = info->attrs; if (tb[NL802154_ATTR_WPAN_PHY]) state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]); @@ -2203,7 +2187,8 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, static const struct genl_ops nl802154_ops[] = { { .cmd = NL802154_CMD_GET_WPAN_PHY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .doit = nl802154_get_wpan_phy, .dumpit = nl802154_dump_wpan_phy, .done = nl802154_dump_wpan_phy_done, @@ -2343,7 +2328,8 @@ static const struct genl_ops nl802154_ops[] = { }, { .cmd = NL802154_CMD_GET_SEC_KEY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching key id? */ .dumpit = nl802154_dump_llsec_key, .flags = GENL_ADMIN_PERM, @@ -2369,7 +2355,8 @@ static const struct genl_ops nl802154_ops[] = { /* TODO unique identifier must short+pan OR extended_addr */ { .cmd = NL802154_CMD_GET_SEC_DEV, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching extended_addr? */ .dumpit = nl802154_dump_llsec_dev, .flags = GENL_ADMIN_PERM, @@ -2395,7 +2382,8 @@ static const struct genl_ops nl802154_ops[] = { /* TODO remove complete devkey, put it as nested? */ { .cmd = NL802154_CMD_GET_SEC_DEVKEY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO doit by matching ??? */ .dumpit = nl802154_dump_llsec_devkey, .flags = GENL_ADMIN_PERM, @@ -2420,7 +2408,8 @@ static const struct genl_ops nl802154_ops[] = { }, { .cmd = NL802154_CMD_GET_SEC_LEVEL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching frame_type? */ .dumpit = nl802154_dump_llsec_seclevel, .flags = GENL_ADMIN_PERM, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 70f92aaca411..53de8e00990e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -208,7 +208,7 @@ int inet_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index b804ccbdb241..0c28bd469a68 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -9,12 +9,12 @@ #include <net/netns/ipv4.h> #include <net/ip_fib.h> -int call_fib4_notifier(struct notifier_block *nb, struct net *net, +int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET; - return call_fib_notifier(nb, net, event_type, info); + return call_fib_notifier(nb, event_type, info); } int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, @@ -34,17 +34,16 @@ static unsigned int fib4_seq_read(struct net *net) return net->ipv4.fib_seq + fib4_rules_seq_read(net); } -static int fib4_dump(struct net *net, struct notifier_block *nb) +static int fib4_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { int err; - err = fib4_rules_dump(net, nb); + err = fib4_rules_dump(net, nb, extack); if (err) return err; - fib_notify(net, nb); - - return 0; + return fib_notify(net, nb, extack); } static const struct fib_notifier_ops fib4_notifier_ops_template = { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b43a7ba5c6a4..f99e3bac5cab 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -65,9 +65,10 @@ bool fib4_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib4_rule_default); -int fib4_rules_dump(struct net *net, struct notifier_block *nb) +int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, AF_INET); + return fib_rules_dump(net, nb, AF_INET, extack); } unsigned int fib4_rules_seq_read(struct net *net) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1ab2fb6bb37d..b9df9c09b84e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -74,11 +74,13 @@ #include <trace/events/fib.h> #include "fib_lookup.h" -static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_alias *fa) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, @@ -86,7 +88,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, .type = fa->fa_type, .tb_id = fa->tb_id, }; - return call_fib4_notifier(nb, net, event_type, &info.info); + return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, @@ -2015,10 +2017,12 @@ void fib_info_notify_update(struct net *net, struct nl_info *info) } } -static void fib_leaf_notify(struct net *net, struct key_vector *l, - struct fib_table *tb, struct notifier_block *nb) +static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, + struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib_alias *fa; + int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; @@ -2032,39 +2036,53 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, if (tb->tb_id != fa->tb_id) continue; - call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fa); + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, + KEYLENGTH - fa->fa_slen, + fa, extack); + if (err) + return err; } + return 0; } -static void fib_table_notify(struct net *net, struct fib_table *tb, - struct notifier_block *nb) +static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; + int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - fib_leaf_notify(net, l, tb, nb); + err = fib_leaf_notify(l, tb, nb, extack); + if (err) + return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } + return 0; } -void fib_notify(struct net *net, struct notifier_block *nb) +int fib_notify(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { unsigned int h; + int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, head, tb_hlist) - fib_table_notify(net, tb, nb); + hlist_for_each_entry_rcu(tb, head, tb_hlist) { + err = fib_table_notify(tb, nb, extack); + if (err) + return err; + } } + return 0; } static void __trie_free_rcu(struct rcu_head *head) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4298aae74e0e..18068ed42f25 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -249,10 +249,11 @@ bool icmp_global_allow(void) bool rc = false; /* Check if token bucket is empty and cannot be refilled - * without taking the spinlock. + * without taking the spinlock. The READ_ONCE() are paired + * with the following WRITE_ONCE() in this same function. */ - if (!icmp_global.credit) { - delta = min_t(u32, now - icmp_global.stamp, HZ); + if (!READ_ONCE(icmp_global.credit)) { + delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ); if (delta < HZ / 50) return false; } @@ -262,14 +263,14 @@ bool icmp_global_allow(void) if (delta >= HZ / 50) { incr = sysctl_icmp_msgs_per_sec * delta / HZ ; if (incr) - icmp_global.stamp = now; + WRITE_ONCE(icmp_global.stamp, now); } credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); if (credit) { credit--; rc = true; } - icmp_global.credit = credit; + WRITE_ONCE(icmp_global.credit, credit); spin_unlock(&icmp_global.lock); return rc; } @@ -682,7 +683,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); if (dev) - saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); + saddr = inet_select_addr(dev, iph->saddr, + RT_SCOPE_LINK); else saddr = 0; rcu_read_unlock(); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 480d0b22db1a..3b9c7a2725a9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1563,7 +1563,7 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb) } } -static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +static __sum16 ip_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_simple_validate(skb); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index eb30fc1770de..e4c6e8b40490 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -716,7 +716,7 @@ static void reqsk_timer_handler(struct timer_list *t) * ones are about to clog our table. */ qlen = reqsk_queue_len(queue); - if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) { + if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 7dc79b973e6e..af154977904c 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -226,17 +226,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = - jiffies_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_to_msecs(sk->sk_timer.expires - jiffies); + jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); } else { r->idiag_timer = 0; r->idiag_expires = 0; @@ -342,16 +342,13 @@ static int inet_twsk_diag_fill(struct sock *sk, r = nlmsg_data(nlh); BUG_ON(tw->tw_state != TCP_TIME_WAIT); - tmo = tw->tw_timer.expires - jiffies; - if (tmo < 0) - tmo = 0; - inet_diag_msg_common_fill(r, sk); r->idiag_retrans = 0; r->idiag_state = tw->tw_substate; r->idiag_timer = 3; - r->idiag_expires = jiffies_to_msecs(tmo); + tmo = tw->tw_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; @@ -385,7 +382,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, offsetof(struct sock, sk_cookie)); tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; - r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0; + r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index be778599bfed..ff327a62c9ce 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -160,7 +160,12 @@ static void inet_peer_gc(struct inet_peer_base *base, base->total / inet_peer_threshold * HZ; for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; - delta = (__u32)jiffies - p->dtime; + + /* The READ_ONCE() pairs with the WRITE_ONCE() + * in inet_putpeer() + */ + delta = (__u32)jiffies - READ_ONCE(p->dtime); + if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; } @@ -237,7 +242,10 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { - p->dtime = (__u32)jiffies; + /* The WRITE_ONCE() pairs with itself (we run lockless) + * and the READ_ONCE() in inet_peer_gc() + */ + WRITE_ONCE(p->dtime, (__u32)jiffies); if (refcount_dec_and_test(&p->refcnt)) call_rcu(&p->rcu, inetpeer_free_rcu); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c59a78a267c3..24a95126e698 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -611,5 +611,6 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - ip_sublist_rcv(&sublist, curr_dev, curr_net); + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, curr_dev, curr_net); } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 1452a97914a0..d4f84bf9289a 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -34,6 +34,9 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/dst_metadata.h> +#include <net/geneve.h> +#include <net/vxlan.h> +#include <net/erspan.h> const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; @@ -126,15 +129,14 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, if (!md || md->type != METADATA_IP_TUNNEL || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) - return NULL; - res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags); + src = &md->u.tun_info; + res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags); if (!res) return NULL; dst = &res->u.tun_info; - src = &md->u.tun_info; dst->key.tun_id = src->key.tun_id; if (src->mode & IP_TUNNEL_INFO_IPV6) memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, @@ -143,6 +145,8 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, dst->key.u.ipv4.dst = src->key.u.ipv4.src; dst->key.tun_flags = src->key.tun_flags; dst->mode = src->mode | IP_TUNNEL_INFO_TX; + ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src), + src->options_len, 0); return res; } @@ -217,24 +221,199 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, + [LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = { + [LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED }, + [LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED }, + [LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = { + [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, + [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, +}; + +static const struct nla_policy +vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = { + [LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = { + [LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, + [LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; +static int ip_tun_parse_opts_geneve(struct nlattr *attr, + struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1]; + int data_len, err; + + err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, + attr, geneve_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] || + !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] || + !tb[LWTUNNEL_IP_OPT_GENEVE_DATA]) + return -EINVAL; + + attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA]; + data_len = nla_len(attr); + if (data_len % 4) + return -EINVAL; + + if (info) { + struct geneve_opt *opt = ip_tunnel_info_opts(info); + + memcpy(opt->opt_data, nla_data(attr), data_len); + opt->length = data_len / 4; + attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS]; + opt->opt_class = nla_get_be16(attr); + attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE]; + opt->type = nla_get_u8(attr); + info->key.tun_flags |= TUNNEL_GENEVE_OPT; + } + + return sizeof(struct geneve_opt) + data_len; +} + +static int ip_tun_parse_opts_vxlan(struct nlattr *attr, + struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1]; + int err; + + err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, + attr, vxlan_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP]) + return -EINVAL; + + if (info) { + struct vxlan_metadata *md = ip_tunnel_info_opts(info); + + attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; + md->gbp = nla_get_u32(attr); + info->key.tun_flags |= TUNNEL_VXLAN_OPT; + } + + return sizeof(struct vxlan_metadata); +} + +static int ip_tun_parse_opts_erspan(struct nlattr *attr, + struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1]; + int err; + + err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, + attr, erspan_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER]) + return -EINVAL; + + if (info) { + struct erspan_metadata *md = ip_tunnel_info_opts(info); + + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_VER]; + md->version = nla_get_u8(attr); + + if (md->version == 1 && tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]) { + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]; + md->u.index = nla_get_be32(attr); + } else if (md->version == 2 && tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] && + tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]) { + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR]; + md->u.md2.dir = nla_get_u8(attr); + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]; + set_hwid(&md->u.md2, nla_get_u8(attr)); + } else { + return -EINVAL; + } + + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + } + + return sizeof(struct erspan_metadata); +} + +static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPTS_MAX + 1]; + int err; + + if (!attr) + return 0; + + err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_OPTS_MAX, attr, + ip_opts_policy, extack); + if (err) + return err; + + if (tb[LWTUNNEL_IP_OPTS_GENEVE]) + err = ip_tun_parse_opts_geneve(tb[LWTUNNEL_IP_OPTS_GENEVE], + info, extack); + else if (tb[LWTUNNEL_IP_OPTS_VXLAN]) + err = ip_tun_parse_opts_vxlan(tb[LWTUNNEL_IP_OPTS_VXLAN], + info, extack); + else if (tb[LWTUNNEL_IP_OPTS_ERSPAN]) + err = ip_tun_parse_opts_erspan(tb[LWTUNNEL_IP_OPTS_ERSPAN], + info, extack); + else + err = -EINVAL; + + return err; +} + +static int ip_tun_get_optlen(struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + return ip_tun_parse_opts(attr, NULL, extack); +} + +static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + return ip_tun_parse_opts(attr, info, extack); +} + static int ip_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { - struct ip_tunnel_info *tun_info; - struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; - int err; + struct lwtunnel_state *new_state; + struct ip_tunnel_info *tun_info; + int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, extack); if (err < 0) return err; - new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack); + if (opt_len < 0) + return opt_len; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; @@ -242,6 +421,12 @@ static int ip_tun_build_state(struct nlattr *attr, tun_info = lwt_tun_info(new_state); + err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack); + if (err < 0) { + lwtstate_free(new_state); + return err; + } + #ifdef CONFIG_DST_CACHE err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL); if (err) { @@ -266,10 +451,10 @@ static int ip_tun_build_state(struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); if (tb[LWTUNNEL_IP_FLAGS]) - tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]); + tun_info->key.tun_flags |= nla_get_be16(tb[LWTUNNEL_IP_FLAGS]); tun_info->mode = IP_TUNNEL_INFO_TX; - tun_info->options_len = 0; + tun_info->options_len = opt_len; *ts = new_state; @@ -285,6 +470,110 @@ static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate) #endif } +static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct geneve_opt *opt; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE); + if (!nest) + return -ENOMEM; + + opt = ip_tunnel_info_opts(tun_info); + if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS, opt->opt_class) || + nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) || + nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4, + opt->opt_data)) { + nla_nest_cancel(skb, nest); + return -ENOMEM; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct vxlan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN); + if (!nest) + return -ENOMEM; + + md = ip_tunnel_info_opts(tun_info); + if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) { + nla_nest_cancel(skb, nest); + return -ENOMEM; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct erspan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN); + if (!nest) + return -ENOMEM; + + md = ip_tunnel_info_opts(tun_info); + if (nla_put_u32(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version)) + goto err; + + if (md->version == 1 && + nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index)) + goto err; + + if (md->version == 2 && + (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) || + nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID, + get_hwid(&md->u.md2)))) + goto err; + + nla_nest_end(skb, nest); + return 0; +err: + nla_nest_cancel(skb, nest); + return -ENOMEM; +} + +static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type, + struct ip_tunnel_info *tun_info) +{ + struct nlattr *nest; + int err = 0; + + if (!(tun_info->key.tun_flags & + (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT))) + return 0; + + nest = nla_nest_start_noflag(skb, type); + if (!nest) + return -ENOMEM; + + if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT) + err = ip_tun_fill_encap_opts_geneve(skb, tun_info); + else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT) + err = ip_tun_fill_encap_opts_vxlan(skb, tun_info); + else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT) + err = ip_tun_fill_encap_opts_erspan(skb, tun_info); + + if (err) { + nla_nest_cancel(skb, nest); + return err; + } + + nla_nest_end(skb, nest); + return 0; +} + static int ip_tun_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { @@ -296,12 +585,42 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) || + ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info)) return -ENOMEM; return 0; } +static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) +{ + int opt_len; + + if (!(info->key.tun_flags & + (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT))) + return 0; + + opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */ + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + struct geneve_opt *opt = ip_tunnel_info_opts(info); + + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_GENEVE */ + + nla_total_size(2) /* OPT_GENEVE_CLASS */ + + nla_total_size(1) /* OPT_GENEVE_TYPE */ + + nla_total_size(opt->length * 4); + /* OPT_GENEVE_DATA */ + } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */ + + nla_total_size(4); /* OPT_VXLAN_GBP */ + } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */ + + nla_total_size(1) /* OPT_ERSPAN_VER */ + + nla_total_size(4); /* OPT_ERSPAN_INDEX/DIR/HWID */ + } + + return opt_len; +} + static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */ @@ -309,13 +628,21 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ - + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ + + nla_total_size(2) /* LWTUNNEL_IP_FLAGS */ + + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); + /* LWTUNNEL_IP_OPTS */ } static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { - return memcmp(lwt_tun_info(a), lwt_tun_info(b), - sizeof(struct ip_tunnel_info)); + struct ip_tunnel_info *info_a = lwt_tun_info(a); + struct ip_tunnel_info *info_b = lwt_tun_info(b); + + return memcmp(info_a, info_b, sizeof(info_a->key)) || + info_a->mode != info_b->mode || + info_a->options_len != info_b->options_len || + memcmp(ip_tunnel_info_opts(info_a), + ip_tunnel_info_opts(info_b), info_a->options_len); } static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { @@ -341,17 +668,21 @@ static int ip6_tun_build_state(struct nlattr *attr, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { - struct ip_tunnel_info *tun_info; - struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; - int err; + struct lwtunnel_state *new_state; + struct ip_tunnel_info *tun_info; + int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, extack); if (err < 0) return err; - new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack); + if (opt_len < 0) + return opt_len; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; @@ -359,6 +690,12 @@ static int ip6_tun_build_state(struct nlattr *attr, tun_info = lwt_tun_info(new_state); + err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack); + if (err < 0) { + lwtstate_free(new_state); + return err; + } + if (tb[LWTUNNEL_IP6_ID]) tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]); @@ -375,10 +712,10 @@ static int ip6_tun_build_state(struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); if (tb[LWTUNNEL_IP6_FLAGS]) - tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]); + tun_info->key.tun_flags |= nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]); tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; - tun_info->options_len = 0; + tun_info->options_len = opt_len; *ts = new_state; @@ -396,7 +733,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) || + ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info)) return -ENOMEM; return 0; @@ -409,7 +747,9 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ - + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ + + nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */ + + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); + /* LWTUNNEL_IP6_OPTS */ } static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 9bcca08efec9..32e20b758b68 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1483,10 +1483,10 @@ static int __init ip_auto_config(void) * missing values. */ if (ic_myaddr == NONE || -#ifdef CONFIG_ROOT_NFS +#if defined(CONFIG_ROOT_NFS) || defined(CONFIG_CIFS_ROOT) (root_server_addr == NONE && ic_servaddr == NONE && - ROOT_DEV == Root_NFS) || + (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) || #endif ic_first_dev->next) { #ifdef IPCONFIG_DYNAMIC @@ -1513,6 +1513,12 @@ static int __init ip_auto_config(void) goto try_try_again; } #endif +#ifdef CONFIG_CIFS_ROOT + if (ROOT_DEV == Root_CIFS) { + pr_err("IP-Config: Retrying forever (CIFS root)...\n"); + goto try_try_again; + } +#endif if (--retries) { pr_err("IP-Config: Reopening network devices...\n"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 716d5472c022..440294bdb752 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -278,9 +278,10 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack); } static unsigned int ipmr_rules_seq_read(struct net *net) @@ -336,7 +337,8 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -3040,10 +3042,11 @@ static unsigned int ipmr_seq_read(struct net *net) return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); } -static int ipmr_dump(struct net *net, struct notifier_block *nb) +static int ipmr_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, - ipmr_mr_table_iter, &mrt_lock); + ipmr_mr_table_iter, &mrt_lock, extack); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index ea48bd15a575..aa8738a91210 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -386,15 +386,17 @@ EXPORT_SYMBOL(mr_rtm_dumproute); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock) + rwlock_t *mrt_lock, + struct netlink_ext_ack *extack) { struct mr_table *mrt; int err; - err = rules_dump(net, nb); + err = rules_dump(net, nb, extack); if (err) return err; @@ -409,17 +411,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, if (!v->dev) continue; - mr_call_vif_notifier(nb, net, family, - FIB_EVENT_VIF_ADD, - v, vifi, mrt->id); + err = mr_call_vif_notifier(nb, family, + FIB_EVENT_VIF_ADD, + v, vifi, mrt->id, extack); + if (err) + break; } read_unlock(mrt_lock); + if (err) + return err; + /* Notify on table MFC entries */ - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - mr_call_mfc_notifier(nb, net, family, - FIB_EVENT_ENTRY_ADD, - mfc, mrt->id); + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + err = mr_call_mfc_notifier(nb, family, + FIB_EVENT_ENTRY_ADD, + mfc, mrt->id, extack); + if (err) + return err; + } } return 0; diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index 36a28d46149c..c94445b44d8c 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -31,16 +31,8 @@ extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, if (icmph == NULL) return 1; - switch (icmph->type) { - case ICMP_DEST_UNREACH: - case ICMP_SOURCE_QUENCH: - case ICMP_REDIRECT: - case ICMP_TIME_EXCEEDED: - case ICMP_PARAMETERPROB: - break; - default: + if (!icmp_is_err(icmph->type)) return 1; - } inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(struct icmphdr), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 621f83434b24..dcc4fa10138d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1894,10 +1894,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, if (!icmph) goto out; - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_REDIRECT && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB) + if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 535b69326f66..345b2b0ff618 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -62,10 +62,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ -u64 cookie_init_timestamp(struct request_sock *req) +u64 cookie_init_timestamp(struct request_sock *req, u64 now) { struct inet_request_sock *ireq; - u32 ts, ts_now = tcp_time_stamp_raw(); + u32 ts, ts_now = tcp_ns_to_ts(now); u32 options = 0; ireq = inet_rsk(req); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d8876f0e9672..9b48aec29aca 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1741,8 +1741,8 @@ static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc) { unsigned long address = (unsigned long)zc->address; + u32 length = 0, seq, offset, zap_len; const skb_frag_t *frags = NULL; - u32 length = 0, seq, offset; struct vm_area_struct *vma; struct sk_buff *skb = NULL; struct tcp_sock *tp; @@ -1769,12 +1769,12 @@ static int tcp_zerocopy_receive(struct sock *sk, seq = tp->copied_seq; inq = tcp_inq(sk); zc->length = min_t(u32, zc->length, inq); - zc->length &= ~(PAGE_SIZE - 1); - if (zc->length) { - zap_page_range(vma, address, zc->length); + zap_len = zc->length & ~(PAGE_SIZE - 1); + if (zap_len) { + zap_page_range(vma, address, zap_len); zc->recv_skip_hint = 0; } else { - zc->recv_skip_hint = inq; + zc->recv_skip_hint = zc->length; } ret = 0; while (length + PAGE_SIZE <= zc->length) { @@ -1958,8 +1958,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, struct sk_buff *skb, *last; u32 urg_hole = 0; struct scm_timestamping_internal tss; - bool has_tss = false; - bool has_cmsg; + int cmsg_flags; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); @@ -1974,7 +1973,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (sk->sk_state == TCP_LISTEN) goto out; - has_cmsg = tp->recvmsg_inq; + cmsg_flags = tp->recvmsg_inq ? 1 : 0; timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */ @@ -2047,7 +2046,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* Well, if we have backlog, try to process it now yet. */ - if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { @@ -2157,8 +2156,7 @@ skip_copy: if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, &tss); - has_tss = true; - has_cmsg = true; + cmsg_flags |= 2; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; @@ -2183,10 +2181,10 @@ found_fin_ok: release_sock(sk); - if (has_cmsg) { - if (has_tss) + if (cmsg_flags) { + if (cmsg_flags & 2) tcp_recv_timestamp(msg, sk, &tss); - if (tp->recvmsg_inq) { + if (cmsg_flags & 1) { inq = tcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } @@ -2666,6 +2664,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); inet->defer_connect = 0; + tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); @@ -3224,8 +3223,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) * tcpi_unacked -> Number of children ready for accept() * tcpi_sacked -> max backlog */ - info->tcpi_unacked = sk->sk_ack_backlog; - info->tcpi_sacked = sk->sk_max_ack_backlog; + info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); + info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); return; } @@ -3305,6 +3304,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 549506162dde..0d08f9e2d8d0 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -21,8 +21,8 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, struct tcp_info *info = _info; if (inet_sk_state_load(sk) == TCP_LISTEN) { - r->idiag_rqueue = sk->sk_ack_backlog; - r->idiag_wqueue = sk->sk_max_ack_backlog; + r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); + r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } else if (sk->sk_type == SOCK_STREAM) { const struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index a915ade0c818..19ad9586c720 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -422,7 +422,10 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, cookie->len = -1; return true; } - return cookie->len > 0; + if (cookie->len > 0) + return true; + tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE; + return false; } /* This function checks if we want to defer sending SYN until the first diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a2e52ad7cdab..88b987ca9ebb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5814,6 +5814,10 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ + if (tp->total_retrans) + tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; + else + tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; skb_rbtree_walk_from(data) { if (__tcp_retransmit_skb(sk, data, 1)) break; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 67b2dc7a1727..92282f98dc82 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -121,11 +121,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == AF_INET6) { if (ipv6_addr_loopback(&tw->tw_v6_daddr) || - (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && - (tw->tw_v6_daddr.s6_addr[12] == 127)) || + ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || - (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && - (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) + ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) loopback = true; } else #endif @@ -2453,7 +2451,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) state = inet_sk_state_load(sk); if (state == TCP_LISTEN) - rx_queue = sk->sk_ack_backlog; + rx_queue = READ_ONCE(sk->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0488607c5cd3..be6d22b8190f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3290,7 +3290,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp_ns = cookie_init_timestamp(req); + skb->skb_mstamp_ns = cookie_init_timestamp(req, now); else #endif { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 34ccef18b40e..98d82305d6de 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5552,14 +5552,13 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (!nla) goto nla_put_failure; - - if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) - goto nla_put_failure; - read_lock_bh(&idev->lock); memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); read_unlock_bh(&idev->lock); + if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c index 05f82baaa99e..f87ae33e1d01 100644 --- a/net/ipv6/fib6_notifier.c +++ b/net/ipv6/fib6_notifier.c @@ -7,12 +7,12 @@ #include <net/netns/ipv6.h> #include <net/ip6_fib.h> -int call_fib6_notifier(struct notifier_block *nb, struct net *net, +int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; - return call_fib_notifier(nb, net, event_type, info); + return call_fib_notifier(nb, event_type, info); } int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, @@ -27,15 +27,16 @@ static unsigned int fib6_seq_read(struct net *net) return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } -static int fib6_dump(struct net *net, struct notifier_block *nb) +static int fib6_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { int err; - err = fib6_rules_dump(net, nb); + err = fib6_rules_dump(net, nb, extack); if (err) return err; - return fib6_tables_dump(net, nb); + return fib6_tables_dump(net, nb, extack); } static const struct fib_notifier_ops fib6_notifier_ops_template = { diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index f9e8fe3ff0c5..fafe556d21e0 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -47,9 +47,10 @@ bool fib6_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib6_rule_default); -int fib6_rules_dump(struct net *net, struct notifier_block *nb) +int fib6_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, AF_INET6); + return fib_rules_dump(net, nb, AF_INET6, extack); } unsigned int fib6_rules_seq_read(struct net *net) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 62c997201970..ef408a5090a2 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -516,13 +516,29 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, mip6_addr_swap(skb); + sk = icmpv6_xmit_lock(net); + if (!sk) + goto out_bh_enable; + memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = hdr->saddr; if (force_saddr) saddr = force_saddr; - if (saddr) + if (saddr) { fl6.saddr = *saddr; + } else { + /* select a more meaningful saddr from input if */ + struct net_device *in_netdev; + + in_netdev = dev_get_by_index(net, IP6CB(skb)->iif); + if (in_netdev) { + ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, + inet6_sk(sk)->srcprefs, + &fl6.saddr); + dev_put(in_netdev); + } + } fl6.flowi6_mark = mark; fl6.flowi6_oif = iif; fl6.fl6_icmp_type = type; @@ -531,10 +547,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); - sk = icmpv6_xmit_lock(net); - if (!sk) - goto out_bh_enable; - sk->sk_mark = mark; np = inet6_sk(sk); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6e2af411cd9c..f66bc2af4e9d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -357,15 +357,17 @@ unsigned int fib6_tables_seq_read(struct net *net) return fib_seq; } -static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, - struct fib6_info *rt) + struct fib6_info *rt, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { + .info.extack = extack, .rt = rt, }; - return call_fib6_notifier(nb, net, event_type, &info.info); + return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, @@ -401,40 +403,51 @@ int call_fib6_multipath_entry_notifiers(struct net *net, struct fib6_dump_arg { struct net *net; struct notifier_block *nb; + struct netlink_ext_ack *extack; }; -static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) +static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { if (rt == arg->net->ipv6.fib6_null_entry) - return; - call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt); + return 0; + return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, + rt, arg->extack); } static int fib6_node_dump(struct fib6_walker *w) { struct fib6_info *rt; + int err = 0; - for_each_fib6_walker_rt(w) - fib6_rt_dump(rt, w->args); + for_each_fib6_walker_rt(w) { + err = fib6_rt_dump(rt, w->args); + if (err) + break; + } w->leaf = NULL; - return 0; + return err; } -static void fib6_table_dump(struct net *net, struct fib6_table *tb, - struct fib6_walker *w) +static int fib6_table_dump(struct net *net, struct fib6_table *tb, + struct fib6_walker *w) { + int err; + w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); - fib6_walk(net, w); + err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); + return err; } /* Called with rcu_read_lock() */ -int fib6_tables_dump(struct net *net, struct notifier_block *nb) +int fib6_tables_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; + int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) @@ -443,19 +456,24 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb) w->func = fib6_node_dump; arg.net = net; arg.nb = nb; + arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) - fib6_table_dump(net, tb, w); + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { + err = fib6_table_dump(net, tb, w); + if (err < 0) + goto out; + } } +out: kfree(w); - return 0; + return err; } static int fib6_dump_node(struct fib6_walker *w) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 3d71c7d6102c..ef7f707d9ae3 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -325,7 +325,8 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - ip6_sublist_rcv(&sublist, curr_dev, curr_net); + if (!list_empty(&sublist)) + ip6_sublist_rcv(&sublist, curr_dev, curr_net); } INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 857a89ad4d6c..bfa49ff70531 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -265,9 +265,10 @@ static void __net_exit ip6mr_rules_exit(struct net *net) rtnl_unlock(); } -static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR); + return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(struct net *net) @@ -324,7 +325,8 @@ static void __net_exit ip6mr_rules_exit(struct net *net) rtnl_unlock(); } -static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -1256,10 +1258,11 @@ static unsigned int ip6mr_seq_read(struct net *net) return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); } -static int ip6mr_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, - ip6mr_mr_table_iter, &mrt_lock); + ip6mr_mr_table_iter, &mrt_lock, extack); } static struct notifier_block ip6_mr_notifier = { diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 34d51cd426b0..6bac68fb27a3 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -150,4 +150,4 @@ EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs"); -MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support"); +MODULE_DESCRIPTION("Netfilter IPv6 transparent proxy support"); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e60bf8e7dd1a..edcb52543518 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1479,11 +1479,11 @@ static u32 rt6_exception_hash(const struct in6_addr *dst, u32 val; net_get_random_once(&seed, sizeof(seed)); - val = jhash(dst, sizeof(*dst), seed); + val = jhash2((const u32 *)dst, sizeof(*dst)/sizeof(u32), seed); #ifdef CONFIG_IPV6_SUBTREES if (src) - val = jhash(src, sizeof(*src), val); + val = jhash2((const u32 *)src, sizeof(*src)/sizeof(u32), val); #endif return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); } @@ -2295,10 +2295,7 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb, if (!icmph) goto out; - if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && - icmph->icmp6_type != ICMPV6_PKT_TOOBIG && - icmph->icmp6_type != ICMPV6_TIME_EXCEED && - icmph->icmp6_type != ICMPV6_PARAMPROB) + if (!icmpv6_is_err(icmph->icmp6_type)) goto out; inner_iph = skb_header_pointer(skb, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4804b6dc5e65..81f51335e326 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1891,7 +1891,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) state = inet_sk_state_load(sp); if (state == TCP_LISTEN) - rx_queue = sp->sk_ack_backlog; + rx_queue = READ_ONCE(sp->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index c74f44dfaa22..2922d4150d88 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -705,7 +705,7 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags, /* put original socket back into a clean listen state. */ sk->sk_state = TCP_LISTEN; - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); dprintk("%s: ok success on %02X, client on %02X\n", __func__, llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); frees: @@ -780,7 +780,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } /* Well, if we have backlog, try to process it now yet. */ - if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index b11883d26875..33da6f738c99 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -485,7 +485,14 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) params.ssn = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, ¶ms); - if (ret) { + if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) { + /* + * We didn't send the request yet, so don't need to check + * here if we already got a response, just mark as driver + * ready immediately. + */ + set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state); + } else if (ret) { ht_dbg(sdata, "BA request denied - HW unavailable for %pM tid %d\n", sta->sta.addr, tid); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 70739e746c13..4fb7f1f12109 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3428,7 +3428,7 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, spin_lock_irqsave(&local->ack_status_lock, spin_flags); id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x10000, GFP_ATOMIC); + 1, 0x40, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); if (id < 0) { diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 0a6ff01c68a9..d40744903fa9 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -538,7 +538,6 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct cfg80211_bss *cbss; - int err, changed = 0; sdata_assert_lock(sdata); @@ -560,13 +559,7 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) ifibss->chandef = sdata->csa_chandef; /* generate the beacon */ - err = ieee80211_ibss_csa_beacon(sdata, NULL); - if (err < 0) - return err; - - changed |= err; - - return changed; + return ieee80211_ibss_csa_beacon(sdata, NULL); } void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 54dd8849d1cc..5fa13176036f 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3186,15 +3186,14 @@ static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, - struct ieee80211_mgmt *mgmt, size_t len) + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee802_11_elems *elems) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; struct sta_info *sta; - u8 *pos; u16 capab_info, aid; - struct ieee802_11_elems elems; struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; const struct cfg80211_bss_ies *bss_ies = NULL; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; @@ -3222,19 +3221,15 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ifmgd->broken_ap = true; } - pos = mgmt->u.assoc_resp.variable; - ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems, - mgmt->bssid, assoc_data->bss->bssid); - - if (!elems.supp_rates) { + if (!elems->supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); return false; } ifmgd->aid = aid; ifmgd->tdls_chan_switch_prohibited = - elems.ext_capab && elems.ext_capab_len >= 5 && - (elems.ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); + elems->ext_capab && elems->ext_capab_len >= 5 && + (elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); /* * Some APs are erroneously not including some information in their @@ -3243,11 +3238,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ - if ((assoc_data->wmm && !elems.wmm_param) || + if ((assoc_data->wmm && !elems->wmm_param) || (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && - (!elems.ht_cap_elem || !elems.ht_operation)) || + (!elems->ht_cap_elem || !elems->ht_operation)) || (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && - (!elems.vht_cap_elem || !elems.vht_operation))) { + (!elems->vht_cap_elem || !elems->vht_operation))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems bss_elems; @@ -3265,8 +3260,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, mgmt->bssid, assoc_data->bss->bssid); if (assoc_data->wmm && - !elems.wmm_param && bss_elems.wmm_param) { - elems.wmm_param = bss_elems.wmm_param; + !elems->wmm_param && bss_elems.wmm_param) { + elems->wmm_param = bss_elems.wmm_param; sdata_info(sdata, "AP bug: WMM param missing from AssocResp\n"); } @@ -3275,27 +3270,27 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * Also check if we requested HT/VHT, otherwise the AP doesn't * have to include the IEs in the (re)association response. */ - if (!elems.ht_cap_elem && bss_elems.ht_cap_elem && + if (!elems->ht_cap_elem && bss_elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { - elems.ht_cap_elem = bss_elems.ht_cap_elem; + elems->ht_cap_elem = bss_elems.ht_cap_elem; sdata_info(sdata, "AP bug: HT capability missing from AssocResp\n"); } - if (!elems.ht_operation && bss_elems.ht_operation && + if (!elems->ht_operation && bss_elems.ht_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { - elems.ht_operation = bss_elems.ht_operation; + elems->ht_operation = bss_elems.ht_operation; sdata_info(sdata, "AP bug: HT operation missing from AssocResp\n"); } - if (!elems.vht_cap_elem && bss_elems.vht_cap_elem && + if (!elems->vht_cap_elem && bss_elems.vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { - elems.vht_cap_elem = bss_elems.vht_cap_elem; + elems->vht_cap_elem = bss_elems.vht_cap_elem; sdata_info(sdata, "AP bug: VHT capa missing from AssocResp\n"); } - if (!elems.vht_operation && bss_elems.vht_operation && + if (!elems->vht_operation && bss_elems.vht_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { - elems.vht_operation = bss_elems.vht_operation; + elems->vht_operation = bss_elems.vht_operation; sdata_info(sdata, "AP bug: VHT operation missing from AssocResp\n"); } @@ -3306,7 +3301,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * they should be present here. This is just a safety net. */ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && - (!elems.wmm_param || !elems.ht_cap_elem || !elems.ht_operation)) { + (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) { sdata_info(sdata, "HT AP is missing WMM params or HT capability/operation\n"); ret = false; @@ -3314,7 +3309,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && - (!elems.vht_cap_elem || !elems.vht_operation)) { + (!elems->vht_cap_elem || !elems->vht_operation)) { sdata_info(sdata, "VHT AP is missing VHT capability/operation\n"); ret = false; @@ -3341,7 +3336,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && - (!elems.he_cap || !elems.he_operation)) { + (!elems->he_cap || !elems->he_operation)) { mutex_unlock(&sdata->local->sta_mtx); sdata_info(sdata, "HE AP is missing HE capability/operation\n"); @@ -3350,23 +3345,23 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } /* Set up internal HT/VHT capabilities */ - if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) + if (elems->ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, - elems.ht_cap_elem, sta); + elems->ht_cap_elem, sta); - if (elems.vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) + if (elems->vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - elems.vht_cap_elem, sta); + elems->vht_cap_elem, sta); - if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && - elems.he_cap) { + if (elems->he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + elems->he_cap) { ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, - elems.he_cap, - elems.he_cap_len, + elems->he_cap, + elems->he_cap_len, sta); bss_conf->he_support = sta->sta.he_cap.has_he; - changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); + changed |= ieee80211_recalc_twt_req(sdata, sta, elems); } else { bss_conf->he_support = false; bss_conf->twt_requester = false; @@ -3374,14 +3369,14 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (bss_conf->he_support) { bss_conf->bss_color = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_MASK); bss_conf->htc_trig_based_pkt_ext = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); bss_conf->frame_time_rts_th = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); bss_conf->multi_sta_back_32bit = @@ -3392,12 +3387,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_ACK_EN; - bss_conf->uora_exists = !!elems.uora_element; - if (elems.uora_element) - bss_conf->uora_ocw_range = elems.uora_element[0]; + bss_conf->uora_exists = !!elems->uora_element; + if (elems->uora_element) + bss_conf->uora_ocw_range = elems->uora_element[0]; - ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems.he_operation); - ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems.he_spr); + ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems->he_operation); + ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems->he_spr); /* TODO: OPEN: what happens if BSS color disable is set? */ } @@ -3421,11 +3416,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * NSS calculation (that would be done in rate_control_rate_init()) * and use the # of streams from that element. */ - if (elems.opmode_notif && - !(*elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { + if (elems->opmode_notif && + !(*elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { u8 nss; - nss = *elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; + nss = *elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; sta->sta.rx_nss = nss; @@ -3440,7 +3435,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta->sta.mfp = false; } - sta->sta.wme = elems.wmm_param && local->hw.queues >= IEEE80211_NUM_ACS; + sta->sta.wme = elems->wmm_param && local->hw.queues >= IEEE80211_NUM_ACS; err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) @@ -3468,9 +3463,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) { ieee80211_set_wmm_default(sdata, false, false); - } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len, - elems.mu_edca_param_set)) { + } else if (!ieee80211_sta_wmm_params(local, sdata, elems->wmm_param, + elems->wmm_param_len, + elems->mu_edca_param_set)) { /* still enable QoS since we might have HT/VHT */ ieee80211_set_wmm_default(sdata, false, true); /* set the disable-WMM flag in this case to disable @@ -3484,11 +3479,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } changed |= BSS_CHANGED_QOS; - if (elems.max_idle_period_ie) { + if (elems->max_idle_period_ie) { bss_conf->max_idle_period = - le16_to_cpu(elems.max_idle_period_ie->max_idle_period); + le16_to_cpu(elems->max_idle_period_ie->max_idle_period); bss_conf->protected_keep_alive = - !!(elems.max_idle_period_ie->idle_options & + !!(elems->max_idle_period_ie->idle_options & WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE); changed |= BSS_CHANGED_KEEP_ALIVE; } else { @@ -3598,7 +3593,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { - if (!ieee80211_assoc_success(sdata, bss, mgmt, len)) { + if (!ieee80211_assoc_success(sdata, bss, mgmt, len, &elems)) { /* oops -- internal error -- send timeout for now */ ieee80211_destroy_assoc_data(sdata, false, false); cfg80211_assoc_timeout(sdata->dev, bss); diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index ee86c3333999..86bc469a28bc 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -70,7 +70,7 @@ rix_to_ndx(struct minstrel_sta_info *mi, int rix) } /* return current EMWA throughput */ -int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma) +int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg) { int usecs; @@ -79,13 +79,13 @@ int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma) usecs = 1000000; /* reset thr. below 10% success */ - if (mr->stats.prob_ewma < MINSTREL_FRAC(10, 100)) + if (mr->stats.prob_avg < MINSTREL_FRAC(10, 100)) return 0; - if (prob_ewma > MINSTREL_FRAC(90, 100)) + if (prob_avg > MINSTREL_FRAC(90, 100)) return MINSTREL_TRUNC(100000 * (MINSTREL_FRAC(90, 100) / usecs)); else - return MINSTREL_TRUNC(100000 * (prob_ewma / usecs)); + return MINSTREL_TRUNC(100000 * (prob_avg / usecs)); } /* find & sort topmost throughput rates */ @@ -98,8 +98,8 @@ minstrel_sort_best_tp_rates(struct minstrel_sta_info *mi, int i, u8 *tp_list) for (j = MAX_THR_RATES; j > 0; --j) { tmp_mrs = &mi->r[tp_list[j - 1]].stats; - if (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_ewma) <= - minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_ewma)) + if (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_avg) <= + minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_avg)) break; } @@ -157,20 +157,24 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * Recalculate statistics and counters of a given rate */ void -minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs) +minstrel_calc_rate_stats(struct minstrel_priv *mp, + struct minstrel_rate_stats *mrs) { unsigned int cur_prob; if (unlikely(mrs->attempts > 0)) { mrs->sample_skipped = 0; cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); - if (unlikely(!mrs->att_hist)) { - mrs->prob_ewma = cur_prob; + if (mp->new_avg) { + minstrel_filter_avg_add(&mrs->prob_avg, + &mrs->prob_avg_1, cur_prob); + } else if (unlikely(!mrs->att_hist)) { + mrs->prob_avg = cur_prob; } else { /*update exponential weighted moving avarage */ - mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma, - cur_prob, - EWMA_LEVEL); + mrs->prob_avg = minstrel_ewma(mrs->prob_avg, + cur_prob, + EWMA_LEVEL); } mrs->att_hist += mrs->attempts; mrs->succ_hist += mrs->success; @@ -200,12 +204,12 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) struct minstrel_rate_stats *tmp_mrs = &mi->r[tmp_prob_rate].stats; /* Update statistics of success probability per rate */ - minstrel_calc_rate_stats(mrs); + minstrel_calc_rate_stats(mp, mrs); /* Sample less often below the 10% chance of success. * Sample less often above the 95% chance of success. */ - if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) || - mrs->prob_ewma < MINSTREL_FRAC(10, 100)) { + if (mrs->prob_avg > MINSTREL_FRAC(95, 100) || + mrs->prob_avg < MINSTREL_FRAC(10, 100)) { mr->adjusted_retry_count = mrs->retry_count >> 1; if (mr->adjusted_retry_count > 2) mr->adjusted_retry_count = 2; @@ -225,14 +229,14 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * choose the maximum throughput rate as max_prob_rate * (2) if all success probabilities < 95%, the rate with * highest success probability is chosen as max_prob_rate */ - if (mrs->prob_ewma >= MINSTREL_FRAC(95, 100)) { - tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_ewma); + if (mrs->prob_avg >= MINSTREL_FRAC(95, 100)) { + tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_avg); tmp_prob_tp = minstrel_get_tp_avg(&mi->r[tmp_prob_rate], - tmp_mrs->prob_ewma); + tmp_mrs->prob_avg); if (tmp_cur_tp >= tmp_prob_tp) tmp_prob_rate = i; } else { - if (mrs->prob_ewma >= tmp_mrs->prob_ewma) + if (mrs->prob_avg >= tmp_mrs->prob_avg) tmp_prob_rate = i; } } @@ -290,7 +294,7 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, mi->sample_deferred--; if (time_after(jiffies, mi->last_stats_update + - (mp->update_interval * HZ) / 1000)) + mp->update_interval / (mp->new_avg ? 2 : 1))) minstrel_update_stats(mp, mi); } @@ -422,7 +426,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, * has a probability of >95%, we shouldn't be attempting * to use it, as this only wastes precious airtime */ if (!mrr_capable && - (mi->r[ndx].stats.prob_ewma > MINSTREL_FRAC(95, 100))) + (mi->r[ndx].stats.prob_avg > MINSTREL_FRAC(95, 100))) return; mi->prev_sample = true; @@ -573,7 +577,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta) * computing cur_tp */ tmp_mrs = &mi->r[idx].stats; - tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10; + tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_avg) * 10; tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024; return tmp_cur_tp; diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index 51d8b2c846e7..dbb43bcd3c45 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -19,6 +19,21 @@ #define MAX_THR_RATES 4 /* + * Coefficients for moving average with noise filter (period=16), + * scaled by 10 bits + * + * a1 = exp(-pi * sqrt(2) / period) + * coeff2 = 2 * a1 * cos(sqrt(2) * 2 * pi / period) + * coeff3 = -sqr(a1) + * coeff1 = 1 - coeff2 - coeff3 + */ +#define MINSTREL_AVG_COEFF1 (MINSTREL_FRAC(1, 1) - \ + MINSTREL_AVG_COEFF2 - \ + MINSTREL_AVG_COEFF3) +#define MINSTREL_AVG_COEFF2 0x00001499 +#define MINSTREL_AVG_COEFF3 -0x0000092e + +/* * Perform EWMA (Exponentially Weighted Moving Average) calculation */ static inline int @@ -32,6 +47,37 @@ minstrel_ewma(int old, int new, int weight) return old + incr; } +static inline int minstrel_filter_avg_add(u16 *prev_1, u16 *prev_2, s32 in) +{ + s32 out_1 = *prev_1; + s32 out_2 = *prev_2; + s32 val; + + if (!in) + in += 1; + + if (!out_1) { + val = out_1 = in; + goto out; + } + + val = MINSTREL_AVG_COEFF1 * in; + val += MINSTREL_AVG_COEFF2 * out_1; + val += MINSTREL_AVG_COEFF3 * out_2; + val >>= MINSTREL_SCALE; + + if (val > 1 << MINSTREL_SCALE) + val = 1 << MINSTREL_SCALE; + if (val < 0) + val = 1; + +out: + *prev_2 = out_1; + *prev_1 = val; + + return val; +} + struct minstrel_rate_stats { /* current / last sampling period attempts/success counters */ u16 attempts, last_attempts; @@ -40,8 +86,9 @@ struct minstrel_rate_stats { /* total attempts/success counters */ u32 att_hist, succ_hist; - /* prob_ewma - exponential weighted moving average of prob */ - u16 prob_ewma; + /* prob_avg - moving average of prob */ + u16 prob_avg; + u16 prob_avg_1; /* maximum retry counts */ u8 retry_count; @@ -95,6 +142,7 @@ struct minstrel_sta_info { struct minstrel_priv { struct ieee80211_hw *hw; bool has_mrr; + bool new_avg; u32 sample_switch; unsigned int cw_min; unsigned int cw_max; @@ -126,8 +174,9 @@ extern const struct rate_control_ops mac80211_minstrel; void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); /* Recalculate success probabilities and counters for a given rate using EWMA */ -void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs); -int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma); +void minstrel_calc_rate_stats(struct minstrel_priv *mp, + struct minstrel_rate_stats *mrs); +int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg); /* debugfs */ int minstrel_stats_open(struct inode *inode, struct file *file); diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index c8afd85b51a0..9b8e0daeb7bb 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -90,8 +90,8 @@ minstrel_stats_open(struct inode *inode, struct file *file) p += sprintf(p, "%6u ", mr->perfect_tx_time); tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); - tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u" " %3u %3u %-3u " @@ -147,8 +147,8 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) p += sprintf(p, "%u,",mr->perfect_tx_time); tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); - tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,%u," "%llu,%llu,%d,%d\n", diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 0ef2633349b5..694a31978a04 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -346,12 +346,12 @@ minstrel_ht_avg_ampdu_len(struct minstrel_ht_sta *mi) */ int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, - int prob_ewma) + int prob_avg) { unsigned int nsecs = 0; /* do not account throughput if sucess prob is below 10% */ - if (prob_ewma < MINSTREL_FRAC(10, 100)) + if (prob_avg < MINSTREL_FRAC(10, 100)) return 0; if (group != MINSTREL_CCK_GROUP) @@ -365,11 +365,11 @@ minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, * account for collision related packet error rate fluctuation * (prob is scaled - see MINSTREL_FRAC above) */ - if (prob_ewma > MINSTREL_FRAC(90, 100)) + if (prob_avg > MINSTREL_FRAC(90, 100)) return MINSTREL_TRUNC(100000 * ((MINSTREL_FRAC(90, 100) * 1000) / nsecs)); else - return MINSTREL_TRUNC(100000 * ((prob_ewma * 1000) / nsecs)); + return MINSTREL_TRUNC(100000 * ((prob_avg * 1000) / nsecs)); } /* @@ -389,13 +389,13 @@ minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index, cur_group = index / MCS_GROUP_RATES; cur_idx = index % MCS_GROUP_RATES; - cur_prob = mi->groups[cur_group].rates[cur_idx].prob_ewma; + cur_prob = mi->groups[cur_group].rates[cur_idx].prob_avg; cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, cur_prob); do { tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (cur_tp_avg < tmp_tp_avg || @@ -432,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) tmp_group = mi->max_prob_rate / MCS_GROUP_RATES; tmp_idx = mi->max_prob_rate % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from @@ -444,11 +444,11 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES; max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; - max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma; + max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_avg; - if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) { + if (mrs->prob_avg > MINSTREL_FRAC(75, 100)) { cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, - mrs->prob_ewma); + mrs->prob_avg); if (cur_tp_avg > tmp_tp_avg) mi->max_prob_rate = index; @@ -458,9 +458,9 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) if (cur_tp_avg > max_gpr_tp_avg) mg->max_group_prob_rate = index; } else { - if (mrs->prob_ewma > tmp_prob) + if (mrs->prob_avg > tmp_prob) mi->max_prob_rate = index; - if (mrs->prob_ewma > max_gpr_prob) + if (mrs->prob_avg > max_gpr_prob) mg->max_group_prob_rate = index; } } @@ -482,12 +482,12 @@ minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, tmp_group = tmp_cck_tp_rate[0] / MCS_GROUP_RATES; tmp_idx = tmp_cck_tp_rate[0] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_cck_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); tmp_group = tmp_mcs_tp_rate[0] / MCS_GROUP_RATES; tmp_idx = tmp_mcs_tp_rate[0] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (tmp_cck_tp_rate && tmp_cck_tp > tmp_mcs_tp) { @@ -518,7 +518,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) continue; tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; - tmp_prob = mi->groups[group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[group].rates[tmp_idx].prob_avg; if (tmp_tp < minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob) && (minstrel_mcs_groups[group].streams < tmp_max_streams)) { @@ -623,7 +623,7 @@ minstrel_ht_rate_sample_switch(struct minstrel_priv *mp, * If that fails, look again for a rate that is at least as fast */ mrs = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); - faster_rate = mrs->prob_ewma > MINSTREL_FRAC(75, 100); + faster_rate = mrs->prob_avg > MINSTREL_FRAC(75, 100); minstrel_ht_find_probe_rates(mi, rates, &n_rates, faster_rate); if (!n_rates && faster_rate) minstrel_ht_find_probe_rates(mi, rates, &n_rates, false); @@ -737,8 +737,8 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, mrs = &mg->rates[i]; mrs->retry_updated = false; - minstrel_calc_rate_stats(mrs); - cur_prob = mrs->prob_ewma; + minstrel_calc_rate_stats(mp, mrs); + cur_prob = mrs->prob_avg; if (minstrel_ht_get_tp_avg(mi, group, i, cur_prob) == 0) continue; @@ -773,6 +773,8 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, /* try to sample all available rates during each interval */ mi->sample_count *= 8; + if (mp->new_avg) + mi->sample_count /= 2; if (sample) minstrel_ht_rate_sample_switch(mp, mi); @@ -889,6 +891,7 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, struct ieee80211_tx_rate *ar = info->status.rates; struct minstrel_rate_stats *rate, *rate2, *rate_sample = NULL; struct minstrel_priv *mp = priv; + u32 update_interval = mp->update_interval / 2; bool last, update = false; bool sample_status = false; int i; @@ -943,6 +946,10 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, switch (mi->sample_mode) { case MINSTREL_SAMPLE_IDLE: + if (mp->new_avg && + (mp->hw->max_rates > 1 || + mi->total_packets_cur < SAMPLE_SWITCH_THR)) + update_interval /= 2; break; case MINSTREL_SAMPLE_ACTIVE: @@ -970,23 +977,20 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, */ rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); if (rate->attempts > 30 && - MINSTREL_FRAC(rate->success, rate->attempts) < - MINSTREL_FRAC(20, 100)) { + rate->success < rate->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true); update = true; } rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]); if (rate2->attempts > 30 && - MINSTREL_FRAC(rate2->success, rate2->attempts) < - MINSTREL_FRAC(20, 100)) { + rate2->success < rate2->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false); update = true; } } - if (time_after(jiffies, mi->last_stats_update + - (mp->update_interval / 2 * HZ) / 1000)) { + if (time_after(jiffies, mi->last_stats_update + update_interval)) { update = true; minstrel_ht_update_stats(mp, mi, true); } @@ -1008,7 +1012,7 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, unsigned int overhead = 0, overhead_rtscts = 0; mrs = minstrel_get_ratestats(mi, index); - if (mrs->prob_ewma < MINSTREL_FRAC(1, 10)) { + if (mrs->prob_avg < MINSTREL_FRAC(1, 10)) { mrs->retry_count = 1; mrs->retry_count_rtscts = 1; return; @@ -1065,7 +1069,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (!mrs->retry_updated) minstrel_calc_retransmit(mp, mi, index); - if (mrs->prob_ewma < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { + if (mrs->prob_avg < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { ratetbl->rate[offset].count = 2; ratetbl->rate[offset].count_rts = 2; ratetbl->rate[offset].count_cts = 2; @@ -1099,11 +1103,11 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, } static inline int -minstrel_ht_get_prob_ewma(struct minstrel_ht_sta *mi, int rate) +minstrel_ht_get_prob_avg(struct minstrel_ht_sta *mi, int rate) { int group = rate / MCS_GROUP_RATES; rate %= MCS_GROUP_RATES; - return mi->groups[group].rates[rate].prob_ewma; + return mi->groups[group].rates[rate].prob_avg; } static int @@ -1115,7 +1119,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) unsigned int duration; /* Disable A-MSDU if max_prob_rate is bad */ - if (mi->groups[group].rates[rate].prob_ewma < MINSTREL_FRAC(50, 100)) + if (mi->groups[group].rates[rate].prob_avg < MINSTREL_FRAC(50, 100)) return 1; duration = g->duration[rate]; @@ -1138,7 +1142,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) * data packet size */ if (duration > MCS_DURATION(1, 0, 260) || - (minstrel_ht_get_prob_ewma(mi, mi->max_tp_rate[0]) < + (minstrel_ht_get_prob_avg(mi, mi->max_tp_rate[0]) < MINSTREL_FRAC(75, 100))) return 3200; @@ -1243,7 +1247,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) * rate, to avoid wasting airtime. */ sample_dur = minstrel_get_duration(sample_idx); - if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) || + if (mrs->prob_avg > MINSTREL_FRAC(95, 100) || minstrel_get_duration(mi->max_prob_rate) * 3 < sample_dur) return -1; @@ -1666,7 +1670,8 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) mp->has_mrr = true; mp->hw = hw; - mp->update_interval = 100; + mp->update_interval = HZ / 10; + mp->new_avg = true; #ifdef CONFIG_MAC80211_DEBUGFS mp->fixed_rate_idx = (u32) -1; @@ -1674,6 +1679,8 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) &mp->fixed_rate_idx); debugfs_create_u32("sample_switch", S_IRUGO | S_IWUSR, debugfsdir, &mp->sample_switch); + debugfs_create_bool("new_avg", S_IRUGO | S_IWUSR, debugfsdir, + &mp->new_avg); #endif minstrel_ht_init_cck_rates(mp); @@ -1698,7 +1705,7 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) i = mi->max_tp_rate[0] / MCS_GROUP_RATES; j = mi->max_tp_rate[0] % MCS_GROUP_RATES; - prob = mi->groups[i].rates[j].prob_ewma; + prob = mi->groups[i].rates[j].prob_avg; /* convert tp_avg from pkt per second in kbps */ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10; diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index f938701e7ab7..53ea3c29debf 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -119,6 +119,6 @@ struct minstrel_ht_sta_priv { void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, - int prob_ewma); + int prob_avg); #endif diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 5a6e9f3edc04..bebb71917742 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -98,8 +98,8 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, "%6u ", tx_time); tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); - tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u" " %3u %3u %-3u " @@ -243,8 +243,8 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, "%u,", tx_time); tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); - tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u," "%u,%llu,%llu,", diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1fa422782905..db38be1b75fa 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1617,7 +1617,7 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, static bool ieee80211_tx_frags(struct ieee80211_local *local, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct sta_info *sta, struct sk_buff_head *skbs, bool txpending) { @@ -1679,7 +1679,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); info->control.vif = vif; - control.sta = sta; + control.sta = sta ? &sta->sta : NULL; __skb_unlink(skb, skbs); drv_tx(local, &control, skb); @@ -1698,7 +1698,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local, struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata; struct ieee80211_vif *vif; - struct ieee80211_sta *pubsta; struct sk_buff *skb; bool result = true; __le16 fc; @@ -1713,11 +1712,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local, if (sta && !sta->uploaded) sta = NULL; - if (sta) - pubsta = &sta->sta; - else - pubsta = NULL; - switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { @@ -1744,8 +1738,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, break; } - result = ieee80211_tx_frags(local, vif, pubsta, skbs, - txpending); + result = ieee80211_tx_frags(local, vif, sta, skbs, txpending); ieee80211_tpt_led_trig_tx(local, fc, led_len); @@ -2424,6 +2417,33 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, return 0; } +static int ieee80211_store_ack_skb(struct ieee80211_local *local, + struct sk_buff *skb, + u32 *info_flags) +{ + struct sk_buff *ack_skb = skb_clone_sk(skb); + u16 info_id = 0; + + if (ack_skb) { + unsigned long flags; + int id; + + spin_lock_irqsave(&local->ack_status_lock, flags); + id = idr_alloc(&local->ack_status_frames, ack_skb, + 1, 0x40, GFP_ATOMIC); + spin_unlock_irqrestore(&local->ack_status_lock, flags); + + if (id >= 0) { + info_id = id; + *info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + } else { + kfree_skb(ack_skb); + } + } + + return info_id; +} + /** * ieee80211_build_hdr - build 802.11 header in the given frame * @sdata: virtual interface to build the header for @@ -2717,26 +2737,8 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } if (unlikely(!multicast && skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) { - struct sk_buff *ack_skb = skb_clone_sk(skb); - - if (ack_skb) { - unsigned long flags; - int id; - - spin_lock_irqsave(&local->ack_status_lock, flags); - id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x10000, GFP_ATOMIC); - spin_unlock_irqrestore(&local->ack_status_lock, flags); - - if (id >= 0) { - info_id = id; - info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; - } else { - kfree_skb(ack_skb); - } - } - } + skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) + info_id = ieee80211_store_ack_skb(local, skb, &info_flags); /* * If the skb is shared we need to obtain our own copy. @@ -3529,7 +3531,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct ieee80211_sub_if_data, u.ap); __skb_queue_tail(&tx.skbs, skb); - ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false); + ieee80211_tx_frags(local, &sdata->vif, sta, &tx.skbs, false); return true; } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5d5bdf450091..78f046ec506f 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -536,6 +536,26 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, } EXPORT_SYMBOL(nf_hook_slow); +void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, + const struct nf_hook_entries *e) +{ + struct sk_buff *skb, *next; + struct list_head sublist; + int ret; + + INIT_LIST_HEAD(&sublist); + + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + ret = nf_hook_slow(skb, state, e, 0); + if (ret == 1) + list_add_tail(&skb->list, &sublist); + } + /* Put passed packets back on main list */ + list_splice(&sublist, head); +} +EXPORT_SYMBOL(nf_hook_slow_list); + /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 063df74b4647..1abd6f0dc227 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -192,7 +192,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, } #ifndef IP_SET_BITMAP_STORED_TIMEOUT -static inline bool +static bool mtype_is_filled(const struct mtype_elem *x) { return true; diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 11ff9d4a7006..abe8f77d7d23 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -55,7 +55,7 @@ struct bitmap_ip_adt_elem { u16 id; }; -static inline u32 +static u32 ip_to_id(const struct bitmap_ip *m, u32 ip) { return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts; @@ -63,33 +63,33 @@ ip_to_id(const struct bitmap_ip *m, u32 ip) /* Common functions */ -static inline int +static int bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize) { return !!test_bit(id, map->members); } -static inline int +static int bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, u32 flags, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, size_t dsize) { @@ -97,7 +97,7 @@ bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, htonl(map->first_ip + id * map->hosts)); } -static inline int +static int bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || @@ -237,6 +237,18 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, return true; } +static u32 +range_to_mask(u32 from, u32 to, u8 *bits) +{ + u32 mask = 0xFFFFFFFE; + + *bits = 32; + while (--(*bits) > 0 && mask && (to & mask) != from) + mask <<= 1; + + return mask; +} + static int bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 1d4e63326e68..b618713297da 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -65,7 +65,7 @@ struct bitmap_ipmac_elem { unsigned char filled; } __aligned(__alignof__(u64)); -static inline u32 +static u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) { return ip - m->first_ip; @@ -79,7 +79,7 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip) /* Common functions */ -static inline int +static int bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, const struct bitmap_ipmac *map, size_t dsize) { @@ -94,7 +94,7 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, return -EAGAIN; } -static inline int +static int bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) { const struct bitmap_ipmac_elem *elem; @@ -106,13 +106,13 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) return elem->filled == MAC_FILLED; } -static inline int +static int bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) { return elem->filled == MAC_FILLED; } -static inline int +static int bitmap_ipmac_add_timeout(unsigned long *timeout, const struct bitmap_ipmac_adt_elem *e, const struct ip_set_ext *ext, struct ip_set *set, @@ -139,7 +139,7 @@ bitmap_ipmac_add_timeout(unsigned long *timeout, return 0; } -static inline int +static int bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map, u32 flags, size_t dsize) { @@ -177,14 +177,14 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, return IPSET_ADD_STORE_PLAIN_TIMEOUT; } -static inline int +static int bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { @@ -197,7 +197,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); } -static inline int +static int bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 704a0dda1609..23d6095cb196 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -46,7 +46,7 @@ struct bitmap_port_adt_elem { u16 id; }; -static inline u16 +static u16 port_to_id(const struct bitmap_port *m, u16 port) { return port - m->first_port; @@ -54,34 +54,34 @@ port_to_id(const struct bitmap_port *m, u16 port) /* Common functions */ -static inline int +static int bitmap_port_do_test(const struct bitmap_port_adt_elem *e, const struct bitmap_port *map, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize) { return !!test_bit(id, map->members); } -static inline int +static int bitmap_port_do_add(const struct bitmap_port_adt_elem *e, struct bitmap_port *map, u32 flags, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_port_do_del(const struct bitmap_port_adt_elem *e, struct bitmap_port *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, size_t dsize) { @@ -89,13 +89,40 @@ bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, htons(map->first_port + id)); } -static inline int +static int bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map) { return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); } +static bool +ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) +{ + bool ret; + u8 proto; + + switch (pf) { + case NFPROTO_IPV4: + ret = ip_set_get_ip4_port(skb, src, port, &proto); + break; + case NFPROTO_IPV6: + ret = ip_set_get_ip6_port(skb, src, port, &proto); + break; + default: + return false; + } + if (!ret) + return ret; + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + return true; + default: + return false; + } +} + static int bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index d73d1828216a..169e0a04f814 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -35,7 +35,7 @@ struct ip_set_net { static unsigned int ip_set_net_id __read_mostly; -static inline struct ip_set_net *ip_set_pernet(struct net *net) +static struct ip_set_net *ip_set_pernet(struct net *net) { return net_generic(net, ip_set_net_id); } @@ -67,13 +67,13 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); * serialized by ip_set_type_mutex. */ -static inline void +static void ip_set_type_lock(void) { mutex_lock(&ip_set_type_mutex); } -static inline void +static void ip_set_type_unlock(void) { mutex_unlock(&ip_set_type_mutex); @@ -277,7 +277,7 @@ ip_set_free(void *members) } EXPORT_SYMBOL_GPL(ip_set_free); -static inline bool +static bool flag_nested(const struct nlattr *nla) { return nla->nla_type & NLA_F_NESTED; @@ -327,6 +327,83 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); +static u32 +ip_set_timeout_get(const unsigned long *timeout) +{ + u32 t; + + if (*timeout == IPSET_ELEM_PERMANENT) + return 0; + + t = jiffies_to_msecs(*timeout - jiffies) / MSEC_PER_SEC; + /* Zero value in userspace means no timeout */ + return t == 0 ? 1 : t; +} + +static char * +ip_set_comment_uget(struct nlattr *tb) +{ + return nla_data(tb); +} + +/* Called from uadd only, protected by the set spinlock. + * The kadt functions don't use the comment extensions in any way. + */ +void +ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, + const struct ip_set_ext *ext) +{ + struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); + size_t len = ext->comment ? strlen(ext->comment) : 0; + + if (unlikely(c)) { + set->ext_size -= sizeof(*c) + strlen(c->str) + 1; + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); + } + if (!len) + return; + if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) + len = IPSET_MAX_COMMENT_SIZE; + c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); + if (unlikely(!c)) + return; + strlcpy(c->str, ext->comment, len + 1); + set->ext_size += sizeof(*c) + strlen(c->str) + 1; + rcu_assign_pointer(comment->c, c); +} +EXPORT_SYMBOL_GPL(ip_set_init_comment); + +/* Used only when dumping a set, protected by rcu_read_lock() */ +static int +ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) +{ + struct ip_set_comment_rcu *c = rcu_dereference(comment->c); + + if (!c) + return 0; + return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); +} + +/* Called from uadd/udel, flush or the garbage collectors protected + * by the set spinlock. + * Called when the set is destroyed and when there can't be any user + * of the set data anymore. + */ +static void +ip_set_comment_free(struct ip_set *set, void *ptr) +{ + struct ip_set_comment *comment = ptr; + struct ip_set_comment_rcu *c; + + c = rcu_dereference_protected(comment->c, 1); + if (unlikely(!c)) + return; + set->ext_size -= sizeof(*c) + strlen(c->str) + 1; + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); +} + typedef void (*destroyer)(struct ip_set *, void *); /* ipset data extension types, in size order */ @@ -353,12 +430,12 @@ const struct ip_set_ext_type ip_set_extensions[] = { .flag = IPSET_FLAG_WITH_COMMENT, .len = sizeof(struct ip_set_comment), .align = __alignof__(struct ip_set_comment), - .destroy = (destroyer) ip_set_comment_free, + .destroy = ip_set_comment_free, }, }; EXPORT_SYMBOL_GPL(ip_set_extensions); -static inline bool +static bool add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) { return ip_set_extensions[id].flag ? @@ -448,6 +525,46 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_set_get_extensions); +static u64 +ip_set_get_bytes(const struct ip_set_counter *counter) +{ + return (u64)atomic64_read(&(counter)->bytes); +} + +static u64 +ip_set_get_packets(const struct ip_set_counter *counter) +{ + return (u64)atomic64_read(&(counter)->packets); +} + +static bool +ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter) +{ + return nla_put_net64(skb, IPSET_ATTR_BYTES, + cpu_to_be64(ip_set_get_bytes(counter)), + IPSET_ATTR_PAD) || + nla_put_net64(skb, IPSET_ATTR_PACKETS, + cpu_to_be64(ip_set_get_packets(counter)), + IPSET_ATTR_PAD); +} + +static bool +ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo) +{ + /* Send nonzero parameters only */ + return ((skbinfo->skbmark || skbinfo->skbmarkmask) && + nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask), + IPSET_ATTR_PAD)) || + (skbinfo->skbprio && + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio))) || + (skbinfo->skbqueue && + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue))); +} + int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, const void *e, bool active) @@ -473,6 +590,55 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, } EXPORT_SYMBOL_GPL(ip_set_put_extensions); +static bool +ip_set_match_counter(u64 counter, u64 match, u8 op) +{ + switch (op) { + case IPSET_COUNTER_NONE: + return true; + case IPSET_COUNTER_EQ: + return counter == match; + case IPSET_COUNTER_NE: + return counter != match; + case IPSET_COUNTER_LT: + return counter < match; + case IPSET_COUNTER_GT: + return counter > match; + } + return false; +} + +static void +ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) +{ + atomic64_add((long long)bytes, &(counter)->bytes); +} + +static void +ip_set_add_packets(u64 packets, struct ip_set_counter *counter) +{ + atomic64_add((long long)packets, &(counter)->packets); +} + +static void +ip_set_update_counter(struct ip_set_counter *counter, + const struct ip_set_ext *ext, u32 flags) +{ + if (ext->packets != ULLONG_MAX && + !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) { + ip_set_add_bytes(ext->bytes, counter); + ip_set_add_packets(ext->packets, counter); + } +} + +static void +ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + mext->skbinfo = *skbinfo; +} + bool ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags, void *data) @@ -508,7 +674,7 @@ EXPORT_SYMBOL_GPL(ip_set_match_extensions); * The set behind an index may change by swapping only, from userspace. */ -static inline void +static void __ip_set_get(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -516,7 +682,7 @@ __ip_set_get(struct ip_set *set) write_unlock_bh(&ip_set_ref_lock); } -static inline void +static void __ip_set_put(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -528,7 +694,7 @@ __ip_set_put(struct ip_set *set) /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ -static inline void +static void __ip_set_put_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -543,7 +709,7 @@ __ip_set_put_netlink(struct ip_set *set) * so it can't be destroyed (or changed) under our foot. */ -static inline struct ip_set * +static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { struct ip_set *set; @@ -672,7 +838,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname); * */ -static inline void +static void __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) { struct ip_set *set; @@ -1255,6 +1421,30 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, #define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF) #define DUMP_FLAGS(arg) (((u32)(arg)) >> 16) +int +ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) +{ + u32 cadt_flags = 0; + + if (SET_WITH_TIMEOUT(set)) + if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(set->timeout)))) + return -EMSGSIZE; + if (SET_WITH_COUNTER(set)) + cadt_flags |= IPSET_FLAG_WITH_COUNTERS; + if (SET_WITH_COMMENT(set)) + cadt_flags |= IPSET_FLAG_WITH_COMMENT; + if (SET_WITH_SKBINFO(set)) + cadt_flags |= IPSET_FLAG_WITH_SKBINFO; + if (SET_WITH_FORCEADD(set)) + cadt_flags |= IPSET_FLAG_WITH_FORCEADD; + + if (!cadt_flags) + return 0; + return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags)); +} +EXPORT_SYMBOL_GPL(ip_set_put_flags); + static int ip_set_dump_done(struct netlink_callback *cb) { diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 2b8f959574b4..36615eb3eae1 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -148,31 +148,3 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src, } EXPORT_SYMBOL_GPL(ip_set_get_ip6_port); #endif - -bool -ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) -{ - bool ret; - u8 proto; - - switch (pf) { - case NFPROTO_IPV4: - ret = ip_set_get_ip4_port(skb, src, port, &proto); - break; - case NFPROTO_IPV6: - ret = ip_set_get_ip6_port(skb, src, port, &proto); - break; - default: - return false; - } - if (!ret) - return ret; - switch (proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - return true; - default: - return false; - } -} -EXPORT_SYMBOL_GPL(ip_set_get_ip_port); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index d098d87bc331..7480ce55b5c8 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -39,7 +39,7 @@ #ifdef IP_SET_HASH_WITH_MULTI #define AHASH_MAX(h) ((h)->ahash_max) -static inline u8 +static u8 tune_ahash_max(u8 curr, u32 multi) { u32 n; @@ -909,7 +909,7 @@ out: return ret; } -static inline int +static int mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, struct ip_set_ext *mext, struct ip_set *set, u32 flags) { diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index f4432d9fcad0..5d6d68eaf6a9 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -44,7 +44,7 @@ struct hash_ip4_elem { /* Common functions */ -static inline bool +static bool hash_ip4_data_equal(const struct hash_ip4_elem *e1, const struct hash_ip4_elem *e2, u32 *multi) @@ -63,7 +63,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) { next->ip = e->ip; @@ -171,7 +171,7 @@ struct hash_ip6_elem { /* Common functions */ -static inline bool +static bool hash_ip6_data_equal(const struct hash_ip6_elem *ip1, const struct hash_ip6_elem *ip2, u32 *multi) @@ -179,7 +179,7 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1, return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } -static inline void +static void hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) { ip6_netmask(ip, prefix); @@ -196,7 +196,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e) { } diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index 4ce563eb927d..eceb7bc4a93a 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -47,7 +47,7 @@ struct hash_ipmac4_elem { /* Common functions */ -static inline bool +static bool hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1, const struct hash_ipmac4_elem *e2, u32 *multi) @@ -67,7 +67,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmac4_data_next(struct hash_ipmac4_elem *next, const struct hash_ipmac4_elem *e) { @@ -154,7 +154,7 @@ struct hash_ipmac6_elem { /* Common functions */ -static inline bool +static bool hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1, const struct hash_ipmac6_elem *e2, u32 *multi) @@ -175,7 +175,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmac6_data_next(struct hash_ipmac6_elem *next, const struct hash_ipmac6_elem *e) { diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 7a1734aad0c5..aba1df617d6e 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -42,7 +42,7 @@ struct hash_ipmark4_elem { /* Common functions */ -static inline bool +static bool hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, const struct hash_ipmark4_elem *ip2, u32 *multi) @@ -64,7 +64,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmark4_data_next(struct hash_ipmark4_elem *next, const struct hash_ipmark4_elem *d) { @@ -165,7 +165,7 @@ struct hash_ipmark6_elem { /* Common functions */ -static inline bool +static bool hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, const struct hash_ipmark6_elem *ip2, u32 *multi) @@ -187,7 +187,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmark6_data_next(struct hash_ipmark6_elem *next, const struct hash_ipmark6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 32e240658334..1ff228717e29 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -47,7 +47,7 @@ struct hash_ipport4_elem { /* Common functions */ -static inline bool +static bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, const struct hash_ipport4_elem *ip2, u32 *multi) @@ -71,7 +71,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { @@ -202,7 +202,7 @@ struct hash_ipport6_elem { /* Common functions */ -static inline bool +static bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, u32 *multi) @@ -226,7 +226,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipport6_data_next(struct hash_ipport6_elem *next, const struct hash_ipport6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 15d419353179..fa88afd812fa 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -46,7 +46,7 @@ struct hash_ipportip4_elem { u8 padding; }; -static inline bool +static bool hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, const struct hash_ipportip4_elem *ip2, u32 *multi) @@ -72,7 +72,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportip4_data_next(struct hash_ipportip4_elem *next, const struct hash_ipportip4_elem *d) { @@ -210,7 +210,7 @@ struct hash_ipportip6_elem { /* Common functions */ -static inline bool +static bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) @@ -236,7 +236,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportip6_data_next(struct hash_ipportip6_elem *next, const struct hash_ipportip6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7a4d7afd4121..eef6ecfcb409 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -59,7 +59,7 @@ struct hash_ipportnet4_elem { /* Common functions */ -static inline bool +static bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, u32 *multi) @@ -71,25 +71,25 @@ hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) { elem->ip2 &= ip_set_netmask(cidr); @@ -116,7 +116,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, const struct hash_ipportnet4_elem *d) { @@ -308,7 +308,7 @@ struct hash_ipportnet6_elem { /* Common functions */ -static inline bool +static bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) @@ -320,25 +320,25 @@ hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip2, cidr); @@ -365,7 +365,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next, const struct hash_ipportnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index d94c585d33c5..0b61593165ef 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -37,7 +37,7 @@ struct hash_mac4_elem { /* Common functions */ -static inline bool +static bool hash_mac4_data_equal(const struct hash_mac4_elem *e1, const struct hash_mac4_elem *e2, u32 *multi) @@ -45,7 +45,7 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1, return ether_addr_equal(e1->ether, e2->ether); } -static inline bool +static bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) @@ -56,7 +56,7 @@ nla_put_failure: return true; } -static inline void +static void hash_mac4_data_next(struct hash_mac4_elem *next, const struct hash_mac4_elem *e) { diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 3d932de0ad29..136cf0781d3a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -47,7 +47,7 @@ struct hash_net4_elem { /* Common functions */ -static inline bool +static bool hash_net4_data_equal(const struct hash_net4_elem *ip1, const struct hash_net4_elem *ip2, u32 *multi) @@ -56,25 +56,25 @@ hash_net4_data_equal(const struct hash_net4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_net4_do_data_match(const struct hash_net4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -97,7 +97,7 @@ nla_put_failure: return true; } -static inline void +static void hash_net4_data_next(struct hash_net4_elem *next, const struct hash_net4_elem *d) { @@ -212,7 +212,7 @@ struct hash_net6_elem { /* Common functions */ -static inline bool +static bool hash_net6_data_equal(const struct hash_net6_elem *ip1, const struct hash_net6_elem *ip2, u32 *multi) @@ -221,25 +221,25 @@ hash_net6_data_equal(const struct hash_net6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_net6_do_data_match(const struct hash_net6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -262,7 +262,7 @@ nla_put_failure: return true; } -static inline void +static void hash_net6_data_next(struct hash_net6_elem *next, const struct hash_net6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 87b29f971226..1a04e0929738 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -62,7 +62,7 @@ struct hash_netiface4_elem { /* Common functions */ -static inline bool +static bool hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, const struct hash_netiface4_elem *ip2, u32 *multi) @@ -74,25 +74,25 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, strcmp(ip1->iface, ip2->iface) == 0; } -static inline int +static int hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -119,7 +119,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netiface4_data_next(struct hash_netiface4_elem *next, const struct hash_netiface4_elem *d) { @@ -285,7 +285,7 @@ struct hash_netiface6_elem { /* Common functions */ -static inline bool +static bool hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, const struct hash_netiface6_elem *ip2, u32 *multi) @@ -297,25 +297,25 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, strcmp(ip1->iface, ip2->iface) == 0; } -static inline int +static int hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -342,7 +342,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netiface6_data_next(struct hash_netiface6_elem *next, const struct hash_netiface6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 4398322fad59..da4ef910b12d 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -52,7 +52,7 @@ struct hash_netnet4_elem { /* Common functions */ -static inline bool +static bool hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, const struct hash_netnet4_elem *ip2, u32 *multi) @@ -61,32 +61,32 @@ hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, ip1->ccmp == ip2->ccmp; } -static inline int +static int hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, struct hash_netnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) { if (inner) { @@ -117,7 +117,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netnet4_data_next(struct hash_netnet4_elem *next, const struct hash_netnet4_elem *d) { @@ -282,7 +282,7 @@ struct hash_netnet6_elem { /* Common functions */ -static inline bool +static bool hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, const struct hash_netnet6_elem *ip2, u32 *multi) @@ -292,32 +292,32 @@ hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, ip1->ccmp == ip2->ccmp; } -static inline int +static int hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, struct hash_netnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) { if (inner) { @@ -348,7 +348,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netnet6_data_next(struct hash_netnet6_elem *next, const struct hash_netnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 799f2272cc65..34448df80fb9 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -57,7 +57,7 @@ struct hash_netport4_elem { /* Common functions */ -static inline bool +static bool hash_netport4_data_equal(const struct hash_netport4_elem *ip1, const struct hash_netport4_elem *ip2, u32 *multi) @@ -68,25 +68,25 @@ hash_netport4_data_equal(const struct hash_netport4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_netport4_do_data_match(const struct hash_netport4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -112,7 +112,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netport4_data_next(struct hash_netport4_elem *next, const struct hash_netport4_elem *d) { @@ -270,7 +270,7 @@ struct hash_netport6_elem { /* Common functions */ -static inline bool +static bool hash_netport6_data_equal(const struct hash_netport6_elem *ip1, const struct hash_netport6_elem *ip2, u32 *multi) @@ -281,25 +281,25 @@ hash_netport6_data_equal(const struct hash_netport6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_netport6_do_data_match(const struct hash_netport6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -325,7 +325,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netport6_data_next(struct hash_netport6_elem *next, const struct hash_netport6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index a82b70e8b9a6..934c1712cba8 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -56,7 +56,7 @@ struct hash_netportnet4_elem { /* Common functions */ -static inline bool +static bool hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, const struct hash_netportnet4_elem *ip2, u32 *multi) @@ -67,32 +67,32 @@ hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, struct hash_netportnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, u8 cidr, bool inner) { @@ -126,7 +126,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netportnet4_data_next(struct hash_netportnet4_elem *next, const struct hash_netportnet4_elem *d) { @@ -331,7 +331,7 @@ struct hash_netportnet6_elem { /* Common functions */ -static inline bool +static bool hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, const struct hash_netportnet6_elem *ip2, u32 *multi) @@ -343,32 +343,32 @@ hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, struct hash_netportnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, u8 cidr, bool inner) { @@ -402,7 +402,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netportnet6_data_next(struct hash_netportnet6_elem *next, const struct hash_netportnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 67ac50104e6f..cd747c0962fd 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -149,7 +149,7 @@ __list_set_del_rcu(struct rcu_head * rcu) kfree(e); } -static inline void +static void list_set_del(struct ip_set *set, struct set_elem *e) { struct list_set *map = set->data; @@ -160,7 +160,7 @@ list_set_del(struct ip_set *set, struct set_elem *e) call_rcu(&e->rcu, __list_set_del_rcu); } -static inline void +static void list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { struct list_set *map = set->data; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 8b80ab794a92..512259f579d7 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2402,18 +2402,22 @@ estimator_fail: return -ENOMEM; } -static void __net_exit __ip_vs_cleanup(struct net *net) +static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) { - struct netns_ipvs *ipvs = net_ipvs(net); - - ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ - ip_vs_conn_net_cleanup(ipvs); - ip_vs_app_net_cleanup(ipvs); - ip_vs_protocol_net_cleanup(ipvs); - ip_vs_control_net_cleanup(ipvs); - ip_vs_estimator_net_cleanup(ipvs); - IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); - net->ipvs = NULL; + struct netns_ipvs *ipvs; + struct net *net; + + ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_conn_net_cleanup(ipvs); + ip_vs_app_net_cleanup(ipvs); + ip_vs_protocol_net_cleanup(ipvs); + ip_vs_control_net_cleanup(ipvs); + ip_vs_estimator_net_cleanup(ipvs); + IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); + net->ipvs = NULL; + } } static int __net_init __ip_vs_dev_init(struct net *net) @@ -2429,27 +2433,32 @@ hook_fail: return ret; } -static void __net_exit __ip_vs_dev_cleanup(struct net *net) +static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs; + struct net *net; + EnterFunction(2); - nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - ipvs->enable = 0; /* Disable packet reception */ - smp_wmb(); - ip_vs_sync_net_cleanup(ipvs); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ipvs->enable = 0; /* Disable packet reception */ + smp_wmb(); + ip_vs_sync_net_cleanup(ipvs); + } LeaveFunction(2); } static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, - .exit = __ip_vs_cleanup, + .exit_batch = __ip_vs_cleanup_batch, .id = &ip_vs_net_id, .size = sizeof(struct netns_ipvs), }; static struct pernet_operations ipvs_core_dev_ops = { .init = __ip_vs_dev_init, - .exit = __ip_vs_dev_cleanup, + .exit_batch = __ip_vs_dev_cleanup_batch, }; /* diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 3cccc88ef817..3be7398901e0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1607,14 +1607,20 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) /* * Delete service by {netns} in the service table. - * Called by __ip_vs_cleanup() + * Called by __ip_vs_batch_cleanup() */ -void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs) +void ip_vs_service_nets_cleanup(struct list_head *net_list) { + struct netns_ipvs *ipvs; + struct net *net; + EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(ipvs, true); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_flush(ipvs, true); + } mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c index 78b074cd5464..c03066fdd5ca 100644 --- a/net/netfilter/ipvs/ip_vs_ovf.c +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -5,7 +5,7 @@ * Authors: Raducu Deaconu <rhadoo_io@yahoo.com> * * Scheduler implements "overflow" loadbalancing according to number of active - * connections , will keep all conections to the node with the highest weight + * connections , will keep all connections to the node with the highest weight * and overflow to the next node if the number of connections exceeds the node's * weight. * Note that this scheduler might not be suitable for UDP because it only uses diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 888d3068a492..b1e300f8881b 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -407,12 +407,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, goto err_put; skb_dst_drop(skb); - if (noref) { - if (!local) - skb_dst_set_noref(skb, &rt->dst); - else - skb_dst_set(skb, dst_clone(&rt->dst)); - } else + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else skb_dst_set(skb, &rt->dst); return local; @@ -574,12 +571,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, goto err_put; skb_dst_drop(skb); - if (noref) { - if (!local) - skb_dst_set_noref(skb, &rt->dst); - else - skb_dst_set(skb, dst_clone(&rt->dst)); - } else + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else skb_dst_set(skb, &rt->dst); return local; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5cd610b547e0..0af1898af2b8 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -573,7 +573,6 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); void nf_ct_tmpl_free(struct nf_conn *tmpl) { nf_ct_ext_destroy(tmpl); - nf_ct_ext_free(tmpl); if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) kfree((char *)tmpl - tmpl->proto.tmpl_padto); @@ -1417,7 +1416,6 @@ void nf_conntrack_free(struct nf_conn *ct) WARN_ON(atomic_read(&ct->ct_general.use) != 0); nf_ct_ext_destroy(ct); - nf_ct_ext_free(ct); kmem_cache_free(nf_conntrack_cachep, ct); smp_mb__before_atomic(); atomic_dec(&net->ct.count); diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 6fba74b5aaf7..7956c9f19899 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -30,6 +30,7 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); #define ECACHE_RETRY_WAIT (HZ/10) +#define ECACHE_STACK_ALLOC (256 / sizeof(void *)) enum retry_state { STATE_CONGESTED, @@ -39,11 +40,11 @@ enum retry_state { static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) { - struct nf_conn *refs[16]; + struct nf_conn *refs[ECACHE_STACK_ALLOC]; + enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int evicted = 0; - enum retry_state ret = STATE_DONE; spin_lock(&pcpu->lock); @@ -54,10 +55,22 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) if (!nf_ct_is_confirmed(ct)) continue; + /* This ecache access is safe because the ct is on the + * pcpu dying list and we hold the spinlock -- the entry + * cannot be free'd until after the lock is released. + * + * This is true even if ct has a refcount of 0: the + * cpu that is about to free the entry must remove it + * from the dying list and needs the lock to do so. + */ e = nf_ct_ecache_find(ct); if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) continue; + /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means + * the worker owns this entry: the ct will remain valid + * until the worker puts its ct reference. + */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; @@ -189,15 +202,15 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct) if (notify == NULL) goto out_unlock; + if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) + goto out_unlock; + e = nf_ct_ecache_find(ct); if (e == NULL) goto out_unlock; events = xchg(&e->cache, 0); - if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) - goto out_unlock; - /* We make a copy of the missed event cache without taking * the lock, thus we may send missed events twice. However, * this does not harm and it happens very rarely. */ diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index d4ed1e197921..c24e5b64b00c 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -34,21 +34,24 @@ void nf_ct_ext_destroy(struct nf_conn *ct) t->destroy(ct); rcu_read_unlock(); } + + kfree(ct->ext); } EXPORT_SYMBOL(nf_ct_ext_destroy); void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { unsigned int newlen, newoff, oldlen, alloc; - struct nf_ct_ext *old, *new; struct nf_ct_ext_type *t; + struct nf_ct_ext *new; /* Conntrack must not be confirmed to avoid races on reallocation. */ WARN_ON(nf_ct_is_confirmed(ct)); - old = ct->ext; - if (old) { + if (ct->ext) { + const struct nf_ct_ext *old = ct->ext; + if (__nf_ct_ext_exist(old, id)) return NULL; oldlen = old->len; @@ -68,22 +71,18 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) rcu_read_unlock(); alloc = max(newlen, NF_CT_EXT_PREALLOC); - kmemleak_not_leak(old); - new = __krealloc(old, alloc, gfp); + new = krealloc(ct->ext, alloc, gfp); if (!new) return NULL; - if (!old) { + if (!ct->ext) memset(new->offset, 0, sizeof(new->offset)); - ct->ext = new; - } else if (new != old) { - kfree_rcu(old, rcu); - rcu_assign_pointer(ct->ext, new); - } new->offset[id] = newoff; new->len = newlen; memset((void *)new + newoff, 0, newlen - newoff); + + ct->ext = new; return (void *)new + newoff; } EXPORT_SYMBOL(nf_ct_ext_add); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e2d13cd18875..d8d33ef52ce0 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -506,9 +506,45 @@ nla_put_failure: return -1; } +/* all these functions access ct->ext. Caller must either hold a reference + * on ct or prevent its deletion by holding either the bucket spinlock or + * pcpu dying list lock. + */ +static int ctnetlink_dump_extinfo(struct sk_buff *skb, + struct nf_conn *ct, u32 type) +{ + if (ctnetlink_dump_acct(skb, ct, type) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_labels(skb, ct) < 0 || + ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || + ctnetlink_dump_ct_synproxy(skb, ct) < 0) + return -1; + + return 0; +} + +static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) +{ + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_secctx(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0 || + ctnetlink_dump_master(skb, ct) < 0) + return -1; + + if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && + (ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0)) + return -1; + + return 0; +} + static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, - struct nf_conn *ct) + struct nf_conn *ct, bool extinfo) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; @@ -552,23 +588,9 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; - if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_acct(skb, ct, type) < 0 || - ctnetlink_dump_timestamp(skb, ct) < 0 || - ctnetlink_dump_helpinfo(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || - ctnetlink_dump_secctx(skb, ct) < 0 || - ctnetlink_dump_labels(skb, ct) < 0 || - ctnetlink_dump_id(skb, ct) < 0 || - ctnetlink_dump_use(skb, ct) < 0 || - ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || - ctnetlink_dump_ct_synproxy(skb, ct) < 0) + if (ctnetlink_dump_info(skb, ct) < 0) goto nla_put_failure; - - if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && - (ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_protoinfo(skb, ct) < 0)) + if (extinfo && ctnetlink_dump_extinfo(skb, ct, type) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -953,13 +975,11 @@ restart: if (!ctnetlink_filter_match(ct, cb->data)) continue; - rcu_read_lock(); res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); + ct, true); if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; @@ -1364,10 +1384,8 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, return -ENOMEM; } - rcu_read_lock(); err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct); - rcu_read_unlock(); + NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true); nf_ct_put(ct); if (err <= 0) goto free; @@ -1429,12 +1447,18 @@ restart: continue; cb->args[1] = 0; } - rcu_read_lock(); + + /* We can't dump extension info for the unconfirmed + * list because unconfirmed conntracks can have + * ct->ext reallocated (and thus freed). + * + * In the dying list case ct->ext can't be free'd + * until after we drop pcpu->lock. + */ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); + ct, dying ? true : false); if (res < 0) { if (!atomic_inc_not_zero(&ct->ct_general.use)) continue; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 097deba7441a..c2e3dff773bc 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -235,11 +235,7 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, } /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) + if (!icmp_is_err(icmph->type)) return NF_ACCEPT; memset(&outer_daddr, 0, sizeof(outer_daddr)); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 712a428509ad..0d2243945f1d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -151,11 +151,64 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) } } +static int nft_netdev_register_hooks(struct net *net, + struct list_head *hook_list) +{ + struct nft_hook *hook; + int err, j; + + j = 0; + list_for_each_entry(hook, hook_list, list) { + err = nf_register_net_hook(net, &hook->ops); + if (err < 0) + goto err_register; + + j++; + } + return 0; + +err_register: + list_for_each_entry(hook, hook_list, list) { + if (j-- <= 0) + break; + + nf_unregister_net_hook(net, &hook->ops); + } + return err; +} + +static void nft_netdev_unregister_hooks(struct net *net, + struct list_head *hook_list) +{ + struct nft_hook *hook; + + list_for_each_entry(hook, hook_list, list) + nf_unregister_net_hook(net, &hook->ops); +} + +static int nft_register_basechain_hooks(struct net *net, int family, + struct nft_base_chain *basechain) +{ + if (family == NFPROTO_NETDEV) + return nft_netdev_register_hooks(net, &basechain->hook_list); + + return nf_register_net_hook(net, &basechain->ops); +} + +static void nft_unregister_basechain_hooks(struct net *net, int family, + struct nft_base_chain *basechain) +{ + if (family == NFPROTO_NETDEV) + nft_netdev_unregister_hooks(net, &basechain->hook_list); + else + nf_unregister_net_hook(net, &basechain->ops); +} + static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { - const struct nft_base_chain *basechain; + struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || @@ -168,14 +221,14 @@ static int nf_tables_register_hook(struct net *net, if (basechain->type->ops_register) return basechain->type->ops_register(net, ops); - return nf_register_net_hook(net, ops); + return nft_register_basechain_hooks(net, table->family, basechain); } static void nf_tables_unregister_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { - const struct nft_base_chain *basechain; + struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || @@ -187,7 +240,7 @@ static void nf_tables_unregister_hook(struct net *net, if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); - nf_unregister_net_hook(net, ops); + nft_unregister_basechain_hooks(net, table->family, basechain); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -742,7 +795,8 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) if (cnt && i++ == cnt) break; - nf_unregister_net_hook(net, &nft_base_chain(chain)->ops); + nft_unregister_basechain_hooks(net, table->family, + nft_base_chain(chain)); } } @@ -757,14 +811,16 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table) if (!nft_is_base_chain(chain)) continue; - err = nf_register_net_hook(net, &nft_base_chain(chain)->ops); + err = nft_register_basechain_hooks(net, table->family, + nft_base_chain(chain)); if (err < 0) - goto err; + goto err_register_hooks; i++; } return 0; -err: + +err_register_hooks: if (i) nft_table_disable(net, table, i); return err; @@ -1225,6 +1281,46 @@ nla_put_failure: return -ENOSPC; } +static int nft_dump_basechain_hook(struct sk_buff *skb, int family, + const struct nft_base_chain *basechain) +{ + const struct nf_hook_ops *ops = &basechain->ops; + struct nft_hook *hook, *first = NULL; + struct nlattr *nest, *nest_devs; + int n = 0; + + nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); + if (nest == NULL) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) + goto nla_put_failure; + + if (family == NFPROTO_NETDEV) { + nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS); + list_for_each_entry(hook, &basechain->hook_list, list) { + if (!first) + first = hook; + + if (nla_put_string(skb, NFTA_DEVICE_NAME, + hook->ops.dev->name)) + goto nla_put_failure; + n++; + } + nla_nest_end(skb, nest_devs); + + if (n == 1 && + nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + + return 0; +nla_put_failure: + return -1; +} + static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, @@ -1253,21 +1349,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); - const struct nf_hook_ops *ops = &basechain->ops; struct nft_stats __percpu *stats; - struct nlattr *nest; - nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); - if (nest == NULL) - goto nla_put_failure; - if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) - goto nla_put_failure; - if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) + if (nft_dump_basechain_hook(skb, family, basechain)) goto nla_put_failure; - if (basechain->dev_name[0] && - nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name)) - goto nla_put_failure; - nla_nest_end(skb, nest); if (nla_put_be32(skb, NFTA_CHAIN_POLICY, htonl(basechain->policy))) @@ -1485,6 +1570,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) static void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; + struct nft_hook *hook, *next; if (WARN_ON(chain->use > 0)) return; @@ -1495,6 +1581,13 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); + if (ctx->family == NFPROTO_NETDEV) { + list_for_each_entry_safe(hook, next, + &basechain->hook_list, list) { + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } + } module_put(basechain->type->owner); if (rcu_access_pointer(basechain->stats)) { static_branch_dec(&nft_counters_enabled); @@ -1508,13 +1601,125 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) } } +static struct nft_hook *nft_netdev_hook_alloc(struct net *net, + const struct nlattr *attr) +{ + struct net_device *dev; + char ifname[IFNAMSIZ]; + struct nft_hook *hook; + int err; + + hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL); + if (!hook) { + err = -ENOMEM; + goto err_hook_alloc; + } + + nla_strlcpy(ifname, attr, IFNAMSIZ); + dev = __dev_get_by_name(net, ifname); + if (!dev) { + err = -ENOENT; + goto err_hook_dev; + } + hook->ops.dev = dev; + + return hook; + +err_hook_dev: + kfree(hook); +err_hook_alloc: + return ERR_PTR(err); +} + +static bool nft_hook_list_find(struct list_head *hook_list, + const struct nft_hook *this) +{ + struct nft_hook *hook; + + list_for_each_entry(hook, hook_list, list) { + if (this->ops.dev == hook->ops.dev) + return true; + } + + return false; +} + +static int nf_tables_parse_netdev_hooks(struct net *net, + const struct nlattr *attr, + struct list_head *hook_list) +{ + struct nft_hook *hook, *next; + const struct nlattr *tmp; + int rem, n = 0, err; + + nla_for_each_nested(tmp, attr, rem) { + if (nla_type(tmp) != NFTA_DEVICE_NAME) { + err = -EINVAL; + goto err_hook; + } + + hook = nft_netdev_hook_alloc(net, tmp); + if (IS_ERR(hook)) { + err = PTR_ERR(hook); + goto err_hook; + } + if (nft_hook_list_find(hook_list, hook)) { + err = -EEXIST; + goto err_hook; + } + list_add_tail(&hook->list, hook_list); + n++; + + if (n == NFT_NETDEVICE_MAX) { + err = -EFBIG; + goto err_hook; + } + } + if (!n) + return -EINVAL; + + return 0; + +err_hook: + list_for_each_entry_safe(hook, next, hook_list, list) { + list_del(&hook->list); + kfree(hook); + } + return err; +} + struct nft_chain_hook { u32 num; s32 priority; const struct nft_chain_type *type; - struct net_device *dev; + struct list_head list; }; +static int nft_chain_parse_netdev(struct net *net, + struct nlattr *tb[], + struct list_head *hook_list) +{ + struct nft_hook *hook; + int err; + + if (tb[NFTA_HOOK_DEV]) { + hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + list_add_tail(&hook->list, hook_list); + } else if (tb[NFTA_HOOK_DEVS]) { + err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS], + hook_list); + if (err < 0) + return err; + } else { + return -EINVAL; + } + + return 0; +} + static int nft_chain_parse_hook(struct net *net, const struct nlattr * const nla[], struct nft_chain_hook *hook, u8 family, @@ -1522,7 +1727,6 @@ static int nft_chain_parse_hook(struct net *net, { struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; - struct net_device *dev; int err; lockdep_assert_held(&net->nft.commit_mutex); @@ -1560,23 +1764,14 @@ static int nft_chain_parse_hook(struct net *net, hook->type = type; - hook->dev = NULL; + INIT_LIST_HEAD(&hook->list); if (family == NFPROTO_NETDEV) { - char ifname[IFNAMSIZ]; - - if (!ha[NFTA_HOOK_DEV]) { - module_put(type->owner); - return -EOPNOTSUPP; - } - - nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); - dev = __dev_get_by_name(net, ifname); - if (!dev) { + err = nft_chain_parse_netdev(net, ha, &hook->list); + if (err < 0) { module_put(type->owner); - return -ENOENT; + return err; } - hook->dev = dev; - } else if (ha[NFTA_HOOK_DEV]) { + } else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) { module_put(type->owner); return -EOPNOTSUPP; } @@ -1586,6 +1781,12 @@ static int nft_chain_parse_hook(struct net *net, static void nft_chain_release_hook(struct nft_chain_hook *hook) { + struct nft_hook *h, *next; + + list_for_each_entry_safe(h, next, &hook->list, list) { + list_del(&h->list); + kfree(h); + } module_put(hook->type->owner); } @@ -1610,6 +1811,49 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha return kvmalloc(alloc, GFP_KERNEL); } +static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, + const struct nft_chain_hook *hook, + struct nft_chain *chain) +{ + ops->pf = family; + ops->hooknum = hook->num; + ops->priority = hook->priority; + ops->priv = chain; + ops->hook = hook->type->hooks[ops->hooknum]; +} + +static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, + struct nft_chain_hook *hook, u32 flags) +{ + struct nft_chain *chain; + struct nft_hook *h; + + basechain->type = hook->type; + INIT_LIST_HEAD(&basechain->hook_list); + chain = &basechain->chain; + + if (family == NFPROTO_NETDEV) { + list_splice_init(&hook->list, &basechain->hook_list); + list_for_each_entry(h, &basechain->hook_list, list) + nft_basechain_hook_init(&h->ops, family, hook, chain); + + basechain->ops.hooknum = hook->num; + basechain->ops.priority = hook->priority; + } else { + nft_basechain_hook_init(&basechain->ops, family, hook, chain); + } + + chain->flags |= NFT_BASE_CHAIN | flags; + basechain->policy = NF_ACCEPT; + if (chain->flags & NFT_CHAIN_HW_OFFLOAD && + nft_chain_offload_priority(basechain) < 0) + return -EOPNOTSUPP; + + flow_block_init(&basechain->flow_block); + + return 0; +} + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, u32 flags) { @@ -1628,7 +1872,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; - struct nf_hook_ops *ops; err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) @@ -1639,9 +1882,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, nft_chain_release_hook(&hook); return -ENOMEM; } - - if (hook.dev != NULL) - strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ); + chain = &basechain->chain; if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); @@ -1654,24 +1895,12 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, static_branch_inc(&nft_counters_enabled); } - basechain->type = hook.type; - chain = &basechain->chain; - - ops = &basechain->ops; - ops->pf = family; - ops->hooknum = hook.num; - ops->priority = hook.priority; - ops->priv = chain; - ops->hook = hook.type->hooks[ops->hooknum]; - ops->dev = hook.dev; - - chain->flags |= NFT_BASE_CHAIN | flags; - basechain->policy = NF_ACCEPT; - if (chain->flags & NFT_CHAIN_HW_OFFLOAD && - nft_chain_offload_priority(basechain) < 0) - return -EOPNOTSUPP; - - flow_block_init(&basechain->flow_block); + err = nft_basechain_init(basechain, family, &hook, flags); + if (err < 0) { + nft_chain_release_hook(&hook); + kfree(basechain); + return err; + } } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) @@ -1731,6 +1960,25 @@ err1: return err; } +static bool nft_hook_list_equal(struct list_head *hook_list1, + struct list_head *hook_list2) +{ + struct nft_hook *hook; + int n = 0, m = 0; + + n = 0; + list_for_each_entry(hook, hook_list2, list) { + if (!nft_hook_list_find(hook_list1, hook)) + return false; + + n++; + } + list_for_each_entry(hook, hook_list1, list) + m++; + + return n == m; +} + static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, u32 flags) { @@ -1762,12 +2010,19 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return -EBUSY; } - ops = &basechain->ops; - if (ops->hooknum != hook.num || - ops->priority != hook.priority || - ops->dev != hook.dev) { - nft_chain_release_hook(&hook); - return -EBUSY; + if (ctx->family == NFPROTO_NETDEV) { + if (!nft_hook_list_equal(&basechain->hook_list, + &hook.list)) { + nft_chain_release_hook(&hook); + return -EBUSY; + } + } else { + ops = &basechain->ops; + if (ops->hooknum != hook.num || + ops->priority != hook.priority) { + nft_chain_release_hook(&hook); + return -EBUSY; + } } nft_chain_release_hook(&hook); } @@ -5626,43 +5881,6 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table, return ERR_PTR(-ENOENT); } -static int nf_tables_parse_devices(const struct nft_ctx *ctx, - const struct nlattr *attr, - struct net_device *dev_array[], int *len) -{ - const struct nlattr *tmp; - struct net_device *dev; - char ifname[IFNAMSIZ]; - int rem, n = 0, err; - - nla_for_each_nested(tmp, attr, rem) { - if (nla_type(tmp) != NFTA_DEVICE_NAME) { - err = -EINVAL; - goto err1; - } - - nla_strlcpy(ifname, tmp, IFNAMSIZ); - dev = __dev_get_by_name(ctx->net, ifname); - if (!dev) { - err = -ENOENT; - goto err1; - } - - dev_array[n++] = dev; - if (n == NFT_FLOWTABLE_DEVICE_MAX) { - err = -EFBIG; - goto err1; - } - } - if (!len) - return -EINVAL; - - err = 0; -err1: - *len = n; - return err; -} - static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, @@ -5673,11 +5891,10 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, const struct nlattr *attr, struct nft_flowtable *flowtable) { - struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX]; struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; - struct nf_hook_ops *ops; + struct nft_hook *hook; int hooknum, priority; - int err, n = 0, i; + int err; err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, nft_flowtable_hook_policy, NULL); @@ -5695,27 +5912,21 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); - err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS], - dev_array, &n); + err = nf_tables_parse_netdev_hooks(ctx->net, + tb[NFTA_FLOWTABLE_HOOK_DEVS], + &flowtable->hook_list); if (err < 0) return err; - ops = kcalloc(n, sizeof(struct nf_hook_ops), GFP_KERNEL); - if (!ops) - return -ENOMEM; - - flowtable->hooknum = hooknum; - flowtable->priority = priority; - flowtable->ops = ops; - flowtable->ops_len = n; + flowtable->hooknum = hooknum; + flowtable->data.priority = priority; - for (i = 0; i < n; i++) { - flowtable->ops[i].pf = NFPROTO_NETDEV; - flowtable->ops[i].hooknum = hooknum; - flowtable->ops[i].priority = priority; - flowtable->ops[i].priv = &flowtable->data; - flowtable->ops[i].hook = flowtable->data.type->hook; - flowtable->ops[i].dev = dev_array[i]; + list_for_each_entry(hook, &flowtable->hook_list, list) { + hook->ops.pf = NFPROTO_NETDEV; + hook->ops.hooknum = hooknum; + hook->ops.priority = priority; + hook->ops.priv = &flowtable->data; + hook->ops.hook = flowtable->data.type->hook; } return err; @@ -5755,14 +5966,51 @@ nft_flowtable_type_get(struct net *net, u8 family) static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable) { - int i; + struct nft_hook *hook; - for (i = 0; i < flowtable->ops_len; i++) { - if (!flowtable->ops[i].dev) - continue; + list_for_each_entry(hook, &flowtable->hook_list, list) + nf_unregister_net_hook(net, &hook->ops); +} - nf_unregister_net_hook(net, &flowtable->ops[i]); +static int nft_register_flowtable_net_hooks(struct net *net, + struct nft_table *table, + struct nft_flowtable *flowtable) +{ + struct nft_hook *hook, *hook2, *next; + struct nft_flowtable *ft; + int err, i = 0; + + list_for_each_entry(hook, &flowtable->hook_list, list) { + list_for_each_entry(ft, &table->flowtables, list) { + list_for_each_entry(hook2, &ft->hook_list, list) { + if (hook->ops.dev == hook2->ops.dev && + hook->ops.pf == hook2->ops.pf) { + err = -EBUSY; + goto err_unregister_net_hooks; + } + } + } + + err = nf_register_net_hook(net, &hook->ops); + if (err < 0) + goto err_unregister_net_hooks; + + i++; } + + return 0; + +err_unregister_net_hooks: + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + if (i-- <= 0) + break; + + nf_unregister_net_hook(net, &hook->ops); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } + + return err; } static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, @@ -5773,12 +6021,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nf_flowtable_type *type; - struct nft_flowtable *flowtable, *ft; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + struct nft_hook *hook, *next; struct nft_table *table; struct nft_ctx ctx; - int err, i, k; + int err; if (!nla[NFTA_FLOWTABLE_TABLE] || !nla[NFTA_FLOWTABLE_NAME] || @@ -5817,6 +6066,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); + INIT_LIST_HEAD(&flowtable->hook_list); flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL); if (!flowtable->name) { @@ -5840,43 +6090,24 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (err < 0) goto err4; - for (i = 0; i < flowtable->ops_len; i++) { - if (!flowtable->ops[i].dev) - continue; - - list_for_each_entry(ft, &table->flowtables, list) { - for (k = 0; k < ft->ops_len; k++) { - if (!ft->ops[k].dev) - continue; - - if (flowtable->ops[i].dev == ft->ops[k].dev && - flowtable->ops[i].pf == ft->ops[k].pf) { - err = -EBUSY; - goto err5; - } - } - } - - err = nf_register_net_hook(net, &flowtable->ops[i]); - if (err < 0) - goto err5; - } + err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); + if (err < 0) + goto err4; err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err6; + goto err5; list_add_tail_rcu(&flowtable->list, &table->flowtables); table->use++; return 0; -err6: - i = flowtable->ops_len; err5: - for (k = i - 1; k >= 0; k--) - nf_unregister_net_hook(net, &flowtable->ops[k]); - - kfree(flowtable->ops); + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + nf_unregister_net_hook(net, &hook->ops); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } err4: flowtable->data.type->free(&flowtable->data); err3: @@ -5943,8 +6174,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, { struct nlattr *nest, *nest_devs; struct nfgenmsg *nfmsg; + struct nft_hook *hook; struct nlmsghdr *nlh; - int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); @@ -5967,18 +6198,15 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!nest) goto nla_put_failure; if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || - nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority))) + nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority))) goto nla_put_failure; nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS); if (!nest_devs) goto nla_put_failure; - for (i = 0; i < flowtable->ops_len; i++) { - const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev); - - if (dev && - nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) + list_for_each_entry_rcu(hook, &flowtable->hook_list, list) { + if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -6169,7 +6397,12 @@ err: static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { - kfree(flowtable->ops); + struct nft_hook *hook, *next; + + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + list_del_rcu(&hook->list); + kfree(hook); + } kfree(flowtable->name); flowtable->data.type->free(&flowtable->data); module_put(flowtable->data.type->owner); @@ -6209,14 +6442,15 @@ nla_put_failure: static void nft_flowtable_event(unsigned long event, struct net_device *dev, struct nft_flowtable *flowtable) { - int i; + struct nft_hook *hook; - for (i = 0; i < flowtable->ops_len; i++) { - if (flowtable->ops[i].dev != dev) + list_for_each_entry(hook, &flowtable->hook_list, list) { + if (hook->ops.dev != dev) continue; - nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]); - flowtable->ops[i].dev = NULL; + nf_unregister_net_hook(dev_net(dev), &hook->ops); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); break; } } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index e25dab8128db..cdea3010c7a0 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -132,13 +132,13 @@ static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, common->extack = extack; } -static int nft_setup_cb_call(struct nft_base_chain *basechain, - enum tc_setup_type type, void *type_data) +static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, + struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; - list_for_each_entry(block_cb, &basechain->flow_block.cb_list, list) { + list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; @@ -155,32 +155,44 @@ int nft_chain_offload_priority(struct nft_base_chain *basechain) return 0; } +static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, + const struct nft_base_chain *basechain, + const struct nft_rule *rule, + const struct nft_flow_rule *flow, + enum flow_cls_command command) +{ + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + + memset(cls_flow, 0, sizeof(*cls_flow)); + + if (flow) + proto = flow->proto; + + nft_flow_offload_common_init(&cls_flow->common, proto, + basechain->ops.priority, &extack); + cls_flow->command = command; + cls_flow->cookie = (unsigned long) rule; + if (flow) + cls_flow->rule = flow->rule; +} + static int nft_flow_offload_rule(struct nft_chain *chain, struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command) { - struct flow_cls_offload cls_flow = {}; + struct flow_cls_offload cls_flow; struct nft_base_chain *basechain; - struct netlink_ext_ack extack; - __be16 proto = ETH_P_ALL; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); + nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, command); - if (flow) - proto = flow->proto; - - nft_flow_offload_common_init(&cls_flow.common, proto, - basechain->ops.priority, &extack); - cls_flow.command = command; - cls_flow.cookie = (unsigned long) rule; - if (flow) - cls_flow.rule = flow->rule; - - return nft_setup_cb_call(basechain, TC_SETUP_CLSFLOWER, &cls_flow); + return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, + &basechain->flow_block.cb_list); } static int nft_flow_offload_bind(struct flow_block_offload *bo, @@ -194,6 +206,16 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; + struct flow_cls_offload cls_flow; + struct nft_chain *chain; + struct nft_rule *rule; + + chain = &basechain->chain; + list_for_each_entry(rule, &chain->rules, list) { + nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, + FLOW_CLS_DESTROY); + nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); + } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); @@ -224,20 +246,30 @@ static int nft_block_setup(struct nft_base_chain *basechain, return err; } +static void nft_flow_block_offload_init(struct flow_block_offload *bo, + struct net *net, + enum flow_block_command cmd, + struct nft_base_chain *basechain, + struct netlink_ext_ack *extack) +{ + memset(bo, 0, sizeof(*bo)); + bo->net = net; + bo->block = &basechain->flow_block; + bo->command = cmd; + bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); +} + static int nft_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; - struct flow_block_offload bo = {}; + struct flow_block_offload bo; int err; - bo.net = dev_net(dev); - bo.block = &chain->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) @@ -253,17 +285,12 @@ static void nft_indr_block_ing_cmd(struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; - struct flow_block_offload bo = {}; + struct flow_block_offload bo; if (!chain) return; - bo.net = dev_net(dev); - bo.block = &chain->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); @@ -274,15 +301,10 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { - struct flow_block_offload bo = {}; struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; - bo.net = dev_net(dev); - bo.block = &chain->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); flow_indr_block_call(dev, &bo, cmd); @@ -294,32 +316,73 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, #define FLOW_SETUP_BLOCK TC_SETUP_BLOCK -static int nft_flow_offload_chain(struct nft_chain *chain, - u8 *ppolicy, +static int nft_chain_offload_cmd(struct nft_base_chain *basechain, + struct net_device *dev, + enum flow_block_command cmd) +{ + int err; + + if (dev->netdev_ops->ndo_setup_tc) + err = nft_block_offload_cmd(basechain, dev, cmd); + else + err = nft_indr_block_offload_cmd(basechain, dev, cmd); + + return err; +} + +static int nft_flow_block_chain(struct nft_base_chain *basechain, + const struct net_device *this_dev, + enum flow_block_command cmd) +{ + struct net_device *dev; + struct nft_hook *hook; + int err, i = 0; + + list_for_each_entry(hook, &basechain->hook_list, list) { + dev = hook->ops.dev; + if (this_dev && this_dev != dev) + continue; + + err = nft_chain_offload_cmd(basechain, dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; + + return err; + } + i++; + } + + return 0; + +err_flow_block: + list_for_each_entry(hook, &basechain->hook_list, list) { + if (i-- <= 0) + break; + + dev = hook->ops.dev; + nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + } + return err; +} + +static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { struct nft_base_chain *basechain; - struct net_device *dev; u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); - dev = basechain->ops.dev; - if (!dev) - return -EOPNOTSUPP; - policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; - if (dev->netdev_ops->ndo_setup_tc) - return nft_block_offload_cmd(basechain, dev, cmd); - else - return nft_indr_block_offload_cmd(basechain, dev, cmd); + return nft_flow_block_chain(basechain, NULL, cmd); } int nft_flow_rule_offload_commit(struct net *net) @@ -386,6 +449,7 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) { struct nft_base_chain *basechain; struct net *net = dev_net(dev); + struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; @@ -398,8 +462,16 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; + found = NULL; basechain = nft_base_chain(chain); - if (strncmp(basechain->dev_name, dev->name, IFNAMSIZ)) + list_for_each_entry(hook, &basechain->hook_list, list) { + if (hook->ops.dev != dev) + continue; + + found = hook; + break; + } + if (!found) continue; return chain; @@ -427,18 +499,6 @@ static void nft_indr_block_cb(struct net_device *dev, mutex_unlock(&net->nft.commit_mutex); } -static void nft_offload_chain_clean(struct nft_chain *chain) -{ - struct nft_rule *rule; - - list_for_each_entry(rule, &chain->rules, list) { - nft_flow_offload_rule(chain, rule, - NULL, FLOW_CLS_DESTROY); - } - - nft_flow_offload_chain(chain, NULL, FLOW_BLOCK_UNBIND); -} - static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -449,7 +509,9 @@ static int nft_offload_netdev_event(struct notifier_block *this, mutex_lock(&net->nft.commit_mutex); chain = __nft_offload_get_chain(dev); if (chain) - nft_offload_chain_clean(chain); + nft_flow_block_chain(nft_base_chain(chain), dev, + FLOW_BLOCK_UNBIND); + mutex_unlock(&net->nft.commit_mutex); return NOTIFY_DONE; diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index b5d5d071d765..c78d01bc02e9 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -287,28 +287,35 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, struct nft_ctx *ctx) { struct nft_base_chain *basechain = nft_base_chain(ctx->chain); + struct nft_hook *hook, *found = NULL; + int n = 0; - switch (event) { - case NETDEV_UNREGISTER: - if (strcmp(basechain->dev_name, dev->name) != 0) - return; - - /* UNREGISTER events are also happpening on netns exit. - * - * Altough nf_tables core releases all tables/chains, only - * this event handler provides guarantee that - * basechain.ops->dev is still accessible, so we cannot - * skip exiting net namespaces. - */ - __nft_release_basechain(ctx); - break; - case NETDEV_CHANGENAME: - if (dev->ifindex != basechain->ops.dev->ifindex) - return; + if (event != NETDEV_UNREGISTER) + return; - strncpy(basechain->dev_name, dev->name, IFNAMSIZ); - break; + list_for_each_entry(hook, &basechain->hook_list, list) { + if (hook->ops.dev == dev) + found = hook; + + n++; } + if (!found) + return; + + if (n > 1) { + nf_unregister_net_hook(ctx->net, &found->ops); + list_del_rcu(&found->list); + kfree_rcu(found, rcu); + return; + } + + /* UNREGISTER events are also happening on netns exit. + * + * Although nf_tables core releases all tables/chains, only this event + * handler provides guarantee that hook->ops.dev is still accessible, + * so we cannot skip exiting net namespaces. + */ + __nft_release_basechain(ctx); } static int nf_tables_netdev_event(struct notifier_block *this, diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index be7798a50546..713fb38541df 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -239,11 +239,7 @@ static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff) return 0; /* Error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) + if (!icmp_is_err(icmph->type)) return 0; *nhoff += iphsz + sizeof(_ih); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index efccd1ac9a66..0522b2b1fd95 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -458,10 +458,63 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, } EXPORT_SYMBOL(genlmsg_put); +static struct genl_dumpit_info *genl_dumpit_info_alloc(void) +{ + return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL); +} + +static void genl_dumpit_info_free(const struct genl_dumpit_info *info) +{ + kfree(info); +} + +static struct nlattr ** +genl_family_rcv_msg_attrs_parse(const struct genl_family *family, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, + enum genl_validate_flags no_strict_flag, + bool parallel) +{ + enum netlink_validation validate = ops->validate & no_strict_flag ? + NL_VALIDATE_LIBERAL : + NL_VALIDATE_STRICT; + struct nlattr **attrbuf; + int err; + + if (!family->maxattr) + return NULL; + + if (parallel) { + attrbuf = kmalloc_array(family->maxattr + 1, + sizeof(struct nlattr *), GFP_KERNEL); + if (!attrbuf) + return ERR_PTR(-ENOMEM); + } else { + attrbuf = family->attrbuf; + } + + err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, + family->policy, validate, extack); + if (err && parallel) { + kfree(attrbuf); + return ERR_PTR(err); + } + return attrbuf; +} + +static void genl_family_rcv_msg_attrs_free(const struct genl_family *family, + struct nlattr **attrbuf, + bool parallel) +{ + if (parallel) + kfree(attrbuf); +} + static int genl_lock_start(struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_ops *ops = genl_dumpit_info(cb)->ops; int rc = 0; if (ops->start) { @@ -474,8 +527,7 @@ static int genl_lock_start(struct netlink_callback *cb) static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_ops *ops = genl_dumpit_info(cb)->ops; int rc; genl_lock(); @@ -486,8 +538,8 @@ static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int genl_lock_done(struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_ops *ops = info->ops; int rc = 0; if (ops->done) { @@ -495,120 +547,111 @@ static int genl_lock_done(struct netlink_callback *cb) rc = ops->done(cb); genl_unlock(); } + genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_dumpit_info_free(info); return rc; } -static int genl_family_rcv_msg(const struct genl_family *family, - struct sk_buff *skb, - struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int genl_parallel_done(struct netlink_callback *cb) { - const struct genl_ops *ops; - struct net *net = sock_net(skb->sk); - struct genl_info info; - struct genlmsghdr *hdr = nlmsg_data(nlh); - struct nlattr **attrbuf; - int hdrlen, err; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_ops *ops = info->ops; + int rc = 0; - /* this family doesn't exist in this netns */ - if (!family->netnsok && !net_eq(net, &init_net)) - return -ENOENT; + if (ops->done) + rc = ops->done(cb); + genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_dumpit_info_free(info); + return rc; +} - hdrlen = GENL_HDRLEN + family->hdrsize; - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) - return -EINVAL; +static int genl_family_rcv_msg_dumpit(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, struct net *net) +{ + struct genl_dumpit_info *info; + struct nlattr **attrs = NULL; + int err; - ops = genl_get_cmd(hdr->cmd, family); - if (ops == NULL) + if (!ops->dumpit) return -EOPNOTSUPP; - if ((ops->flags & GENL_ADMIN_PERM) && - !netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; - - if ((ops->flags & GENL_UNS_ADMIN_PERM) && - !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { - int rc; - - if (ops->dumpit == NULL) - return -EOPNOTSUPP; - - if (!(ops->validate & GENL_DONT_VALIDATE_DUMP)) { - int hdrlen = GENL_HDRLEN + family->hdrsize; - - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) - return -EINVAL; + if (ops->validate & GENL_DONT_VALIDATE_DUMP) + goto no_attrs; - if (family->maxattr) { - unsigned int validate = NL_VALIDATE_STRICT; - - if (ops->validate & - GENL_DONT_VALIDATE_DUMP_STRICT) - validate = NL_VALIDATE_LIBERAL; - rc = __nla_validate(nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), - family->maxattr, - family->policy, - validate, extack); - if (rc) - return rc; - } - } + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; - if (!family->parallel_ops) { - struct netlink_dump_control c = { - .module = family->module, - /* we have const, but the netlink API doesn't */ - .data = (void *)ops, - .start = genl_lock_start, - .dump = genl_lock_dumpit, - .done = genl_lock_done, - }; + attrs = genl_family_rcv_msg_attrs_parse(family, nlh, extack, + ops, hdrlen, + GENL_DONT_VALIDATE_DUMP_STRICT, + true); + if (IS_ERR(attrs)) + return PTR_ERR(attrs); + +no_attrs: + /* Allocate dumpit info. It is going to be freed by done() callback. */ + info = genl_dumpit_info_alloc(); + if (!info) { + genl_family_rcv_msg_attrs_free(family, attrs, true); + return -ENOMEM; + } - genl_unlock(); - rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - genl_lock(); + info->family = family; + info->ops = ops; + info->attrs = attrs; - } else { - struct netlink_dump_control c = { - .module = family->module, - .start = ops->start, - .dump = ops->dumpit, - .done = ops->done, - }; + if (!family->parallel_ops) { + struct netlink_dump_control c = { + .module = family->module, + .data = info, + .start = genl_lock_start, + .dump = genl_lock_dumpit, + .done = genl_lock_done, + }; - rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - } + genl_unlock(); + err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); + genl_lock(); - return rc; + } else { + struct netlink_dump_control c = { + .module = family->module, + .data = info, + .start = ops->start, + .dump = ops->dumpit, + .done = genl_parallel_done, + }; + + err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); } - if (ops->doit == NULL) - return -EOPNOTSUPP; - - if (family->maxattr && family->parallel_ops) { - attrbuf = kmalloc_array(family->maxattr + 1, - sizeof(struct nlattr *), - GFP_KERNEL); - if (attrbuf == NULL) - return -ENOMEM; - } else - attrbuf = family->attrbuf; + return err; +} - if (attrbuf) { - enum netlink_validation validate = NL_VALIDATE_STRICT; +static int genl_family_rcv_msg_doit(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, struct net *net) +{ + struct nlattr **attrbuf; + struct genl_info info; + int err; - if (ops->validate & GENL_DONT_VALIDATE_STRICT) - validate = NL_VALIDATE_LIBERAL; + if (!ops->doit) + return -EOPNOTSUPP; - err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, - family->policy, validate, extack); - if (err < 0) - goto out; - } + attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack, + ops, hdrlen, + GENL_DONT_VALIDATE_STRICT, + family->parallel_ops); + if (IS_ERR(attrbuf)) + return PTR_ERR(attrbuf); info.snd_seq = nlh->nlmsg_seq; info.snd_portid = NETLINK_CB(skb).portid; @@ -632,12 +675,49 @@ static int genl_family_rcv_msg(const struct genl_family *family, family->post_doit(ops, skb, &info); out: - if (family->parallel_ops) - kfree(attrbuf); + genl_family_rcv_msg_attrs_free(family, attrbuf, family->parallel_ops); return err; } +static int genl_family_rcv_msg(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + const struct genl_ops *ops; + struct net *net = sock_net(skb->sk); + struct genlmsghdr *hdr = nlmsg_data(nlh); + int hdrlen; + + /* this family doesn't exist in this netns */ + if (!family->netnsok && !net_eq(net, &init_net)) + return -ENOENT; + + hdrlen = GENL_HDRLEN + family->hdrsize; + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; + + ops = genl_get_cmd(hdr->cmd, family); + if (ops == NULL) + return -EOPNOTSUPP; + + if ((ops->flags & GENL_ADMIN_PERM) && + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + if ((ops->flags & GENL_UNS_ADMIN_PERM) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) + return genl_family_rcv_msg_dumpit(family, skb, nlh, extack, + ops, hdrlen, net); + else + return genl_family_rcv_msg_doit(family, skb, nlh, extack, + ops, hdrlen, net); +} + static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1088,25 +1168,6 @@ problem: subsys_initcall(genl_init); -/** - * genl_family_attrbuf - return family's attrbuf - * @family: the family - * - * Return the family's attrbuf, while validating that it's - * actually valid to access it. - * - * You cannot use this function with a family that has parallel_ops - * and you can only use it within (pre/post) doit/dumpit callbacks. - */ -struct nlattr **genl_family_attrbuf(const struct genl_family *family) -{ - if (!WARN_ON(family->parallel_ops)) - lockdep_assert_held(&genl_mutex); - - return family->attrbuf; -} -EXPORT_SYMBOL(genl_family_attrbuf); - static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, gfp_t flags) { diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index afde0d763039..eee0dddb7749 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -102,22 +102,14 @@ nla_put_failure: static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) { - struct nlattr **attrbuf = genl_family_attrbuf(&nfc_genl_family); + const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nfc_dev *dev; - int rc; u32 idx; - rc = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nfc_genl_family.hdrsize, - attrbuf, nfc_genl_family.maxattr, - nfc_genl_policy, NULL); - if (rc < 0) - return ERR_PTR(rc); - - if (!attrbuf[NFC_ATTR_DEVICE_INDEX]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); - idx = nla_get_u32(attrbuf[NFC_ATTR_DEVICE_INDEX]); + idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) @@ -1695,7 +1687,8 @@ static const struct genl_ops nfc_genl_ops[] = { }, { .cmd = NFC_CMD_GET_TARGET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = nfc_genl_dump_targets, .done = nfc_genl_dump_targets_done, }, diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 1c77f520f474..12936c151cc0 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -200,7 +200,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, if (err) return err; - flow_key->mpls.top_lse = lse; + flow_key->mpls.lse[0] = lse; return 0; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 05249eb45082..df9c80bf621d 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -971,6 +971,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, ct = nf_ct_get(skb, &ctinfo); if (ct) { + bool add_helper = false; + /* Packets starting a new connection must be NATted before the * helper, so that the helper knows about the NAT. We enforce * this by delaying both NAT and helper calls for unconfirmed @@ -988,16 +990,17 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, } /* Userspace may decide to perform a ct lookup without a helper - * specified followed by a (recirculate and) commit with one. - * Therefore, for unconfirmed connections which we will commit, - * we need to attach the helper here. + * specified followed by a (recirculate and) commit with one, + * or attach a helper in a later commit. Therefore, for + * connections which we will commit, we may need to attach + * the helper here. */ - if (!nf_ct_is_confirmed(ct) && info->commit && - info->helper && !nfct_help(ct)) { + if (info->commit && info->helper && !nfct_help(ct)) { int err = __nf_ct_try_assign_helper(ct, info->ct, GFP_ATOMIC); if (err) return err; + add_helper = true; /* helper installed, add seqadj if NAT is required */ if (info->nat && !nfct_seqadj(ct)) { @@ -1007,11 +1010,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, } /* Call the helper only if: - * - nf_conntrack_in() was executed above ("!cached") for a - * confirmed connection, or + * - nf_conntrack_in() was executed above ("!cached") or a + * helper was just attached ("add_helper") for a confirmed + * connection, or * - When committing an unconfirmed connection. */ - if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) && + if ((nf_ct_is_confirmed(ct) ? !cached || add_helper : + info->commit) && ovs_ct_helper(skb, info->family) != NF_ACCEPT) { return -EINVAL; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d8c364d637b1..2088619c03f0 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -227,7 +227,8 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ - flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit); + flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), + &n_mask_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -1575,6 +1576,31 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) return 0; } +static int ovs_dp_stats_init(struct datapath *dp) +{ + dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); + if (!dp->stats_percpu) + return -ENOMEM; + + return 0; +} + +static int ovs_dp_vport_init(struct datapath *dp) +{ + int i; + + dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!dp->ports) + return -ENOMEM; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->ports[i]); + + return 0; +} + static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1583,7 +1609,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct vport *vport; struct ovs_net *ovs_net; - int err, i; + int err; err = -EINVAL; if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) @@ -1596,35 +1622,26 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) - goto err_free_reply; + goto err_destroy_reply; ovs_dp_set_net(dp, sock_net(skb->sk)); /* Allocate table. */ err = ovs_flow_tbl_init(&dp->table); if (err) - goto err_free_dp; + goto err_destroy_dp; - dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); - if (!dp->stats_percpu) { - err = -ENOMEM; + err = ovs_dp_stats_init(dp); + if (err) goto err_destroy_table; - } - - dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, - sizeof(struct hlist_head), - GFP_KERNEL); - if (!dp->ports) { - err = -ENOMEM; - goto err_destroy_percpu; - } - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->ports[i]); + err = ovs_dp_vport_init(dp); + if (err) + goto err_destroy_stats; err = ovs_meters_init(dp); if (err) - goto err_destroy_ports_array; + goto err_destroy_ports; /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); @@ -1656,6 +1673,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } + ovs_unlock(); goto err_destroy_meters; } @@ -1672,17 +1690,16 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) return 0; err_destroy_meters: - ovs_unlock(); ovs_meters_exit(dp); -err_destroy_ports_array: +err_destroy_ports: kfree(dp->ports); -err_destroy_percpu: +err_destroy_stats: free_percpu(dp->stats_percpu); err_destroy_table: ovs_flow_tbl_destroy(&dp->table); -err_free_dp: +err_destroy_dp: kfree(dp); -err_free_reply: +err_destroy_reply: kfree_skb(reply); err: return err; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 38147e6a20f5..9d375e74b607 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -637,27 +637,35 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ipv4, 0, sizeof(key->ipv4)); } } else if (eth_p_mpls(key->eth.type)) { - size_t stack_len = MPLS_HLEN; + u8 label_count = 1; + memset(&key->mpls, 0, sizeof(key->mpls)); skb_set_inner_network_header(skb, skb->mac_len); while (1) { __be32 lse; - error = check_header(skb, skb->mac_len + stack_len); + error = check_header(skb, skb->mac_len + + label_count * MPLS_HLEN); if (unlikely(error)) return 0; memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN); - if (stack_len == MPLS_HLEN) - memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); + if (label_count <= MPLS_LABEL_DEPTH) + memcpy(&key->mpls.lse[label_count - 1], &lse, + MPLS_HLEN); - skb_set_inner_network_header(skb, skb->mac_len + stack_len); + skb_set_inner_network_header(skb, skb->mac_len + + label_count * MPLS_HLEN); if (lse & htonl(MPLS_LS_S_MASK)) break; - stack_len += MPLS_HLEN; + label_count++; } + if (label_count > MPLS_LABEL_DEPTH) + label_count = MPLS_LABEL_DEPTH; + + key->mpls.num_labels_mask = GENMASK(label_count - 1, 0); } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index b830d5ff7af4..fd8ed766bdd1 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -30,6 +30,7 @@ enum sw_flow_mac_proto { MAC_PROTO_ETHERNET, }; #define SW_FLOW_KEY_INVALID 0x80 +#define MPLS_LABEL_DEPTH 3 /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length @@ -85,9 +86,6 @@ struct sw_flow_key { */ union { struct { - __be32 top_lse; /* top label stack entry */ - } mpls; - struct { u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ u8 tos; /* IP ToS. */ u8 ttl; /* IP TTL/hop limit. */ @@ -135,6 +133,11 @@ struct sw_flow_key { } nd; }; } ipv6; + struct { + u32 num_labels_mask; /* labels present bitmap of effective length MPLS_LABEL_DEPTH */ + __be32 lse[MPLS_LABEL_DEPTH]; /* label stack entry */ + } mpls; + struct ovs_key_nsh nsh; /* network service header */ }; struct { @@ -166,7 +169,6 @@ struct sw_flow_key_range { struct sw_flow_mask { int ref_count; struct rcu_head rcu; - struct list_head list; struct sw_flow_key_range range; struct sw_flow_key key; }; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d7559c64795d..65c2e3458ff5 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -424,7 +424,7 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) }, [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, .next = ovs_tunnel_key_lens, }, - [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, + [OVS_KEY_ATTR_MPLS] = { .len = OVS_ATTR_VARIABLE }, [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) }, [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, @@ -1628,10 +1628,25 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { const struct ovs_key_mpls *mpls_key; + u32 hdr_len; + u32 label_count, label_count_mask, i; mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); - SW_FLOW_KEY_PUT(match, mpls.top_lse, - mpls_key->mpls_lse, is_mask); + hdr_len = nla_len(a[OVS_KEY_ATTR_MPLS]); + label_count = hdr_len / sizeof(struct ovs_key_mpls); + + if (label_count == 0 || label_count > MPLS_LABEL_DEPTH || + hdr_len % sizeof(struct ovs_key_mpls)) + return -EINVAL; + + label_count_mask = GENMASK(label_count - 1, 0); + + for (i = 0 ; i < label_count; i++) + SW_FLOW_KEY_PUT(match, mpls.lse[i], + mpls_key[i].mpls_lse, is_mask); + + SW_FLOW_KEY_PUT(match, mpls.num_labels_mask, + label_count_mask, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_MPLS); } @@ -2114,13 +2129,18 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); } else if (eth_p_mpls(swkey->eth.type)) { + u8 i, num_labels; struct ovs_key_mpls *mpls_key; - nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); + num_labels = hweight_long(output->mpls.num_labels_mask); + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, + num_labels * sizeof(*mpls_key)); if (!nla) goto nla_put_failure; + mpls_key = nla_data(nla); - mpls_key->mpls_lse = output->mpls.top_lse; + for (i = 0; i < num_labels; i++) + mpls_key[i].mpls_lse = output->mpls.lse[i]; } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -2406,13 +2426,14 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, bool log); + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log); static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - bool log, bool last) + u32 mpls_label_count, bool log, bool last) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; @@ -2463,7 +2484,7 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, return err; err = __ovs_nla_copy_actions(net, actions, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2478,7 +2499,7 @@ static int validate_and_copy_clone(struct net *net, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - bool log, bool last) + u32 mpls_label_count, bool log, bool last) { int start, err; u32 exec; @@ -2498,7 +2519,7 @@ static int validate_and_copy_clone(struct net *net, return err; err = __ovs_nla_copy_actions(net, attr, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2864,6 +2885,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log, bool last) { const struct nlattr *acts_if_greater, *acts_if_lesser_eq; @@ -2912,7 +2934,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, return nested_acts_start; err = __ovs_nla_copy_actions(net, acts_if_lesser_eq, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2925,7 +2947,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, return nested_acts_start; err = __ovs_nla_copy_actions(net, acts_if_greater, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2952,7 +2974,8 @@ static int copy_action(const struct nlattr *from, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, bool log) + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log) { u8 mac_proto = ovs_key_mac_proto(key); const struct nlattr *a; @@ -3065,25 +3088,36 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, !eth_p_mpls(eth_type))) return -EINVAL; eth_type = mpls->mpls_ethertype; + mpls_label_count++; break; } - case OVS_ACTION_ATTR_POP_MPLS: + case OVS_ACTION_ATTR_POP_MPLS: { + __be16 proto; if (vlan_tci & htons(VLAN_CFI_MASK) || !eth_p_mpls(eth_type)) return -EINVAL; - /* Disallow subsequent L2.5+ set and mpls_pop actions - * as there is no check here to ensure that the new - * eth_type is valid and thus set actions could - * write off the end of the packet or otherwise - * corrupt it. + /* Disallow subsequent L2.5+ set actions and mpls_pop + * actions once the last MPLS label in the packet is + * is popped as there is no check here to ensure that + * the new eth type is valid and thus set actions could + * write off the end of the packet or otherwise corrupt + * it. * * Support for these actions is planned using packet * recirculation. */ - eth_type = htons(0); + proto = nla_get_be16(a); + mpls_label_count--; + + if (!eth_p_mpls(proto) || !mpls_label_count) + eth_type = htons(0); + else + eth_type = proto; + break; + } case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, @@ -3106,6 +3140,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_sample(net, a, key, sfa, eth_type, vlan_tci, + mpls_label_count, log, last); if (err) return err; @@ -3176,6 +3211,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_clone(net, a, key, sfa, eth_type, vlan_tci, + mpls_label_count, log, last); if (err) return err; @@ -3188,8 +3224,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_check_pkt_len(net, a, key, sfa, eth_type, - vlan_tci, log, - last); + vlan_tci, + mpls_label_count, + log, last); if (err) return err; skip_copy = true; @@ -3219,14 +3256,18 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, struct sw_flow_actions **sfa, bool log) { int err; + u32 mpls_label_count = 0; *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); if (IS_ERR(*sfa)) return PTR_ERR(*sfa); + if (eth_p_mpls(key->eth.type)) + mpls_label_count = hweight_long(key->mpls.num_labels_mask); + (*sfa)->orig_len = nla_len(attr); err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type, - key->eth.vlan.tci, log); + key->eth.vlan.tci, mpls_label_count, log); if (err) ovs_nla_free_flow_actions(*sfa); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index cf3582c5ed70..5904e93e5765 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -34,8 +34,13 @@ #include <net/ndisc.h> #define TBL_MIN_BUCKETS 1024 +#define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) +#define MC_HASH_SHIFT 8 +#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT) +#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) + static struct kmem_cache *flow_cache; struct kmem_cache *flow_stats_cache __read_mostly; @@ -164,14 +169,133 @@ static struct table_instance *table_instance_alloc(int new_size) return ti; } +static struct mask_array *tbl_mask_array_alloc(int size) +{ + struct mask_array *new; + + size = max(MASK_ARRAY_SIZE_MIN, size); + new = kzalloc(sizeof(struct mask_array) + + sizeof(struct sw_flow_mask *) * size, GFP_KERNEL); + if (!new) + return NULL; + + new->count = 0; + new->max = size; + + return new; +} + +static int tbl_mask_array_realloc(struct flow_table *tbl, int size) +{ + struct mask_array *old; + struct mask_array *new; + + new = tbl_mask_array_alloc(size); + if (!new) + return -ENOMEM; + + old = ovsl_dereference(tbl->mask_array); + if (old) { + int i; + + for (i = 0; i < old->max; i++) { + if (ovsl_dereference(old->masks[i])) + new->masks[new->count++] = old->masks[i]; + } + } + + rcu_assign_pointer(tbl->mask_array, new); + kfree_rcu(old, rcu); + + return 0; +} + +static int tbl_mask_array_add_mask(struct flow_table *tbl, + struct sw_flow_mask *new) +{ + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int err, ma_count = READ_ONCE(ma->count); + + if (ma_count >= ma->max) { + err = tbl_mask_array_realloc(tbl, ma->max + + MASK_ARRAY_SIZE_MIN); + if (err) + return err; + + ma = ovsl_dereference(tbl->mask_array); + } + + BUG_ON(ovsl_dereference(ma->masks[ma_count])); + + rcu_assign_pointer(ma->masks[ma_count], new); + WRITE_ONCE(ma->count, ma_count +1); + + return 0; +} + +static void tbl_mask_array_del_mask(struct flow_table *tbl, + struct sw_flow_mask *mask) +{ + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int i, ma_count = READ_ONCE(ma->count); + + /* Remove the deleted mask pointers from the array */ + for (i = 0; i < ma_count; i++) { + if (mask == ovsl_dereference(ma->masks[i])) + goto found; + } + + BUG(); + return; + +found: + WRITE_ONCE(ma->count, ma_count -1); + + rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]); + RCU_INIT_POINTER(ma->masks[ma_count -1], NULL); + + kfree_rcu(mask, rcu); + + /* Shrink the mask array if necessary. */ + if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) && + ma_count <= (ma->max / 3)) + tbl_mask_array_realloc(tbl, ma->max / 2); +} + +/* Remove 'mask' from the mask list, if it is not needed any more. */ +static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + if (mask) { + /* ovs-lock is required to protect mask-refcount and + * mask list. + */ + ASSERT_OVSL(); + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) + tbl_mask_array_del_mask(tbl, mask); + } +} + int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; + struct mask_array *ma; - ti = table_instance_alloc(TBL_MIN_BUCKETS); + table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * + MC_HASH_ENTRIES, + __alignof__(struct mask_cache_entry)); + if (!table->mask_cache) + return -ENOMEM; + ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); + if (!ma) + goto free_mask_cache; + + ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ti) - return -ENOMEM; + goto free_mask_array; ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ufid_ti) @@ -179,7 +303,7 @@ int ovs_flow_tbl_init(struct flow_table *table) rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); - INIT_LIST_HEAD(&table->mask_list); + rcu_assign_pointer(table->mask_array, ma); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; @@ -187,6 +311,10 @@ int ovs_flow_tbl_init(struct flow_table *table) free_ti: __table_instance_destroy(ti); +free_mask_array: + kfree(ma); +free_mask_cache: + free_percpu(table->mask_cache); return -ENOMEM; } @@ -197,7 +325,28 @@ static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) __table_instance_destroy(ti); } -static void table_instance_destroy(struct table_instance *ti, +static void table_instance_flow_free(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti, + struct sw_flow *flow, + bool count) +{ + hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); + if (count) + table->count--; + + if (ovs_identifier_is_ufid(&flow->id)) { + hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); + + if (count) + table->ufid_count--; + } + + flow_mask_remove(table, flow->mask); +} + +static void table_instance_destroy(struct flow_table *table, + struct table_instance *ti, struct table_instance *ufid_ti, bool deferred) { @@ -214,13 +363,12 @@ static void table_instance_destroy(struct table_instance *ti, struct sw_flow *flow; struct hlist_head *head = &ti->buckets[i]; struct hlist_node *n; - int ver = ti->node_ver; - int ufid_ver = ufid_ti->node_ver; - hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) { - hlist_del_rcu(&flow->flow_table.node[ver]); - if (ovs_identifier_is_ufid(&flow->id)) - hlist_del_rcu(&flow->ufid_table.node[ufid_ver]); + hlist_for_each_entry_safe(flow, n, head, + flow_table.node[ti->node_ver]) { + + table_instance_flow_free(table, ti, ufid_ti, + flow, false); ovs_flow_free(flow, deferred); } } @@ -243,7 +391,9 @@ void ovs_flow_tbl_destroy(struct flow_table *table) struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); - table_instance_destroy(ti, ufid_ti, false); + free_percpu(table->mask_cache); + kfree_rcu(rcu_dereference_raw(table->mask_array), rcu); + table_instance_destroy(table, ti, ufid_ti, false); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -359,7 +509,7 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table) flow_table->count = 0; flow_table->ufid_count = 0; - table_instance_destroy(old_ti, old_ufid_ti, true); + table_instance_destroy(flow_table, old_ti, old_ufid_ti, true); return 0; err_free_ti: @@ -370,13 +520,10 @@ err_free_ti: static u32 flow_hash(const struct sw_flow_key *key, const struct sw_flow_key_range *range) { - int key_start = range->start; - int key_end = range->end; - const u32 *hash_key = (const u32 *)((const u8 *)key + key_start); - int hash_u32s = (key_end - key_start) >> 2; + const u32 *hash_key = (const u32 *)((const u8 *)key + range->start); /* Make sure number of hash bytes are multiple of u32. */ - BUILD_BUG_ON(sizeof(long) % sizeof(u32)); + int hash_u32s = range_n_bytes(range) >> 2; return jhash2(hash_key, hash_u32s, 0); } @@ -425,7 +572,8 @@ static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, static struct sw_flow *masked_flow_lookup(struct table_instance *ti, const struct sw_flow_key *unmasked, - const struct sw_flow_mask *mask) + const struct sw_flow_mask *mask, + u32 *n_mask_hit) { struct sw_flow *flow; struct hlist_head *head; @@ -435,6 +583,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, ovs_flow_mask_key(&masked_key, unmasked, false, mask); hash = flow_hash(&masked_key, &mask->range); head = find_bucket(ti, hash); + (*n_mask_hit)++; + hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { if (flow->mask == mask && flow->flow_table.hash == hash && flow_cmp_masked_key(flow, &masked_key, &mask->range)) @@ -443,46 +593,147 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, return NULL; } -struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, - const struct sw_flow_key *key, - u32 *n_mask_hit) +/* Flow lookup does full lookup on flow table. It starts with + * mask from index passed in *index. + */ +static struct sw_flow *flow_lookup(struct flow_table *tbl, + struct table_instance *ti, + struct mask_array *ma, + const struct sw_flow_key *key, + u32 *n_mask_hit, + u32 *index) { - struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow *flow; struct sw_flow_mask *mask; + int i; + + if (likely(*index < ma->max)) { + mask = rcu_dereference_ovsl(ma->masks[*index]); + if (mask) { + flow = masked_flow_lookup(ti, key, mask, n_mask_hit); + if (flow) + return flow; + } + } + + for (i = 0; i < ma->max; i++) { + + if (i == *index) + continue; + + mask = rcu_dereference_ovsl(ma->masks[i]); + if (unlikely(!mask)) + break; + + flow = masked_flow_lookup(ti, key, mask, n_mask_hit); + if (flow) { /* Found */ + *index = i; + return flow; + } + } + + return NULL; +} + +/* + * mask_cache maps flow to probable mask. This cache is not tightly + * coupled cache, It means updates to mask list can result in inconsistent + * cache entry in mask cache. + * This is per cpu cache and is divided in MC_HASH_SEGS segments. + * In case of a hash collision the entry is hashed in next segment. + * */ +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, + const struct sw_flow_key *key, + u32 skb_hash, + u32 *n_mask_hit) +{ + struct mask_array *ma = rcu_dereference(tbl->mask_array); + struct table_instance *ti = rcu_dereference(tbl->ti); + struct mask_cache_entry *entries, *ce; struct sw_flow *flow; + u32 hash; + int seg; *n_mask_hit = 0; - list_for_each_entry_rcu(mask, &tbl->mask_list, list) { - (*n_mask_hit)++; - flow = masked_flow_lookup(ti, key, mask); - if (flow) /* Found */ + if (unlikely(!skb_hash)) { + u32 mask_index = 0; + + return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); + } + + /* Pre and post recirulation flows usually have the same skb_hash + * value. To avoid hash collisions, rehash the 'skb_hash' with + * 'recirc_id'. */ + if (key->recirc_id) + skb_hash = jhash_1word(skb_hash, key->recirc_id); + + ce = NULL; + hash = skb_hash; + entries = this_cpu_ptr(tbl->mask_cache); + + /* Find the cache entry 'ce' to operate on. */ + for (seg = 0; seg < MC_HASH_SEGS; seg++) { + int index = hash & (MC_HASH_ENTRIES - 1); + struct mask_cache_entry *e; + + e = &entries[index]; + if (e->skb_hash == skb_hash) { + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, + &e->mask_index); + if (!flow) + e->skb_hash = 0; return flow; + } + + if (!ce || e->skb_hash < ce->skb_hash) + ce = e; /* A better replacement cache candidate. */ + + hash >>= MC_HASH_SHIFT; } - return NULL; + + /* Cache miss, do full lookup. */ + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); + if (flow) + ce->skb_hash = skb_hash; + + return flow; } struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, const struct sw_flow_key *key) { + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; + u32 index = 0; - return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit); + return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, const struct sw_flow_match *match) { - struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); - struct sw_flow_mask *mask; - struct sw_flow *flow; + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int i; /* Always called under ovs-mutex. */ - list_for_each_entry(mask, &tbl->mask_list, list) { - flow = masked_flow_lookup(ti, match->key, mask); + for (i = 0; i < ma->max; i++) { + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 __always_unused n_mask_hit; + struct sw_flow_mask *mask; + struct sw_flow *flow; + + mask = ovsl_dereference(ma->masks[i]); + if (!mask) + continue; + + flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit); if (flow && ovs_identifier_is_key(&flow->id) && - ovs_flow_cmp_unmasked_key(flow, match)) + ovs_flow_cmp_unmasked_key(flow, match)) { return flow; + } } + return NULL; } @@ -528,13 +779,8 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, int ovs_flow_tbl_num_masks(const struct flow_table *table) { - struct sw_flow_mask *mask; - int num = 0; - - list_for_each_entry(mask, &table->mask_list, list) - num++; - - return num; + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); + return READ_ONCE(ma->count); } static struct table_instance *table_instance_expand(struct table_instance *ti, @@ -543,24 +789,6 @@ static struct table_instance *table_instance_expand(struct table_instance *ti, return table_instance_rehash(ti, ti->n_buckets * 2, ufid); } -/* Remove 'mask' from the mask list, if it is not needed any more. */ -static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) -{ - if (mask) { - /* ovs-lock is required to protect mask-refcount and - * mask list. - */ - ASSERT_OVSL(); - BUG_ON(!mask->ref_count); - mask->ref_count--; - - if (!mask->ref_count) { - list_del_rcu(&mask->list); - kfree_rcu(mask, rcu); - } - } -} - /* Must be called with OVS mutex held. */ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) { @@ -568,17 +796,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); BUG_ON(table->count == 0); - hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); - table->count--; - if (ovs_identifier_is_ufid(&flow->id)) { - hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); - table->ufid_count--; - } - - /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be - * accessible as long as the RCU read lock is held. - */ - flow_mask_remove(table, flow->mask); + table_instance_flow_free(table, ti, ufid_ti, flow, true); } static struct sw_flow_mask *mask_alloc(void) @@ -606,13 +824,16 @@ static bool mask_equal(const struct sw_flow_mask *a, static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, const struct sw_flow_mask *mask) { - struct list_head *ml; + struct mask_array *ma; + int i; + + ma = ovsl_dereference(tbl->mask_array); + for (i = 0; i < ma->max; i++) { + struct sw_flow_mask *t; + t = ovsl_dereference(ma->masks[i]); - list_for_each(ml, &tbl->mask_list) { - struct sw_flow_mask *m; - m = container_of(ml, struct sw_flow_mask, list); - if (mask_equal(mask, m)) - return m; + if (t && mask_equal(mask, t)) + return t; } return NULL; @@ -623,6 +844,7 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, const struct sw_flow_mask *new) { struct sw_flow_mask *mask; + mask = flow_mask_find(tbl, new); if (!mask) { /* Allocate a new mask if none exsits. */ @@ -631,7 +853,12 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, return -ENOMEM; mask->key = new->key; mask->range = new->range; - list_add_rcu(&mask->list, &tbl->mask_list); + + /* Add mask to mask-list. */ + if (tbl_mask_array_add_mask(tbl, mask)) { + kfree(mask); + return -ENOMEM; + } } else { BUG_ON(!mask->ref_count); mask->ref_count++; diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index bc52045b63ff..8a5cea6ae111 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -22,6 +22,17 @@ #include "flow.h" +struct mask_cache_entry { + u32 skb_hash; + u32 mask_index; +}; + +struct mask_array { + struct rcu_head rcu; + int count, max; + struct sw_flow_mask __rcu *masks[]; +}; + struct table_instance { struct hlist_head *buckets; unsigned int n_buckets; @@ -34,7 +45,8 @@ struct table_instance { struct flow_table { struct table_instance __rcu *ti; struct table_instance __rcu *ufid_ti; - struct list_head mask_list; + struct mask_cache_entry __percpu *mask_cache; + struct mask_array __rcu *mask_array; unsigned long last_rehash; unsigned int count; unsigned int ufid_count; @@ -60,8 +72,9 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, - const struct sw_flow_key *, - u32 *n_mask_hit); + const struct sw_flow_key *, + u32 skb_hash, + u32 *n_mask_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 3fc38d16c456..5da9392b03d6 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -403,8 +403,9 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) ids = rcu_dereference(vport->upcall_portids); - if (ids->n_ids == 1 && ids->ids[0] == 0) - return 0; + /* If there is only one portid, select it in the fast-path. */ + if (ids->n_ids == 1) + return ids->ids[0]; hash = skb_get_hash(skb); ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 82a50e850245..53c1d41fb1c9 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1295,15 +1295,21 @@ static void packet_sock_destruct(struct sock *sk) static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { - u32 rxhash; + u32 *history = po->rollover->history; + u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) - if (po->rollover->history[i] == rxhash) + if (READ_ONCE(history[i]) == rxhash) count++; - po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash; + victim = prandom_u32() % ROLLOVER_HLEN; + + /* Avoid dirtying the cache line if possible */ + if (READ_ONCE(history[victim]) != rxhash) + WRITE_ONCE(history[victim], rxhash); + return count > (ROLLOVER_HLEN >> 1); } diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c index e35869e81766..15ce9b642b25 100644 --- a/net/qrtr/tun.c +++ b/net/qrtr/tun.c @@ -111,15 +111,11 @@ static __poll_t qrtr_tun_poll(struct file *filp, poll_table *wait) static int qrtr_tun_release(struct inode *inode, struct file *filp) { struct qrtr_tun *tun = filp->private_data; - struct sk_buff *skb; qrtr_endpoint_unregister(&tun->ep); /* Discard all SKBs */ - while (!skb_queue_empty(&tun->queue)) { - skb = skb_dequeue(&tun->queue); - kfree_skb(skb); - } + skb_queue_purge(&tun->queue); kfree(tun); diff --git a/net/rds/ib.c b/net/rds/ib.c index 9de2ae22d583..3fd5f40189bd 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,6 +30,7 @@ * SOFTWARE. * */ +#include <linux/dmapool.h> #include <linux/kernel.h> #include <linux/in.h> #include <linux/if.h> @@ -107,6 +108,7 @@ static void rds_ib_dev_free(struct work_struct *work) rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); + dma_pool_destroy(rds_ibdev->rid_hdrs_pool); list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { list_del(&i_ipaddr->list); @@ -182,6 +184,12 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->pd = NULL; goto put_dev; } + rds_ibdev->rid_hdrs_pool = dma_pool_create(device->name, + device->dma_device, + sizeof(struct rds_header), + L1_CACHE_BYTES, 0); + if (!rds_ibdev->rid_hdrs_pool) + goto put_dev; rds_ibdev->mr_1m_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); diff --git a/net/rds/ib.h b/net/rds/ib.h index f2b558e8b5ea..6e6f24753998 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -165,8 +165,8 @@ struct rds_ib_connection { /* tx */ struct rds_ib_work_ring i_send_ring; struct rm_data_op *i_data_op; - struct rds_header *i_send_hdrs; - dma_addr_t i_send_hdrs_dma; + struct rds_header **i_send_hdrs; + dma_addr_t *i_send_hdrs_dma; struct rds_ib_send_work *i_sends; atomic_t i_signaled_sends; @@ -175,8 +175,8 @@ struct rds_ib_connection { struct rds_ib_work_ring i_recv_ring; struct rds_ib_incoming *i_ibinc; u32 i_recv_data_rem; - struct rds_header *i_recv_hdrs; - dma_addr_t i_recv_hdrs_dma; + struct rds_header **i_recv_hdrs; + dma_addr_t *i_recv_hdrs_dma; struct rds_ib_recv_work *i_recvs; u64 i_ack_recv; /* last ACK received */ struct rds_ib_refill_cache i_cache_incs; @@ -246,6 +246,7 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; + struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */ bool use_fastreg; unsigned int max_mrs; @@ -381,7 +382,11 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6); void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event); - +struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev, + struct dma_pool *pool, + dma_addr_t **dma_addrs, u32 num_hdrs); +void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs, + dma_addr_t *dma_addrs, u32 num_hdrs); #define rds_ib_conn_error(conn, fmt...) \ __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 233f1368162b..6b345c858dba 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,6 +30,7 @@ * SOFTWARE. * */ +#include <linux/dmapool.h> #include <linux/kernel.h> #include <linux/in.h> #include <linux/slab.h> @@ -439,6 +440,68 @@ static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index) rds_ibdev->vector_load[index]--; } +/* Allocate DMA coherent memory to be used to store struct rds_header for + * sending/receiving packets. The pointers to the DMA memory and the + * associated DMA addresses are stored in two arrays. + * + * @ibdev: the IB device + * @pool: the DMA memory pool + * @dma_addrs: pointer to the array for storing DMA addresses + * @num_hdrs: number of headers to allocate + * + * It returns the pointer to the array storing the DMA memory pointers. On + * error, NULL pointer is returned. + */ +struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev, + struct dma_pool *pool, + dma_addr_t **dma_addrs, u32 num_hdrs) +{ + struct rds_header **hdrs; + dma_addr_t *hdr_daddrs; + u32 i; + + hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL, + ibdev_to_node(ibdev)); + if (!hdrs) + return NULL; + + hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL, + ibdev_to_node(ibdev)); + if (!hdr_daddrs) { + kvfree(hdrs); + return NULL; + } + + for (i = 0; i < num_hdrs; i++) { + hdrs[i] = dma_pool_zalloc(pool, GFP_KERNEL, &hdr_daddrs[i]); + if (!hdrs[i]) { + rds_dma_hdrs_free(pool, hdrs, hdr_daddrs, i); + return NULL; + } + } + + *dma_addrs = hdr_daddrs; + return hdrs; +} + +/* Free the DMA memory used to store struct rds_header. + * + * @pool: the DMA memory pool + * @hdrs: pointer to the array storing DMA memory pointers + * @dma_addrs: pointer to the array storing DMA addresses + * @num_hdars: number of headers to free. + */ +void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs, + dma_addr_t *dma_addrs, u32 num_hdrs) +{ + u32 i; + + for (i = 0; i < num_hdrs; i++) + dma_pool_free(pool, hdrs[i], dma_addrs[i]); + kvfree(hdrs); + kvfree(dma_addrs); +} + /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -451,6 +514,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct ib_cq_init_attr cq_attr = {}; struct rds_ib_device *rds_ibdev; int ret, fr_queue_space; + struct dma_pool *pool; /* * It's normal to see a null device if an incoming connection races @@ -541,31 +605,28 @@ static int rds_ib_setup_qp(struct rds_connection *conn) goto recv_cq_out; } - ic->i_send_hdrs = ib_dma_alloc_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - &ic->i_send_hdrs_dma, GFP_KERNEL); + pool = rds_ibdev->rid_hdrs_pool; + ic->i_send_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); if (!ic->i_send_hdrs) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent send failed\n"); + rdsdebug("DMA send hdrs alloc failed\n"); goto qp_out; } - ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - &ic->i_recv_hdrs_dma, GFP_KERNEL); + ic->i_recv_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); if (!ic->i_recv_hdrs) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent recv failed\n"); + rdsdebug("DMA recv hdrs alloc failed\n"); goto send_hdrs_dma_out; } - ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), - &ic->i_ack_dma, GFP_KERNEL); + ic->i_ack = dma_pool_zalloc(pool, GFP_KERNEL, + &ic->i_ack_dma); if (!ic->i_ack) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent ack failed\n"); + rdsdebug("DMA ack header alloc failed\n"); goto recv_hdrs_dma_out; } @@ -596,17 +657,23 @@ static int rds_ib_setup_qp(struct rds_connection *conn) sends_out: vfree(ic->i_sends); + ack_dma_out: - ib_dma_free_coherent(dev, sizeof(struct rds_header), - ic->i_ack, ic->i_ack_dma); + dma_pool_free(pool, ic->i_ack, ic->i_ack_dma); + ic->i_ack = NULL; + recv_hdrs_dma_out: - ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - ic->i_recv_hdrs, ic->i_recv_hdrs_dma); + rds_dma_hdrs_free(pool, ic->i_recv_hdrs, ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); + ic->i_recv_hdrs = NULL; + ic->i_recv_hdrs_dma = NULL; + send_hdrs_dma_out: - ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * - sizeof(struct rds_header), - ic->i_send_hdrs, ic->i_send_hdrs_dma); + rds_dma_hdrs_free(pool, ic->i_send_hdrs, ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); + ic->i_send_hdrs = NULL; + ic->i_send_hdrs_dma = NULL; + qp_out: rdma_destroy_qp(ic->i_cm_id); recv_cq_out: @@ -984,8 +1051,6 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_cm_id ? ic->i_cm_id->qp : NULL); if (ic->i_cm_id) { - struct ib_device *dev = ic->i_cm_id->device; - rdsdebug("disconnecting cm %p\n", ic->i_cm_id); err = rdma_disconnect(ic->i_cm_id); if (err) { @@ -1035,24 +1100,39 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ib_destroy_cq(ic->i_recv_cq); } - /* then free the resources that ib callbacks use */ - if (ic->i_send_hdrs) - ib_dma_free_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - ic->i_send_hdrs, - ic->i_send_hdrs_dma); - - if (ic->i_recv_hdrs) - ib_dma_free_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - ic->i_recv_hdrs, - ic->i_recv_hdrs_dma); - - if (ic->i_ack) - ib_dma_free_coherent(dev, sizeof(struct rds_header), - ic->i_ack, ic->i_ack_dma); + if (ic->rds_ibdev) { + struct dma_pool *pool; + + pool = ic->rds_ibdev->rid_hdrs_pool; + + /* then free the resources that ib callbacks use */ + if (ic->i_send_hdrs) { + rds_dma_hdrs_free(pool, ic->i_send_hdrs, + ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); + ic->i_send_hdrs = NULL; + ic->i_send_hdrs_dma = NULL; + } + + if (ic->i_recv_hdrs) { + rds_dma_hdrs_free(pool, ic->i_recv_hdrs, + ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); + ic->i_recv_hdrs = NULL; + ic->i_recv_hdrs_dma = NULL; + } + + if (ic->i_ack) { + dma_pool_free(pool, ic->i_ack, ic->i_ack_dma); + ic->i_ack = NULL; + } + } else { + WARN_ON(ic->i_send_hdrs); + WARN_ON(ic->i_send_hdrs_dma); + WARN_ON(ic->i_recv_hdrs); + WARN_ON(ic->i_recv_hdrs_dma); + WARN_ON(ic->i_ack); + } if (ic->i_sends) rds_ib_send_clear_ring(ic); @@ -1071,9 +1151,6 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_pd = NULL; ic->i_send_cq = NULL; ic->i_recv_cq = NULL; - ic->i_send_hdrs = NULL; - ic->i_recv_hdrs = NULL; - ic->i_ack = NULL; } BUG_ON(ic->rds_ibdev); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index a0f99bbf362c..694d411dc72f 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -61,7 +61,7 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) recv->r_wr.num_sge = RDS_IB_RECV_SGE; sge = &recv->r_sge[0]; - sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->addr = ic->i_recv_hdrs_dma[i]; sge->length = sizeof(struct rds_header); sge->lkey = ic->i_pd->local_dma_lkey; @@ -343,7 +343,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, WARN_ON(ret != 1); sge = &recv->r_sge[0]; - sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs]; sge->length = sizeof(struct rds_header); sge = &recv->r_sge[1]; @@ -861,7 +861,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, } data_len -= sizeof(struct rds_header); - ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + ihdr = ic->i_recv_hdrs[recv - ic->i_recvs]; /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { @@ -993,10 +993,11 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, } else { /* We expect errors as the qp is drained during shutdown */ if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, wc->status, - ib_wc_status_msg(wc->status)); + ib_wc_status_msg(wc->status), + wc->vendor_err); } /* rds_ib_process_recv() doesn't always consume the frag, and diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index dfe6237dafe2..d1cc1d7778d8 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -201,7 +201,8 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_wr.ex.imm_data = 0; sge = &send->s_sge[0]; - sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->addr = ic->i_send_hdrs_dma[i]; + sge->length = sizeof(struct rds_header); sge->lkey = ic->i_pd->local_dma_lkey; @@ -300,10 +301,10 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, wc->status, - ib_wc_status_msg(wc->status)); + ib_wc_status_msg(wc->status), wc->vendor_err); } } @@ -631,11 +632,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_queued = jiffies; send->s_op = NULL; - send->s_sge[0].addr = ic->i_send_hdrs_dma - + (pos * sizeof(struct rds_header)); + send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; + send->s_sge[0].length = sizeof(struct rds_header); - memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, + sizeof(struct rds_header)); + /* Set up the data, if present */ if (i < work_alloc @@ -674,7 +677,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, &send->s_wr, send->s_wr.num_sge, send->s_wr.next); if (ic->i_flowctl && adv_credits) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; + struct rds_header *hdr = ic->i_send_hdrs[pos]; /* add credit and redo the header checksum */ hdr->h_credit = adv_credits; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 6a0df7c8a939..46b8ff24020d 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -906,7 +906,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); out_release: release_sock(sk); @@ -1011,7 +1011,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros make_rose->va = 0; make_rose->vr = 0; make_rose->vl = 0; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); rose_insert_socket(make); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 64830d8c1fdb..452163eadb98 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -209,6 +209,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx, */ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) { + const void *here = __builtin_return_address(0); struct rxrpc_peer *peer; _enter(""); @@ -230,6 +231,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) peer->cong_cwnd = 3; else peer->cong_cwnd = 4; + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here); } _leave(" = %p", peer); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 69d4676a402f..bda1ba25c59e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -188,6 +188,8 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act) + nla_total_size(0) /* TCA_ACT_STATS nested */ /* TCA_STATS_BASIC */ + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) + /* TCA_STATS_PKT64 */ + + nla_total_size_64bit(sizeof(u64)) /* TCA_STATS_QUEUE */ + nla_total_size_64bit(sizeof(struct gnet_stats_queue)) + nla_total_size(0) /* TCA_OPTIONS nested */ @@ -399,7 +401,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, - int bind, bool cpustats) + int bind, bool cpustats, u32 flags) { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; @@ -427,6 +429,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; p->tcfa_tm.firstuse = 0; + p->tcfa_flags = flags; if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, @@ -451,6 +454,17 @@ err1: } EXPORT_SYMBOL(tcf_idr_create); +int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, + struct nlattr *est, struct tc_action **a, + const struct tc_action_ops *ops, int bind, + u32 flags) +{ + /* Set cpustats according to actions flags. */ + return tcf_idr_create(tn, index, est, a, ops, bind, + !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags); +} +EXPORT_SYMBOL(tcf_idr_create_from_flags); + void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) { struct tcf_idrinfo *idrinfo = tn->idrinfo; @@ -773,6 +787,14 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) } rcu_read_unlock(); + if (a->tcfa_flags) { + struct nla_bitfield32 flags = { a->tcfa_flags, + a->tcfa_flags, }; + + if (nla_put(skb, TCA_ACT_FLAGS, sizeof(flags), &flags)) + goto nla_put_failure; + } + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; @@ -831,12 +853,15 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } +static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS; static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, [TCA_ACT_COOKIE] = { .type = NLA_BINARY, .len = TC_COOKIE_MAX_SIZE }, [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, + [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32, + .validation_data = &tca_act_flags_allowed }, }; struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, @@ -845,6 +870,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { + struct nla_bitfield32 flags = { 0, 0 }; struct tc_action *a; struct tc_action_ops *a_o; struct tc_cookie *cookie = NULL; @@ -876,6 +902,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_out; } } + if (tb[TCA_ACT_FLAGS]) + flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); } else { if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { NL_SET_ERR_MSG(extack, "TC action name too long"); @@ -914,10 +942,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - rtnl_held, tp, extack); + rtnl_held, tp, flags.value, extack); else err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, - tp, extack); + tp, flags.value, extack); if (err < 0) goto err_mod; @@ -989,6 +1017,29 @@ err: return err; } +void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, + bool drop, bool hw) +{ + if (a->cpu_bstats) { + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + + if (drop) + this_cpu_ptr(a->cpu_qstats)->drops += packets; + + if (hw) + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); + return; + } + + _bstats_update(&a->tcfa_bstats, bytes, packets); + if (drop) + a->tcfa_qstats.drops += packets; + if (hw) + _bstats_update(&a->tcfa_bstats_hw, bytes, packets); +} +EXPORT_SYMBOL(tcf_action_update_stats); + int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 04b7bd4ec751..46f47e58b3be 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -275,7 +275,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int replace, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; @@ -303,7 +304,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, ret = tcf_idr_check_alloc(tn, &index, act, bind); if (!ret) { ret = tcf_idr_create(tn, index, est, act, - &act_bpf_ops, bind, true); + &act_bpf_ops, bind, true, 0); if (ret < 0) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 2b43cacf82af..43a243081e7d 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -94,7 +94,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -121,7 +121,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ret = tcf_idr_check_alloc(tn, &index, a, bind); if (!ret) { ret = tcf_idr_create(tn, index, est, a, - &act_connmark_ops, bind, false); + &act_connmark_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index d3cfad88dc3a..16e67e1c1db1 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -43,7 +43,7 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_new; @@ -68,8 +68,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_csum_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_csum_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -580,7 +580,7 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, params = rcu_dereference_bh(p->params); tcf_lastuse_update(&p->tcf_tm); - bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); + tcf_action_update_bstats(&p->common, skb); action = READ_ONCE(p->tcf_action); if (unlikely(action == TC_ACT_SHOT)) @@ -624,7 +624,7 @@ out: return action; drop: - qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&p->common); action = TC_ACT_SHOT; goto out; } diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index fcc46025e790..68d6af56b243 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -465,11 +465,11 @@ out_push: skb_push_rcsum(skb, nh_ofs); out: - bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); + tcf_action_update_bstats(&c->common, skb); return retval; drop: - qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); + tcf_action_inc_drop_qstats(&c->common); return TC_ACT_SHOT; } @@ -656,7 +656,7 @@ static int tcf_ct_fill_params(struct net *net, static int tcf_ct_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int replace, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ct_net_id); @@ -688,8 +688,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, return err; if (!err) { - err = tcf_idr_create(tn, index, est, a, - &act_ct_ops, bind, true); + err = tcf_idr_create_from_flags(tn, index, est, a, + &act_ct_ops, bind, flags); if (err) { tcf_idr_cleanup(tn, index); return err; @@ -905,11 +905,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, { struct tcf_ct *c = to_ct(a); - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse); } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 0dbcfd1dca7b..b1e601007242 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -153,7 +153,7 @@ static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ctinfo_net_id); @@ -210,7 +210,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create(tn, index, est, a, - &act_ctinfo_ops, bind, false); + &act_ctinfo_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 324f1d1f6d47..416065772719 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -53,7 +53,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; @@ -98,8 +99,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_gact_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_gact_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -161,9 +162,9 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, action = gact_rand[ptype](gact); } #endif - bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb); + tcf_action_update_bstats(&gact->common, skb); if (action == TC_ACT_SHOT) - qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&gact->common); tcf_lastuse_update(&gact->tcf_tm); @@ -177,15 +178,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, - packets); - if (action == TC_ACT_SHOT) - this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; - - if (hw) - _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw), - bytes, packets); - + tcf_action_update_stats(a, bytes, packets, action == TC_ACT_SHOT, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3a31e241c647..d562c88cccbe 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -465,7 +465,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; @@ -522,7 +523,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_ife_ops, - bind, true); + bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); kfree(p); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 214a03d405cf..400a2cfe8452 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -95,7 +95,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int ovr, int bind, - struct tcf_proto *tp) + struct tcf_proto *tp, u32 flags) { struct tc_action_net *tn = net_generic(net, id); struct nlattr *tb[TCA_IPT_MAX + 1]; @@ -144,7 +144,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, ops, bind, - false); + false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -205,19 +205,19 @@ err1: static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, - bind, tp); + bind, tp, flags); } static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool unlocked, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, - bind, tp); + bind, tp, flags); } static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 08923b21e566..b6e1b5bbb4da 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -93,7 +93,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; @@ -148,8 +148,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; } - ret = tcf_idr_create(tn, index, est, a, - &act_mirred_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_mirred_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -231,7 +231,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, } tcf_lastuse_update(&m->tcf_tm); - bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); + tcf_action_update_bstats(&m->common, skb); m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); @@ -289,8 +289,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; - res->qstats = this_cpu_ptr(m->common.cpu_qstats); - skb_tc_reinsert(skb, res); + if (skb_tc_reinsert(skb, res)) + tcf_action_inc_overlimit_qstats(&m->common); __this_cpu_dec(mirred_rec_level); return TC_ACT_CONSUMED; } @@ -303,7 +303,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, if (err) { out: - qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats)); + tcf_action_inc_overlimit_qstats(&m->common); if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } @@ -318,10 +318,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 4cf6c553bb0b..4d8c822b6aca 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -131,7 +131,8 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { static int tcf_mpls_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mpls_net_id); struct nlattr *tb[TCA_MPLS_MAX + 1]; @@ -224,7 +225,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_mpls_ops, bind, true); + &act_mpls_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index ea4c5359e7df..855a6fa16a62 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -36,7 +36,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; @@ -61,7 +61,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create(tn, index, est, a, - &act_nat_ops, bind, false); + &act_nat_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -206,9 +206,7 @@ static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a, icmph = (void *)(skb_network_header(skb) + ihl); - if ((icmph->type != ICMP_DEST_UNREACH) && - (icmph->type != ICMP_TIME_EXCEEDED) && - (icmph->type != ICMP_PARAMETERPROB)) + if (!icmp_is_err(icmph->type)) break; if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) + diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index cdfaa79382a2..d5eff6ac17a9 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -137,7 +137,8 @@ nla_failure: static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; @@ -190,7 +191,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_free; } ret = tcf_idr_create(tn, index, est, a, - &act_pedit_ops, bind, false); + &act_pedit_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); goto out_free; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 89c04c52af3d..d96271590268 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -47,7 +47,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { int ret = 0, tcfp_result = TC_ACT_OK, err, size; @@ -87,7 +87,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, NULL, a, - &act_police_ops, bind, true); + &act_police_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -294,10 +294,7 @@ static void tcf_police_stats_update(struct tc_action *a, struct tcf_police *police = to_police(a); struct tcf_t *tm = &police->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } @@ -345,10 +342,7 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(police->tcf_tm.expires); + tcf_tm_dump(&t, &police->tcf_tm); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; spin_unlock_bh(&police->tcf_lock); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 514456a0b9a8..29b23bfaf10d 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -36,7 +36,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; @@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_sample_ops, bind, true); + &act_sample_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 6120e56117ca..9813ca4006dd 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -35,7 +35,7 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, * Example if this was the 3rd packet and the string was "hello" * then it would look like "hello_3" (without quotes) */ - pr_info("simple: %s_%d\n", + pr_info("simple: %s_%llu\n", (char *)d->tcfd_defdata, d->tcf_bstats.packets); spin_unlock(&d->tcf_lock); return d->tcf_action; @@ -86,7 +86,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; @@ -127,7 +128,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_simp_ops, bind, false); + &act_simp_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 6a8d3337c577..5f7ca7f89ca2 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -86,7 +86,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); @@ -165,7 +165,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_skbedit_ops, bind, true); + &act_skbedit_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 888437f97ba6..39e6d94cfafb 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -79,7 +79,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); @@ -143,7 +143,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_skbmod_ops, bind, true); + &act_skbmod_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2f83a79f76aa..cb34e5d57aaa 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -31,7 +31,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); - bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); + tcf_action_update_bstats(&t->common, skb); action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { @@ -208,7 +208,7 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); @@ -347,8 +347,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } if (!exists) { - ret = tcf_idr_create(tn, index, est, a, - &act_tunnel_key_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_tunnel_key_ops, bind, + act_flags); if (ret) { NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); goto release_tun_meta; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 08aaf719a70f..b6939abc61eb 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -29,7 +29,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, u16 tci; tcf_lastuse_update(&v->tcf_tm); - bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb); + tcf_action_update_bstats(&v->common, skb); /* Ensure 'data' points at mac_header prior calling vlan manipulating * functions. @@ -88,7 +88,7 @@ out: return action; drop: - qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&v->common); return TC_ACT_SHOT; } @@ -102,7 +102,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; @@ -188,8 +189,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, action = parm->v_action; if (!exists) { - ret = tcf_idr_create(tn, index, est, a, - &act_vlan_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_vlan_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -307,10 +308,7 @@ static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u32 packets, struct tcf_vlan *v = to_vlan(a); struct tcf_t *tm = &v->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 3177dcb17316..d99966a55c84 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -521,7 +521,7 @@ META_COLLECTOR(int_sk_ack_bl) *err = -1; return; } - dst->value = sk->sk_ack_backlog; + dst->value = READ_ONCE(sk->sk_ack_backlog); } META_COLLECTOR(int_sk_max_ack_bl) @@ -532,7 +532,7 @@ META_COLLECTOR(int_sk_max_ack_bl) *err = -1; return; } - dst->value = sk->sk_max_ack_backlog; + dst->value = READ_ONCE(sk->sk_max_ack_backlog); } META_COLLECTOR(int_sk_prio) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 98dd87ce1510..b1c7e726ce5d 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -530,8 +530,7 @@ begin: fq_flow_set_throttled(q, f); goto begin; } - if (time_next_packet && - (s64)(now - time_next_packet - q->ce_threshold) > 0) { + if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index c261c0a18868..968519ff36e9 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -14,7 +14,6 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> -#include <linux/jhash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/netlink.h> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 8769b4b8807d..5ab696efca95 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -382,13 +382,8 @@ void __qdisc_run(struct Qdisc *q) int packets; while (qdisc_restart(q, &packets)) { - /* - * Ordered by possible occurrence: Postpone processing if - * 1. we've exceeded packet quota - * 2. another process needs the CPU; - */ quota -= packets; - if (quota <= 0 || need_resched()) { + if (quota <= 0) { __netif_schedule(q); break; } @@ -657,7 +652,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else { - qdisc->empty = true; + WRITE_ONCE(qdisc->empty, true); } return skb; @@ -1214,8 +1209,13 @@ void dev_deactivate_many(struct list_head *head) /* Wait for outstanding qdisc_run calls. */ list_for_each_entry(dev, head, close_list) { - while (some_qdisc_is_busy(dev)) - yield(); + while (some_qdisc_is_busy(dev)) { + /* wait_event() would avoid this sleep-loop but would + * require expensive checks in the fast paths of packet + * processing which isn't worth it. + */ + schedule_timeout_uninterruptible(1); + } /* The new qdisc is assigned at this point so we can safely * unwind stale skb lists and qdisc statistics */ diff --git a/net/sctp/associola.c b/net/sctp/associola.c index d2ffc9a0ba3a..8f8d18abd013 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -86,6 +86,8 @@ static struct sctp_association *sctp_association_init( */ asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt; asoc->pf_retrans = sp->pf_retrans; + asoc->ps_retrans = sp->ps_retrans; + asoc->pf_expose = sp->pf_expose; asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial); asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max); @@ -324,7 +326,7 @@ void sctp_association_free(struct sctp_association *asoc) * socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); } /* Mark as dead, so other users can know this structure is @@ -429,6 +431,8 @@ void sctp_assoc_set_primary(struct sctp_association *asoc, changeover = 1 ; asoc->peer.primary_path = transport; + sctp_ulpevent_nofity_peer_addr_change(transport, + SCTP_ADDR_MADE_PRIM, 0); /* Set a default msg_name for events. */ memcpy(&asoc->peer.primary_addr, &transport->ipaddr, @@ -569,6 +573,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, asoc->peer.transport_count--; + sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); sctp_transport_free(peer); } @@ -624,6 +629,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, /* And the partial failure retrans threshold */ peer->pf_retrans = asoc->pf_retrans; + /* And the primary path switchover retrans threshold */ + peer->ps_retrans = asoc->ps_retrans; /* Initialize the peer's SACK delay timeout based on the * association configured value. @@ -707,6 +714,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; + sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); + /* If we do not yet have a primary path, set one. */ if (!asoc->peer.primary_path) { sctp_assoc_set_primary(asoc, peer); @@ -781,9 +790,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, enum sctp_transport_cmd command, sctp_sn_error_t error) { - struct sctp_ulpevent *event; - struct sockaddr_storage addr; - int spc_state = 0; + int spc_state = SCTP_ADDR_AVAILABLE; bool ulp_notify = true; /* Record the transition on the transport. */ @@ -793,19 +800,13 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, * to heartbeat success, report the SCTP_ADDR_CONFIRMED * state to the user, otherwise report SCTP_ADDR_AVAILABLE. */ - if (SCTP_UNCONFIRMED == transport->state && - SCTP_HEARTBEAT_SUCCESS == error) - spc_state = SCTP_ADDR_CONFIRMED; - else - spc_state = SCTP_ADDR_AVAILABLE; - /* Don't inform ULP about transition from PF to - * active state and set cwnd to 1 MTU, see SCTP - * Quick failover draft section 5.1, point 5 - */ - if (transport->state == SCTP_PF) { + if (transport->state == SCTP_PF && + asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) ulp_notify = false; - transport->cwnd = asoc->pathmtu; - } + else if (transport->state == SCTP_UNCONFIRMED && + error == SCTP_HEARTBEAT_SUCCESS) + spc_state = SCTP_ADDR_CONFIRMED; + transport->state = SCTP_ACTIVE; break; @@ -814,19 +815,21 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, * to inactive state. Also, release the cached route since * there may be a better route next time. */ - if (transport->state != SCTP_UNCONFIRMED) + if (transport->state != SCTP_UNCONFIRMED) { transport->state = SCTP_INACTIVE; - else { + spc_state = SCTP_ADDR_UNREACHABLE; + } else { sctp_transport_dst_release(transport); ulp_notify = false; } - - spc_state = SCTP_ADDR_UNREACHABLE; break; case SCTP_TRANSPORT_PF: transport->state = SCTP_PF; - ulp_notify = false; + if (asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) + ulp_notify = false; + else + spc_state = SCTP_ADDR_POTENTIALLY_FAILED; break; default: @@ -836,16 +839,9 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, /* Generate and send a SCTP_PEER_ADDR_CHANGE notification * to the user. */ - if (ulp_notify) { - memset(&addr, 0, sizeof(struct sockaddr_storage)); - memcpy(&addr, &transport->ipaddr, - transport->af_specific->sockaddr_len); - - event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, - 0, spc_state, error, GFP_ATOMIC); - if (event) - asoc->stream.si->enqueue_event(&asoc->ulpq, event); - } + if (ulp_notify) + sctp_ulpevent_nofity_peer_addr_change(transport, + spc_state, error); /* Select new active and retran paths. */ sctp_select_active_and_retran_path(asoc); @@ -1077,7 +1073,7 @@ void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk) /* Decrement the backlog value for a TCP-style socket. */ if (sctp_style(oldsk, TCP)) - oldsk->sk_ack_backlog--; + sk_acceptq_removed(oldsk); /* Release references to the old endpoint and the sock. */ sctp_endpoint_put(assoc->ep); diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index cc0405c79dfc..cc3ce5d80b08 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -75,41 +75,39 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_ulpevent *ev; - int error = 0, notify; - - /* If we failed, we may need to notify. */ - notify = msg->send_failed ? -1 : 0; + int error, sent; /* Release all references. */ list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); - /* Check whether we _really_ need to notify. */ - if (notify < 0) { - asoc = chunk->asoc; - if (msg->send_error) - error = msg->send_error; - else - error = asoc->outqueue.error; - - notify = sctp_ulpevent_type_enabled(asoc->subscribe, - SCTP_SEND_FAILED); + + if (!msg->send_failed) { + sctp_chunk_put(chunk); + continue; } - /* Generate a SEND FAILED event only if enabled. */ - if (notify > 0) { - int sent; - if (chunk->has_tsn) - sent = SCTP_DATA_SENT; - else - sent = SCTP_DATA_UNSENT; + asoc = chunk->asoc; + error = msg->send_error ?: asoc->outqueue.error; + sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT; + if (sctp_ulpevent_type_enabled(asoc->subscribe, + SCTP_SEND_FAILED)) { ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } + if (sctp_ulpevent_type_enabled(asoc->subscribe, + SCTP_SEND_FAILED_EVENT)) { + ev = sctp_ulpevent_make_send_failed_event(asoc, chunk, + sent, error, + GFP_ATOMIC); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } + sctp_chunk_put(chunk); } diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 0851166b9175..8a15146faaeb 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -425,8 +425,8 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc); r->idiag_wqueue = infox->asoc->sndbuf_used; } else { - r->idiag_rqueue = sk->sk_ack_backlog; - r->idiag_wqueue = sk->sk_max_ack_backlog; + r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); + r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } if (infox->sctpinfo) sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index ea53049d1db6..9d05b2e7bce2 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -164,7 +164,7 @@ void sctp_endpoint_add_asoc(struct sctp_endpoint *ep, /* Increment the backlog value for a TCP-style listening socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) - sk->sk_ack_backlog++; + sk_acceptq_added(sk); } /* Free the endpoint structure. Delay cleanup until diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 08d14d86ecfb..fbbf19128c2d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1217,9 +1217,15 @@ static int __net_init sctp_defaults_init(struct net *net) /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; + /* Disable of Primary Path Switchover by default */ + net->sctp.ps_retrans = SCTP_PS_RETRANS_MAX; + /* Enable pf state by default */ net->sctp.pf_enable = 1; + /* Ignore pf exposure feature by default */ + net->sctp.pf_expose = SCTP_PF_EXPOSE_UNSET; + /* Association.Max.Retrans - 10 attempts * Path.Max.Retrans - 5 attempts (per destination address) * Max.Init.Retransmits - 8 attempts diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index e52b2128e43b..acd737d4c0e0 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -567,6 +567,11 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, SCTP_FAILED_THRESHOLD); } + if (transport->error_count > transport->ps_retrans && + asoc->peer.primary_path == transport && + asoc->peer.active_path != transport) + sctp_assoc_set_primary(asoc, asoc->peer.active_path); + /* E2) For the destination address for which the timer * expires, set RTO <- RTO * 2 ("back off the timer"). The * maximum value discussed in rule C7 above (RTO.max) may be diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ffd3262b7a41..83e4ca1fabda 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3943,18 +3943,22 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, */ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, char __user *optval, - unsigned int optlen) + unsigned int optlen, bool v2) { - struct sctp_paddrthlds val; + struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; + int len; - if (optlen < sizeof(struct sctp_paddrthlds)) + len = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); + if (optlen < len) return -EINVAL; - if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, - sizeof(struct sctp_paddrthlds))) + if (copy_from_user(&val, optval, len)) return -EFAULT; + if (v2 && val.spt_pathpfthld > val.spt_pathcpthld) + return -EINVAL; + if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { trans = sctp_addr_id2transport(sk, &val.spt_address, val.spt_assoc_id); @@ -3963,6 +3967,8 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, if (val.spt_pathmaxrxt) trans->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + trans->ps_retrans = val.spt_pathcpthld; trans->pf_retrans = val.spt_pathpfthld; return 0; @@ -3978,17 +3984,23 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, transports) { if (val.spt_pathmaxrxt) trans->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + trans->ps_retrans = val.spt_pathcpthld; trans->pf_retrans = val.spt_pathpfthld; } if (val.spt_pathmaxrxt) asoc->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + asoc->ps_retrans = val.spt_pathcpthld; asoc->pf_retrans = val.spt_pathpfthld; } else { struct sctp_sock *sp = sctp_sk(sk); if (val.spt_pathmaxrxt) sp->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + sp->ps_retrans = val.spt_pathcpthld; sp->pf_retrans = val.spt_pathpfthld; } @@ -4589,6 +4601,40 @@ out: return retval; } +static int sctp_setsockopt_pf_expose(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_value > SCTP_PF_EXPOSE_MAX) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) + goto out; + + if (asoc) + asoc->pf_expose = params.assoc_value; + else + sctp_sk(sk)->pf_expose = params.assoc_value; + retval = 0; + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4744,7 +4790,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_auto_asconf(sk, optval, optlen); break; case SCTP_PEER_ADDR_THLDS: - retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen); + retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + false); + break; + case SCTP_PEER_ADDR_THLDS_V2: + retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + true); break; case SCTP_RECVRCVINFO: retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen); @@ -4798,6 +4849,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_ECN_SUPPORTED: retval = sctp_setsockopt_ecn_supported(sk, optval, optlen); break; + case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: + retval = sctp_setsockopt_pf_expose(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -5041,6 +5095,8 @@ static int sctp_init_sock(struct sock *sk) sp->hbinterval = net->sctp.hb_interval; sp->pathmaxrxt = net->sctp.max_retrans_path; sp->pf_retrans = net->sctp.pf_retrans; + sp->ps_retrans = net->sctp.ps_retrans; + sp->pf_expose = net->sctp.pf_expose; sp->pathmtu = 0; /* allow default discovery */ sp->sackdelay = net->sctp.sack_timeout; sp->sackfreq = 2; @@ -5521,8 +5577,16 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address, pinfo.spinfo_assoc_id); - if (!transport) - return -EINVAL; + if (!transport) { + retval = -EINVAL; + goto out; + } + + if (transport->state == SCTP_PF && + transport->asoc->pf_expose == SCTP_PF_EXPOSE_DISABLE) { + retval = -EACCES; + goto out; + } pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc); pinfo.spinfo_state = transport->state; @@ -7170,18 +7234,19 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt */ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, - char __user *optval, - int len, - int __user *optlen) + char __user *optval, int len, + int __user *optlen, bool v2) { - struct sctp_paddrthlds val; + struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; + int min; - if (len < sizeof(struct sctp_paddrthlds)) + min = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); + if (len < min) return -EINVAL; - len = sizeof(struct sctp_paddrthlds); - if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, len)) + len = min; + if (copy_from_user(&val, optval, len)) return -EFAULT; if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { @@ -7192,6 +7257,7 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, val.spt_pathmaxrxt = trans->pathmaxrxt; val.spt_pathpfthld = trans->pf_retrans; + val.spt_pathcpthld = trans->ps_retrans; goto out; } @@ -7204,11 +7270,13 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, if (asoc) { val.spt_pathpfthld = asoc->pf_retrans; val.spt_pathmaxrxt = asoc->pathmaxrxt; + val.spt_pathcpthld = asoc->ps_retrans; } else { struct sctp_sock *sp = sctp_sk(sk); val.spt_pathpfthld = sp->pf_retrans; val.spt_pathmaxrxt = sp->pathmaxrxt; + val.spt_pathcpthld = sp->ps_retrans; } out: @@ -7900,6 +7968,45 @@ out: return retval; } +static int sctp_getsockopt_pf_expose(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) { + retval = -EINVAL; + goto out; + } + + params.assoc_value = asoc ? asoc->pf_expose + : sctp_sk(sk)->pf_expose; + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -8049,7 +8156,12 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen); break; case SCTP_PEER_ADDR_THLDS: - retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen); + retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, + optlen, false); + break; + case SCTP_PEER_ADDR_THLDS_V2: + retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, + optlen, true); break; case SCTP_GET_ASSOC_STATS: retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen); @@ -8112,6 +8224,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_ECN_SUPPORTED: retval = sctp_getsockopt_ecn_supported(sk, len, optval, optlen); break; + case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: + retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -8376,7 +8491,7 @@ static int sctp_listen_start(struct sock *sk, int backlog) } } - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); return sctp_hash_endpoint(ep); } @@ -8430,7 +8545,7 @@ int sctp_inet_listen(struct socket *sock, int backlog) /* If we are already listening, just update the backlog */ if (sctp_sstate(sk, LISTENING)) - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); else { err = sctp_listen_start(sk, backlog); if (err) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 238cf1737576..4740aa70e652 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -34,6 +34,8 @@ static int rto_alpha_min = 0; static int rto_beta_min = 0; static int rto_alpha_max = 1000; static int rto_beta_max = 1000; +static int pf_expose_max = SCTP_PF_EXPOSE_MAX; +static int ps_retrans_max = SCTP_PS_RETRANS_MAX; static unsigned long max_autoclose_min = 0; static unsigned long max_autoclose_max = @@ -212,7 +214,16 @@ static struct ctl_table sctp_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_INT_MAX, + .extra2 = &init_net.sctp.ps_retrans, + }, + { + .procname = "ps_retrans", + .data = &init_net.sctp.ps_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.sctp.pf_retrans, + .extra2 = &ps_retrans_max, }, { .procname = "sndbuf_policy", @@ -318,6 +329,15 @@ static struct ctl_table sctp_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "pf_expose", + .data = &init_net.sctp.pf_expose, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &pf_expose_max, + }, { /* sentinel */ } }; diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index e0cc1edf49a0..c82dbdcf13f2 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -238,7 +238,7 @@ fail: * When a destination address on a multi-homed peer encounters a change * an interface details event is sent. */ -struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( +static struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( const struct sctp_association *asoc, const struct sockaddr_storage *aaddr, int flags, int state, int error, gfp_t gfp) @@ -336,6 +336,22 @@ fail: return NULL; } +void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, + int state, int error) +{ + struct sctp_association *asoc = transport->asoc; + struct sockaddr_storage addr; + struct sctp_ulpevent *event; + + memset(&addr, 0, sizeof(struct sockaddr_storage)); + memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len); + + event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, state, + error, GFP_ATOMIC); + if (event) + asoc->stream.si->enqueue_event(&asoc->ulpq, event); +} + /* Create and initialize an SCTP_REMOTE_ERROR notification. * * Note: This assumes that the chunk->skb->data already points to the @@ -511,6 +527,45 @@ fail: return NULL; } +struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event( + const struct sctp_association *asoc, struct sctp_chunk *chunk, + __u16 flags, __u32 error, gfp_t gfp) +{ + struct sctp_send_failed_event *ssf; + struct sctp_ulpevent *event; + struct sk_buff *skb; + int len; + + skb = skb_copy_expand(chunk->skb, sizeof(*ssf), 0, gfp); + if (!skb) + return NULL; + + len = ntohs(chunk->chunk_hdr->length); + len -= sctp_datachk_len(&asoc->stream); + + skb_pull(skb, sctp_datachk_len(&asoc->stream)); + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); + + ssf = skb_push(skb, sizeof(*ssf)); + ssf->ssf_type = SCTP_SEND_FAILED_EVENT; + ssf->ssf_flags = flags; + ssf->ssf_length = sizeof(*ssf) + len; + skb_trim(skb, ssf->ssf_length); + ssf->ssf_error = error; + + ssf->ssfe_info.snd_sid = chunk->sinfo.sinfo_stream; + ssf->ssfe_info.snd_ppid = chunk->sinfo.sinfo_ppid; + ssf->ssfe_info.snd_context = chunk->sinfo.sinfo_context; + ssf->ssfe_info.snd_assoc_id = chunk->sinfo.sinfo_assoc_id; + ssf->ssfe_info.snd_flags = chunk->chunk_hdr->flags; + + sctp_ulpevent_set_owner(event, asoc); + ssf->ssf_assoc_id = sctp_assoc2id(asoc); + + return event; +} + /* Create and initialize a SCTP_SHUTDOWN_EVENT notification. * * Socket Extensions for SCTP - draft-01 diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 47946f489fd4..b7d9fd285c71 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -174,6 +174,7 @@ static int smc_release(struct socket *sock) if (!sk) goto out; + sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); /* cleanup for a dangling non-blocking connect */ @@ -196,6 +197,7 @@ static int smc_release(struct socket *sock) sock->sk = NULL; release_sock(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ out: return rc; @@ -977,12 +979,14 @@ void smc_close_non_accepted(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); + sock_hold(sk); /* sock_put below */ lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; __smc_release(smc); release_sock(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ } diff --git a/net/smc/smc.h b/net/smc/smc.h index 878313f8d6c1..be11ba41190f 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -188,6 +188,7 @@ struct smc_connection { * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ + u8 killed : 1; /* abnormal termination */ }; struct smc_sock { /* smc sock container */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index d0b0f4c865b4..7dc07ec2379b 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -63,7 +63,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, wr_rdma_buf, (struct smc_wr_tx_pend_priv **)pend); - if (!conn->alert_token_local) + if (conn->killed) /* abnormal termination */ rc = -EPIPE; return rc; @@ -328,7 +328,7 @@ static void smcd_cdc_rx_tsklet(unsigned long data) struct smcd_cdc_msg cdc; struct smc_sock *smc; - if (!conn) + if (!conn || conn->killed) return; data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index fc06720b53c1..d34e5adce2eb 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -13,6 +13,7 @@ #include <linux/sched/signal.h> #include <net/sock.h> +#include <net/tcp.h> #include "smc.h" #include "smc_tx.h" @@ -65,8 +66,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) rc = sk_wait_event(sk, &timeout, !smc_tx_prepared_sends(&smc->conn) || - (sk->sk_err == ECONNABORTED) || - (sk->sk_err == ECONNRESET), + sk->sk_err == ECONNABORTED || + sk->sk_err == ECONNRESET || + smc->conn.killed, &wait); if (rc) break; @@ -95,11 +97,13 @@ static int smc_close_final(struct smc_connection *conn) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; else conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1; + if (conn->killed) + return -EPIPE; return smc_cdc_get_slot_and_msg_send(conn); } -static int smc_close_abort(struct smc_connection *conn) +int smc_close_abort(struct smc_connection *conn) { conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; @@ -109,19 +113,15 @@ static int smc_close_abort(struct smc_connection *conn) /* terminate smc socket abnormally - active abort * link group is terminated, i.e. RDMA communication no longer possible */ -static void smc_close_active_abort(struct smc_sock *smc) +void smc_close_active_abort(struct smc_sock *smc) { struct sock *sk = &smc->sk; - - struct smc_cdc_conn_state_flags *txflags = - &smc->conn.local_tx_ctrl.conn_state_flags; + bool release_clcsock = false; if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) { sk->sk_err = ECONNABORTED; - if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_err = ECONNABORTED; - smc->clcsock->sk->sk_state_change(smc->clcsock->sk); - } + if (smc->clcsock && smc->clcsock->sk) + tcp_abort(smc->clcsock->sk, ECONNABORTED); } switch (sk->sk_state) { case SMC_ACTIVE: @@ -129,35 +129,29 @@ static void smc_close_active_abort(struct smc_sock *smc) release_sock(sk); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); + sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: - if (!smc_cdc_rxed_any_close(&smc->conn)) - sk->sk_state = SMC_PEERABORTWAIT; - else - sk->sk_state = SMC_CLOSED; release_sock(sk); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* postponed passive closing */ break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: - if (!txflags->peer_conn_closed) { - /* just SHUTDOWN_SEND done */ - sk->sk_state = SMC_PEERABORTWAIT; - } else { - sk->sk_state = SMC_CLOSED; - } + case SMC_PEERFINCLOSEWAIT: + sk->sk_state = SMC_CLOSED; + smc_conn_free(&smc->conn); + release_clcsock = true; sock_put(sk); /* passive closing */ break; case SMC_PROCESSABORT: case SMC_APPFINCLOSEWAIT: sk->sk_state = SMC_CLOSED; break; - case SMC_PEERFINCLOSEWAIT: - sock_put(sk); /* passive closing */ - break; case SMC_INIT: case SMC_PEERABORTWAIT: case SMC_CLOSED: @@ -166,6 +160,12 @@ static void smc_close_active_abort(struct smc_sock *smc) sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); + + if (release_clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } } static inline bool smc_close_sent_any_close(struct smc_connection *conn) @@ -215,8 +215,6 @@ again: if (sk->sk_state == SMC_ACTIVE) { /* send close request */ rc = smc_close_final(conn); - if (rc) - break; sk->sk_state = SMC_PEERCLOSEWAIT1; } else { /* peer event has changed the state */ @@ -229,8 +227,6 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); - if (rc) - break; } sk->sk_state = SMC_CLOSED; break; @@ -246,8 +242,6 @@ again: goto again; /* confirm close from peer */ rc = smc_close_final(conn); - if (rc) - break; if (smc_cdc_rxed_any_close(conn)) { /* peer has closed the socket already */ sk->sk_state = SMC_CLOSED; @@ -263,8 +257,6 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); - if (rc) - break; } /* peer sending PeerConnectionClosed will cause transition */ break; @@ -272,10 +264,12 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - smc_close_abort(conn); + rc = smc_close_abort(conn); sk->sk_state = SMC_CLOSED; break; case SMC_PEERABORTWAIT: + sk->sk_state = SMC_CLOSED; + break; case SMC_CLOSED: /* nothing to do, add tracing in future patch */ break; @@ -344,12 +338,6 @@ static void smc_close_passive_work(struct work_struct *work) lock_sock(sk); old_state = sk->sk_state; - if (!conn->alert_token_local) { - /* abnormal termination */ - smc_close_active_abort(smc); - goto wakeup; - } - rxflags = &conn->local_rx_ctrl.conn_state_flags; if (rxflags->peer_conn_abort) { /* peer has not received all data */ @@ -451,8 +439,6 @@ again: goto again; /* send close wr request */ rc = smc_close_wr(conn); - if (rc) - break; sk->sk_state = SMC_PEERCLOSEWAIT1; break; case SMC_APPCLOSEWAIT1: @@ -466,8 +452,6 @@ again: goto again; /* confirm close from peer */ rc = smc_close_wr(conn); - if (rc) - break; sk->sk_state = SMC_APPCLOSEWAIT2; break; case SMC_APPCLOSEWAIT2: diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index e0e3b5df25d2..634fea2b7c95 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -24,5 +24,7 @@ int smc_close_active(struct smc_sock *smc); int smc_close_shutdown_write(struct smc_sock *smc); void smc_close_init(struct smc_sock *smc); void smc_clcsock_release(struct smc_sock *smc); +int smc_close_abort(struct smc_connection *conn); +void smc_close_active_abort(struct smc_sock *smc); #endif /* SMC_CLOSE_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2ba97ff325a5..0d92456729ab 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -42,20 +42,40 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); +/* return head of link group list and its lock for a given link group */ +static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, + spinlock_t **lgr_lock) +{ + if (lgr->is_smcd) { + *lgr_lock = &lgr->smcd->lgr_lock; + return &lgr->smcd->lgr_list; + } + + *lgr_lock = &smc_lgr_list.lock; + return &smc_lgr_list.list; +} + static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ - mod_delayed_work(system_wq, &lgr->free_work, - (!lgr->is_smcd && lgr->role == SMC_CLNT) ? - SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); + if (!lgr->freeing && !lgr->freefast) { + mod_delayed_work(system_wq, &lgr->free_work, + (!lgr->is_smcd && lgr->role == SMC_CLNT) ? + SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); + } } void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) { - mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST); + if (!lgr->freeing && !lgr->freefast) { + lgr->freefast = 1; + mod_delayed_work(system_wq, &lgr->free_work, + SMC_LGR_FREE_DELAY_FAST); + } } /* Register connection's alert token in our lookup structure. @@ -134,6 +154,7 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); + conn->lgr = NULL; } /* Send delete link, either as client to request the initiation @@ -157,48 +178,62 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link_group *lgr = container_of(to_delayed_work(work), struct smc_link_group, free_work); + spinlock_t *lgr_lock; + struct smc_link *lnk; bool conns; - spin_lock_bh(&smc_lgr_list.lock); + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (lgr->freeing) { + spin_unlock_bh(lgr_lock); + return; + } read_lock_bh(&lgr->conns_lock); conns = RB_EMPTY_ROOT(&lgr->conns_all); read_unlock_bh(&lgr->conns_lock); if (!conns) { /* number of lgr connections is no longer zero */ - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(lgr_lock); return; } - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); /* remove from smc_lgr_list */ - spin_unlock_bh(&smc_lgr_list.lock); + list_del_init(&lgr->list); /* remove from smc_lgr_list */ + lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (!lgr->is_smcd && !lgr->terminating) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* try to send del link msg, on error free lgr immediately */ if (lnk->state == SMC_LNK_ACTIVE && !smc_link_send_delete(lnk)) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); + spin_unlock_bh(lgr_lock); return; } } + lgr->freeing = 1; /* this instance does the freeing, no new schedule */ + spin_unlock_bh(lgr_lock); + cancel_delayed_work(&lgr->free_work); - if (!delayed_work_pending(&lgr->free_work)) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(lnk); + if (lgr->is_smcd) + smc_ism_signal_shutdown(lgr); + smc_lgr_free(lgr); +} - if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); - if (lgr->is_smcd) - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); - } +static void smc_lgr_terminate_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + terminate_work); + + smc_lgr_terminate(lgr); } /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; + struct list_head *lgr_list; struct smc_link *lnk; + spinlock_t *lgr_lock; u8 rndvec[3]; int rc = 0; int i; @@ -217,6 +252,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) } lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; + lgr->terminating = 0; + lgr->freefast = 0; + lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); @@ -228,13 +266,18 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); + INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work); lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ + get_device(&ini->ism_dev->dev); lgr->peer_gid = ini->ism_gid; lgr->smcd = ini->ism_dev; + lgr_list = &ini->ism_dev->lgr_list; + lgr_lock = &lgr->smcd->lgr_lock; } else { /* SMC-R specific settings */ + get_device(&ini->ib_dev->ibdev->dev); lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); @@ -245,6 +288,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lnk->link_id = SMC_SINGLE_LINK; lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; + lgr_list = &smc_lgr_list.list; + lgr_lock = &smc_lgr_list.lock; lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; if (!ini->ib_dev->initialized) @@ -274,9 +319,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) goto destroy_qp; } smc->conn.lgr = lgr; - spin_lock_bh(&smc_lgr_list.lock); - list_add(&lgr->list, &smc_lgr_list.list); - spin_unlock_bh(&smc_lgr_list.lock); + spin_lock_bh(lgr_lock); + list_add(&lgr->list, lgr_list); + spin_unlock_bh(lgr_lock); return 0; destroy_qp: @@ -309,7 +354,7 @@ static void smc_buf_unuse(struct smc_connection *conn, conn->sndbuf_desc->used = 0; if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { - if (!lgr->is_smcd) { + if (!lgr->is_smcd && !list_empty(&lgr->list)) { /* unregister rmb with peer */ smc_llc_do_delete_rkey( &lgr->lnk[SMC_SINGLE_LINK], @@ -340,9 +385,10 @@ void smc_conn_free(struct smc_connection *conn) } else { smc_cdc_tx_dismiss_slots(conn); } - smc_lgr_unregister_conn(conn); - smc_buf_unuse(conn, lgr); /* allow buffer reuse */ - conn->lgr = NULL; + if (!list_empty(&lgr->list)) { + smc_lgr_unregister_conn(conn); + smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + } if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); @@ -433,23 +479,50 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) static void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); - if (lgr->is_smcd) + if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - else + put_device(&lgr->smcd->dev); + } else { smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); + } kfree(lgr); } void smc_lgr_forget(struct smc_link_group *lgr) { - spin_lock_bh(&smc_lgr_list.lock); + struct list_head *lgr_list; + spinlock_t *lgr_lock; + + lgr_list = smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); - spin_unlock_bh(&smc_lgr_list.lock); + if (!list_empty(lgr_list)) + list_del_init(lgr_list); + spin_unlock_bh(lgr_lock); +} + +static void smc_sk_wake_ups(struct smc_sock *smc) +{ + smc->sk.sk_write_space(&smc->sk); + smc->sk.sk_data_ready(&smc->sk); + smc->sk.sk_state_change(&smc->sk); } -/* terminate linkgroup abnormally */ +/* kill a connection */ +static void smc_conn_kill(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + smc_close_abort(conn); + conn->killed = 1; + smc_sk_wake_ups(smc); + smc_lgr_unregister_conn(conn); + smc->sk.sk_err = ECONNABORTED; + smc_close_active_abort(smc); +} + +/* terminate link group */ static void __smc_lgr_terminate(struct smc_link_group *lgr) { struct smc_connection *conn; @@ -459,52 +532,65 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) if (lgr->terminating) return; /* lgr already terminating */ lgr->terminating = 1; - if (!list_empty(&lgr->list)) /* forget lgr */ - list_del_init(&lgr->list); if (!lgr->is_smcd) smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); - write_lock_bh(&lgr->conns_lock); + /* kill remaining link group connections */ + read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); while (node) { + read_unlock_bh(&lgr->conns_lock); conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); - sock_hold(&smc->sk); /* sock_put in close work */ - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; - __smc_lgr_unregister_conn(conn); - conn->lgr = NULL; - write_unlock_bh(&lgr->conns_lock); - if (!schedule_work(&conn->close_work)) - sock_put(&smc->sk); - write_lock_bh(&lgr->conns_lock); + sock_hold(&smc->sk); /* sock_put below */ + lock_sock(&smc->sk); + smc_conn_kill(conn); + release_sock(&smc->sk); + sock_put(&smc->sk); /* sock_hold above */ + read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } - write_unlock_bh(&lgr->conns_lock); + read_unlock_bh(&lgr->conns_lock); if (!lgr->is_smcd) wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); - smc_lgr_schedule_free_work(lgr); + smc_lgr_schedule_free_work_fast(lgr); } +/* unlink and terminate link group */ void smc_lgr_terminate(struct smc_link_group *lgr) { - spin_lock_bh(&smc_lgr_list.lock); + spinlock_t *lgr_lock; + + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (lgr->terminating) { + spin_unlock_bh(lgr_lock); + return; /* lgr already terminating */ + } + list_del_init(&lgr->list); + spin_unlock_bh(lgr_lock); __smc_lgr_terminate(lgr); - spin_unlock_bh(&smc_lgr_list.lock); } /* Called when IB port is terminated */ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *l; + LIST_HEAD(lgr_free_list); spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { if (!lgr->is_smcd && lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) - __smc_lgr_terminate(lgr); + list_move(&lgr->list, &lgr_free_list); } spin_unlock_bh(&smc_lgr_list.lock); + + list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr); + } } /* Called when SMC-D device is terminated or peer is lost */ @@ -514,20 +600,19 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) LIST_HEAD(lgr_free_list); /* run common cleanup function and build free list */ - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->is_smcd && lgr->smcd == dev && - (!peer_gid || lgr->peer_gid == peer_gid) && + spin_lock_bh(&dev->lgr_lock); + list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { + if ((!peer_gid || lgr->peer_gid == peer_gid) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { - __smc_lgr_terminate(lgr); list_move(&lgr->list, &lgr_free_list); } } - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(&dev->lgr_lock); /* cancel the regular free workers and actually free lgrs */ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); + __smc_lgr_terminate(lgr); cancel_delayed_work_sync(&lgr->free_work); if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */ smc_ism_signal_shutdown(lgr); @@ -607,10 +692,14 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; + struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; + spinlock_t *lgr_lock; int rc = 0; + lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list; + lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock; ini->cln_first_contact = SMC_FIRST_CONTACT; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; if (role == SMC_CLNT && ini->srv_first_contact) @@ -618,8 +707,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) goto create; /* determine if an existing link group can be reused */ - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry(lgr, &smc_lgr_list.list, list) { + spin_lock_bh(lgr_lock); + list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) : @@ -639,7 +728,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) } write_unlock_bh(&lgr->conns_lock); } - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(lgr_lock); if (role == SMC_CLNT && !ini->srv_first_contact && ini->cln_first_contact == SMC_FIRST_CONTACT) { @@ -1027,16 +1116,45 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, return 0; } +static void smc_core_going_away(void) +{ + struct smc_ib_device *smcibdev; + struct smcd_dev *smcd; + + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { + int i; + + for (i = 0; i < SMC_MAX_PORTS; i++) + set_bit(i, smcibdev->ports_going_away); + } + spin_unlock(&smc_ib_devices.lock); + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + smcd->going_away = 1; + } + spin_unlock(&smcd_dev_list.lock); +} + /* Called (from smc_exit) when module is removed */ void smc_core_exit(void) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_freeing_list); + struct smcd_dev *smcd; + + smc_core_going_away(); spin_lock_bh(&smc_lgr_list.lock); - if (!list_empty(&smc_lgr_list.list)) - list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); + list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); spin_unlock_bh(&smc_lgr_list.lock); + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(smcd, &smcd_dev_list.list, list) + list_splice_init(&smcd->lgr_list, &lgr_freeing_list); + spin_unlock(&smcd_dev_list.lock); + list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { list_del_init(&lgr->list); if (!lgr->is_smcd) { diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c00ac61dc129..e6fd1ed42064 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -202,8 +202,11 @@ struct smc_link_group { u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ + struct work_struct terminate_work; /* abnormal lgr termination */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ + u8 freefast : 1; /* free worker scheduled fast */ + u8 freeing : 1; /* lgr is being freed */ bool is_smcd; /* SMC-R or SMC-D */ union { @@ -280,6 +283,12 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +static inline void smc_lgr_terminate_sched(struct smc_link_group *lgr) +{ + if (!lgr->terminating) + schedule_work(&lgr->terminate_work); +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index d14ca4af6f94..af05daeb0538 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -242,8 +242,12 @@ static void smc_ib_port_event_work(struct work_struct *work) for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { smc_ib_remember_port_attr(smcibdev, port_idx + 1); clear_bit(port_idx, &smcibdev->port_event_mask); - if (!smc_ib_port_active(smcibdev, port_idx + 1)) + if (!smc_ib_port_active(smcibdev, port_idx + 1)) { + set_bit(port_idx, smcibdev->ports_going_away); smc_port_terminate(smcibdev, port_idx + 1); + } else { + clear_bit(port_idx, smcibdev->ports_going_away); + } } } @@ -259,8 +263,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, switch (ibevent->event) { case IB_EVENT_DEVICE_FATAL: /* terminate all ports on device */ - for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) + for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { set_bit(port_idx, &smcibdev->port_event_mask); + set_bit(port_idx, smcibdev->ports_going_away); + } schedule_work(&smcibdev->port_event_work); break; case IB_EVENT_PORT_ERR: @@ -269,6 +275,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, port_idx = ibevent->element.port_num - 1; if (port_idx < SMC_MAX_PORTS) { set_bit(port_idx, &smcibdev->port_event_mask); + if (ibevent->event == IB_EVENT_PORT_ERR) + set_bit(port_idx, smcibdev->ports_going_away); + else if (ibevent->event == IB_EVENT_PORT_ACTIVE) + clear_bit(port_idx, smcibdev->ports_going_away); schedule_work(&smcibdev->port_event_work); } break; @@ -307,6 +317,7 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) port_idx = ibevent->element.qp->port - 1; if (port_idx < SMC_MAX_PORTS) { set_bit(port_idx, &smcibdev->port_event_mask); + set_bit(port_idx, smcibdev->ports_going_away); schedule_work(&smcibdev->port_event_work); } break; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index da60ab9e8d70..6a0069db6cae 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -47,6 +47,7 @@ struct smc_ib_device { /* ib-device infos for smc */ u8 initialized : 1; /* ib dev CQ, evthdl done */ struct work_struct port_event_work; unsigned long port_event_mask; + DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS); }; struct smc_buf_desc; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index e89e918b88e0..ee7340898cb4 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -286,7 +286,9 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); spin_lock_init(&smcd->lock); + spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); + INIT_LIST_HEAD(&smcd->lgr_list); smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) { @@ -313,6 +315,7 @@ void smcd_unregister_dev(struct smcd_dev *smcd) spin_lock(&smcd_dev_list.lock); list_del(&smcd->list); spin_unlock(&smcd_dev_list.lock); + smcd->going_away = 1; flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); smc_smcd_terminate(smcd, 0, VLAN_VID_MASK); @@ -342,6 +345,8 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) { struct smc_ism_event_work *wrk; + if (smcd->going_away) + return; /* copy event to event work queue, and let it be handled there */ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); if (!wrk) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 4fd60c522802..e1918ffaf125 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -475,7 +475,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } smc_llc_send_message(link, llc, sizeof(*llc)); - smc_lgr_schedule_free_work_fast(lgr); + smc_lgr_terminate_sched(lgr); } } diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 571e6d84da3b..82dedf052d86 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -779,6 +779,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL)) { ini->ib_dev = ibdev; @@ -818,6 +819,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, continue; if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL)) { ini->ib_dev = ibdev; @@ -844,7 +846,8 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, spin_lock(&smcd_dev_list.lock); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { - if (smc_pnet_match(ismdev->pnetid, ndev_pnetid)) { + if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && + !ismdev->going_away) { ini->ism_dev = ismdev; break; } diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 97e8369002d7..39d7b34d06d2 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -201,6 +201,8 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; + struct smc_cdc_conn_state_flags *cflags = + &conn->local_tx_ctrl.conn_state_flags; struct sock *sk = &smc->sk; int rc; @@ -210,7 +212,9 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, add_wait_queue(sk_sleep(sk), &wait); rc = sk_wait_event(sk, timeo, sk->sk_err || + cflags->peer_conn_abort || sk->sk_shutdown & RCV_SHUTDOWN || + conn->killed || fcrit(conn), &wait); remove_wait_queue(sk_sleep(sk), &wait); @@ -314,11 +318,13 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (read_done >= target || (pipe && read_done)) break; + if (conn->killed) + break; + if (smc_rx_recvmsg_data_available(smc)) goto copy; - if (sk->sk_shutdown & RCV_SHUTDOWN || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) { + if (sk->sk_shutdown & RCV_SHUTDOWN) { /* smc_cdc_msg_recv_action() could have run after * above smc_rx_recvmsg_data_available() */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 6c8f09c1ce51..824f096ee7de 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -86,6 +86,7 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || + conn->killed || conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { rc = -EPIPE; break; @@ -155,7 +156,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) return -ENOTCONN; if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + conn->killed) return -EPIPE; if (smc_cdc_rxed_any_close(conn)) return send_done ?: -ECONNRESET; @@ -282,10 +283,8 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); - if (rc) { - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + if (rc) smc_lgr_terminate(lgr); - } return rc; } @@ -495,10 +494,11 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) if (smc->sk.sk_err == ECONNABORTED) return sock_error(&smc->sk); + if (conn->killed) + return -EPIPE; rc = 0; - if (conn->alert_token_local) /* connection healthy */ - mod_delayed_work(system_wq, &conn->tx_work, - SMC_TX_WORK_DELAY); + mod_delayed_work(system_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); } return rc; } @@ -547,6 +547,9 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { int rc; + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return -EPIPE; /* connection being aborted */ if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -573,9 +576,7 @@ void smc_tx_work(struct work_struct *work) int rc; lock_sock(&smc->sk); - if (smc->sk.sk_err || - !conn->alert_token_local || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + if (smc->sk.sk_err) goto out; rc = smc_tx_sndbuf_nonempty(conn); @@ -608,8 +609,11 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) ((to_confirm > conn->rmbe_update_limit) && ((sender_free <= (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return; if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && - conn->alert_token_local) { /* connection healthy */ + !conn->killed) { schedule_delayed_work(&conn->tx_work, SMC_TX_WORK_DELAY); return; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 253aa75dc2b6..50743dc56c86 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -101,7 +101,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) clear_bit(i, link->wr_tx_mask); } /* terminate connections of this link group abnormally */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); @@ -191,7 +191,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); return -EPIPE; } if (idx == link->wr_tx_cnt) @@ -247,7 +247,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { smc_wr_tx_put_slot(link, priv); - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); } return rc; } @@ -272,7 +272,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) SMC_WR_REG_MR_WAIT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); return -EPIPE; } if (rc == -ERESTARTSYS) @@ -373,7 +373,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) /* terminate connections of this link group * abnormally */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); break; default: smc_wr_rx_post(link); /* refill WR RX */ diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index b83e16ade4d2..716b61a701a8 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -35,6 +35,21 @@ config TIPC_MEDIA_UDP Saying Y here will enable support for running TIPC over IP/UDP bool default y +config TIPC_CRYPTO + bool "TIPC encryption support" + depends on TIPC + select CRYPTO + select CRYPTO_AES + select CRYPTO_GCM + help + Saying Y here will enable support for TIPC encryption. + All TIPC messages will be encrypted/decrypted by using the currently most + advanced algorithm: AEAD AES-GCM (like IPSec or TLS) before leaving/ + entering the TIPC stack. + Key setting from user-space is performed via netlink by a user program + (e.g. the iproute2 'tipc' tool). + bool + default y config TIPC_DIAG tristate "TIPC: socket monitoring interface" diff --git a/net/tipc/Makefile b/net/tipc/Makefile index c86aba0282af..11255e970dd4 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -16,6 +16,7 @@ CFLAGS_trace.o += -I$(src) tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o +tipc-$(CONFIG_TIPC_CRYPTO) += crypto.o obj-$(CONFIG_TIPC_DIAG) += diag.o diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 6ef1abdd525f..f41096a759fa 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -84,7 +84,7 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net) */ int tipc_bcast_get_mtu(struct net *net) { - return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE; + return tipc_link_mss(tipc_bc_sndlink(net)); } void tipc_bcast_disable_rcast(struct net *net) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 0214aa1c4427..d7ec26bd739d 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -44,6 +44,7 @@ #include "netlink.h" #include "udp_media.h" #include "trace.h" +#include "crypto.h" #define MAX_ADDR_STR 60 @@ -315,6 +316,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, b->net_plane = bearer_id + 'A'; b->priority = prio; test_and_set_bit_lock(0, &b->up); + refcount_set(&b->refcnt, 1); res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { @@ -351,6 +353,17 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) return 0; } +bool tipc_bearer_hold(struct tipc_bearer *b) +{ + return (b && refcount_inc_not_zero(&b->refcnt)); +} + +void tipc_bearer_put(struct tipc_bearer *b) +{ + if (b && refcount_dec_and_test(&b->refcnt)) + kfree_rcu(b, rcu); +} + /** * bearer_disable * @@ -369,7 +382,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) if (b->disc) tipc_disc_delete(b->disc); RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL); - kfree_rcu(b, rcu); + tipc_bearer_put(b); tipc_mon_delete(net, bearer_id); } @@ -504,10 +517,15 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, rcu_read_lock(); b = bearer_get(net, bearer_id); - if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) - b->media->send_msg(net, skb, b, dest); - else + if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) { +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dest, NULL); + if (skb) +#endif + b->media->send_msg(net, skb, b, dest); + } else { kfree_skb(skb); + } rcu_read_unlock(); } @@ -515,7 +533,8 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, */ void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, - struct tipc_media_addr *dst) + struct tipc_media_addr *dst, + struct tipc_node *__dnode) { struct tipc_bearer *b; struct sk_buff *skb, *tmp; @@ -529,10 +548,15 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id, __skb_queue_purge(xmitq); skb_queue_walk_safe(xmitq, skb, tmp) { __skb_dequeue(xmitq); - if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) - b->media->send_msg(net, skb, b, dst); - else + if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) { +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dst, __dnode); + if (skb) +#endif + b->media->send_msg(net, skb, b, dst); + } else { kfree_skb(skb); + } } rcu_read_unlock(); } @@ -543,6 +567,7 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq) { struct tipc_net *tn = tipc_net(net); + struct tipc_media_addr *dst; int net_id = tn->net_id; struct tipc_bearer *b; struct sk_buff *skb, *tmp; @@ -557,7 +582,12 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, msg_set_non_seq(hdr, 1); msg_set_mc_netid(hdr, net_id); __skb_dequeue(xmitq); - b->media->send_msg(net, skb, b, &b->bcast_addr); + dst = &b->bcast_addr; +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dst, NULL); + if (skb) +#endif + b->media->send_msg(net, skb, b, dst); } rcu_read_unlock(); } @@ -584,6 +614,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, if (likely(b && test_bit(0, &b->up) && (skb->pkt_type <= PACKET_MULTICAST))) { skb_mark_not_on_list(skb); + TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(dev_net(b->pt.dev), skb, b); rcu_read_unlock(); return NET_RX_SUCCESS; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index ea0f3c49cbed..d0c79cc6c0c2 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -165,6 +165,7 @@ struct tipc_bearer { struct tipc_discoverer *disc; char net_plane; unsigned long up; + refcount_t refcnt; }; struct tipc_bearer_names { @@ -210,6 +211,8 @@ int tipc_media_set_window(const char *name, u32 new_value); int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]); +bool tipc_bearer_hold(struct tipc_bearer *b); +void tipc_bearer_put(struct tipc_bearer *b); void tipc_disable_l2_media(struct tipc_bearer *b); int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, struct tipc_bearer *b, struct tipc_media_addr *dest); @@ -229,7 +232,8 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, struct tipc_media_addr *dest); void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, - struct tipc_media_addr *dst); + struct tipc_media_addr *dst, + struct tipc_node *__dnode); void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq); void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts); diff --git a/net/tipc/core.c b/net/tipc/core.c index 23cb379a93d6..fc01a13d7462 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -44,6 +44,7 @@ #include "socket.h" #include "bcast.h" #include "node.h" +#include "crypto.h" #include <linux/module.h> @@ -68,6 +69,11 @@ static int __net_init tipc_init_net(struct net *net) INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); +#ifdef CONFIG_TIPC_CRYPTO + err = tipc_crypto_start(&tn->crypto_tx, net, NULL); + if (err) + goto out_crypto; +#endif err = tipc_sk_rht_init(net); if (err) goto out_sk_rht; @@ -93,6 +99,11 @@ out_bclink: out_nametbl: tipc_sk_rht_destroy(net); out_sk_rht: + +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&tn->crypto_tx); +out_crypto: +#endif return err; } @@ -103,8 +114,20 @@ static void __net_exit tipc_exit_net(struct net *net) tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&tipc_net(net)->crypto_tx); +#endif +} + +static void __net_exit tipc_pernet_pre_exit(struct net *net) +{ + tipc_node_pre_cleanup_net(net); } +static struct pernet_operations tipc_pernet_pre_exit_ops = { + .pre_exit = tipc_pernet_pre_exit, +}; + static struct pernet_operations tipc_net_ops = { .init = tipc_init_net, .exit = tipc_exit_net, @@ -151,6 +174,10 @@ static int __init tipc_init(void) if (err) goto out_pernet_topsrv; + err = register_pernet_subsys(&tipc_pernet_pre_exit_ops); + if (err) + goto out_register_pernet_subsys; + err = tipc_bearer_setup(); if (err) goto out_bearer; @@ -158,6 +185,8 @@ static int __init tipc_init(void) pr_info("Started in single node mode\n"); return 0; out_bearer: + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); +out_register_pernet_subsys: unregister_pernet_device(&tipc_topsrv_net_ops); out_pernet_topsrv: tipc_socket_stop(); @@ -177,6 +206,7 @@ out_netlink: static void __exit tipc_exit(void) { tipc_bearer_cleanup(); + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); unregister_pernet_device(&tipc_topsrv_net_ops); tipc_socket_stop(); unregister_pernet_device(&tipc_net_ops); diff --git a/net/tipc/core.h b/net/tipc/core.h index 60d829581068..775848a5f27e 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -59,6 +59,7 @@ #include <net/netns/generic.h> #include <linux/rhashtable.h> #include <net/genetlink.h> +#include <net/netns/hash.h> struct tipc_node; struct tipc_bearer; @@ -67,6 +68,9 @@ struct tipc_link; struct tipc_name_table; struct tipc_topsrv; struct tipc_monitor; +#ifdef CONFIG_TIPC_CRYPTO +struct tipc_crypto; +#endif #define TIPC_MOD_VER "2.0.0" @@ -128,6 +132,11 @@ struct tipc_net { /* Tracing of node internal messages */ struct packet_type loopback_pt; + +#ifdef CONFIG_TIPC_CRYPTO + /* TX crypto handler */ + struct tipc_crypto *crypto_tx; +#endif }; static inline struct tipc_net *tipc_net(struct net *net) @@ -185,6 +194,11 @@ static inline int in_range(u16 val, u16 min, u16 max) return !less(val, min) && !more(val, max); } +static inline u32 tipc_net_hash_mixes(struct net *net, int tn_rand) +{ + return net_hash_mix(&init_net) ^ net_hash_mix(net) ^ tn_rand; +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c new file mode 100644 index 000000000000..05f7ca76e8ce --- /dev/null +++ b/net/tipc/crypto.c @@ -0,0 +1,1986 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption + * + * Copyright (c) 2019, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <crypto/aead.h> +#include <crypto/aes.h> +#include "crypto.h" + +#define TIPC_TX_PROBE_LIM msecs_to_jiffies(1000) /* > 1s */ +#define TIPC_TX_LASTING_LIM msecs_to_jiffies(120000) /* 2 mins */ +#define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ +#define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(180000) /* 3 mins */ +#define TIPC_MAX_TFMS_DEF 10 +#define TIPC_MAX_TFMS_LIM 1000 + +/** + * TIPC Key ids + */ +enum { + KEY_UNUSED = 0, + KEY_MIN, + KEY_1 = KEY_MIN, + KEY_2, + KEY_3, + KEY_MAX = KEY_3, +}; + +/** + * TIPC Crypto statistics + */ +enum { + STAT_OK, + STAT_NOK, + STAT_ASYNC, + STAT_ASYNC_OK, + STAT_ASYNC_NOK, + STAT_BADKEYS, /* tx only */ + STAT_BADMSGS = STAT_BADKEYS, /* rx only */ + STAT_NOKEYS, + STAT_SWITCHES, + + MAX_STATS, +}; + +/* TIPC crypto statistics' header */ +static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok", + "async_nok", "badmsgs", "nokeys", + "switches"}; + +/* Max TFMs number per key */ +int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; + +/** + * struct tipc_key - TIPC keys' status indicator + * + * 7 6 5 4 3 2 1 0 + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * key: | (reserved)|passive idx| active idx|pending idx| + * +-----+-----+-----+-----+-----+-----+-----+-----+ + */ +struct tipc_key { +#define KEY_BITS (2) +#define KEY_MASK ((1 << KEY_BITS) - 1) + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 pending:2, + active:2, + passive:2, /* rx only */ + reserved:2; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:2, + passive:2, /* rx only */ + active:2, + pending:2; +#else +#error "Please fix <asm/byteorder.h>" +#endif + } __packed; + u8 keys; + }; +}; + +/** + * struct tipc_tfm - TIPC TFM structure to form a list of TFMs + */ +struct tipc_tfm { + struct crypto_aead *tfm; + struct list_head list; +}; + +/** + * struct tipc_aead - TIPC AEAD key structure + * @tfm_entry: per-cpu pointer to one entry in TFM list + * @crypto: TIPC crypto owns this key + * @cloned: reference to the source key in case cloning + * @users: the number of the key users (TX/RX) + * @salt: the key's SALT value + * @authsize: authentication tag size (max = 16) + * @mode: crypto mode is applied to the key + * @hint[]: a hint for user key + * @rcu: struct rcu_head + * @seqno: the key seqno (cluster scope) + * @refcnt: the key reference counter + */ +struct tipc_aead { +#define TIPC_AEAD_HINT_LEN (5) + struct tipc_tfm * __percpu *tfm_entry; + struct tipc_crypto *crypto; + struct tipc_aead *cloned; + atomic_t users; + u32 salt; + u8 authsize; + u8 mode; + char hint[TIPC_AEAD_HINT_LEN + 1]; + struct rcu_head rcu; + + atomic64_t seqno ____cacheline_aligned; + refcount_t refcnt ____cacheline_aligned; + +} ____cacheline_aligned; + +/** + * struct tipc_crypto_stats - TIPC Crypto statistics + */ +struct tipc_crypto_stats { + unsigned int stat[MAX_STATS]; +}; + +/** + * struct tipc_crypto - TIPC TX/RX crypto structure + * @net: struct net + * @node: TIPC node (RX) + * @aead: array of pointers to AEAD keys for encryption/decryption + * @peer_rx_active: replicated peer RX active key index + * @key: the key states + * @working: the crypto is working or not + * @stats: the crypto statistics + * @sndnxt: the per-peer sndnxt (TX) + * @timer1: general timer 1 (jiffies) + * @timer2: general timer 1 (jiffies) + * @lock: tipc_key lock + */ +struct tipc_crypto { + struct net *net; + struct tipc_node *node; + struct tipc_aead __rcu *aead[KEY_MAX + 1]; /* key[0] is UNUSED */ + atomic_t peer_rx_active; + struct tipc_key key; + u8 working:1; + struct tipc_crypto_stats __percpu *stats; + + atomic64_t sndnxt ____cacheline_aligned; + unsigned long timer1; + unsigned long timer2; + spinlock_t lock; /* crypto lock */ + +} ____cacheline_aligned; + +/* struct tipc_crypto_tx_ctx - TX context for callbacks */ +struct tipc_crypto_tx_ctx { + struct tipc_aead *aead; + struct tipc_bearer *bearer; + struct tipc_media_addr dst; +}; + +/* struct tipc_crypto_rx_ctx - RX context for callbacks */ +struct tipc_crypto_rx_ctx { + struct tipc_aead *aead; + struct tipc_bearer *bearer; +}; + +static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead); +static inline void tipc_aead_put(struct tipc_aead *aead); +static void tipc_aead_free(struct rcu_head *rp); +static int tipc_aead_users(struct tipc_aead __rcu *aead); +static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim); +static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim); +static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val); +static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead); +static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, + u8 mode); +static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src); +static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, + unsigned int crypto_ctx_size, + u8 **iv, struct aead_request **req, + struct scatterlist **sg, int nsg); +static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode); +static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err); +static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, + struct sk_buff *skb, struct tipc_bearer *b); +static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err); +static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr); +static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, + u8 tx_key, struct sk_buff *skb, + struct tipc_crypto *__rx); +static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, + u8 new_passive, + u8 new_active, + u8 new_pending); +static int tipc_crypto_key_attach(struct tipc_crypto *c, + struct tipc_aead *aead, u8 pos); +static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); +static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, + struct tipc_crypto *rx, + struct sk_buff *skb); +static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active, + struct tipc_msg *hdr); +static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); +static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, + struct tipc_bearer *b, + struct sk_buff **skb, int err); +static void tipc_crypto_do_cmd(struct net *net, int cmd); +static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf); +#ifdef TIPC_CRYPTO_DEBUG +static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, + char *buf); +#endif + +#define key_next(cur) ((cur) % KEY_MAX + 1) + +#define tipc_aead_rcu_ptr(rcu_ptr, lock) \ + rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) + +#define tipc_aead_rcu_swap(rcu_ptr, ptr, lock) \ + rcu_swap_protected((rcu_ptr), (ptr), lockdep_is_held(lock)) + +#define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ +do { \ + typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \ + lockdep_is_held(lock)); \ + rcu_assign_pointer((rcu_ptr), (ptr)); \ + tipc_aead_put(__tmp); \ +} while (0) + +#define tipc_crypto_key_detach(rcu_ptr, lock) \ + tipc_aead_rcu_replace((rcu_ptr), NULL, lock) + +/** + * tipc_aead_key_validate - Validate a AEAD user key + */ +int tipc_aead_key_validate(struct tipc_aead_key *ukey) +{ + int keylen; + + /* Check if algorithm exists */ + if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) { + pr_info("Not found cipher: \"%s\"!\n", ukey->alg_name); + return -ENODEV; + } + + /* Currently, we only support the "gcm(aes)" cipher algorithm */ + if (strcmp(ukey->alg_name, "gcm(aes)")) + return -ENOTSUPP; + + /* Check if key size is correct */ + keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; + if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 && + keylen != TIPC_AES_GCM_KEY_SIZE_192 && + keylen != TIPC_AES_GCM_KEY_SIZE_256)) + return -EINVAL; + + return 0; +} + +static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (unlikely(!tmp || !refcount_inc_not_zero(&tmp->refcnt))) + tmp = NULL; + rcu_read_unlock(); + + return tmp; +} + +static inline void tipc_aead_put(struct tipc_aead *aead) +{ + if (aead && refcount_dec_and_test(&aead->refcnt)) + call_rcu(&aead->rcu, tipc_aead_free); +} + +/** + * tipc_aead_free - Release AEAD key incl. all the TFMs in the list + * @rp: rcu head pointer + */ +static void tipc_aead_free(struct rcu_head *rp) +{ + struct tipc_aead *aead = container_of(rp, struct tipc_aead, rcu); + struct tipc_tfm *tfm_entry, *head, *tmp; + + if (aead->cloned) { + tipc_aead_put(aead->cloned); + } else { + head = *this_cpu_ptr(aead->tfm_entry); + list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) { + crypto_free_aead(tfm_entry->tfm); + list_del(&tfm_entry->list); + kfree(tfm_entry); + } + /* Free the head */ + crypto_free_aead(head->tfm); + list_del(&head->list); + kfree(head); + } + free_percpu(aead->tfm_entry); + kfree(aead); +} + +static int tipc_aead_users(struct tipc_aead __rcu *aead) +{ + struct tipc_aead *tmp; + int users = 0; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + users = atomic_read(&tmp->users); + rcu_read_unlock(); + + return users; +} + +static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + atomic_add_unless(&tmp->users, 1, lim); + rcu_read_unlock(); +} + +static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + atomic_add_unless(&rcu_dereference(aead)->users, -1, lim); + rcu_read_unlock(); +} + +static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) +{ + struct tipc_aead *tmp; + int cur; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) { + do { + cur = atomic_read(&tmp->users); + if (cur == val) + break; + } while (atomic_cmpxchg(&tmp->users, cur, val) != cur); + } + rcu_read_unlock(); +} + +/** + * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it + */ +static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) +{ + struct tipc_tfm **tfm_entry = this_cpu_ptr(aead->tfm_entry); + + *tfm_entry = list_next_entry(*tfm_entry, list); + return (*tfm_entry)->tfm; +} + +/** + * tipc_aead_init - Initiate TIPC AEAD + * @aead: returned new TIPC AEAD key handle pointer + * @ukey: pointer to user key data + * @mode: the key mode + * + * Allocate a (list of) new cipher transformation (TFM) with the specific user + * key data if valid. The number of the allocated TFMs can be set via the sysfs + * "net/tipc/max_tfms" first. + * Also, all the other AEAD data are also initialized. + * + * Return: 0 if the initiation is successful, otherwise: < 0 + */ +static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, + u8 mode) +{ + struct tipc_tfm *tfm_entry, *head; + struct crypto_aead *tfm; + struct tipc_aead *tmp; + int keylen, err, cpu; + int tfm_cnt = 0; + + if (unlikely(*aead)) + return -EEXIST; + + /* Allocate a new AEAD */ + tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); + if (unlikely(!tmp)) + return -ENOMEM; + + /* The key consists of two parts: [AES-KEY][SALT] */ + keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; + + /* Allocate per-cpu TFM entry pointer */ + tmp->tfm_entry = alloc_percpu(struct tipc_tfm *); + if (!tmp->tfm_entry) { + kzfree(tmp); + return -ENOMEM; + } + + /* Make a list of TFMs with the user key data */ + do { + tfm = crypto_alloc_aead(ukey->alg_name, 0, 0); + if (IS_ERR(tfm)) { + err = PTR_ERR(tfm); + break; + } + + if (unlikely(!tfm_cnt && + crypto_aead_ivsize(tfm) != TIPC_AES_GCM_IV_SIZE)) { + crypto_free_aead(tfm); + err = -ENOTSUPP; + break; + } + + err |= crypto_aead_setauthsize(tfm, TIPC_AES_GCM_TAG_SIZE); + err |= crypto_aead_setkey(tfm, ukey->key, keylen); + if (unlikely(err)) { + crypto_free_aead(tfm); + break; + } + + tfm_entry = kmalloc(sizeof(*tfm_entry), GFP_KERNEL); + if (unlikely(!tfm_entry)) { + crypto_free_aead(tfm); + err = -ENOMEM; + break; + } + INIT_LIST_HEAD(&tfm_entry->list); + tfm_entry->tfm = tfm; + + /* First entry? */ + if (!tfm_cnt) { + head = tfm_entry; + for_each_possible_cpu(cpu) { + *per_cpu_ptr(tmp->tfm_entry, cpu) = head; + } + } else { + list_add_tail(&tfm_entry->list, &head->list); + } + + } while (++tfm_cnt < sysctl_tipc_max_tfms); + + /* Not any TFM is allocated? */ + if (!tfm_cnt) { + free_percpu(tmp->tfm_entry); + kzfree(tmp); + return err; + } + + /* Copy some chars from the user key as a hint */ + memcpy(tmp->hint, ukey->key, TIPC_AEAD_HINT_LEN); + tmp->hint[TIPC_AEAD_HINT_LEN] = '\0'; + + /* Initialize the other data */ + tmp->mode = mode; + tmp->cloned = NULL; + tmp->authsize = TIPC_AES_GCM_TAG_SIZE; + memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); + atomic_set(&tmp->users, 0); + atomic64_set(&tmp->seqno, 0); + refcount_set(&tmp->refcnt, 1); + + *aead = tmp; + return 0; +} + +/** + * tipc_aead_clone - Clone a TIPC AEAD key + * @dst: dest key for the cloning + * @src: source key to clone from + * + * Make a "copy" of the source AEAD key data to the dest, the TFMs list is + * common for the keys. + * A reference to the source is hold in the "cloned" pointer for the later + * freeing purposes. + * + * Note: this must be done in cluster-key mode only! + * Return: 0 in case of success, otherwise < 0 + */ +static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src) +{ + struct tipc_aead *aead; + int cpu; + + if (!src) + return -ENOKEY; + + if (src->mode != CLUSTER_KEY) + return -EINVAL; + + if (unlikely(*dst)) + return -EEXIST; + + aead = kzalloc(sizeof(*aead), GFP_ATOMIC); + if (unlikely(!aead)) + return -ENOMEM; + + aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC); + if (unlikely(!aead->tfm_entry)) { + kzfree(aead); + return -ENOMEM; + } + + for_each_possible_cpu(cpu) { + *per_cpu_ptr(aead->tfm_entry, cpu) = + *per_cpu_ptr(src->tfm_entry, cpu); + } + + memcpy(aead->hint, src->hint, sizeof(src->hint)); + aead->mode = src->mode; + aead->salt = src->salt; + aead->authsize = src->authsize; + atomic_set(&aead->users, 0); + atomic64_set(&aead->seqno, 0); + refcount_set(&aead->refcnt, 1); + + WARN_ON(!refcount_inc_not_zero(&src->refcnt)); + aead->cloned = src; + + *dst = aead; + return 0; +} + +/** + * tipc_aead_mem_alloc - Allocate memory for AEAD request operations + * @tfm: cipher handle to be registered with the request + * @crypto_ctx_size: size of crypto context for callback + * @iv: returned pointer to IV data + * @req: returned pointer to AEAD request data + * @sg: returned pointer to SG lists + * @nsg: number of SG lists to be allocated + * + * Allocate memory to store the crypto context data, AEAD request, IV and SG + * lists, the memory layout is as follows: + * crypto_ctx || iv || aead_req || sg[] + * + * Return: the pointer to the memory areas in case of success, otherwise NULL + */ +static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, + unsigned int crypto_ctx_size, + u8 **iv, struct aead_request **req, + struct scatterlist **sg, int nsg) +{ + unsigned int iv_size, req_size; + unsigned int len; + u8 *mem; + + iv_size = crypto_aead_ivsize(tfm); + req_size = sizeof(**req) + crypto_aead_reqsize(tfm); + + len = crypto_ctx_size; + len += iv_size; + len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); + len = ALIGN(len, crypto_tfm_ctx_alignment()); + len += req_size; + len = ALIGN(len, __alignof__(struct scatterlist)); + len += nsg * sizeof(**sg); + + mem = kmalloc(len, GFP_ATOMIC); + if (!mem) + return NULL; + + *iv = (u8 *)PTR_ALIGN(mem + crypto_ctx_size, + crypto_aead_alignmask(tfm) + 1); + *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, + crypto_tfm_ctx_alignment()); + *sg = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, + __alignof__(struct scatterlist)); + + return (void *)mem; +} + +/** + * tipc_aead_encrypt - Encrypt a message + * @aead: TIPC AEAD key for the message encryption + * @skb: the input/output skb + * @b: TIPC bearer where the message will be delivered after the encryption + * @dst: the destination media address + * @__dnode: TIPC dest node if "known" + * + * Return: + * 0 : if the encryption has completed + * -EINPROGRESS/-EBUSY : if a callback will be performed + * < 0 : the encryption has failed + */ +static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode) +{ + struct crypto_aead *tfm = tipc_aead_tfm_next(aead); + struct tipc_crypto_tx_ctx *tx_ctx; + struct aead_request *req; + struct sk_buff *trailer; + struct scatterlist *sg; + struct tipc_ehdr *ehdr; + int ehsz, len, tailen, nsg, rc; + void *ctx; + u32 salt; + u8 *iv; + + /* Make sure message len at least 4-byte aligned */ + len = ALIGN(skb->len, 4); + tailen = len - skb->len + aead->authsize; + + /* Expand skb tail for authentication tag: + * As for simplicity, we'd have made sure skb having enough tailroom + * for authentication tag @skb allocation. Even when skb is nonlinear + * but there is no frag_list, it should be still fine! + * Otherwise, we must cow it to be a writable buffer with the tailroom. + */ +#ifdef TIPC_CRYPTO_DEBUG + SKB_LINEAR_ASSERT(skb); + if (tailen > skb_tailroom(skb)) { + pr_warn("TX: skb tailroom is not enough: %d, requires: %d\n", + skb_tailroom(skb), tailen); + } +#endif + + if (unlikely(!skb_cloned(skb) && tailen <= skb_tailroom(skb))) { + nsg = 1; + trailer = skb; + } else { + /* TODO: We could avoid skb_cow_data() if skb has no frag_list + * e.g. by skb_fill_page_desc() to add another page to the skb + * with the wanted tailen... However, page skbs look not often, + * so take it easy now! + * Cloned skbs e.g. from link_xmit() seems no choice though :( + */ + nsg = skb_cow_data(skb, tailen, &trailer); + if (unlikely(nsg < 0)) { + pr_err("TX: skb_cow_data() returned %d\n", nsg); + return nsg; + } + } + + pskb_put(skb, trailer, tailen); + + /* Allocate memory for the AEAD operation */ + ctx = tipc_aead_mem_alloc(tfm, sizeof(*tx_ctx), &iv, &req, &sg, nsg); + if (unlikely(!ctx)) + return -ENOMEM; + TIPC_SKB_CB(skb)->crypto_ctx = ctx; + + /* Map skb to the sg lists */ + sg_init_table(sg, nsg); + rc = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(rc < 0)) { + pr_err("TX: skb_to_sgvec() returned %d, nsg %d!\n", rc, nsg); + goto exit; + } + + /* Prepare IV: [SALT (4 octets)][SEQNO (8 octets)] + * In case we're in cluster-key mode, SALT is varied by xor-ing with + * the source address (or w0 of id), otherwise with the dest address + * if dest is known. + */ + ehdr = (struct tipc_ehdr *)skb->data; + salt = aead->salt; + if (aead->mode == CLUSTER_KEY) + salt ^= ehdr->addr; /* __be32 */ + else if (__dnode) + salt ^= tipc_node_get_addr(__dnode); + memcpy(iv, &salt, 4); + memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); + + /* Prepare request */ + ehsz = tipc_ehdr_size(ehdr); + aead_request_set_tfm(req, tfm); + aead_request_set_ad(req, ehsz); + aead_request_set_crypt(req, sg, sg, len - ehsz, iv); + + /* Set callback function & data */ + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + tipc_aead_encrypt_done, skb); + tx_ctx = (struct tipc_crypto_tx_ctx *)ctx; + tx_ctx->aead = aead; + tx_ctx->bearer = b; + memcpy(&tx_ctx->dst, dst, sizeof(*dst)); + + /* Hold bearer */ + if (unlikely(!tipc_bearer_hold(b))) { + rc = -ENODEV; + goto exit; + } + + /* Now, do encrypt */ + rc = crypto_aead_encrypt(req); + if (rc == -EINPROGRESS || rc == -EBUSY) + return rc; + + tipc_bearer_put(b); + +exit: + kfree(ctx); + TIPC_SKB_CB(skb)->crypto_ctx = NULL; + return rc; +} + +static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; + struct tipc_bearer *b = tx_ctx->bearer; + struct tipc_aead *aead = tx_ctx->aead; + struct tipc_crypto *tx = aead->crypto; + struct net *net = tx->net; + + switch (err) { + case 0: + this_cpu_inc(tx->stats->stat[STAT_ASYNC_OK]); + if (likely(test_bit(0, &b->up))) + b->media->send_msg(net, skb, b, &tx_ctx->dst); + else + kfree_skb(skb); + break; + case -EINPROGRESS: + return; + default: + this_cpu_inc(tx->stats->stat[STAT_ASYNC_NOK]); + kfree_skb(skb); + break; + } + + kfree(tx_ctx); + tipc_bearer_put(b); + tipc_aead_put(aead); +} + +/** + * tipc_aead_decrypt - Decrypt an encrypted message + * @net: struct net + * @aead: TIPC AEAD for the message decryption + * @skb: the input/output skb + * @b: TIPC bearer where the message has been received + * + * Return: + * 0 : if the decryption has completed + * -EINPROGRESS/-EBUSY : if a callback will be performed + * < 0 : the decryption has failed + */ +static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, + struct sk_buff *skb, struct tipc_bearer *b) +{ + struct tipc_crypto_rx_ctx *rx_ctx; + struct aead_request *req; + struct crypto_aead *tfm; + struct sk_buff *unused; + struct scatterlist *sg; + struct tipc_ehdr *ehdr; + int ehsz, nsg, rc; + void *ctx; + u32 salt; + u8 *iv; + + if (unlikely(!aead)) + return -ENOKEY; + + /* Cow skb data if needed */ + if (likely(!skb_cloned(skb) && + (!skb_is_nonlinear(skb) || !skb_has_frag_list(skb)))) { + nsg = 1 + skb_shinfo(skb)->nr_frags; + } else { + nsg = skb_cow_data(skb, 0, &unused); + if (unlikely(nsg < 0)) { + pr_err("RX: skb_cow_data() returned %d\n", nsg); + return nsg; + } + } + + /* Allocate memory for the AEAD operation */ + tfm = tipc_aead_tfm_next(aead); + ctx = tipc_aead_mem_alloc(tfm, sizeof(*rx_ctx), &iv, &req, &sg, nsg); + if (unlikely(!ctx)) + return -ENOMEM; + TIPC_SKB_CB(skb)->crypto_ctx = ctx; + + /* Map skb to the sg lists */ + sg_init_table(sg, nsg); + rc = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(rc < 0)) { + pr_err("RX: skb_to_sgvec() returned %d, nsg %d\n", rc, nsg); + goto exit; + } + + /* Reconstruct IV: */ + ehdr = (struct tipc_ehdr *)skb->data; + salt = aead->salt; + if (aead->mode == CLUSTER_KEY) + salt ^= ehdr->addr; /* __be32 */ + else if (ehdr->destined) + salt ^= tipc_own_addr(net); + memcpy(iv, &salt, 4); + memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); + + /* Prepare request */ + ehsz = tipc_ehdr_size(ehdr); + aead_request_set_tfm(req, tfm); + aead_request_set_ad(req, ehsz); + aead_request_set_crypt(req, sg, sg, skb->len - ehsz, iv); + + /* Set callback function & data */ + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + tipc_aead_decrypt_done, skb); + rx_ctx = (struct tipc_crypto_rx_ctx *)ctx; + rx_ctx->aead = aead; + rx_ctx->bearer = b; + + /* Hold bearer */ + if (unlikely(!tipc_bearer_hold(b))) { + rc = -ENODEV; + goto exit; + } + + /* Now, do decrypt */ + rc = crypto_aead_decrypt(req); + if (rc == -EINPROGRESS || rc == -EBUSY) + return rc; + + tipc_bearer_put(b); + +exit: + kfree(ctx); + TIPC_SKB_CB(skb)->crypto_ctx = NULL; + return rc; +} + +static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; + struct tipc_bearer *b = rx_ctx->bearer; + struct tipc_aead *aead = rx_ctx->aead; + struct tipc_crypto_stats __percpu *stats = aead->crypto->stats; + struct net *net = aead->crypto->net; + + switch (err) { + case 0: + this_cpu_inc(stats->stat[STAT_ASYNC_OK]); + break; + case -EINPROGRESS: + return; + default: + this_cpu_inc(stats->stat[STAT_ASYNC_NOK]); + break; + } + + kfree(rx_ctx); + tipc_crypto_rcv_complete(net, aead, b, &skb, err); + if (likely(skb)) { + if (likely(test_bit(0, &b->up))) + tipc_rcv(net, skb, b); + else + kfree_skb(skb); + } + + tipc_bearer_put(b); +} + +static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr) +{ + return (ehdr->user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; +} + +/** + * tipc_ehdr_validate - Validate an encryption message + * @skb: the message buffer + * + * Returns "true" if this is a valid encryption message, otherwise "false" + */ +bool tipc_ehdr_validate(struct sk_buff *skb) +{ + struct tipc_ehdr *ehdr; + int ehsz; + + if (unlikely(!pskb_may_pull(skb, EHDR_MIN_SIZE))) + return false; + + ehdr = (struct tipc_ehdr *)skb->data; + if (unlikely(ehdr->version != TIPC_EVERSION)) + return false; + ehsz = tipc_ehdr_size(ehdr); + if (unlikely(!pskb_may_pull(skb, ehsz))) + return false; + if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) + return false; + if (unlikely(!ehdr->tx_key)) + return false; + + return true; +} + +/** + * tipc_ehdr_build - Build TIPC encryption message header + * @net: struct net + * @aead: TX AEAD key to be used for the message encryption + * @tx_key: key id used for the message encryption + * @skb: input/output message skb + * @__rx: RX crypto handle if dest is "known" + * + * Return: the header size if the building is successful, otherwise < 0 + */ +static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, + u8 tx_key, struct sk_buff *skb, + struct tipc_crypto *__rx) +{ + struct tipc_msg *hdr = buf_msg(skb); + struct tipc_ehdr *ehdr; + u32 user = msg_user(hdr); + u64 seqno; + int ehsz; + + /* Make room for encryption header */ + ehsz = (user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; + WARN_ON(skb_headroom(skb) < ehsz); + ehdr = (struct tipc_ehdr *)skb_push(skb, ehsz); + + /* Obtain a seqno first: + * Use the key seqno (= cluster wise) if dest is unknown or we're in + * cluster key mode, otherwise it's better for a per-peer seqno! + */ + if (!__rx || aead->mode == CLUSTER_KEY) + seqno = atomic64_inc_return(&aead->seqno); + else + seqno = atomic64_inc_return(&__rx->sndnxt); + + /* Revoke the key if seqno is wrapped around */ + if (unlikely(!seqno)) + return tipc_crypto_key_revoke(net, tx_key); + + /* Word 1-2 */ + ehdr->seqno = cpu_to_be64(seqno); + + /* Words 0, 3- */ + ehdr->version = TIPC_EVERSION; + ehdr->user = 0; + ehdr->keepalive = 0; + ehdr->tx_key = tx_key; + ehdr->destined = (__rx) ? 1 : 0; + ehdr->rx_key_active = (__rx) ? __rx->key.active : 0; + ehdr->reserved_1 = 0; + ehdr->reserved_2 = 0; + + switch (user) { + case LINK_CONFIG: + ehdr->user = LINK_CONFIG; + memcpy(ehdr->id, tipc_own_id(net), NODE_ID_LEN); + break; + default: + if (user == LINK_PROTOCOL && msg_type(hdr) == STATE_MSG) { + ehdr->user = LINK_PROTOCOL; + ehdr->keepalive = msg_is_keepalive(hdr); + } + ehdr->addr = hdr->hdr[3]; + break; + } + + return ehsz; +} + +static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, + u8 new_passive, + u8 new_active, + u8 new_pending) +{ +#ifdef TIPC_CRYPTO_DEBUG + struct tipc_key old = c->key; + char buf[32]; +#endif + + c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) | + ((new_active & KEY_MASK) << (KEY_BITS)) | + ((new_pending & KEY_MASK)); + +#ifdef TIPC_CRYPTO_DEBUG + pr_info("%s(%s): key changing %s ::%pS\n", + (c->node) ? "RX" : "TX", + (c->node) ? tipc_node_get_id_str(c->node) : + tipc_own_id_string(c->net), + tipc_key_change_dump(old, c->key, buf), + __builtin_return_address(0)); +#endif +} + +/** + * tipc_crypto_key_init - Initiate a new user / AEAD key + * @c: TIPC crypto to which new key is attached + * @ukey: the user key + * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY) + * + * A new TIPC AEAD key will be allocated and initiated with the specified user + * key, then attached to the TIPC crypto. + * + * Return: new key id in case of success, otherwise: < 0 + */ +int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, + u8 mode) +{ + struct tipc_aead *aead = NULL; + int rc = 0; + + /* Initiate with the new user key */ + rc = tipc_aead_init(&aead, ukey, mode); + + /* Attach it to the crypto */ + if (likely(!rc)) { + rc = tipc_crypto_key_attach(c, aead, 0); + if (rc < 0) + tipc_aead_free(&aead->rcu); + } + + pr_info("%s(%s): key initiating, rc %d!\n", + (c->node) ? "RX" : "TX", + (c->node) ? tipc_node_get_id_str(c->node) : + tipc_own_id_string(c->net), + rc); + + return rc; +} + +/** + * tipc_crypto_key_attach - Attach a new AEAD key to TIPC crypto + * @c: TIPC crypto to which the new AEAD key is attached + * @aead: the new AEAD key pointer + * @pos: desired slot in the crypto key array, = 0 if any! + * + * Return: new key id in case of success, otherwise: -EBUSY + */ +static int tipc_crypto_key_attach(struct tipc_crypto *c, + struct tipc_aead *aead, u8 pos) +{ + u8 new_pending, new_passive, new_key; + struct tipc_key key; + int rc = -EBUSY; + + spin_lock_bh(&c->lock); + key = c->key; + if (key.active && key.passive) + goto exit; + if (key.passive && !tipc_aead_users(c->aead[key.passive])) + goto exit; + if (key.pending) { + if (pos) + goto exit; + if (tipc_aead_users(c->aead[key.pending]) > 0) + goto exit; + /* Replace it */ + new_pending = key.pending; + new_passive = key.passive; + new_key = new_pending; + } else { + if (pos) { + if (key.active && pos != key_next(key.active)) { + new_pending = key.pending; + new_passive = pos; + new_key = new_passive; + goto attach; + } else if (!key.active && !key.passive) { + new_pending = pos; + new_passive = key.passive; + new_key = new_pending; + goto attach; + } + } + new_pending = key_next(key.active ?: key.passive); + new_passive = key.passive; + new_key = new_pending; + } + +attach: + aead->crypto = c; + tipc_crypto_key_set_state(c, new_passive, key.active, new_pending); + tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock); + + c->working = 1; + c->timer1 = jiffies; + c->timer2 = jiffies; + rc = new_key; + +exit: + spin_unlock_bh(&c->lock); + return rc; +} + +void tipc_crypto_key_flush(struct tipc_crypto *c) +{ + int k; + + spin_lock_bh(&c->lock); + c->working = 0; + tipc_crypto_key_set_state(c, 0, 0, 0); + for (k = KEY_MIN; k <= KEY_MAX; k++) + tipc_crypto_key_detach(c->aead[k], &c->lock); + atomic_set(&c->peer_rx_active, 0); + atomic64_set(&c->sndnxt, 0); + spin_unlock_bh(&c->lock); +} + +/** + * tipc_crypto_key_try_align - Align RX keys if possible + * @rx: RX crypto handle + * @new_pending: new pending slot if aligned (= TX key from peer) + * + * Peer has used an unknown key slot, this only happens when peer has left and + * rejoned, or we are newcomer. + * That means, there must be no active key but a pending key at unaligned slot. + * If so, we try to move the pending key to the new slot. + * Note: A potential passive key can exist, it will be shifted correspondingly! + * + * Return: "true" if key is successfully aligned, otherwise "false" + */ +static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) +{ + struct tipc_aead *tmp1, *tmp2 = NULL; + struct tipc_key key; + bool aligned = false; + u8 new_passive = 0; + int x; + + spin_lock(&rx->lock); + key = rx->key; + if (key.pending == new_pending) { + aligned = true; + goto exit; + } + if (key.active) + goto exit; + if (!key.pending) + goto exit; + if (tipc_aead_users(rx->aead[key.pending]) > 0) + goto exit; + + /* Try to "isolate" this pending key first */ + tmp1 = tipc_aead_rcu_ptr(rx->aead[key.pending], &rx->lock); + if (!refcount_dec_if_one(&tmp1->refcnt)) + goto exit; + rcu_assign_pointer(rx->aead[key.pending], NULL); + + /* Move passive key if any */ + if (key.passive) { + tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock); + x = (key.passive - key.pending + new_pending) % KEY_MAX; + new_passive = (x <= 0) ? x + KEY_MAX : x; + } + + /* Re-allocate the key(s) */ + tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); + rcu_assign_pointer(rx->aead[new_pending], tmp1); + if (new_passive) + rcu_assign_pointer(rx->aead[new_passive], tmp2); + refcount_set(&tmp1->refcnt, 1); + aligned = true; + pr_info("RX(%s): key is aligned!\n", tipc_node_get_id_str(rx->node)); + +exit: + spin_unlock(&rx->lock); + return aligned; +} + +/** + * tipc_crypto_key_pick_tx - Pick one TX key for message decryption + * @tx: TX crypto handle + * @rx: RX crypto handle (can be NULL) + * @skb: the message skb which will be decrypted later + * + * This function looks up the existing TX keys and pick one which is suitable + * for the message decryption, that must be a cluster key and not used before + * on the same message (i.e. recursive). + * + * Return: the TX AEAD key handle in case of success, otherwise NULL + */ +static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, + struct tipc_crypto *rx, + struct sk_buff *skb) +{ + struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb); + struct tipc_aead *aead = NULL; + struct tipc_key key = tx->key; + u8 k, i = 0; + + /* Initialize data if not yet */ + if (!skb_cb->tx_clone_deferred) { + skb_cb->tx_clone_deferred = 1; + memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); + } + + skb_cb->tx_clone_ctx.rx = rx; + if (++skb_cb->tx_clone_ctx.recurs > 2) + return NULL; + + /* Pick one TX key */ + spin_lock(&tx->lock); + do { + k = (i == 0) ? key.pending : + ((i == 1) ? key.active : key.passive); + if (!k) + continue; + aead = tipc_aead_rcu_ptr(tx->aead[k], &tx->lock); + if (!aead) + continue; + if (aead->mode != CLUSTER_KEY || + aead == skb_cb->tx_clone_ctx.last) { + aead = NULL; + continue; + } + /* Ok, found one cluster key */ + skb_cb->tx_clone_ctx.last = aead; + WARN_ON(skb->next); + skb->next = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!skb->next)) + pr_warn("Failed to clone skb for next round if any\n"); + WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); + break; + } while (++i < 3); + spin_unlock(&tx->lock); + + return aead; +} + +/** + * tipc_crypto_key_synch: Synch own key data according to peer key status + * @rx: RX crypto handle + * @new_rx_active: latest RX active key from peer + * @hdr: TIPCv2 message + * + * This function updates the peer node related data as the peer RX active key + * has changed, so the number of TX keys' users on this node are increased and + * decreased correspondingly. + * + * The "per-peer" sndnxt is also reset when the peer key has switched. + */ +static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active, + struct tipc_msg *hdr) +{ + struct net *net = rx->net; + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + u8 cur_rx_active; + + /* TX might be even not ready yet */ + if (unlikely(!tx->key.active && !tx->key.pending)) + return; + + cur_rx_active = atomic_read(&rx->peer_rx_active); + if (likely(cur_rx_active == new_rx_active)) + return; + + /* Make sure this message destined for this node */ + if (unlikely(msg_short(hdr) || + msg_destnode(hdr) != tipc_own_addr(net))) + return; + + /* Peer RX active key has changed, try to update owns' & TX users */ + if (atomic_cmpxchg(&rx->peer_rx_active, + cur_rx_active, + new_rx_active) == cur_rx_active) { + if (new_rx_active) + tipc_aead_users_inc(tx->aead[new_rx_active], INT_MAX); + if (cur_rx_active) + tipc_aead_users_dec(tx->aead[cur_rx_active], 0); + + atomic64_set(&rx->sndnxt, 0); + /* Mark the point TX key users changed */ + tx->timer1 = jiffies; + +#ifdef TIPC_CRYPTO_DEBUG + pr_info("TX(%s): key users changed %d-- %d++, peer RX(%s)\n", + tipc_own_id_string(net), cur_rx_active, + new_rx_active, tipc_node_get_id_str(rx->node)); +#endif + } +} + +static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) +{ + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_key key; + + spin_lock(&tx->lock); + key = tx->key; + WARN_ON(!key.active || tx_key != key.active); + + /* Free the active key */ + tipc_crypto_key_set_state(tx, key.passive, 0, key.pending); + tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); + spin_unlock(&tx->lock); + + pr_warn("TX(%s): key is revoked!\n", tipc_own_id_string(net)); + return -EKEYREVOKED; +} + +int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, + struct tipc_node *node) +{ + struct tipc_crypto *c; + + if (*crypto) + return -EEXIST; + + /* Allocate crypto */ + c = kzalloc(sizeof(*c), GFP_ATOMIC); + if (!c) + return -ENOMEM; + + /* Allocate statistic structure */ + c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); + if (!c->stats) { + kzfree(c); + return -ENOMEM; + } + + c->working = 0; + c->net = net; + c->node = node; + tipc_crypto_key_set_state(c, 0, 0, 0); + atomic_set(&c->peer_rx_active, 0); + atomic64_set(&c->sndnxt, 0); + c->timer1 = jiffies; + c->timer2 = jiffies; + spin_lock_init(&c->lock); + *crypto = c; + + return 0; +} + +void tipc_crypto_stop(struct tipc_crypto **crypto) +{ + struct tipc_crypto *c, *tx, *rx; + bool is_rx; + u8 k; + + if (!*crypto) + return; + + rcu_read_lock(); + /* RX stopping? => decrease TX key users if any */ + is_rx = !!((*crypto)->node); + if (is_rx) { + rx = *crypto; + tx = tipc_net(rx->net)->crypto_tx; + k = atomic_read(&rx->peer_rx_active); + if (k) { + tipc_aead_users_dec(tx->aead[k], 0); + /* Mark the point TX key users changed */ + tx->timer1 = jiffies; + } + } + + /* Release AEAD keys */ + c = *crypto; + for (k = KEY_MIN; k <= KEY_MAX; k++) + tipc_aead_put(rcu_dereference(c->aead[k])); + rcu_read_unlock(); + + pr_warn("%s(%s) has been purged, node left!\n", + (is_rx) ? "RX" : "TX", + (is_rx) ? tipc_node_get_id_str((*crypto)->node) : + tipc_own_id_string((*crypto)->net)); + + /* Free this crypto statistics */ + free_percpu(c->stats); + + *crypto = NULL; + kzfree(c); +} + +void tipc_crypto_timeout(struct tipc_crypto *rx) +{ + struct tipc_net *tn = tipc_net(rx->net); + struct tipc_crypto *tx = tn->crypto_tx; + struct tipc_key key; + u8 new_pending, new_passive; + int cmd; + + /* TX key activating: + * The pending key (users > 0) -> active + * The active key if any (users == 0) -> free + */ + spin_lock(&tx->lock); + key = tx->key; + if (key.active && tipc_aead_users(tx->aead[key.active]) > 0) + goto s1; + if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0) + goto s1; + if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_LIM)) + goto s1; + + tipc_crypto_key_set_state(tx, key.passive, key.pending, 0); + if (key.active) + tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); + this_cpu_inc(tx->stats->stat[STAT_SWITCHES]); + pr_info("TX(%s): key %d is activated!\n", tipc_own_id_string(tx->net), + key.pending); + +s1: + spin_unlock(&tx->lock); + + /* RX key activating: + * The pending key (users > 0) -> active + * The active key if any -> passive, freed later + */ + spin_lock(&rx->lock); + key = rx->key; + if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0) + goto s2; + + new_pending = (key.passive && + !tipc_aead_users(rx->aead[key.passive])) ? + key.passive : 0; + new_passive = (key.active) ?: ((new_pending) ? 0 : key.passive); + tipc_crypto_key_set_state(rx, new_passive, key.pending, new_pending); + this_cpu_inc(rx->stats->stat[STAT_SWITCHES]); + pr_info("RX(%s): key %d is activated!\n", + tipc_node_get_id_str(rx->node), key.pending); + goto s5; + +s2: + /* RX key "faulty" switching: + * The faulty pending key (users < -30) -> passive + * The passive key (users = 0) -> pending + * Note: This only happens after RX deactivated - s3! + */ + key = rx->key; + if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -30) + goto s3; + if (!key.passive || tipc_aead_users(rx->aead[key.passive]) != 0) + goto s3; + + new_pending = key.passive; + new_passive = key.pending; + tipc_crypto_key_set_state(rx, new_passive, key.active, new_pending); + goto s5; + +s3: + /* RX key deactivating: + * The passive key if any -> pending + * The active key -> passive (users = 0) / pending + * The pending key if any -> passive (users = 0) + */ + key = rx->key; + if (!key.active) + goto s4; + if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM)) + goto s4; + + new_pending = (key.passive) ?: key.active; + new_passive = (key.passive) ? key.active : key.pending; + tipc_aead_users_set(rx->aead[new_pending], 0); + if (new_passive) + tipc_aead_users_set(rx->aead[new_passive], 0); + tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); + pr_info("RX(%s): key %d is deactivated!\n", + tipc_node_get_id_str(rx->node), key.active); + goto s5; + +s4: + /* RX key passive -> freed: */ + key = rx->key; + if (!key.passive || !tipc_aead_users(rx->aead[key.passive])) + goto s5; + if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM)) + goto s5; + + tipc_crypto_key_set_state(rx, 0, key.active, key.pending); + tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock); + pr_info("RX(%s): key %d is freed!\n", tipc_node_get_id_str(rx->node), + key.passive); + +s5: + spin_unlock(&rx->lock); + + /* Limit max_tfms & do debug commands if needed */ + if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM)) + return; + + cmd = sysctl_tipc_max_tfms; + sysctl_tipc_max_tfms = TIPC_MAX_TFMS_DEF; + tipc_crypto_do_cmd(rx->net, cmd); +} + +/** + * tipc_crypto_xmit - Build & encrypt TIPC message for xmit + * @net: struct net + * @skb: input/output message skb pointer + * @b: bearer used for xmit later + * @dst: destination media address + * @__dnode: destination node for reference if any + * + * First, build an encryption message header on the top of the message, then + * encrypt the original TIPC message by using the active or pending TX key. + * If the encryption is successful, the encrypted skb is returned directly or + * via the callback. + * Otherwise, the skb is freed! + * + * Return: + * 0 : the encryption has succeeded (or no encryption) + * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made + * -ENOKEK : the encryption has failed due to no key + * -EKEYREVOKED : the encryption has failed due to key revoked + * -ENOMEM : the encryption has failed due to no memory + * < 0 : the encryption has failed due to other reasons + */ +int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, + struct tipc_bearer *b, struct tipc_media_addr *dst, + struct tipc_node *__dnode) +{ + struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode); + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_crypto_stats __percpu *stats = tx->stats; + struct tipc_key key = tx->key; + struct tipc_aead *aead = NULL; + struct sk_buff *probe; + int rc = -ENOKEY; + u8 tx_key; + + /* No encryption? */ + if (!tx->working) + return 0; + + /* Try with the pending key if available and: + * 1) This is the only choice (i.e. no active key) or; + * 2) Peer has switched to this key (unicast only) or; + * 3) It is time to do a pending key probe; + */ + if (unlikely(key.pending)) { + tx_key = key.pending; + if (!key.active) + goto encrypt; + if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key) + goto encrypt; + if (TIPC_SKB_CB(*skb)->probe) + goto encrypt; + if (!__rx && + time_after(jiffies, tx->timer2 + TIPC_TX_PROBE_LIM)) { + tx->timer2 = jiffies; + probe = skb_clone(*skb, GFP_ATOMIC); + if (probe) { + TIPC_SKB_CB(probe)->probe = 1; + tipc_crypto_xmit(net, &probe, b, dst, __dnode); + if (probe) + b->media->send_msg(net, probe, b, dst); + } + } + } + /* Else, use the active key if any */ + if (likely(key.active)) { + tx_key = key.active; + goto encrypt; + } + goto exit; + +encrypt: + aead = tipc_aead_get(tx->aead[tx_key]); + if (unlikely(!aead)) + goto exit; + rc = tipc_ehdr_build(net, aead, tx_key, *skb, __rx); + if (likely(rc > 0)) + rc = tipc_aead_encrypt(aead, *skb, b, dst, __dnode); + +exit: + switch (rc) { + case 0: + this_cpu_inc(stats->stat[STAT_OK]); + break; + case -EINPROGRESS: + case -EBUSY: + this_cpu_inc(stats->stat[STAT_ASYNC]); + *skb = NULL; + return rc; + default: + this_cpu_inc(stats->stat[STAT_NOK]); + if (rc == -ENOKEY) + this_cpu_inc(stats->stat[STAT_NOKEYS]); + else if (rc == -EKEYREVOKED) + this_cpu_inc(stats->stat[STAT_BADKEYS]); + kfree_skb(*skb); + *skb = NULL; + break; + } + + tipc_aead_put(aead); + return rc; +} + +/** + * tipc_crypto_rcv - Decrypt an encrypted TIPC message from peer + * @net: struct net + * @rx: RX crypto handle + * @skb: input/output message skb pointer + * @b: bearer where the message has been received + * + * If the decryption is successful, the decrypted skb is returned directly or + * as the callback, the encryption header and auth tag will be trimed out + * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). + * Otherwise, the skb will be freed! + * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX + * cluster key(s) can be taken for decryption (- recursive). + * + * Return: + * 0 : the decryption has successfully completed + * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made + * -ENOKEY : the decryption has failed due to no key + * -EBADMSG : the decryption has failed due to bad message + * -ENOMEM : the decryption has failed due to no memory + * < 0 : the decryption has failed due to other reasons + */ +int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, + struct sk_buff **skb, struct tipc_bearer *b) +{ + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_crypto_stats __percpu *stats; + struct tipc_aead *aead = NULL; + struct tipc_key key; + int rc = -ENOKEY; + u8 tx_key = 0; + + /* New peer? + * Let's try with TX key (i.e. cluster mode) & verify the skb first! + */ + if (unlikely(!rx)) + goto pick_tx; + + /* Pick RX key according to TX key, three cases are possible: + * 1) The current active key (likely) or; + * 2) The pending (new or deactivated) key (if any) or; + * 3) The passive or old active key (i.e. users > 0); + */ + tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; + key = rx->key; + if (likely(tx_key == key.active)) + goto decrypt; + if (tx_key == key.pending) + goto decrypt; + if (tx_key == key.passive) { + rx->timer2 = jiffies; + if (tipc_aead_users(rx->aead[key.passive]) > 0) + goto decrypt; + } + + /* Unknown key, let's try to align RX key(s) */ + if (tipc_crypto_key_try_align(rx, tx_key)) + goto decrypt; + +pick_tx: + /* No key suitable? Try to pick one from TX... */ + aead = tipc_crypto_key_pick_tx(tx, rx, *skb); + if (aead) + goto decrypt; + goto exit; + +decrypt: + rcu_read_lock(); + if (!aead) + aead = tipc_aead_get(rx->aead[tx_key]); + rc = tipc_aead_decrypt(net, aead, *skb, b); + rcu_read_unlock(); + +exit: + stats = ((rx) ?: tx)->stats; + switch (rc) { + case 0: + this_cpu_inc(stats->stat[STAT_OK]); + break; + case -EINPROGRESS: + case -EBUSY: + this_cpu_inc(stats->stat[STAT_ASYNC]); + *skb = NULL; + return rc; + default: + this_cpu_inc(stats->stat[STAT_NOK]); + if (rc == -ENOKEY) { + kfree_skb(*skb); + *skb = NULL; + if (rx) + tipc_node_put(rx->node); + this_cpu_inc(stats->stat[STAT_NOKEYS]); + return rc; + } else if (rc == -EBADMSG) { + this_cpu_inc(stats->stat[STAT_BADMSGS]); + } + break; + } + + tipc_crypto_rcv_complete(net, aead, b, skb, rc); + return rc; +} + +static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, + struct tipc_bearer *b, + struct sk_buff **skb, int err) +{ + struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(*skb); + struct tipc_crypto *rx = aead->crypto; + struct tipc_aead *tmp = NULL; + struct tipc_ehdr *ehdr; + struct tipc_node *n; + u8 rx_key_active; + bool destined; + + /* Is this completed by TX? */ + if (unlikely(!rx->node)) { + rx = skb_cb->tx_clone_ctx.rx; +#ifdef TIPC_CRYPTO_DEBUG + pr_info("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n", + (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead, + (*skb)->next, skb_cb->flags); + pr_info("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n", + skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last, + aead->crypto->aead[1], aead->crypto->aead[2], + aead->crypto->aead[3]); +#endif + if (unlikely(err)) { + if (err == -EBADMSG && (*skb)->next) + tipc_rcv(net, (*skb)->next, b); + goto free_skb; + } + + if (likely((*skb)->next)) { + kfree_skb((*skb)->next); + (*skb)->next = NULL; + } + ehdr = (struct tipc_ehdr *)(*skb)->data; + if (!rx) { + WARN_ON(ehdr->user != LINK_CONFIG); + n = tipc_node_create(net, 0, ehdr->id, 0xffffu, 0, + true); + rx = tipc_node_crypto_rx(n); + if (unlikely(!rx)) + goto free_skb; + } + + /* Skip cloning this time as we had a RX pending key */ + if (rx->key.pending) + goto rcv; + if (tipc_aead_clone(&tmp, aead) < 0) + goto rcv; + if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key) < 0) { + tipc_aead_free(&tmp->rcu); + goto rcv; + } + tipc_aead_put(aead); + aead = tipc_aead_get(tmp); + } + + if (unlikely(err)) { + tipc_aead_users_dec(aead, INT_MIN); + goto free_skb; + } + + /* Set the RX key's user */ + tipc_aead_users_set(aead, 1); + +rcv: + /* Mark this point, RX works */ + rx->timer1 = jiffies; + + /* Remove ehdr & auth. tag prior to tipc_rcv() */ + ehdr = (struct tipc_ehdr *)(*skb)->data; + destined = ehdr->destined; + rx_key_active = ehdr->rx_key_active; + skb_pull(*skb, tipc_ehdr_size(ehdr)); + pskb_trim(*skb, (*skb)->len - aead->authsize); + + /* Validate TIPCv2 message */ + if (unlikely(!tipc_msg_validate(skb))) { + pr_err_ratelimited("Packet dropped after decryption!\n"); + goto free_skb; + } + + /* Update peer RX active key & TX users */ + if (destined) + tipc_crypto_key_synch(rx, rx_key_active, buf_msg(*skb)); + + /* Mark skb decrypted */ + skb_cb->decrypted = 1; + + /* Clear clone cxt if any */ + if (likely(!skb_cb->tx_clone_deferred)) + goto exit; + skb_cb->tx_clone_deferred = 0; + memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); + goto exit; + +free_skb: + kfree_skb(*skb); + *skb = NULL; + +exit: + tipc_aead_put(aead); + if (rx) + tipc_node_put(rx->node); +} + +static void tipc_crypto_do_cmd(struct net *net, int cmd) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_crypto *tx = tn->crypto_tx, *rx; + struct list_head *p; + unsigned int stat; + int i, j, cpu; + char buf[200]; + + /* Currently only one command is supported */ + switch (cmd) { + case 0xfff1: + goto print_stats; + default: + return; + } + +print_stats: + /* Print a header */ + pr_info("\n=============== TIPC Crypto Statistics ===============\n\n"); + + /* Print key status */ + pr_info("Key status:\n"); + pr_info("TX(%7.7s)\n%s", tipc_own_id_string(net), + tipc_crypto_key_dump(tx, buf)); + + rcu_read_lock(); + for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { + rx = tipc_node_crypto_rx_by_list(p); + pr_info("RX(%7.7s)\n%s", tipc_node_get_id_str(rx->node), + tipc_crypto_key_dump(rx, buf)); + } + rcu_read_unlock(); + + /* Print crypto statistics */ + for (i = 0, j = 0; i < MAX_STATS; i++) + j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]); + pr_info("\nCounter %s", buf); + + memset(buf, '-', 115); + buf[115] = '\0'; + pr_info("%s\n", buf); + + j = scnprintf(buf, 200, "TX(%7.7s) ", tipc_own_id_string(net)); + for_each_possible_cpu(cpu) { + for (i = 0; i < MAX_STATS; i++) { + stat = per_cpu_ptr(tx->stats, cpu)->stat[i]; + j += scnprintf(buf + j, 200 - j, "|%11d ", stat); + } + pr_info("%s", buf); + j = scnprintf(buf, 200, "%12s", " "); + } + + rcu_read_lock(); + for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { + rx = tipc_node_crypto_rx_by_list(p); + j = scnprintf(buf, 200, "RX(%7.7s) ", + tipc_node_get_id_str(rx->node)); + for_each_possible_cpu(cpu) { + for (i = 0; i < MAX_STATS; i++) { + stat = per_cpu_ptr(rx->stats, cpu)->stat[i]; + j += scnprintf(buf + j, 200 - j, "|%11d ", + stat); + } + pr_info("%s", buf); + j = scnprintf(buf, 200, "%12s", " "); + } + } + rcu_read_unlock(); + + pr_info("\n======================== Done ========================\n"); +} + +static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) +{ + struct tipc_key key = c->key; + struct tipc_aead *aead; + int k, i = 0; + char *s; + + for (k = KEY_MIN; k <= KEY_MAX; k++) { + if (k == key.passive) + s = "PAS"; + else if (k == key.active) + s = "ACT"; + else if (k == key.pending) + s = "PEN"; + else + s = "-"; + i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s); + + rcu_read_lock(); + aead = rcu_dereference(c->aead[k]); + if (aead) + i += scnprintf(buf + i, 200 - i, + "{\"%s...\", \"%s\"}/%d:%d", + aead->hint, + (aead->mode == CLUSTER_KEY) ? "c" : "p", + atomic_read(&aead->users), + refcount_read(&aead->refcnt)); + rcu_read_unlock(); + i += scnprintf(buf + i, 200 - i, "\n"); + } + + if (c->node) + i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n", + atomic_read(&c->peer_rx_active)); + + return buf; +} + +#ifdef TIPC_CRYPTO_DEBUG +static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, + char *buf) +{ + struct tipc_key *key = &old; + int k, i = 0; + char *s; + + /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */ +again: + i += scnprintf(buf + i, 32 - i, "["); + for (k = KEY_MIN; k <= KEY_MAX; k++) { + if (k == key->passive) + s = "pas"; + else if (k == key->active) + s = "act"; + else if (k == key->pending) + s = "pen"; + else + s = "-"; + i += scnprintf(buf + i, 32 - i, + (k != KEY_MAX) ? "%s " : "%s", s); + } + if (key != &new) { + i += scnprintf(buf + i, 32 - i, "] -> "); + key = &new; + goto again; + } + i += scnprintf(buf + i, 32 - i, "]"); + return buf; +} +#endif diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h new file mode 100644 index 000000000000..c3de769f49e8 --- /dev/null +++ b/net/tipc/crypto.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * net/tipc/crypto.h: Include file for TIPC crypto + * + * Copyright (c) 2019, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifdef CONFIG_TIPC_CRYPTO +#ifndef _TIPC_CRYPTO_H +#define _TIPC_CRYPTO_H + +#include "core.h" +#include "node.h" +#include "msg.h" +#include "bearer.h" + +#define TIPC_EVERSION 7 + +/* AEAD aes(gcm) */ +#define TIPC_AES_GCM_KEY_SIZE_128 16 +#define TIPC_AES_GCM_KEY_SIZE_192 24 +#define TIPC_AES_GCM_KEY_SIZE_256 32 + +#define TIPC_AES_GCM_SALT_SIZE 4 +#define TIPC_AES_GCM_IV_SIZE 12 +#define TIPC_AES_GCM_TAG_SIZE 16 + +/** + * TIPC crypto modes: + * - CLUSTER_KEY: + * One single key is used for both TX & RX in all nodes in the cluster. + * - PER_NODE_KEY: + * Each nodes in the cluster has one TX key, for RX a node needs to know + * its peers' TX key for the decryption of messages from those nodes. + */ +enum { + CLUSTER_KEY = 1, + PER_NODE_KEY = (1 << 1), +}; + +extern int sysctl_tipc_max_tfms __read_mostly; + +/** + * TIPC encryption message format: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 + * 1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w0:|Ver=7| User |D|TX |RX |K| Rsvd | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w1:| Seqno | + * w2:| (8 octets) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w3:\ Prevnode \ + * / (4 or 16 octets) / + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / Encrypted complete TIPC V2 header and user data / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | | + * | AuthTag | + * | (16 octets) | + * | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Word0: + * Ver : = 7 i.e. TIPC encryption message version + * User : = 7 (for LINK_PROTOCOL); = 13 (for LINK_CONFIG) or = 0 + * D : The destined bit i.e. the message's destination node is + * "known" or not at the message encryption + * TX : TX key used for the message encryption + * RX : Currently RX active key corresponding to the destination + * node's TX key (when the "D" bit is set) + * K : Keep-alive bit (for RPS, LINK_PROTOCOL/STATE_MSG only) + * Rsvd : Reserved bit, field + * Word1-2: + * Seqno : The 64-bit sequence number of the encrypted message, also + * part of the nonce used for the message encryption/decryption + * Word3-: + * Prevnode: The source node address, or ID in case LINK_CONFIG only + * AuthTag : The authentication tag for the message integrity checking + * generated by the message encryption + */ +struct tipc_ehdr { + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 destined:1, + user:4, + version:3; + __u8 reserved_1:3, + keepalive:1, + rx_key_active:2, + tx_key:2; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 version:3, + user:4, + destined:1; + __u8 tx_key:2, + rx_key_active:2, + keepalive:1, + reserved_1:3; +#else +#error "Please fix <asm/byteorder.h>" +#endif + __be16 reserved_2; + } __packed; + __be32 w0; + }; + __be64 seqno; + union { + __be32 addr; + __u8 id[NODE_ID_LEN]; /* For a LINK_CONFIG message only! */ + }; +#define EHDR_SIZE (offsetof(struct tipc_ehdr, addr) + sizeof(__be32)) +#define EHDR_CFG_SIZE (sizeof(struct tipc_ehdr)) +#define EHDR_MIN_SIZE (EHDR_SIZE) +#define EHDR_MAX_SIZE (EHDR_CFG_SIZE) +#define EMSG_OVERHEAD (EHDR_SIZE + TIPC_AES_GCM_TAG_SIZE) +} __packed; + +int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, + struct tipc_node *node); +void tipc_crypto_stop(struct tipc_crypto **crypto); +void tipc_crypto_timeout(struct tipc_crypto *rx); +int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, + struct tipc_bearer *b, struct tipc_media_addr *dst, + struct tipc_node *__dnode); +int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, + struct sk_buff **skb, struct tipc_bearer *b); +int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, + u8 mode); +void tipc_crypto_key_flush(struct tipc_crypto *c); +int tipc_aead_key_validate(struct tipc_aead_key *ukey); +bool tipc_ehdr_validate(struct sk_buff *skb); + +#endif /* _TIPC_CRYPTO_H */ +#endif diff --git a/net/tipc/discover.c b/net/tipc/discover.c index c138d68e8a69..b043e8c6397a 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -94,6 +94,7 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); + msg_set_peer_net_hash(hdr, tipc_net_hash_mixes(net, tn->random)); msg_set_node_id(hdr, tipc_own_id(net)); } @@ -242,7 +243,8 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, if (!tipc_in_scope(legacy, b->domain, src)) return; tipc_node_check_dest(net, src, peer_id, b, caps, signature, - &maddr, &respond, &dupl_addr); + msg_peer_net_hash(hdr), &maddr, &respond, + &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); if (!respond) diff --git a/net/tipc/link.c b/net/tipc/link.c index 999eab592de8..fb72031228c9 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -44,6 +44,7 @@ #include "netlink.h" #include "monitor.h" #include "trace.h" +#include "crypto.h" #include <linux/pkt_sched.h> @@ -397,6 +398,15 @@ int tipc_link_mtu(struct tipc_link *l) return l->mtu; } +int tipc_link_mss(struct tipc_link *l) +{ +#ifdef CONFIG_TIPC_CRYPTO + return l->mtu - INT_H_SIZE - EMSG_OVERHEAD; +#else + return l->mtu - INT_H_SIZE; +#endif +} + u16 tipc_link_rcv_nxt(struct tipc_link *l) { return l->rcv_nxt; @@ -940,16 +950,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb_peek(list)); - unsigned int maxwin = l->window; - int imp = msg_importance(hdr); - unsigned int mtu = l->mtu; + struct sk_buff_head *backlogq = &l->backlogq; + struct sk_buff_head *transmq = &l->transmq; + struct sk_buff *skb, *_skb; + u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; u16 seqno = l->snd_nxt; - u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; - struct sk_buff_head *transmq = &l->transmq; - struct sk_buff_head *backlogq = &l->backlogq; - struct sk_buff *skb, *_skb, **tskb; int pkt_cnt = skb_queue_len(list); + int imp = msg_importance(hdr); + unsigned int mss = tipc_link_mss(l); + unsigned int maxwin = l->window; + unsigned int mtu = l->mtu; + bool new_bundle; int rc = 0; if (unlikely(msg_size(hdr) > mtu)) { @@ -975,20 +987,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, } /* Prepare each packet for sending, and add to relevant queue: */ - while (skb_queue_len(list)) { - skb = skb_peek(list); - hdr = buf_msg(skb); - msg_set_seqno(hdr, seqno); - msg_set_ack(hdr, ack); - msg_set_bcast_ack(hdr, bc_ack); - + while ((skb = __skb_dequeue(list))) { if (likely(skb_queue_len(transmq) < maxwin)) { + hdr = buf_msg(skb); + msg_set_seqno(hdr, seqno); + msg_set_ack(hdr, ack); + msg_set_bcast_ack(hdr, bc_ack); _skb = skb_clone(skb, GFP_ATOMIC); if (!_skb) { + kfree_skb(skb); __skb_queue_purge(list); return -ENOBUFS; } - __skb_dequeue(list); __skb_queue_tail(transmq, skb); /* next retransmit attempt */ if (link_is_bc_sndlink(l)) @@ -1000,22 +1010,25 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, seqno++; continue; } - tskb = &l->backlog[imp].target_bskb; - if (tipc_msg_bundle(*tskb, hdr, mtu)) { - kfree_skb(__skb_dequeue(list)); - l->stats.sent_bundled++; - continue; - } - if (tipc_msg_make_bundle(tskb, hdr, mtu, l->addr)) { - kfree_skb(__skb_dequeue(list)); - __skb_queue_tail(backlogq, *tskb); - l->backlog[imp].len++; - l->stats.sent_bundled++; - l->stats.sent_bundles++; + if (tipc_msg_try_bundle(l->backlog[imp].target_bskb, &skb, + mss, l->addr, &new_bundle)) { + if (skb) { + /* Keep a ref. to the skb for next try */ + l->backlog[imp].target_bskb = skb; + l->backlog[imp].len++; + __skb_queue_tail(backlogq, skb); + } else { + if (new_bundle) { + l->stats.sent_bundles++; + l->stats.sent_bundled++; + } + l->stats.sent_bundled++; + } continue; } l->backlog[imp].target_bskb = NULL; - l->backlog[imp].len += skb_queue_len(list); + l->backlog[imp].len += (1 + skb_queue_len(list)); + __skb_queue_tail(backlogq, skb); skb_queue_splice_tail_init(list, backlogq); } l->snd_nxt = seqno; @@ -1084,7 +1097,7 @@ static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r, return false; if (!time_after(jiffies, TIPC_SKB_CB(skb)->retr_stamp + - msecs_to_jiffies(r->tolerance))) + msecs_to_jiffies(r->tolerance * 10))) return false; hdr = buf_msg(skb); @@ -1151,7 +1164,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; - _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC); + _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) return 0; hdr = buf_msg(_skb); @@ -1427,8 +1440,7 @@ next_gap_ack: if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME; - _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, - GFP_ATOMIC); + _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) continue; hdr = buf_msg(_skb); @@ -1728,21 +1740,6 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, return; __skb_queue_head_init(&tnlq); - __skb_queue_head_init(&tmpxq); - __skb_queue_head_init(&frags); - - /* At least one packet required for safe algorithm => add dummy */ - skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, - BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net), - 0, 0, TIPC_ERR_NO_PORT); - if (!skb) { - pr_warn("%sunable to create tunnel packet\n", link_co_err); - return; - } - __skb_queue_tail(&tnlq, skb); - tipc_link_xmit(l, &tnlq, &tmpxq); - __skb_queue_purge(&tmpxq); - /* Link Synching: * From now on, send only one single ("dummy") SYNCH message * to peer. The SYNCH message does not contain any data, just @@ -1768,6 +1765,20 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, return; } + __skb_queue_head_init(&tmpxq); + __skb_queue_head_init(&frags); + /* At least one packet required for safe algorithm => add dummy */ + skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, + BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net), + 0, 0, TIPC_ERR_NO_PORT); + if (!skb) { + pr_warn("%sunable to create tunnel packet\n", link_co_err); + return; + } + __skb_queue_tail(&tnlq, skb); + tipc_link_xmit(l, &tnlq, &tmpxq); + __skb_queue_purge(&tmpxq); + /* Initialize reusable tunnel packet header */ tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL, mtyp, INT_H_SIZE, l->addr); @@ -1873,7 +1884,7 @@ void tipc_link_failover_prepare(struct tipc_link *l, struct tipc_link *tnl, tipc_link_create_dummy_tnl_msg(tnl, xmitq); - /* This failover link enpoint was never established before, + /* This failover link endpoint was never established before, * so it has not received anything from peer. * Otherwise, it must be a normal failover situation or the * node has entered SELF_DOWN_PEER_LEAVING and both peer nodes diff --git a/net/tipc/link.h b/net/tipc/link.h index adcad65e761c..c09e9d49d0a3 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -141,6 +141,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, int tipc_link_bc_peers(struct tipc_link *l); void tipc_link_set_mtu(struct tipc_link *l, int mtu); int tipc_link_mtu(struct tipc_link *l); +int tipc_link_mss(struct tipc_link *l); void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, struct sk_buff_head *xmitq); void tipc_link_build_bc_sync_msg(struct tipc_link *l, diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 922d262e153f..0d515d20b056 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -39,10 +39,16 @@ #include "msg.h" #include "addr.h" #include "name_table.h" +#include "crypto.h" #define MAX_FORWARD_SIZE 1024 +#ifdef CONFIG_TIPC_CRYPTO +#define BUF_HEADROOM ALIGN(((LL_MAX_HEADER + 48) + EHDR_MAX_SIZE), 16) +#define BUF_TAILROOM (TIPC_AES_GCM_TAG_SIZE) +#else #define BUF_HEADROOM (LL_MAX_HEADER + 48) #define BUF_TAILROOM 16 +#endif static unsigned int align(unsigned int i) { @@ -61,7 +67,11 @@ static unsigned int align(unsigned int i) struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { struct sk_buff *skb; +#ifdef CONFIG_TIPC_CRYPTO + unsigned int buf_size = (BUF_HEADROOM + size + BUF_TAILROOM + 3) & ~3u; +#else unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; +#endif skb = alloc_skb_fclone(buf_size, gfp); if (skb) { @@ -173,7 +183,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) } if (fragid == LAST_FRAGMENT) { - TIPC_SKB_CB(head)->validated = false; + TIPC_SKB_CB(head)->validated = 0; if (unlikely(!tipc_msg_validate(&head))) goto err; *buf = head; @@ -190,6 +200,59 @@ err: return 0; } +/** + * tipc_msg_append(): Append data to tail of an existing buffer queue + * @hdr: header to be used + * @m: the data to be appended + * @mss: max allowable size of buffer + * @dlen: size of data to be appended + * @txq: queue to appand to + * Returns the number og 1k blocks appended or errno value + */ +int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, + int mss, struct sk_buff_head *txq) +{ + struct sk_buff *skb, *prev; + int accounted, total, curr; + int mlen, cpy, rem = dlen; + struct tipc_msg *hdr; + + skb = skb_peek_tail(txq); + accounted = skb ? msg_blocks(buf_msg(skb)) : 0; + total = accounted; + + while (rem) { + if (!skb || skb->len >= mss) { + prev = skb; + skb = tipc_buf_acquire(mss, GFP_KERNEL); + if (unlikely(!skb)) + return -ENOMEM; + skb_orphan(skb); + skb_trim(skb, MIN_H_SIZE); + hdr = buf_msg(skb); + skb_copy_to_linear_data(skb, _hdr, MIN_H_SIZE); + msg_set_hdr_sz(hdr, MIN_H_SIZE); + msg_set_size(hdr, MIN_H_SIZE); + __skb_queue_tail(txq, skb); + total += 1; + if (prev) + msg_set_ack_required(buf_msg(prev), 0); + msg_set_ack_required(hdr, 1); + } + hdr = buf_msg(skb); + curr = msg_blocks(hdr); + mlen = msg_size(hdr); + cpy = min_t(int, rem, mss - mlen); + if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter)) + return -EFAULT; + msg_set_size(hdr, mlen + cpy); + skb_put(skb, cpy); + rem -= cpy; + total += msg_blocks(hdr) - curr; + } + return total - accounted; +} + /* tipc_msg_validate - validate basic format of received message * * This routine ensures a TIPC message has an acceptable header, and at least @@ -218,6 +281,7 @@ bool tipc_msg_validate(struct sk_buff **_skb) if (unlikely(TIPC_SKB_CB(skb)->validated)) return true; + if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE))) return false; @@ -239,7 +303,7 @@ bool tipc_msg_validate(struct sk_buff **_skb) if (unlikely(skb->len < msz)) return false; - TIPC_SKB_CB(skb)->validated = true; + TIPC_SKB_CB(skb)->validated = 1; return true; } @@ -419,48 +483,98 @@ error: } /** - * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one - * @skb: the buffer to append to ("bundle") - * @msg: message to be appended - * @mtu: max allowable size for the bundle buffer - * Consumes buffer if successful - * Returns true if bundling could be performed, otherwise false + * tipc_msg_bundle - Append contents of a buffer to tail of an existing one + * @bskb: the bundle buffer to append to + * @msg: message to be appended + * @max: max allowable size for the bundle buffer + * + * Returns "true" if bundling has been performed, otherwise "false" */ -bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu) +static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg, + u32 max) { - struct tipc_msg *bmsg; - unsigned int bsz; - unsigned int msz = msg_size(msg); - u32 start, pad; - u32 max = mtu - INT_H_SIZE; + struct tipc_msg *bmsg = buf_msg(bskb); + u32 msz, bsz, offset, pad; - if (likely(msg_user(msg) == MSG_FRAGMENTER)) - return false; - if (!skb) - return false; - bmsg = buf_msg(skb); + msz = msg_size(msg); bsz = msg_size(bmsg); - start = align(bsz); - pad = start - bsz; + offset = align(bsz); + pad = offset - bsz; - if (unlikely(msg_user(msg) == TUNNEL_PROTOCOL)) + if (unlikely(skb_tailroom(bskb) < (pad + msz))) return false; - if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) + if (unlikely(max < (offset + msz))) return false; - if (unlikely(msg_user(bmsg) != MSG_BUNDLER)) + + skb_put(bskb, pad + msz); + skb_copy_to_linear_data_offset(bskb, offset, msg, msz); + msg_set_size(bmsg, offset + msz); + msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); + return true; +} + +/** + * tipc_msg_try_bundle - Try to bundle a new message to the last one + * @tskb: the last/target message to which the new one will be appended + * @skb: the new message skb pointer + * @mss: max message size (header inclusive) + * @dnode: destination node for the message + * @new_bundle: if this call made a new bundle or not + * + * Return: "true" if the new message skb is potential for bundling this time or + * later, in the case a bundling has been done this time, the skb is consumed + * (the skb pointer = NULL). + * Otherwise, "false" if the skb cannot be bundled at all. + */ +bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, + u32 dnode, bool *new_bundle) +{ + struct tipc_msg *msg, *inner, *outer; + u32 tsz; + + /* First, check if the new buffer is suitable for bundling */ + msg = buf_msg(*skb); + if (msg_user(msg) == MSG_FRAGMENTER) return false; - if (unlikely(skb_tailroom(skb) < (pad + msz))) + if (msg_user(msg) == TUNNEL_PROTOCOL) return false; - if (unlikely(max < (start + msz))) + if (msg_user(msg) == BCAST_PROTOCOL) return false; - if ((msg_importance(msg) < TIPC_SYSTEM_IMPORTANCE) && - (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE)) + if (mss <= INT_H_SIZE + msg_size(msg)) return false; - skb_put(skb, pad + msz); - skb_copy_to_linear_data_offset(skb, start, msg, msz); - msg_set_size(bmsg, start + msz); - msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); + /* Ok, but the last/target buffer can be empty? */ + if (unlikely(!tskb)) + return true; + + /* Is it a bundle already? Try to bundle the new message to it */ + if (msg_user(buf_msg(tskb)) == MSG_BUNDLER) { + *new_bundle = false; + goto bundle; + } + + /* Make a new bundle of the two messages if possible */ + tsz = msg_size(buf_msg(tskb)); + if (unlikely(mss < align(INT_H_SIZE + tsz) + msg_size(msg))) + return true; + if (unlikely(pskb_expand_head(tskb, INT_H_SIZE, mss - tsz - INT_H_SIZE, + GFP_ATOMIC))) + return true; + inner = buf_msg(tskb); + skb_push(tskb, INT_H_SIZE); + outer = buf_msg(tskb); + tipc_msg_init(msg_prevnode(inner), outer, MSG_BUNDLER, 0, INT_H_SIZE, + dnode); + msg_set_importance(outer, msg_importance(inner)); + msg_set_size(outer, INT_H_SIZE + tsz); + msg_set_msgcnt(outer, 1); + *new_bundle = true; + +bundle: + if (likely(tipc_msg_bundle(tskb, msg, mss))) { + consume_skb(*skb); + *skb = NULL; + } return true; } @@ -510,49 +624,6 @@ none: } /** - * tipc_msg_make_bundle(): Create bundle buf and append message to its tail - * @list: the buffer chain, where head is the buffer to replace/append - * @skb: buffer to be created, appended to and returned in case of success - * @msg: message to be appended - * @mtu: max allowable size for the bundle buffer, inclusive header - * @dnode: destination node for message. (Not always present in header) - * Returns true if success, otherwise false - */ -bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, - u32 mtu, u32 dnode) -{ - struct sk_buff *_skb; - struct tipc_msg *bmsg; - u32 msz = msg_size(msg); - u32 max = mtu - INT_H_SIZE; - - if (msg_user(msg) == MSG_FRAGMENTER) - return false; - if (msg_user(msg) == TUNNEL_PROTOCOL) - return false; - if (msg_user(msg) == BCAST_PROTOCOL) - return false; - if (msz > (max / 2)) - return false; - - _skb = tipc_buf_acquire(max, GFP_ATOMIC); - if (!_skb) - return false; - - skb_trim(_skb, INT_H_SIZE); - bmsg = buf_msg(_skb); - tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, - INT_H_SIZE, dnode); - msg_set_importance(bmsg, msg_importance(msg)); - msg_set_seqno(bmsg, msg_seqno(msg)); - msg_set_ack(bmsg, msg_ack(msg)); - msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); - tipc_msg_bundle(_skb, msg, mtu); - *skb = _skb; - return true; -} - -/** * tipc_msg_reverse(): swap source and destination addresses and add error code * @own_node: originating node id for reversed message * @skb: buffer containing message to be reversed; will be consumed diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 0daa6f04ca81..6d466ebdb64f 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -102,16 +102,42 @@ struct plist; #define TIPC_MEDIA_INFO_OFFSET 5 struct tipc_skb_cb { - struct sk_buff *tail; - unsigned long nxt_retr; - unsigned long retr_stamp; - u32 bytes_read; - u32 orig_member; - u16 chain_imp; - u16 ackers; - u16 retr_cnt; - bool validated; -}; + union { + struct { + struct sk_buff *tail; + unsigned long nxt_retr; + unsigned long retr_stamp; + u32 bytes_read; + u32 orig_member; + u16 chain_imp; + u16 ackers; + u16 retr_cnt; + } __packed; +#ifdef CONFIG_TIPC_CRYPTO + struct { + struct tipc_crypto *rx; + struct tipc_aead *last; + u8 recurs; + } tx_clone_ctx __packed; +#endif + } __packed; + union { + struct { + u8 validated:1; +#ifdef CONFIG_TIPC_CRYPTO + u8 encrypted:1; + u8 decrypted:1; + u8 probe:1; + u8 tx_clone_deferred:1; +#endif + }; + u8 flags; + }; + u8 reserved; +#ifdef CONFIG_TIPC_CRYPTO + void *crypto_ctx; +#endif +} __packed; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) @@ -290,6 +316,16 @@ static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) msg_set_bits(m, 0, 18, 1, d); } +static inline int msg_ack_required(struct tipc_msg *m) +{ + return msg_bits(m, 0, 18, 1); +} + +static inline void msg_set_ack_required(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 18, 1, d); +} + static inline bool msg_is_rcast(struct tipc_msg *m) { return msg_bits(m, 0, 18, 0x1); @@ -1026,6 +1062,20 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } +/* Word 13 + */ +static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 13, n); +} + +static inline u32 msg_peer_net_hash(struct tipc_msg *m) +{ + return msg_word(m, 13); +} + +/* Word 14 + */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); @@ -1057,14 +1107,15 @@ struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); -bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu); -bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, - u32 mtu, u32 dnode); +bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, + u32 dnode, bool *new_bundle); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); +int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen, + int mss, struct sk_buff_head *txq); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 836e629e8f4a..5feaf3b67380 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -146,7 +146,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; - u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) / + u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index d6165ad384c0..e53231bd23b4 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -102,7 +102,11 @@ const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, - [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } + [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG }, + [TIPC_NLA_NODE_ID] = { .type = NLA_BINARY, + .len = TIPC_NODEID_LEN}, + [TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY, + .len = TIPC_AEAD_KEY_SIZE_MAX}, }; /* Properties valid for media, bearer and link */ @@ -176,7 +180,8 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_PUBL_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_nl_publ_dump, }, { @@ -239,7 +244,8 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_MON_PEER_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_nl_node_dump_monitor_peer, }, { @@ -250,10 +256,23 @@ static const struct genl_ops tipc_genl_v2_ops[] = { #ifdef CONFIG_TIPC_MEDIA_UDP { .cmd = TIPC_NL_UDP_GET_REMOTEIP, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_udp_nl_dump_remoteip, }, #endif +#ifdef CONFIG_TIPC_CRYPTO + { + .cmd = TIPC_NL_KEY_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_node_set_key, + }, + { + .cmd = TIPC_NL_KEY_FLUSH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_node_flush_key, + }, +#endif }; struct genl_family tipc_genl_family __ro_after_init = { @@ -268,18 +287,6 @@ struct genl_family tipc_genl_family __ro_after_init = { .n_ops = ARRAY_SIZE(tipc_genl_v2_ops), }; -int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) -{ - u32 maxattr = tipc_genl_family.maxattr; - - *attr = genl_family_attrbuf(&tipc_genl_family); - if (!*attr) - return -EOPNOTSUPP; - - return nlmsg_parse_deprecated(nlh, GENL_HDRLEN, *attr, maxattr, - tipc_nl_policy, NULL); -} - int __init tipc_netlink_start(void) { int res; diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h index 4ba0ad422110..7cf777723e3e 100644 --- a/net/tipc/netlink.h +++ b/net/tipc/netlink.h @@ -38,7 +38,6 @@ #include <net/netlink.h> extern struct genl_family tipc_genl_family; -int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf); struct tipc_nl_msg { struct sk_buff *skb; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index e135d4e11231..17a529739f8d 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -181,15 +181,18 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, struct tipc_nl_compat_msg *msg, struct sk_buff *arg) { + struct genl_dumpit_info info; int len = 0; int err; struct sk_buff *buf; struct nlmsghdr *nlmsg; struct netlink_callback cb; + struct nlattr **attrbuf; memset(&cb, 0, sizeof(cb)); cb.nlh = (struct nlmsghdr *)arg->data; cb.skb = arg; + cb.data = &info; buf = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!buf) @@ -201,19 +204,35 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, return -ENOMEM; } + attrbuf = kmalloc_array(tipc_genl_family.maxattr + 1, + sizeof(struct nlattr *), GFP_KERNEL); + if (!attrbuf) { + err = -ENOMEM; + goto err_out; + } + + info.attrs = attrbuf; + err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf, + tipc_genl_family.maxattr, + tipc_genl_family.policy, NULL); + if (err) + goto err_out; + do { int rem; len = (*cmd->dumpit)(buf, &cb); nlmsg_for_each_msg(nlmsg, nlmsg_hdr(buf), len, rem) { - struct nlattr **attrs; - - err = tipc_nlmsg_parse(nlmsg, &attrs); + err = nlmsg_parse_deprecated(nlmsg, GENL_HDRLEN, + attrbuf, + tipc_genl_family.maxattr, + tipc_genl_family.policy, + NULL); if (err) goto err_out; - err = (*cmd->format)(msg, attrs); + err = (*cmd->format)(msg, attrbuf); if (err) goto err_out; @@ -231,6 +250,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, err = 0; err_out: + kfree(attrbuf); tipc_dump_done(&cb); kfree_skb(buf); diff --git a/net/tipc/node.c b/net/tipc/node.c index c8f6177dd5a2..aaf595613e6e 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -44,6 +44,7 @@ #include "discover.h" #include "netlink.h" #include "trace.h" +#include "crypto.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 @@ -89,6 +90,7 @@ struct tipc_bclink_entry { * @links: array containing references to all links to node * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node + * @preliminary: a preliminary node or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) @@ -99,6 +101,7 @@ struct tipc_bclink_entry { * @publ_list: list of publications * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node + * @crypto_rx: RX crypto handler */ struct tipc_node { u32 addr; @@ -112,6 +115,7 @@ struct tipc_node { int action_flags; struct list_head list; int state; + bool preliminary; bool failover_sent; u16 sync_point; int link_cnt; @@ -120,12 +124,18 @@ struct tipc_node { u32 signature; u32 link_id; u8 peer_id[16]; + char peer_id_string[NODE_ID_STR_LEN]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; + struct net *peer_net; + u32 peer_hash_mix; +#ifdef CONFIG_TIPC_CRYPTO + struct tipc_crypto *crypto_rx; +#endif }; /* Node FSM states and events: @@ -163,7 +173,6 @@ static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); -static void tipc_node_put(struct tipc_node *node); static bool node_is_up(struct tipc_node *n); static void tipc_node_delete_from_list(struct tipc_node *node); @@ -184,7 +193,7 @@ static struct tipc_link *node_active_link(struct tipc_node *n, int sel) return n->links[bearer_id].link; } -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; @@ -194,6 +203,14 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) if (unlikely(!n)) return mtu; + /* Allow MAX_MSG_SIZE when building connection oriented message + * if they are in the same core network + */ + if (n->peer_net && connected) { + tipc_node_put(n); + return mtu; + } + bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; @@ -235,15 +252,51 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr) return caps; } +u32 tipc_node_get_addr(struct tipc_node *node) +{ + return (node) ? node->addr : 0; +} + +char *tipc_node_get_id_str(struct tipc_node *node) +{ + return node->peer_id_string; +} + +#ifdef CONFIG_TIPC_CRYPTO +/** + * tipc_node_crypto_rx - Retrieve crypto RX handle from node + * Note: node ref counter must be held first! + */ +struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) +{ + return (__n) ? __n->crypto_rx : NULL; +} + +struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) +{ + return container_of(pos, struct tipc_node, list)->crypto_rx; +} +#endif + +void tipc_node_free(struct rcu_head *rp) +{ + struct tipc_node *n = container_of(rp, struct tipc_node, rcu); + +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&n->crypto_rx); +#endif + kfree(n); +} + static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); kfree(n->bc_entry.link); - kfree_rcu(n, rcu); + call_rcu(&n->rcu, tipc_node_free); } -static void tipc_node_put(struct tipc_node *node) +void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } @@ -264,7 +317,7 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { - if (node->addr != addr) + if (node->addr != addr || node->preliminary) continue; if (!kref_get_unless_zero(&node->kref)) node = NULL; @@ -360,18 +413,71 @@ static void tipc_node_write_unlock(struct tipc_node *n) } } -static struct tipc_node *tipc_node_create(struct net *net, u32 addr, - u8 *peer_id, u16 capabilities) +static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) +{ + int net_id = tipc_netid(n->net); + struct tipc_net *tn_peer; + struct net *tmp; + u32 hash_chk; + + if (n->peer_net) + return; + + for_each_net_rcu(tmp) { + tn_peer = tipc_net(tmp); + if (!tn_peer) + continue; + /* Integrity checking whether node exists in namespace or not */ + if (tn_peer->net_id != net_id) + continue; + if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) + continue; + hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); + if (hash_mixes ^ hash_chk) + continue; + n->peer_net = tmp; + n->peer_hash_mix = hash_mixes; + break; + } +} + +struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, + u16 capabilities, u32 hash_mixes, + bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; struct tipc_link *l; + unsigned long intv; int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); - n = tipc_node_find(net, addr); + n = tipc_node_find(net, addr) ?: + tipc_node_find_by_id(net, peer_id); if (n) { + if (!n->preliminary) + goto update; + if (preliminary) + goto exit; + /* A preliminary node becomes "real" now, refresh its data */ + tipc_node_write_lock(n); + n->preliminary = false; + n->addr = addr; + hlist_del_rcu(&n->hash); + hlist_add_head_rcu(&n->hash, + &tn->node_htable[tipc_hashfn(addr)]); + list_del_rcu(&n->list); + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + if (n->addr < temp_node->addr) + break; + } + list_add_tail_rcu(&n->list, &temp_node->list); + tipc_node_write_unlock_fast(n); + +update: + if (n->peer_hash_mix ^ hash_mixes) + tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ @@ -389,6 +495,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -396,9 +503,23 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, pr_warn("Node creation failed, no memory\n"); goto exit; } + tipc_nodeid2string(n->peer_id_string, peer_id); +#ifdef CONFIG_TIPC_CRYPTO + if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) { + pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string); + kfree(n); + n = NULL; + goto exit; + } +#endif n->addr = addr; + n->preliminary = preliminary; memcpy(&n->peer_id, peer_id, 16); n->net = net; + n->peer_net = NULL; + n->peer_hash_mix = 0; + /* Assign kernel local namespace if exists */ + tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); @@ -417,22 +538,14 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; - if (!tipc_link_bc_create(net, tipc_own_addr(net), - addr, U16_MAX, - tipc_link_window(tipc_bc_sndlink(net)), - n->capabilities, - &n->bc_entry.inputq1, - &n->bc_entry.namedq, - tipc_bc_sndlink(net), - &n->bc_entry.link)) { - pr_warn("Broadcast rcv link creation failed, no memory\n"); - kfree(n); - n = NULL; - goto exit; - } + n->bc_entry.link = NULL; tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); - n->keepalive_intv = U32_MAX; + /* Start a slow timer anyway, crypto needs it */ + n->keepalive_intv = 10000; + intv = jiffies + msecs_to_jiffies(n->keepalive_intv); + if (!mod_timer(&n->timer, intv)) + tipc_node_get(n); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) @@ -617,6 +730,11 @@ static bool tipc_node_cleanup(struct tipc_node *peer) } tipc_node_write_unlock(peer); + if (!deleted) { + spin_unlock_bh(&tn->node_list_lock); + return deleted; + } + /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { @@ -645,6 +763,10 @@ static void tipc_node_timeout(struct timer_list *t) return; } +#ifdef CONFIG_TIPC_CRYPTO + /* Take any crypto key related actions first */ + tipc_crypto_timeout(n->crypto_rx); +#endif __skb_queue_head_init(&xmitq); /* Initial node interval to value larger (10 seconds), then it will be @@ -665,7 +787,7 @@ static void tipc_node_timeout(struct timer_list *t) remains--; } tipc_node_read_unlock(n); - tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } @@ -697,7 +819,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ - n->links[bearer_id].mtu = tipc_link_mtu(nl) - INT_H_SIZE; + n->links[bearer_id].mtu = tipc_link_mss(nl); tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); @@ -751,7 +873,7 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); maddr = &n->links[bearer_id].maddr; - tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr); + tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n); tipc_node_write_unlock(n); } @@ -906,7 +1028,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); + tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n); tipc_sk_rcv(n->net, &le->inputq); } @@ -950,6 +1072,8 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; + bool preliminary; + u32 sugg_addr; /* Suggest new address if some other peer is using this one */ n = tipc_node_find(net, addr); @@ -965,9 +1089,11 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) /* Suggest previously used address if peer is known */ n = tipc_node_find_by_id(net, id); if (n) { - addr = n->addr; + sugg_addr = n->addr; + preliminary = n->preliminary; tipc_node_put(n); - return addr; + if (!preliminary) + return sugg_addr; } /* Even this node may be in conflict */ @@ -979,12 +1105,12 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { struct tipc_node *n; - struct tipc_link *l; + struct tipc_link *l, *snd_l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; @@ -998,11 +1124,27 @@ void tipc_node_check_dest(struct net *net, u32 addr, *dupl_addr = false; *respond = false; - n = tipc_node_create(net, addr, peer_id, capabilities); + n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes, + false); if (!n) return; tipc_node_write_lock(n); + if (unlikely(!n->bc_entry.link)) { + snd_l = tipc_bc_sndlink(net); + if (!tipc_link_bc_create(net, tipc_own_addr(net), + addr, U16_MAX, + tipc_link_window(snd_l), + n->capabilities, + &n->bc_entry.inputq1, + &n->bc_entry.namedq, snd_l, + &n->bc_entry.link)) { + pr_warn("Broadcast rcv link creation failed, no mem\n"); + tipc_node_write_unlock_fast(n); + tipc_node_put(n); + return; + } + } le = &n->links[b->identity]; @@ -1017,6 +1159,9 @@ void tipc_node_check_dest(struct net *net, u32 addr, if (sign_match && addr_match && link_up) { /* All is fine. Do nothing. */ reset = false; + /* Peer node is not a container/local namespace */ + if (!n->peer_hash_mix) + n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; @@ -1342,7 +1487,8 @@ static void node_lost_contact(struct tipc_node *n, /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; - + n->peer_net = NULL; + n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, @@ -1424,6 +1570,56 @@ msg_full: return -EMSGSIZE; } +static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) +{ + struct tipc_msg *hdr = buf_msg(skb_peek(list)); + struct sk_buff_head inputq; + + switch (msg_user(hdr)) { + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + if (msg_connected(hdr) || msg_named(hdr)) { + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + } + if (msg_mcast(hdr)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + return; + } + return; + case MSG_FRAGMENTER: + if (tipc_msg_assemble(list)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + } + return; + case GROUP_PROTOCOL: + case CONN_MANAGER: + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + case LINK_PROTOCOL: + case NAME_DISTRIBUTOR: + case TUNNEL_PROTOCOL: + case BCAST_PROTOCOL: + return; + default: + return; + }; +} + /** * tipc_node_xmit() is the general link level function for message sending * @net: the applicable net namespace @@ -1439,6 +1635,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; + bool node_up = false; int bearer_id; int rc; @@ -1456,6 +1653,17 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, } tipc_node_read_lock(n); + node_up = node_is_up(n); + if (node_up && n->peer_net && check_net(n->peer_net)) { + /* xmit inner linux container */ + tipc_lxc_xmit(n->peer_net, list); + if (likely(skb_queue_empty(list))) { + tipc_node_read_unlock(n); + tipc_node_put(n); + return 0; + } + } + bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); @@ -1474,7 +1682,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); else - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); @@ -1622,7 +1830,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id } if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); @@ -1800,20 +2008,38 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; - struct tipc_node *n; + struct tipc_link_entry *le; struct tipc_msg *hdr; + struct tipc_node *n; int bearer_id = b->identity; - struct tipc_link_entry *le; u32 self = tipc_own_addr(net); int usr, rc = 0; u16 bc_ack; +#ifdef CONFIG_TIPC_CRYPTO + struct tipc_ehdr *ehdr; - __skb_queue_head_init(&xmitq); + /* Check if message must be decrypted first */ + if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb)) + goto rcv; + + ehdr = (struct tipc_ehdr *)skb->data; + if (likely(ehdr->user != LINK_CONFIG)) { + n = tipc_node_find(net, ntohl(ehdr->addr)); + if (unlikely(!n)) + goto discard; + } else { + n = tipc_node_find_by_id(net, ehdr->id); + } + tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); + if (!skb) + return; +rcv: +#endif /* Ensure message is well-formed before touching the header */ - TIPC_SKB_CB(skb)->validated = false; if (unlikely(!tipc_msg_validate(&skb))) goto discard; + __skb_queue_head_init(&xmitq); hdr = buf_msg(skb); usr = msg_user(hdr); bc_ack = msg_bcast_ack(hdr); @@ -1884,7 +2110,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) tipc_sk_rcv(net, &le->inputq); if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); discard: @@ -1915,7 +2141,7 @@ void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, tipc_link_set_mtu(e->link, b->mtu); } tipc_node_write_unlock(n); - tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } rcu_read_unlock(); @@ -1926,7 +2152,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; - struct tipc_node *peer; + struct tipc_node *peer, *temp_node; u32 addr; int err; @@ -1967,6 +2193,11 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) tipc_node_write_unlock(peer); tipc_node_delete(peer); + /* Calculate cluster capabilities */ + tn->capabilities = TIPC_NODE_CAPABILITIES; + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + tn->capabilities &= temp_node->capabilities; + } err = 0; err_out: tipc_node_put(peer); @@ -2011,6 +2242,8 @@ int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) } list_for_each_entry_rcu(node, &tn->node_list, list) { + if (node->preliminary) + continue; if (last_addr) { if (node->addr == last_addr) last_addr = 0; @@ -2150,7 +2383,8 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) out: tipc_node_read_unlock(node); - tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr, + NULL); return res; } @@ -2484,13 +2718,9 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, int err; if (!prev_node) { - struct nlattr **attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_MON]) return -EINVAL; @@ -2530,11 +2760,141 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, return skb->len; } -u32 tipc_node_get_addr(struct tipc_node *node) +#ifdef CONFIG_TIPC_CRYPTO +static int tipc_nl_retrieve_key(struct nlattr **attrs, + struct tipc_aead_key **key) { - return (node) ? node->addr : 0; + struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; + + if (!attr) + return -ENODATA; + + *key = (struct tipc_aead_key *)nla_data(attr); + if (nla_len(attr) < tipc_aead_key_size(*key)) + return -EINVAL; + + return 0; +} + +static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) +{ + struct nlattr *attr = attrs[TIPC_NLA_NODE_ID]; + + if (!attr) + return -ENODATA; + + if (nla_len(attr) < TIPC_NODEID_LEN) + return -EINVAL; + + *node_id = (u8 *)nla_data(attr); + return 0; } +int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n = NULL; + struct tipc_aead_key *ukey; + struct tipc_crypto *c; + u8 *id, *own_id; + int rc = 0; + + if (!info->attrs[TIPC_NLA_NODE]) + return -EINVAL; + + rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX, + info->attrs[TIPC_NLA_NODE], + tipc_nl_node_policy, info->extack); + if (rc) + goto exit; + + own_id = tipc_own_id(net); + if (!own_id) { + rc = -EPERM; + goto exit; + } + + rc = tipc_nl_retrieve_key(attrs, &ukey); + if (rc) + goto exit; + + rc = tipc_aead_key_validate(ukey); + if (rc) + goto exit; + + rc = tipc_nl_retrieve_nodeid(attrs, &id); + switch (rc) { + case -ENODATA: + /* Cluster key mode */ + rc = tipc_crypto_key_init(tn->crypto_tx, ukey, CLUSTER_KEY); + break; + case 0: + /* Per-node key mode */ + if (!memcmp(id, own_id, NODE_ID_LEN)) { + c = tn->crypto_tx; + } else { + n = tipc_node_find_by_id(net, id) ?: + tipc_node_create(net, 0, id, 0xffffu, 0, true); + if (unlikely(!n)) { + rc = -ENOMEM; + break; + } + c = n->crypto_rx; + } + + rc = tipc_crypto_key_init(c, ukey, PER_NODE_KEY); + if (n) + tipc_node_put(n); + break; + default: + break; + } + +exit: + return (rc < 0) ? rc : 0; +} + +int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_node_set_key(skb, info); + rtnl_unlock(); + + return err; +} + +int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n; + + tipc_crypto_key_flush(tn->crypto_tx); + rcu_read_lock(); + list_for_each_entry_rcu(n, &tn->node_list, list) + tipc_crypto_key_flush(n->crypto_rx); + rcu_read_unlock(); + + pr_info("All keys are flushed!\n"); + return 0; +} + +int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_node_flush_key(skb, info); + rtnl_unlock(); + + return err; +} +#endif + /** * tipc_node_dump - dump TIPC node data * @n: tipc node to be dumped @@ -2591,3 +2951,33 @@ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) return i; } + +void tipc_node_pre_cleanup_net(struct net *exit_net) +{ + struct tipc_node *n; + struct tipc_net *tn; + struct net *tmp; + + rcu_read_lock(); + for_each_net_rcu(tmp) { + if (tmp == exit_net) + continue; + tn = tipc_net(tmp); + if (!tn) + continue; + spin_lock_bh(&tn->node_list_lock); + list_for_each_entry_rcu(n, &tn->node_list, list) { + if (!n->peer_net) + continue; + if (n->peer_net != exit_net) + continue; + tipc_node_write_lock(n); + n->peer_net = NULL; + n->peer_hash_mix = 0; + tipc_node_write_unlock_fast(n); + break; + } + spin_unlock_bh(&tn->node_list_lock); + } + rcu_read_unlock(); +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 291d0ecd4101..a6803b449a2c 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -54,7 +54,8 @@ enum { TIPC_LINK_PROTO_SEQNO = (1 << 6), TIPC_MCAST_RBCTL = (1 << 7), TIPC_GAP_ACK_BLOCK = (1 << 8), - TIPC_TUNNEL_ENHANCED = (1 << 9) + TIPC_TUNNEL_ENHANCED = (1 << 9), + TIPC_NAGLE = (1 << 10) }; #define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \ @@ -66,16 +67,27 @@ enum { TIPC_LINK_PROTO_SEQNO | \ TIPC_MCAST_RBCTL | \ TIPC_GAP_ACK_BLOCK | \ - TIPC_TUNNEL_ENHANCED) + TIPC_TUNNEL_ENHANCED | \ + TIPC_NAGLE) + #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); bool tipc_node_get_id(struct net *net, u32 addr, u8 *id); u32 tipc_node_get_addr(struct tipc_node *node); +char *tipc_node_get_id_str(struct tipc_node *node); +void tipc_node_put(struct tipc_node *node); +struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, + u16 capabilities, u32 hash_mixes, + bool preliminary); +#ifdef CONFIG_TIPC_CRYPTO +struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n); +struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos); +#endif u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, struct tipc_bearer *bearer, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr); void tipc_node_delete_links(struct net *net, int bearer_id); @@ -92,7 +104,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr); void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected); bool tipc_node_is_up(struct net *net, u32 addr); u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); @@ -107,4 +119,9 @@ int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb); +#ifdef CONFIG_TIPC_CRYPTO +int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info); +#endif +void tipc_node_pre_cleanup_net(struct net *exit_net); #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 4b92b196cfa6..5d7859aac78e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -75,6 +75,7 @@ struct sockaddr_pair { * @conn_instance: TIPC instance used when connection was established * @published: non-zero if port has one or more associated names * @max_pkt: maximum packet size "hint" used when building messages sent by port + * @maxnagle: maximum size of msg which can be subject to nagle * @portid: unique port identity in TIPC socket hash table * @phdr: preformatted message header used when sending messages * #cong_links: list of congested links @@ -97,6 +98,7 @@ struct tipc_sock { u32 conn_instance; int published; u32 max_pkt; + u32 maxnagle; u32 portid; struct tipc_msg phdr; struct list_head cong_links; @@ -116,6 +118,10 @@ struct tipc_sock { struct tipc_mc_method mc_method; struct rcu_head rcu; struct tipc_group *group; + u32 oneway; + u16 snd_backlog; + bool expect_ack; + bool nodelay; bool group_is_open; }; @@ -137,6 +143,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); +static void tipc_sk_push_backlog(struct tipc_sock *tsk); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -227,6 +234,26 @@ static u16 tsk_inc(struct tipc_sock *tsk, int msglen) return 1; } +/* tsk_set_nagle - enable/disable nagle property by manipulating maxnagle + */ +static void tsk_set_nagle(struct tipc_sock *tsk) +{ + struct sock *sk = &tsk->sk; + + tsk->maxnagle = 0; + if (sk->sk_type != SOCK_STREAM) + return; + if (tsk->nodelay) + return; + if (!(tsk->peer_caps & TIPC_NAGLE)) + return; + /* Limit node local buffer size to avoid receive queue overflow */ + if (tsk->max_pkt == MAX_MSG_SIZE) + tsk->maxnagle = 1500; + else + tsk->maxnagle = tsk->max_pkt; +} + /** * tsk_advance_rx_queue - discard first buffer in socket receive queue * @@ -446,6 +473,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); tsk->max_pkt = MAX_PKT_DEFAULT; + tsk->maxnagle = 0; INIT_LIST_HEAD(&tsk->publications); INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; @@ -512,8 +540,12 @@ static void __tipc_shutdown(struct socket *sock, int error) tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))); - /* Remove any pending SYN message */ - __skb_queue_purge(&sk->sk_write_queue); + /* Push out unsent messages or remove if pending SYN */ + skb = skb_peek(&sk->sk_write_queue); + if (skb && !msg_is_syn(buf_msg(skb))) + tipc_sk_push_backlog(tsk); + else + __skb_queue_purge(&sk->sk_write_queue); /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). @@ -854,7 +886,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, /* Build message as chain of buffers */ __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1208,6 +1240,27 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, tipc_sk_rcv(net, inputq); } +/* tipc_sk_push_backlog(): send accumulated buffers in socket write queue + * when socket is in Nagle mode + */ +static void tipc_sk_push_backlog(struct tipc_sock *tsk) +{ + struct sk_buff_head *txq = &tsk->sk.sk_write_queue; + struct net *net = sock_net(&tsk->sk); + u32 dnode = tsk_peer_node(tsk); + int rc; + + if (skb_queue_empty(txq) || tsk->cong_link_cnt) + return; + + tsk->snt_unacked += tsk->snd_backlog; + tsk->snd_backlog = 0; + tsk->expect_ack = true; + rc = tipc_node_xmit(net, txq, dnode, tsk->portid); + if (rc == -ELINKCONG) + tsk->cong_link_cnt = 1; +} + /** * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket @@ -1221,7 +1274,7 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, u32 onode = tsk_own_node(tsk); struct sock *sk = &tsk->sk; int mtyp = msg_type(hdr); - bool conn_cong; + bool was_cong; /* Ignore if connection cannot be validated: */ if (!tsk_peer_msg(tsk, hdr)) { @@ -1254,11 +1307,13 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, __skb_queue_tail(xmitq, skb); return; } else if (mtyp == CONN_ACK) { - conn_cong = tsk_conn_cong(tsk); + was_cong = tsk_conn_cong(tsk); + tsk->expect_ack = false; + tipc_sk_push_backlog(tsk); tsk->snt_unacked -= msg_conn_ack(hdr); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) tsk->snd_win = msg_adv_win(hdr); - if (conn_cong) + if (was_cong && !tsk_conn_cong(tsk)) sk->sk_write_space(sk); } else if (mtyp != CONN_PROBE_REPLY) { pr_warn("Received unknown CONN_PROTO msg\n"); @@ -1388,7 +1443,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return rc; __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1437,15 +1492,15 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + struct sk_buff_head *txq = &sk->sk_write_queue; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - struct sk_buff_head pkts; u32 dnode = tsk_peer_node(tsk); + int maxnagle = tsk->maxnagle; + int maxpkt = tsk->max_pkt; int send, sent = 0; - int rc = 0; - - __skb_queue_head_init(&pkts); + int blocks, rc = 0; if (unlikely(dlen > INT_MAX)) return -EMSGSIZE; @@ -1467,21 +1522,35 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) tipc_sk_connected(sk))); if (unlikely(rc)) break; - send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); - rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts); - if (unlikely(rc != send)) - break; - - trace_tipc_sk_sendstream(sk, skb_peek(&pkts), + blocks = tsk->snd_backlog; + if (tsk->oneway++ >= 4 && send <= maxnagle) { + rc = tipc_msg_append(hdr, m, send, maxnagle, txq); + if (unlikely(rc < 0)) + break; + blocks += rc; + if (blocks <= 64 && tsk->expect_ack) { + tsk->snd_backlog = blocks; + sent += send; + break; + } + tsk->expect_ack = true; + } else { + rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq); + if (unlikely(rc != send)) + break; + blocks += tsk_inc(tsk, send + MIN_H_SIZE); + } + trace_tipc_sk_sendstream(sk, skb_peek(txq), TIPC_DUMP_SK_SNDQ, " "); - rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + rc = tipc_node_xmit(net, txq, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { tsk->cong_link_cnt = 1; rc = 0; } if (likely(!rc)) { - tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE); + tsk->snt_unacked += blocks; + tsk->snd_backlog = 0; sent += send; } } while (sent < dlen && !rc); @@ -1526,8 +1595,9 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); tipc_set_sk_state(sk, TIPC_ESTABLISHED); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); - tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); + tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid, true); tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); + tsk_set_nagle(tsk); __skb_queue_purge(&sk->sk_write_queue); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) return; @@ -1848,6 +1918,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, bool peek = flags & MSG_PEEK; int offset, required, copy, copied = 0; int hlen, dlen, err, rc; + bool ack = false; long timeout; /* Catch invalid receive attempts */ @@ -1892,6 +1963,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Copy data if msg ok, otherwise return error/partial data */ if (likely(!err)) { + ack = msg_ack_required(hdr); offset = skb_cb->bytes_read; copy = min_t(int, dlen - offset, buflen - copied); rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); @@ -1919,7 +1991,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); - if (unlikely(tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)) + if (ack || tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) tipc_sk_send_ack(tsk); /* Exit if all requested data or FIN/error received */ @@ -1990,6 +2062,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, smp_wmb(); tsk->cong_link_cnt--; wakeup = true; + tipc_sk_push_backlog(tsk); break; case GROUP_PROTOCOL: tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); @@ -2029,6 +2102,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) if (unlikely(msg_mcast(hdr))) return false; + tsk->oneway = 0; switch (sk->sk_state) { case TIPC_CONNECTING: @@ -2074,6 +2148,8 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) return true; return false; case TIPC_ESTABLISHED: + if (!skb_queue_empty(&sk->sk_write_queue)) + tipc_sk_push_backlog(tsk); /* Accept only connection-based messages sent by peer */ if (likely(con_msg && !err && pport == oport && pnode == onode)) return true; @@ -2959,6 +3035,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_SRC_DROPPABLE: case TIPC_DEST_DROPPABLE: case TIPC_CONN_TIMEOUT: + case TIPC_NODELAY: if (ol < sizeof(value)) return -EINVAL; if (get_user(value, (u32 __user *)ov)) @@ -3007,6 +3084,10 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_GROUP_LEAVE: res = tipc_sk_leave(tsk); break; + case TIPC_NODELAY: + tsk->nodelay = !!value; + tsk_set_nagle(tsk); + break; default: res = -EINVAL; } @@ -3588,13 +3669,9 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) struct tipc_sock *tsk; if (!tsk_portid) { - struct nlattr **attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1]; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_SOCK]) return -EINVAL; diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 6159d327db76..58ab3d6dcdce 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -35,6 +35,7 @@ #include "core.h" #include "trace.h" +#include "crypto.h" #include <linux/sysctl.h> @@ -64,6 +65,16 @@ static struct ctl_table tipc_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_TIPC_CRYPTO + { + .procname = "max_tfms", + .data = &sysctl_tipc_max_tfms, + .maxlen = sizeof(sysctl_tipc_max_tfms), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif {} }; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 287df68721df..86aaa4d3e781 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -372,6 +372,7 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) goto out; if (b && test_bit(0, &b->up)) { + TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(sock_net(sk), skb, b); return 0; } @@ -448,15 +449,11 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) int i; if (!bid && !skip_cnt) { + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct net *net = sock_net(skb->sk); struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1]; - struct nlattr **attrs; char *bname; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_BEARER]) return -EINVAL; diff --git a/net/tls/Kconfig b/net/tls/Kconfig index e4328b3b72eb..61ec78521a60 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -26,3 +26,13 @@ config TLS_DEVICE Enable kernel support for HW offload of the TLS protocol. If unsure, say N. + +config TLS_TOE + bool "Transport Layer Security TCP stack bypass" + depends on TLS + default n + help + Enable kernel support for legacy HW offload of the TLS protocol, + which is incompatible with the Linux networking stack semantics. + + If unsure, say N. diff --git a/net/tls/Makefile b/net/tls/Makefile index ef0dc74ce8f9..f1ffbfe8968d 100644 --- a/net/tls/Makefile +++ b/net/tls/Makefile @@ -3,8 +3,11 @@ # Makefile for the TLS subsystem. # +CFLAGS_trace.o := -I$(src) + obj-$(CONFIG_TLS) += tls.o -tls-y := tls_main.o tls_sw.o +tls-y := tls_main.o tls_sw.o tls_proc.o trace.o +tls-$(CONFIG_TLS_TOE) += tls_toe.o tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 683d00837693..0683788bbef0 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -38,6 +38,8 @@ #include <net/tcp.h> #include <net/tls.h> +#include "trace.h" + /* device_offload_lock is used to synchronize tls_dev_add * against NETDEV_DOWN notifications. */ @@ -202,6 +204,15 @@ void tls_device_free_resources_tx(struct sock *sk) tls_free_partial_record(sk, tls_ctx); } +void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + + trace_tls_device_tx_resync_req(sk, got_seq, exp_seq); + WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags)); +} +EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request); + static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, u32 seq) { @@ -216,6 +227,7 @@ static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, rcd_sn = tls_ctx->tx.rec_seq; + trace_tls_device_tx_resync_send(sk, seq, rcd_sn); down_read(&device_offload_lock); netdev = tls_ctx->netdev; if (netdev) @@ -419,7 +431,7 @@ static int tls_push_data(struct sock *sk, ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) return -ENOTSUPP; - if (sk->sk_err) + if (unlikely(sk->sk_err)) return -sk->sk_err; flags |= MSG_SENDPAGE_DECRYPTED; @@ -440,9 +452,8 @@ static int tls_push_data(struct sock *sk, max_open_record_len = TLS_MAX_PAYLOAD_SIZE + prot->prepend_size; do { - rc = tls_do_allocation(sk, ctx, pfrag, - prot->prepend_size); - if (rc) { + rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); + if (unlikely(rc)) { rc = sk_stream_wait_memory(sk, &timeo); if (!rc) continue; @@ -645,15 +656,19 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) static void tls_device_resync_rx(struct tls_context *tls_ctx, struct sock *sk, u32 seq, u8 *rcd_sn) { + struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); struct net_device *netdev; if (WARN_ON(test_and_set_bit(TLS_RX_SYNC_RUNNING, &tls_ctx->flags))) return; + + trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type); netdev = READ_ONCE(tls_ctx->netdev); if (netdev) netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_RX); clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC); } void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) @@ -661,8 +676,8 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + u32 sock_data, is_req_pending; struct tls_prot_info *prot; - u32 is_req_pending; s64 resync_req; u32 req_seq; @@ -691,8 +706,12 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) /* head of next rec is already in, note that the sock_inq will * include the currently parsed message when called from parser */ - if (tcp_inq(sk) > rcd_len) + sock_data = tcp_inq(sk); + if (sock_data > rcd_len) { + trace_tls_device_rx_resync_nh_delay(sk, sock_data, + rcd_len); return; + } rx_ctx->resync_nh_do_now = 0; seq += rcd_len; @@ -736,6 +755,7 @@ static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, /* head of next rec is already in, parser will sync for us */ if (tcp_inq(sk) > rxm->full_len) { + trace_tls_device_rx_resync_nh_schedule(sk); ctx->resync_nh_do_now = 1; } else { struct tls_prot_info *prot = &tls_ctx->prot_info; @@ -834,9 +854,9 @@ free_buf: return err; } -int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) +int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, + struct sk_buff *skb, struct strp_msg *rxm) { - struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); int is_decrypted = skb->decrypted; int is_encrypted = !is_decrypted; @@ -848,6 +868,10 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) is_encrypted &= !skb_iter->decrypted; } + trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len, + tls_ctx->rx.rec_seq, rxm->full_len, + is_encrypted, is_decrypted); + ctx->sw.decrypted |= is_decrypted; /* Return immediately if the record is either entirely plaintext or @@ -1021,6 +1045,8 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, &ctx->crypto_send.info, tcp_sk(sk)->write_seq); + trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX, + tcp_sk(sk)->write_seq, rec_seq, rc); if (rc) goto release_lock; @@ -1057,6 +1083,7 @@ free_marker_record: int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { + struct tls12_crypto_info_aes_gcm_128 *info; struct tls_offload_context_rx *context; struct net_device *netdev; int rc = 0; @@ -1104,6 +1131,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); + info = (void *)&ctx->crypto_recv.info; + trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX, + tcp_sk(sk)->copied_seq, info->rec_seq, rc); if (rc) goto free_sw_resources; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 0775ae40fcfb..5bb93dd5762b 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -41,7 +41,9 @@ #include <linux/inetdevice.h> #include <linux/inet_diag.h> +#include <net/snmp.h> #include <net/tls.h> +#include <net/tls_toe.h> MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); @@ -58,14 +60,12 @@ static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static struct proto *saved_tcpv4_prot; static DEFINE_MUTEX(tcpv4_prot_mutex); -static LIST_HEAD(device_list); -static DEFINE_SPINLOCK(device_spinlock); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], struct proto *base); -static void update_sk_prot(struct sock *sk, struct tls_context *ctx) +void update_sk_prot(struct sock *sk, struct tls_context *ctx) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; @@ -287,14 +287,19 @@ static void tls_sk_proto_cleanup(struct sock *sk, kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); tls_sw_release_resources_tx(sk); + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); } else if (ctx->tx_conf == TLS_HW) { tls_device_free_resources_tx(sk); + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); } - if (ctx->rx_conf == TLS_SW) + if (ctx->rx_conf == TLS_SW) { tls_sw_release_resources_rx(sk); - else if (ctx->rx_conf == TLS_HW) + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); + } else if (ctx->rx_conf == TLS_HW) { tls_device_offload_cleanup_rx(sk); + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); + } } static void tls_sk_proto_close(struct sock *sk, long timeout) @@ -535,19 +540,29 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, if (tx) { rc = tls_set_device_offload(sk, ctx); conf = TLS_HW; - if (rc) { + if (!rc) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXDEVICE); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); + } else { rc = tls_set_sw_offload(sk, ctx, 1); if (rc) goto err_crypto_info; + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); conf = TLS_SW; } } else { rc = tls_set_device_offload_rx(sk, ctx); conf = TLS_HW; - if (rc) { + if (!rc) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICE); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); + } else { rc = tls_set_sw_offload(sk, ctx, 0); if (rc) goto err_crypto_info; + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); conf = TLS_SW; } tls_sw_strparser_arm(sk, ctx); @@ -604,7 +619,7 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } -static struct tls_context *create_ctx(struct sock *sk) +struct tls_context *tls_ctx_create(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; @@ -645,90 +660,6 @@ static void tls_build_proto(struct sock *sk) } } -static void tls_hw_sk_destruct(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - ctx->sk_destruct(sk); - /* Free ctx */ - rcu_assign_pointer(icsk->icsk_ulp_data, NULL); - tls_ctx_free(sk, ctx); -} - -static int tls_hw_prot(struct sock *sk) -{ - struct tls_context *ctx; - struct tls_device *dev; - int rc = 0; - - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->feature && dev->feature(dev)) { - ctx = create_ctx(sk); - if (!ctx) - goto out; - - spin_unlock_bh(&device_spinlock); - tls_build_proto(sk); - ctx->sk_destruct = sk->sk_destruct; - sk->sk_destruct = tls_hw_sk_destruct; - ctx->rx_conf = TLS_HW_RECORD; - ctx->tx_conf = TLS_HW_RECORD; - update_sk_prot(sk, ctx); - spin_lock_bh(&device_spinlock); - rc = 1; - break; - } - } -out: - spin_unlock_bh(&device_spinlock); - return rc; -} - -static void tls_hw_unhash(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct tls_device *dev; - - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->unhash) { - kref_get(&dev->kref); - spin_unlock_bh(&device_spinlock); - dev->unhash(dev, sk); - kref_put(&dev->kref, dev->release); - spin_lock_bh(&device_spinlock); - } - } - spin_unlock_bh(&device_spinlock); - ctx->sk_proto->unhash(sk); -} - -static int tls_hw_hash(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct tls_device *dev; - int err; - - err = ctx->sk_proto->hash(sk); - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->hash) { - kref_get(&dev->kref); - spin_unlock_bh(&device_spinlock); - err |= dev->hash(dev, sk); - kref_put(&dev->kref, dev->release); - spin_lock_bh(&device_spinlock); - } - } - spin_unlock_bh(&device_spinlock); - - if (err) - tls_hw_unhash(sk); - return err; -} - static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], struct proto *base) { @@ -766,10 +697,11 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW]; #endif - +#ifdef CONFIG_TLS_TOE prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; - prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash; - prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_toe_hash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_toe_unhash; +#endif } static int tls_init(struct sock *sk) @@ -777,8 +709,12 @@ static int tls_init(struct sock *sk) struct tls_context *ctx; int rc = 0; - if (tls_hw_prot(sk)) + tls_build_proto(sk); + +#ifdef CONFIG_TLS_TOE + if (tls_toe_bypass(sk)) return 0; +#endif /* The TLS ulp is currently supported only for TCP sockets * in ESTABLISHED state. @@ -789,11 +725,9 @@ static int tls_init(struct sock *sk) if (sk->sk_state != TCP_ESTABLISHED) return -ENOTSUPP; - tls_build_proto(sk); - /* allocate tls context */ write_lock_bh(&sk->sk_callback_lock); - ctx = create_ctx(sk); + ctx = tls_ctx_create(sk); if (!ctx) { rc = -ENOMEM; goto out; @@ -879,21 +813,34 @@ static size_t tls_get_info_size(const struct sock *sk) return size; } -void tls_register_device(struct tls_device *device) +static int __net_init tls_init_net(struct net *net) { - spin_lock_bh(&device_spinlock); - list_add_tail(&device->dev_list, &device_list); - spin_unlock_bh(&device_spinlock); + int err; + + net->mib.tls_statistics = alloc_percpu(struct linux_tls_mib); + if (!net->mib.tls_statistics) + return -ENOMEM; + + err = tls_proc_init(net); + if (err) + goto err_free_stats; + + return 0; +err_free_stats: + free_percpu(net->mib.tls_statistics); + return err; } -EXPORT_SYMBOL(tls_register_device); -void tls_unregister_device(struct tls_device *device) +static void __net_exit tls_exit_net(struct net *net) { - spin_lock_bh(&device_spinlock); - list_del(&device->dev_list); - spin_unlock_bh(&device_spinlock); + tls_proc_fini(net); + free_percpu(net->mib.tls_statistics); } -EXPORT_SYMBOL(tls_unregister_device); + +static struct pernet_operations tls_proc_ops = { + .init = tls_init_net, + .exit = tls_exit_net, +}; static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", @@ -906,6 +853,12 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { static int __init tls_register(void) { + int err; + + err = register_pernet_subsys(&tls_proc_ops); + if (err) + return err; + tls_sw_proto_ops = inet_stream_ops; tls_sw_proto_ops.splice_read = tls_sw_splice_read; @@ -919,6 +872,7 @@ static void __exit tls_unregister(void) { tcp_unregister_ulp(&tcp_tls_ulp_ops); tls_device_cleanup(); + unregister_pernet_subsys(&tls_proc_ops); } module_init(tls_register); diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c new file mode 100644 index 000000000000..83d9c80a684e --- /dev/null +++ b/net/tls/tls_proc.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/snmp.h> +#include <net/tls.h> + +static const struct snmp_mib tls_mib_list[] = { + SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW), + SNMP_MIB_ITEM("TlsCurrRxSw", LINUX_MIB_TLSCURRRXSW), + SNMP_MIB_ITEM("TlsCurrTxDevice", LINUX_MIB_TLSCURRTXDEVICE), + SNMP_MIB_ITEM("TlsCurrRxDevice", LINUX_MIB_TLSCURRRXDEVICE), + SNMP_MIB_ITEM("TlsTxSw", LINUX_MIB_TLSTXSW), + SNMP_MIB_ITEM("TlsRxSw", LINUX_MIB_TLSRXSW), + SNMP_MIB_ITEM("TlsTxDevice", LINUX_MIB_TLSTXDEVICE), + SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE), + SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR), + SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), + SNMP_MIB_SENTINEL +}; + +static int tls_statistics_seq_show(struct seq_file *seq, void *v) +{ + unsigned long buf[LINUX_MIB_TLSMAX] = {}; + struct net *net = seq->private; + int i; + + snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics); + for (i = 0; tls_mib_list[i].name; i++) + seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]); + + return 0; +} + +int __net_init tls_proc_init(struct net *net) +{ + if (!proc_create_net_single("tls_stat", 0444, net->proc_net, + tls_statistics_seq_show, NULL)) + return -ENOMEM; + return 0; +} + +void __net_exit tls_proc_fini(struct net *net) +{ + remove_proc_entry("tls_stat", net->proc_net); +} diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 446f23c1f3ce..141da093ff04 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -168,6 +168,9 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err) /* Propagate if there was an err */ if (err) { + if (err == -EBADMSG) + TLS_INC_STATS(sock_net(skb->sk), + LINUX_MIB_TLSDECRYPTERROR); ctx->async_wait.err = err; tls_err_abort(skb->sk, err); } else { @@ -253,6 +256,8 @@ static int tls_do_decryption(struct sock *sk, return ret; ret = crypto_wait_req(ret, &ctx->async_wait); + } else if (ret == -EBADMSG) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR); } if (async) @@ -1481,7 +1486,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, if (!ctx->decrypted) { if (tls_ctx->rx_conf == TLS_HW) { - err = tls_device_decrypted(sk, skb); + err = tls_device_decrypted(sk, tls_ctx, skb, rxm); if (err < 0) return err; } @@ -1509,7 +1514,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, rxm->offset += prot->prepend_size; rxm->full_len -= prot->overhead_size; tls_advance_record_sn(sk, prot, &tls_ctx->rx); - ctx->decrypted = true; + ctx->decrypted = 1; ctx->saved_data_ready(sk); } else { *zc = false; @@ -1919,7 +1924,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, tls_err_abort(sk, EBADMSG); goto splice_read_end; } - ctx->decrypted = true; + ctx->decrypted = 1; } rxm = strp_msg(skb); @@ -2020,7 +2025,7 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - ctx->decrypted = false; + ctx->decrypted = 0; ctx->recv_pkt = skb; strp_pause(strp); @@ -2376,10 +2381,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); if (crypto_info->version == TLS_1_3_VERSION) - sw_ctx_rx->async_capable = false; + sw_ctx_rx->async_capable = 0; else sw_ctx_rx->async_capable = - tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; + !!(tfm->__crt_alg->cra_flags & + CRYPTO_ALG_ASYNC); /* Set up strparser */ memset(&cb, 0, sizeof(cb)); diff --git a/net/tls/tls_toe.c b/net/tls/tls_toe.c new file mode 100644 index 000000000000..7e1330f19165 --- /dev/null +++ b/net/tls/tls_toe.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. + * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/spinlock.h> +#include <net/inet_connection_sock.h> +#include <net/tls.h> +#include <net/tls_toe.h> + +static LIST_HEAD(device_list); +static DEFINE_SPINLOCK(device_spinlock); + +static void tls_toe_sk_destruct(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tls_context *ctx = tls_get_ctx(sk); + + ctx->sk_destruct(sk); + /* Free ctx */ + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); + tls_ctx_free(sk, ctx); +} + +int tls_toe_bypass(struct sock *sk) +{ + struct tls_toe_device *dev; + struct tls_context *ctx; + int rc = 0; + + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->feature && dev->feature(dev)) { + ctx = tls_ctx_create(sk); + if (!ctx) + goto out; + + ctx->sk_destruct = sk->sk_destruct; + sk->sk_destruct = tls_toe_sk_destruct; + ctx->rx_conf = TLS_HW_RECORD; + ctx->tx_conf = TLS_HW_RECORD; + update_sk_prot(sk, ctx); + rc = 1; + break; + } + } +out: + spin_unlock_bh(&device_spinlock); + return rc; +} + +void tls_toe_unhash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_toe_device *dev; + + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->unhash) { + kref_get(&dev->kref); + spin_unlock_bh(&device_spinlock); + dev->unhash(dev, sk); + kref_put(&dev->kref, dev->release); + spin_lock_bh(&device_spinlock); + } + } + spin_unlock_bh(&device_spinlock); + ctx->sk_proto->unhash(sk); +} + +int tls_toe_hash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_toe_device *dev; + int err; + + err = ctx->sk_proto->hash(sk); + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->hash) { + kref_get(&dev->kref); + spin_unlock_bh(&device_spinlock); + err |= dev->hash(dev, sk); + kref_put(&dev->kref, dev->release); + spin_lock_bh(&device_spinlock); + } + } + spin_unlock_bh(&device_spinlock); + + if (err) + tls_toe_unhash(sk); + return err; +} + +void tls_toe_register_device(struct tls_toe_device *device) +{ + spin_lock_bh(&device_spinlock); + list_add_tail(&device->dev_list, &device_list); + spin_unlock_bh(&device_spinlock); +} +EXPORT_SYMBOL(tls_toe_register_device); + +void tls_toe_unregister_device(struct tls_toe_device *device) +{ + spin_lock_bh(&device_spinlock); + list_del(&device->dev_list); + spin_unlock_bh(&device_spinlock); +} +EXPORT_SYMBOL(tls_toe_unregister_device); diff --git a/net/tls/trace.c b/net/tls/trace.c new file mode 100644 index 000000000000..e374913cf9c9 --- /dev/null +++ b/net/tls/trace.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#include <linux/module.h> + +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "trace.h" + +#endif diff --git a/net/tls/trace.h b/net/tls/trace.h new file mode 100644 index 000000000000..9ba5f600ea43 --- /dev/null +++ b/net/tls/trace.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM tls + +#if !defined(_TLS_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) +#define _TLS_TRACE_H_ + +#include <asm/unaligned.h> +#include <linux/tracepoint.h> + +struct sock; + +TRACE_EVENT(tls_device_offload_set, + + TP_PROTO(struct sock *sk, int dir, u32 tcp_seq, u8 *rec_no, int ret), + + TP_ARGS(sk, dir, tcp_seq, rec_no, ret), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( int, dir ) + __field( u32, tcp_seq ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->dir = dir; + __entry->tcp_seq = tcp_seq; + __entry->ret = ret; + ), + + TP_printk( + "sk=%p direction=%d tcp_seq=%u rec_no=%llu ret=%d", + __entry->sk, __entry->dir, __entry->tcp_seq, __entry->rec_no, + __entry->ret + ) +); + +TRACE_EVENT(tls_device_decrypted, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no, u32 rec_len, + bool encrypted, bool decrypted), + + TP_ARGS(sk, tcp_seq, rec_no, rec_len, encrypted, decrypted), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + __field( u32, rec_len ) + __field( bool, encrypted ) + __field( bool, decrypted ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + __entry->rec_len = rec_len; + __entry->encrypted = encrypted; + __entry->decrypted = decrypted; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu len=%u encrypted=%d decrypted=%d", + __entry->sk, __entry->tcp_seq, + __entry->rec_no, __entry->rec_len, + __entry->encrypted, __entry->decrypted + ) +); + +TRACE_EVENT(tls_device_rx_resync_send, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no, int sync_type), + + TP_ARGS(sk, tcp_seq, rec_no, sync_type), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + __field( int, sync_type ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + __entry->sync_type = sync_type; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu sync_type=%d", + __entry->sk, __entry->tcp_seq, __entry->rec_no, + __entry->sync_type + ) +); + +TRACE_EVENT(tls_device_rx_resync_nh_schedule, + + TP_PROTO(struct sock *sk), + + TP_ARGS(sk), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + ), + + TP_fast_assign( + __entry->sk = sk; + ), + + TP_printk( + "sk=%p", __entry->sk + ) +); + +TRACE_EVENT(tls_device_rx_resync_nh_delay, + + TP_PROTO(struct sock *sk, u32 sock_data, u32 rec_len), + + TP_ARGS(sk, sock_data, rec_len), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u32, sock_data ) + __field( u32, rec_len ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->sock_data = sock_data; + __entry->rec_len = rec_len; + ), + + TP_printk( + "sk=%p sock_data=%u rec_len=%u", + __entry->sk, __entry->sock_data, __entry->rec_len + ) +); + +TRACE_EVENT(tls_device_tx_resync_req, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u32 exp_tcp_seq), + + TP_ARGS(sk, tcp_seq, exp_tcp_seq), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u32, tcp_seq ) + __field( u32, exp_tcp_seq ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->tcp_seq = tcp_seq; + __entry->exp_tcp_seq = exp_tcp_seq; + ), + + TP_printk( + "sk=%p tcp_seq=%u exp_tcp_seq=%u", + __entry->sk, __entry->tcp_seq, __entry->exp_tcp_seq + ) +); + +TRACE_EVENT(tls_device_tx_resync_send, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no), + + TP_ARGS(sk, tcp_seq, rec_no), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu", + __entry->sk, __entry->tcp_seq, __entry->rec_no + ) +); + +#endif /* _TLS_TRACE_H_ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +#include <trace/define_trace.h> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0d8da809bea2..193cba2d777b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -284,11 +284,9 @@ static struct sock *__unix_find_socket_byname(struct net *net, if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) - goto found; + return s; } - s = NULL; -found: - return s; + return NULL; } static inline struct sock *unix_find_socket_byname(struct net *net, diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 582a3e4dfce2..1f4fde4711b6 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -439,7 +439,7 @@ static void vsock_pending_work(struct work_struct *work) if (vsock_is_pending(sk)) { vsock_remove_pending(listener, sk); - listener->sk_ack_backlog--; + sk_acceptq_removed(listener); } else if (!vsk->rejected) { /* We are not on the pending list and accept() did not reject * us, so we must have been accepted by our user process. We @@ -641,7 +641,6 @@ EXPORT_SYMBOL_GPL(__vsock_create); static void __vsock_release(struct sock *sk, int level) { if (sk) { - struct sk_buff *skb; struct sock *pending; struct vsock_sock *vsk; @@ -662,8 +661,7 @@ static void __vsock_release(struct sock *sk, int level) sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; - while ((skb = skb_dequeue(&sk->sk_receive_queue))) - kfree_skb(skb); + skb_queue_purge(&sk->sk_receive_queue); /* Clean up any sockets that never were accepted. */ while ((pending = vsock_dequeue_accept(sk)) != NULL) { @@ -1301,7 +1299,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, err = -listener->sk_err; if (connected) { - listener->sk_ack_backlog--; + sk_acceptq_removed(listener); lock_sock_nested(connected, SINGLE_DEPTH_NESTING); vconnected = vsock_sk(connected); diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index c443db7af8d4..7fa09c5e4625 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -13,15 +13,16 @@ #include <linux/hyperv.h> #include <net/sock.h> #include <net/af_vsock.h> +#include <asm/hyperv-tlfs.h> /* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some - * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer - * hosts don't have this limitation; but, keep the defaults the same for compat. + * stricter requirements on the hv_sock ring buffer size of six 4K pages. + * hyperv-tlfs defines HV_HYP_PAGE_SIZE as 4K. Newer hosts don't have this + * limitation; but, keep the defaults the same for compat. */ -#define PAGE_SIZE_4K 4096 -#define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6) -#define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6) -#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64) +#define RINGBUFFER_HVS_RCV_SIZE (HV_HYP_PAGE_SIZE * 6) +#define RINGBUFFER_HVS_SND_SIZE (HV_HYP_PAGE_SIZE * 6) +#define RINGBUFFER_HVS_MAX_SIZE (HV_HYP_PAGE_SIZE * 64) /* The MTU is 16KB per the host side's design */ #define HVS_MTU_SIZE (1024 * 16) @@ -54,7 +55,8 @@ struct hvs_recv_buf { * ringbuffer APIs that allow us to directly copy data from userspace buffer * to VMBus ringbuffer. */ -#define HVS_SEND_BUF_SIZE (PAGE_SIZE_4K - sizeof(struct vmpipe_proto_header)) +#define HVS_SEND_BUF_SIZE \ + (HV_HYP_PAGE_SIZE - sizeof(struct vmpipe_proto_header)) struct hvs_send_buf { /* The header before the payload data */ @@ -393,10 +395,10 @@ static void hvs_open_connection(struct vmbus_channel *chan) } else { sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE); sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE); - sndbuf = ALIGN(sndbuf, PAGE_SIZE); + sndbuf = ALIGN(sndbuf, HV_HYP_PAGE_SIZE); rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE); rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE); - rcvbuf = ALIGN(rcvbuf, PAGE_SIZE); + rcvbuf = ALIGN(rcvbuf, HV_HYP_PAGE_SIZE); } ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb, @@ -426,7 +428,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (conn_from_host) { new->sk_state = TCP_ESTABLISHED; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); hvs_addr_init(&vnew->local_addr, if_type); hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr); @@ -670,7 +672,7 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, ssize_t ret = 0; ssize_t bytes_written = 0; - BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K); + BUILD_BUG_ON(sizeof(*send_buf) != HV_HYP_PAGE_SIZE); send_buf = kmalloc(sizeof(*send_buf), GFP_KERNEL); if (!send_buf) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index fb2060dffb0a..828edd88488c 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -268,6 +268,55 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk, } static ssize_t +virtio_transport_stream_do_peek(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt; + size_t bytes, total = 0, off; + int err = -EFAULT; + + spin_lock_bh(&vvs->rx_lock); + + list_for_each_entry(pkt, &vvs->rx_queue, list) { + off = pkt->off; + + if (total == len) + break; + + while (total < len && off < pkt->len) { + bytes = len - total; + if (bytes > pkt->len - off) + bytes = pkt->len - off; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + + err = memcpy_to_msg(msg, pkt->buf + off, bytes); + if (err) + goto out; + + spin_lock_bh(&vvs->rx_lock); + + total += bytes; + off += bytes; + } + } + + spin_unlock_bh(&vvs->rx_lock); + + return total; + +out: + if (total) + err = total; + return err; +} + +static ssize_t virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len) @@ -339,9 +388,9 @@ virtio_transport_stream_dequeue(struct vsock_sock *vsk, size_t len, int flags) { if (flags & MSG_PEEK) - return -EOPNOTSUPP; - - return virtio_transport_stream_do_dequeue(vsk, msg, len); + return virtio_transport_stream_do_peek(vsk, msg, len); + else + return virtio_transport_stream_do_dequeue(vsk, msg, len); } EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); @@ -1019,7 +1068,7 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) return -ENOMEM; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); lock_sock_nested(child, SINGLE_DEPTH_NESTING); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 8c9c4ed90fa7..6ba98a1efe2e 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1098,7 +1098,7 @@ static int vmci_transport_recv_listen(struct sock *sk, } vsock_add_pending(sk, pending); - sk->sk_ack_backlog++; + sk_acceptq_added(sk); pending->sk_state = TCP_SYN_SENT; vmci_trans(vpending)->produce_size = diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7b72286922f7..da5262b2298b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -624,6 +624,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = SAE_PASSWORD_MAX_LEN }, [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG }, [NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy), + [NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2), }; /* policy for the key attributes */ @@ -3940,6 +3941,10 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) key.type != NL80211_KEYTYPE_GROUP) return -EINVAL; + if (key.type == NL80211_KEYTYPE_GROUP && + info->attrs[NL80211_ATTR_VLAN_ID]) + key.p.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (!rdev->ops->add_key) return -EOPNOTSUPP; @@ -5711,6 +5716,9 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_STA_AID]) params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]); + if (info->attrs[NL80211_ATTR_VLAN_ID]) + params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); @@ -5856,6 +5864,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); + if (info->attrs[NL80211_ATTR_VLAN_ID]) + params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) { params.support_p2p_ps = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]); @@ -8265,10 +8276,8 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, /* leave request id zero for legacy request * or if driver does not support multi-scheduled scan */ - if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) { - while (!sched_scan_req->reqid) - sched_scan_req->reqid = cfg80211_assign_cookie(rdev); - } + if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) + sched_scan_req->reqid = cfg80211_assign_cookie(rdev); err = rdev_sched_scan_start(rdev, dev, sched_scan_req); if (err) diff --git a/net/wireless/reg.h b/net/wireless/reg.h index dc8f689bd469..f9e83031a40a 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -114,7 +114,7 @@ void regulatory_hint_country_ie(struct wiphy *wiphy, u8 country_ie_len); /** - * regulatory_hint_disconnect - informs all devices have been disconneted + * regulatory_hint_disconnect - informs all devices have been disconnected * * Regulotory rules can be enhanced further upon scanning and upon * connection to an AP. These rules become stale if we disconnect diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 6aee9f5e8e71..c34f7d077604 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -891,7 +891,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: @@ -1062,7 +1062,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); x25_insert_socket(make); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9044073fbf22..6040bc2b0088 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -196,7 +196,7 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len; @@ -212,7 +212,7 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); } -void xsk_flush(struct xdp_sock *xs) +static void xsk_flush(struct xdp_sock *xs) { xskq_produce_flush_desc(xs->rx); xs->sk.sk_data_ready(&xs->sk); @@ -264,6 +264,35 @@ out_unlock: return err; } +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + if (!xs->flush_node.prev) + list_add(&xs->flush_node, flush_list); + + return 0; +} + +void __xsk_map_flush(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct xdp_sock *xs, *tmp; + + list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { + xsk_flush(xs); + __list_del_clearprev(&xs->flush_node); + } +} + void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { xskq_produce_flush_addr_n(umem->cq, nb_entries); |